Files
mongo/db/cap.cpp

458 lines
18 KiB
C++
Raw Normal View History

2011-01-04 00:40:41 -05:00
// @file cap.cpp capped collection related
2010-09-07 05:30:27 -04:00
// the "old" version (<= v1.6)
2010-07-23 23:20:00 -04:00
/**
* Copyright (C) 2008 10gen Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "pch.h"
#include "pdfile.h"
#include "db.h"
#include "../util/mmap.h"
#include "../util/hashtab.h"
#include "../scripting/engine.h"
#include "btree.h"
#include <algorithm>
#include <list>
#include "json.h"
2011-06-22 15:51:08 -04:00
#include "clientcursor.h"
2010-07-23 23:20:00 -04:00
2010-07-25 22:50:35 -04:00
/*
capped collection layout
d's below won't exist if things align perfectly:
extent1 -> extent2 -> extent3
------------------- ----------------------- ---------------------
d r r r r r r r r d d r r r r d r r r r r d d r r r r r r r r r d
^ ^
oldest newest
^cappedFirstDeletedInCurExtent()
^cappedLastDelRecLastExtent()
^cappedListOfAllDeletedRecords()
*/
2010-07-23 23:20:00 -04:00
namespace mongo {
2010-07-25 16:16:19 -04:00
/* combine adjacent deleted records *for the current extent* of the capped collection
2011-01-04 00:40:41 -05:00
2010-07-23 23:20:00 -04:00
this is O(n^2) but we call it for capped tables where typically n==1 or 2!
(or 3...there will be a little unused sliver at the end of the extent.)
*/
void NamespaceDetails::compact() {
assert(capped);
list<DiskLoc> drecs;
// Pull out capExtent's DRs from deletedList
2010-07-25 22:50:35 -04:00
DiskLoc i = cappedFirstDeletedInCurExtent();
2010-07-23 23:20:00 -04:00
for (; !i.isNull() && inCapExtent( i ); i = i.drec()->nextDeleted )
drecs.push_back( i );
2010-09-27 12:35:22 -04:00
getDur().writingDiskLoc( cappedFirstDeletedInCurExtent() ) = i;
2010-07-23 23:20:00 -04:00
// This is the O(n^2) part.
drecs.sort();
list<DiskLoc>::iterator j = drecs.begin();
assert( j != drecs.end() );
DiskLoc a = *j;
while ( 1 ) {
j++;
if ( j == drecs.end() ) {
DEBUGGING out() << "TEMP: compact adddelrec\n";
addDeletedRec(a.drec(), a);
break;
}
DiskLoc b = *j;
while ( a.a() == b.a() && a.getOfs() + a.drec()->lengthWithHeaders == b.getOfs() ) {
// a & b are adjacent. merge.
getDur().writingInt( a.drec()->lengthWithHeaders ) += b.drec()->lengthWithHeaders;
2010-07-23 23:20:00 -04:00
j++;
if ( j == drecs.end() ) {
DEBUGGING out() << "temp: compact adddelrec2\n";
addDeletedRec(a.drec(), a);
return;
}
b = *j;
}
DEBUGGING out() << "temp: compact adddelrec3\n";
addDeletedRec(a.drec(), a);
a = b;
}
}
2010-07-25 22:50:35 -04:00
DiskLoc &NamespaceDetails::cappedFirstDeletedInCurExtent() {
2010-07-25 16:16:19 -04:00
if ( cappedLastDelRecLastExtent().isNull() )
return cappedListOfAllDeletedRecords();
else
return cappedLastDelRecLastExtent().drec()->nextDeleted;
}
2010-07-23 23:20:00 -04:00
void NamespaceDetails::cappedCheckMigrate() {
// migrate old NamespaceDetails format
assert( capped );
if ( capExtent.a() == 0 && capExtent.getOfs() == 0 ) {
2010-09-27 12:35:22 -04:00
//capFirstNewRecord = DiskLoc();
capFirstNewRecord.writing().setInvalid();
2010-07-25 16:16:19 -04:00
// put all the DeletedRecords in cappedListOfAllDeletedRecords()
2010-07-23 23:20:00 -04:00
for ( int i = 1; i < Buckets; ++i ) {
DiskLoc first = deletedList[ i ];
if ( first.isNull() )
continue;
DiskLoc last = first;
for (; !last.drec()->nextDeleted.isNull(); last = last.drec()->nextDeleted );
2010-09-27 12:35:22 -04:00
last.drec()->nextDeleted.writing() = cappedListOfAllDeletedRecords();
cappedListOfAllDeletedRecords().writing() = first;
deletedList[i].writing() = DiskLoc();
2010-07-23 23:20:00 -04:00
}
2010-07-25 16:16:19 -04:00
// NOTE cappedLastDelRecLastExtent() set to DiskLoc() in above
2010-07-23 23:20:00 -04:00
// Last, in case we're killed before getting here
2010-09-27 12:35:22 -04:00
capExtent.writing() = firstExtent;
2010-07-23 23:20:00 -04:00
}
}
bool NamespaceDetails::inCapExtent( const DiskLoc &dl ) const {
assert( !dl.isNull() );
// We could have a rec or drec, doesn't matter.
2011-06-06 15:14:51 -04:00
bool res = dl.drec()->myExtentLoc(dl) == capExtent;
DEV {
// old implementation. this check is temp to test works the same. new impl should be a little faster.
assert( res == (dl.drec()->myExtent( dl ) == capExtent.ext()) );
}
return res;
2010-07-23 23:20:00 -04:00
}
bool NamespaceDetails::nextIsInCapExtent( const DiskLoc &dl ) const {
assert( !dl.isNull() );
DiskLoc next = dl.drec()->nextDeleted;
if ( next.isNull() )
return false;
return inCapExtent( next );
}
void NamespaceDetails::advanceCapExtent( const char *ns ) {
2010-07-25 16:16:19 -04:00
// We want cappedLastDelRecLastExtent() to be the last DeletedRecord of the prev cap extent
2010-07-23 23:20:00 -04:00
// (or DiskLoc() if new capExtent == firstExtent)
if ( capExtent == lastExtent )
getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = DiskLoc();
2010-07-23 23:20:00 -04:00
else {
2010-07-25 22:50:35 -04:00
DiskLoc i = cappedFirstDeletedInCurExtent();
2010-07-23 23:20:00 -04:00
for (; !i.isNull() && nextIsInCapExtent( i ); i = i.drec()->nextDeleted );
getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = i;
2010-07-23 23:20:00 -04:00
}
getDur().writingDiskLoc( capExtent ) = theCapExtent()->xnext.isNull() ? firstExtent : theCapExtent()->xnext;
2010-07-23 23:20:00 -04:00
/* this isn't true if a collection has been renamed...that is ok just used for diagnostics */
//dassert( theCapExtent()->ns == ns );
theCapExtent()->assertOk();
getDur().writingDiskLoc( capFirstNewRecord ) = DiskLoc();
2010-07-23 23:20:00 -04:00
}
DiskLoc NamespaceDetails::__capAlloc( int len ) {
2010-07-25 16:16:19 -04:00
DiskLoc prev = cappedLastDelRecLastExtent();
2010-07-25 22:50:35 -04:00
DiskLoc i = cappedFirstDeletedInCurExtent();
2010-07-23 23:20:00 -04:00
DiskLoc ret;
for (; !i.isNull() && inCapExtent( i ); prev = i, i = i.drec()->nextDeleted ) {
2010-07-25 16:16:19 -04:00
// We need to keep at least one DR per extent in cappedListOfAllDeletedRecords(),
2010-07-23 23:20:00 -04:00
// so make sure there's space to create a DR at the end.
if ( i.drec()->lengthWithHeaders >= len + 24 ) {
ret = i;
break;
}
}
/* unlink ourself from the deleted list */
if ( !ret.isNull() ) {
if ( prev.isNull() )
2010-09-27 12:35:22 -04:00
cappedListOfAllDeletedRecords().writing() = ret.drec()->nextDeleted;
2010-07-23 23:20:00 -04:00
else
2010-09-27 12:35:22 -04:00
prev.drec()->nextDeleted.writing() = ret.drec()->nextDeleted;
ret.drec()->nextDeleted.writing().setInvalid(); // defensive.
2010-07-23 23:20:00 -04:00
assert( ret.drec()->extentOfs < ret.getOfs() );
}
return ret;
}
2011-01-04 00:40:41 -05:00
DiskLoc NamespaceDetails::cappedAlloc(const char *ns, int len) {
2010-07-23 23:20:00 -04:00
// signal done allocating new extents.
2010-07-25 16:16:19 -04:00
if ( !cappedLastDelRecLastExtent().isValid() )
getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = DiskLoc();
2011-01-04 00:40:41 -05:00
2010-07-23 23:20:00 -04:00
assert( len < 400000000 );
int passes = 0;
int maxPasses = ( len / 30 ) + 2; // 30 is about the smallest entry that could go in the oplog
2011-01-04 00:40:41 -05:00
if ( maxPasses < 5000 ) {
2010-07-23 23:20:00 -04:00
// this is for bacwards safety since 5000 was the old value
maxPasses = 5000;
}
DiskLoc loc;
// delete records until we have room and the max # objects limit achieved.
/* this fails on a rename -- that is ok but must keep commented out */
//assert( theCapExtent()->ns == ns );
theCapExtent()->assertOk();
DiskLoc firstEmptyExtent;
while ( 1 ) {
2010-09-27 12:35:22 -04:00
if ( stats.nrecords < max ) {
2010-07-23 23:20:00 -04:00
loc = __capAlloc( len );
if ( !loc.isNull() )
break;
}
// If on first iteration through extents, don't delete anything.
if ( !capFirstNewRecord.isValid() ) {
advanceCapExtent( ns );
2010-09-27 12:35:22 -04:00
2010-07-23 23:20:00 -04:00
if ( capExtent != firstExtent )
2010-09-27 12:35:22 -04:00
capFirstNewRecord.writing().setInvalid();
2010-07-23 23:20:00 -04:00
// else signal done with first iteration through extents.
continue;
}
if ( !capFirstNewRecord.isNull() &&
theCapExtent()->firstRecord == capFirstNewRecord ) {
// We've deleted all records that were allocated on the previous
// iteration through this extent.
advanceCapExtent( ns );
continue;
}
if ( theCapExtent()->firstRecord.isNull() ) {
if ( firstEmptyExtent.isNull() )
firstEmptyExtent = capExtent;
advanceCapExtent( ns );
if ( firstEmptyExtent == capExtent ) {
maybeComplain( ns, len );
return DiskLoc();
}
continue;
}
DiskLoc fr = theCapExtent()->firstRecord;
theDataFileMgr.deleteRecord(ns, fr.rec(), fr, true); // ZZZZZZZZZZZZ
compact();
if( ++passes > maxPasses ) {
log() << "passes ns:" << ns << " len:" << len << " maxPasses: " << maxPasses << '\n';
2010-09-27 12:35:22 -04:00
log() << "passes max:" << max << " nrecords:" << stats.nrecords << " datasize: " << stats.datasize << endl;
2010-07-23 23:20:00 -04:00
massert( 10345 , "passes >= maxPasses in capped collection alloc", false );
}
}
// Remember first record allocated on this iteration through capExtent.
if ( capFirstNewRecord.isValid() && capFirstNewRecord.isNull() )
getDur().writingDiskLoc(capFirstNewRecord) = loc;
2010-07-23 23:20:00 -04:00
return loc;
}
2010-07-25 16:16:19 -04:00
void NamespaceDetails::dumpExtents() {
cout << "dumpExtents:" << endl;
for ( DiskLoc i = firstExtent; !i.isNull(); i = i.ext()->xnext ) {
Extent *e = i.ext();
stringstream ss;
e->dump(ss);
cout << ss.str() << endl;
2010-07-24 00:10:34 -04:00
}
2010-07-25 16:16:19 -04:00
}
2011-01-04 00:40:41 -05:00
void NamespaceDetails::cappedDumpDelInfo() {
2010-07-25 16:16:19 -04:00
cout << "dl[0]: " << deletedList[0].toString() << endl;
2011-01-04 00:40:41 -05:00
for( DiskLoc z = deletedList[0]; !z.isNull(); z = z.drec()->nextDeleted ) {
cout << " drec:" << z.toString() << " dreclen:" << hex << z.drec()->lengthWithHeaders <<
" ext:" << z.drec()->myExtent(z)->myLoc.toString() << endl;
2010-07-25 16:16:19 -04:00
}
cout << "dl[1]: " << deletedList[1].toString() << endl;
}
2010-12-27 18:10:58 -08:00
void NamespaceDetails::cappedTruncateLastDelUpdate() {
if ( capExtent == firstExtent ) {
// Only one extent of the collection is in use, so there
// is no deleted record in a previous extent, so nullify
// cappedLastDelRecLastExtent().
cappedLastDelRecLastExtent().writing() = DiskLoc();
2011-01-04 00:40:41 -05:00
}
else {
2010-12-27 18:10:58 -08:00
// Scan through all deleted records in the collection
// until the last deleted record for the extent prior
// to the new capExtent is found. Then set
// cappedLastDelRecLastExtent() to that deleted record.
DiskLoc i = cappedListOfAllDeletedRecords();
for( ;
2011-01-04 00:40:41 -05:00
!i.drec()->nextDeleted.isNull() &&
!inCapExtent( i.drec()->nextDeleted );
i = i.drec()->nextDeleted );
2010-12-28 11:06:48 -08:00
// In our capped storage model, every extent must have at least one
// deleted record. Here we check that 'i' is not the last deleted
// record. (We expect that there will be deleted records in the new
// capExtent as well.)
2010-12-27 18:10:58 -08:00
assert( !i.drec()->nextDeleted.isNull() );
cappedLastDelRecLastExtent().writing() = i;
2011-01-04 00:40:41 -05:00
}
2010-12-27 18:10:58 -08:00
}
2011-01-04 00:40:41 -05:00
2010-07-26 12:01:15 -07:00
void NamespaceDetails::cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive) {
2010-07-25 16:16:19 -04:00
DEV assert( this == nsdetails(ns) );
2010-07-26 12:01:15 -07:00
assert( cappedLastDelRecLastExtent().isValid() );
2011-01-04 00:40:41 -05:00
2010-12-27 14:56:19 -08:00
// We iteratively remove the newest document until the newest document
// is 'end', then we remove 'end' if requested.
2010-07-26 12:01:15 -07:00
bool foundLast = false;
while( 1 ) {
if ( foundLast ) {
2010-12-27 14:56:19 -08:00
// 'end' has been found and removed, so break.
2010-07-26 12:01:15 -07:00
break;
}
getDur().commitIfNeeded();
// 'curr' will point to the newest document in the collection.
2010-07-26 12:01:15 -07:00
DiskLoc curr = theCapExtent()->lastRecord;
assert( !curr.isNull() );
if ( curr == end ) {
if ( inclusive ) {
2010-12-27 14:56:19 -08:00
// 'end' has been found, so break next iteration.
2010-07-26 12:01:15 -07:00
foundLast = true;
2011-01-04 00:40:41 -05:00
}
else {
2010-12-27 14:56:19 -08:00
// 'end' has been found, so break.
2010-07-26 12:01:15 -07:00
break;
}
}
2011-01-04 00:40:41 -05:00
2010-12-27 14:56:19 -08:00
// TODO The algorithm used in this function cannot generate an
// empty collection, but we could call emptyCappedCollection() in
// this case instead of asserting.
2010-09-27 12:35:22 -04:00
uassert( 13415, "emptying the collection is not allowed", stats.nrecords > 1 );
2011-01-04 00:40:41 -05:00
2010-12-27 18:10:58 -08:00
// Delete the newest record, and coalesce the new deleted
// record with existing deleted records.
theDataFileMgr.deleteRecord(ns, curr.rec(), curr, true);
compact();
2011-01-04 00:40:41 -05:00
2010-12-27 14:56:19 -08:00
// This is the case where we have not yet had to remove any
// documents to make room for other documents, and we are allocating
// documents from free space in fresh extents instead of reusing
// space from familiar extents.
2010-07-26 12:01:15 -07:00
if ( !capLooped() ) {
2011-01-04 00:40:41 -05:00
2010-12-27 14:56:19 -08:00
// We just removed the last record from the 'capExtent', and
// the 'capExtent' can't be empty, so we set 'capExtent' to
// capExtent's prev extent.
2010-07-26 12:01:15 -07:00
if ( theCapExtent()->lastRecord.isNull() ) {
assert( !theCapExtent()->xprev.isNull() );
2010-12-27 14:56:19 -08:00
// NOTE Because we didn't delete the last document, and
// capLooped() is false, capExtent is not the first extent
// so xprev will be nonnull.
capExtent.writing() = theCapExtent()->xprev;
2011-01-04 00:40:41 -05:00
theCapExtent()->assertOk();
2010-12-27 18:10:58 -08:00
// update cappedLastDelRecLastExtent()
cappedTruncateLastDelUpdate();
2010-07-26 12:01:15 -07:00
}
continue;
}
2010-12-27 14:56:19 -08:00
// This is the case where capLooped() is true, and we just deleted
// from capExtent, and we just deleted capFirstNewRecord, which was
// the last record on the fresh side of capExtent.
// NOTE In this comparison, curr and potentially capFirstNewRecord
2010-12-27 18:10:58 -08:00
// may point to invalid data, but we can still compare the
// references themselves.
2010-12-27 14:56:19 -08:00
if ( curr == capFirstNewRecord ) {
2011-01-04 00:40:41 -05:00
// Set 'capExtent' to the first nonempty extent prior to the
// initial capExtent. There must be such an extent because we
// have not deleted the last document in the collection. It is
// possible that all extents other than the capExtent are empty.
// In this case we will keep the initial capExtent and specify
// that all records contained within are on the fresh rather than
// stale side of the extent.
DiskLoc newCapExtent = capExtent;
do {
// Find the previous extent, looping if necessary.
newCapExtent = ( newCapExtent == firstExtent ) ? lastExtent : newCapExtent.ext()->xprev;
newCapExtent.ext()->assertOk();
2011-01-04 00:40:41 -05:00
}
while ( newCapExtent.ext()->firstRecord.isNull() );
capExtent.writing() = newCapExtent;
2011-01-04 00:40:41 -05:00
2010-12-27 14:56:19 -08:00
// Place all documents in the new capExtent on the fresh side
// of the capExtent by setting capFirstNewRecord to the first
// document in the new capExtent.
capFirstNewRecord.writing() = theCapExtent()->firstRecord;
2010-12-27 18:10:58 -08:00
// update cappedLastDelRecLastExtent()
cappedTruncateLastDelUpdate();
2010-07-26 12:01:15 -07:00
}
}
2010-07-23 23:20:00 -04:00
}
2011-01-04 00:40:41 -05:00
2010-08-02 14:26:14 -07:00
void NamespaceDetails::emptyCappedCollection( const char *ns ) {
DEV assert( this == nsdetails(ns) );
massert( 13424, "collection must be capped", capped );
massert( 13425, "background index build in progress", !indexBuildInProgress );
2010-08-02 14:26:14 -07:00
massert( 13426, "indexes present", nIndexes == 0 );
2010-12-27 14:56:19 -08:00
// Clear all references to this namespace.
2010-08-02 14:26:14 -07:00
ClientCursor::invalidate( ns );
2011-01-04 00:40:41 -05:00
NamespaceDetailsTransient::clearForPrefix( ns );
2010-08-02 14:26:14 -07:00
2010-12-27 14:56:19 -08:00
// Get a writeable reference to 'this' and reset all pertinent
// attributes.
NamespaceDetails *t = writingWithoutExtra();
2010-12-27 14:56:19 -08:00
t->cappedLastDelRecLastExtent() = DiskLoc();
t->cappedListOfAllDeletedRecords() = DiskLoc();
2011-01-04 00:40:41 -05:00
2010-08-02 14:26:14 -07:00
// preserve firstExtent/lastExtent
t->capExtent = firstExtent;
t->stats.datasize = stats.nrecords = 0;
2010-08-02 14:26:14 -07:00
// lastExtentSize preserve
// nIndexes preserve 0
// capped preserve true
// max preserve
t->paddingFactor = 1.0;
t->flags = 0;
t->capFirstNewRecord = DiskLoc();
t->capFirstNewRecord.setInvalid();
t->cappedLastDelRecLastExtent().setInvalid();
2010-08-02 14:26:14 -07:00
// dataFileVersion preserve
// indexFileVersion preserve
t->multiKeyIndexBits = 0;
t->reservedA = 0;
t->extraOffset = 0;
// indexBuildInProgress preserve 0
memset(t->reserved, 0, sizeof(t->reserved));
2010-08-02 14:26:14 -07:00
2010-12-27 14:56:19 -08:00
// Reset all existing extents and recreate the deleted list.
2010-08-02 14:26:14 -07:00
for( DiskLoc ext = firstExtent; !ext.isNull(); ext = ext.ext()->xnext ) {
DiskLoc prev = ext.ext()->xprev;
DiskLoc next = ext.ext()->xnext;
DiskLoc empty = ext.ext()->reuse( ns, true );
ext.ext()->xprev.writing() = prev;
ext.ext()->xnext.writing() = next;
2010-08-02 14:26:14 -07:00
addDeletedRec( empty.drec(), empty );
}
}
2010-07-23 23:20:00 -04:00
}