2011-04-07 16:37:58 -04:00
/** @file compact.cpp
2010-10-16 20:10:47 -04:00
compaction of deleted space in pdfiles ( datafiles )
*/
/* NOTE 6Oct2010 : this file PRELIMINARY, EXPERIMENTAL, NOT DONE, NOT USED YET (not in SConstruct) */
2010-10-06 14:09:53 -04:00
/**
* Copyright ( C ) 2010 10 gen Inc .
*
* This program is free software : you can redistribute it and / or modify
* it under the terms of the GNU Affero General Public License , version 3 ,
* as published by the Free Software Foundation .
*
* This program is distributed in the hope that it will be useful , b
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU Affero General Public License for more details .
*
* You should have received a copy of the GNU Affero General Public License
* along with this program . If not , see < http : //www.gnu.org/licenses/>.
*/
# include "pch.h"
# include "pdfile.h"
# include "concurrency.h"
2010-10-16 20:10:47 -04:00
# include "commands.h"
2010-11-04 09:00:39 -04:00
# include "curop-inl.h"
2011-04-04 12:38:38 -04:00
# include "background.h"
2011-04-04 16:30:09 -04:00
# include "extsort.h"
# include "compact.h"
2010-10-16 20:10:47 -04:00
# include "../util/concurrency/task.h"
2010-10-06 14:09:53 -04:00
2011-01-04 00:40:41 -05:00
namespace mongo {
2010-10-06 14:09:53 -04:00
2011-04-04 12:38:38 -04:00
char faux ;
2010-10-06 14:09:53 -04:00
2011-04-04 12:38:38 -04:00
void addRecordToRecListInExtent ( Record * r , DiskLoc loc ) ;
2011-05-23 20:01:42 -04:00
DiskLoc allocateSpaceForANewRecord ( const char * ns , NamespaceDetails * d , int lenWHdr , bool god ) ;
2011-04-04 12:38:38 -04:00
void freeExtents ( DiskLoc firstExt , DiskLoc lastExt ) ;
2010-10-06 14:09:53 -04:00
2011-04-07 16:11:23 -04:00
/** @return number of skipped (invalid) documents */
unsigned compactExtent ( const char * ns , NamespaceDetails * d , const DiskLoc ext , int n ,
2011-04-04 16:30:09 -04:00
const scoped_array < IndexSpec > & indexSpecs ,
2011-04-06 14:09:30 -04:00
scoped_array < SortPhaseOne > & phase1 , int nidx , bool validate )
2011-04-04 16:30:09 -04:00
{
2011-04-04 12:38:38 -04:00
log ( ) < < " compact extent # " < < n < < endl ;
2011-04-07 16:34:10 -04:00
2011-04-04 12:38:38 -04:00
Extent * e = ext . ext ( ) ;
e - > assertOk ( ) ;
assert ( e - > validates ( ) ) ;
2011-04-07 16:11:23 -04:00
unsigned skipped = 0 ;
2011-03-17 16:47:23 -04:00
2010-10-16 20:10:47 -04:00
{
2011-04-04 12:38:38 -04:00
// the next/prev pointers within the extent might not be in order so we first page the whole thing in
// sequentially
log ( ) < < " compact paging in len= " < < e - > length / 1000000.0 < < " MB " < < endl ;
Timer t ;
MAdvise adv ( e , e - > length , MAdvise : : Sequential ) ;
const char * p = ( const char * ) e ;
for ( int i = 0 ; i < e - > length ; i + = 4096 ) {
faux + = * p ;
2011-03-17 16:47:23 -04:00
}
2011-04-04 12:38:38 -04:00
int ms = t . millis ( ) ;
if ( ms > 1000 )
log ( ) < < " compact end paging in " < < ms < < " ms " < < e - > length / 1000000.0 / ms < < " MB/sec " < < endl ;
2011-03-17 16:47:23 -04:00
}
2011-04-04 12:38:38 -04:00
{
log ( ) < < " compact copying records " < < endl ;
unsigned totalSize = 0 ;
int nrecs = 0 ;
DiskLoc L = e - > firstRecord ;
if ( ! L . isNull ( ) )
while ( 1 ) {
Record * recOld = L . rec ( ) ;
L = recOld - > nextInExtent ( L ) ;
nrecs + + ;
BSONObj objOld ( recOld ) ;
2011-04-04 16:30:09 -04:00
2011-04-06 14:09:30 -04:00
if ( ! validate | | objOld . valid ( ) ) {
unsigned sz = objOld . objsize ( ) ;
unsigned lenWHdr = sz + Record : : HeaderSize ;
totalSize + = lenWHdr ;
DiskLoc extentLoc ;
2011-05-23 20:01:42 -04:00
DiskLoc loc = allocateSpaceForANewRecord ( ns , d , lenWHdr , false ) ;
2011-04-06 14:09:30 -04:00
uassert ( 14024 , " compact error out of space during compaction " , ! loc . isNull ( ) ) ;
Record * recNew = loc . rec ( ) ;
recNew = ( Record * ) getDur ( ) . writingPtr ( recNew , lenWHdr ) ;
addRecordToRecListInExtent ( recNew , loc ) ;
memcpy ( recNew - > data , objOld . objdata ( ) , sz ) ;
{
// extract keys for all indexes we will be rebuilding
for ( int x = 0 ; x < nidx ; x + + ) {
phase1 [ x ] . addKeys ( indexSpecs [ x ] , objOld , loc ) ;
}
2011-04-04 16:30:09 -04:00
}
}
2011-04-06 14:09:30 -04:00
else {
2011-04-07 16:11:23 -04:00
if ( + + skipped < = 10 )
log ( ) < < " compact skipping invalid object " < < endl ;
2011-04-06 14:09:30 -04:00
}
2011-04-04 16:30:09 -04:00
2011-04-04 12:38:38 -04:00
if ( L . isNull ( ) ) {
// we just did the very last record from the old extent. it's still pointed to
// by the old extent ext, but that will be fixed below after this loop
2010-10-16 20:10:47 -04:00
break ;
2011-04-04 12:38:38 -04:00
}
2011-04-05 15:59:28 -04:00
// remove the old records (orphan them) periodically so our commit block doesn't get too large
2011-04-07 16:34:10 -04:00
bool stopping = false ;
RARELY stopping = * killCurrentOp . checkForInterruptNoAssert ( false ) ! = 0 ;
if ( stopping | | getDur ( ) . aCommitIsNeeded ( ) ) {
2011-04-05 15:59:28 -04:00
e - > firstRecord . writing ( ) = L ;
Record * r = L . rec ( ) ;
getDur ( ) . writingInt ( r - > prevOfs ) = DiskLoc : : NullOfs ;
getDur ( ) . commitIfNeeded ( ) ;
2011-04-07 16:34:10 -04:00
killCurrentOp . checkForInterrupt ( false ) ;
2011-04-05 15:59:28 -04:00
}
2010-10-16 20:10:47 -04:00
}
2011-04-04 12:38:38 -04:00
assert ( d - > firstExtent = = ext ) ;
assert ( d - > lastExtent ! = ext ) ;
DiskLoc newFirst = e - > xnext ;
d - > firstExtent . writing ( ) = newFirst ;
newFirst . ext ( ) - > xprev . writing ( ) . Null ( ) ;
getDur ( ) . writing ( e ) - > markEmpty ( ) ;
freeExtents ( ext , ext ) ;
2011-04-05 15:59:28 -04:00
getDur ( ) . commitIfNeeded ( ) ;
2011-04-04 12:38:38 -04:00
2011-04-05 14:19:43 -04:00
log ( ) < < " compact " < < nrecs < < " documents " < < totalSize / 1000000.0 < < " MB " < < endl ;
2010-10-16 20:10:47 -04:00
}
2011-04-04 12:38:38 -04:00
2011-04-07 16:11:23 -04:00
return skipped ;
2011-04-04 12:38:38 -04:00
}
2011-04-04 16:30:09 -04:00
extern SortPhaseOne * precalced ;
2011-04-07 16:11:23 -04:00
bool _compact ( const char * ns , NamespaceDetails * d , string & errmsg , bool validate , BSONObjBuilder & result ) {
2011-04-04 14:16:37 -04:00
//int les = d->lastExtentSize;
2011-04-04 12:38:38 -04:00
// this is a big job, so might as well make things tidy before we start just to be nice.
getDur ( ) . commitNow ( ) ;
2011-04-05 14:14:10 -04:00
list < DiskLoc > extents ;
2011-04-04 12:38:38 -04:00
for ( DiskLoc L = d - > firstExtent ; ! L . isNull ( ) ; L = L . ext ( ) - > xnext )
2011-04-05 14:14:10 -04:00
extents . push_back ( L ) ;
2011-04-04 12:38:38 -04:00
log ( ) < < " compact " < < extents . size ( ) < < " extents " < < endl ;
2011-04-12 12:09:31 -04:00
ProgressMeterHolder pm ( cc ( ) . curop ( ) - > setMessage ( " compact extent " , extents . size ( ) ) ) ;
2011-04-04 12:38:38 -04:00
// same data, but might perform a little different after compact?
NamespaceDetailsTransient : : get_w ( ns ) . clearQueryCache ( ) ;
2011-04-04 16:30:09 -04:00
int nidx = d - > nIndexes ;
scoped_array < IndexSpec > indexSpecs ( new IndexSpec [ nidx ] ) ;
scoped_array < SortPhaseOne > phase1 ( new SortPhaseOne [ nidx ] ) ;
2011-04-04 12:38:38 -04:00
{
NamespaceDetails : : IndexIterator ii = d - > ii ( ) ;
2011-04-04 16:30:09 -04:00
int x = 0 ;
2011-04-04 12:38:38 -04:00
while ( ii . more ( ) ) {
BSONObjBuilder b ;
2011-06-16 16:35:07 -04:00
IndexDetails & idx = ii . next ( ) ;
BSONObj : : iterator i ( idx . info . obj ( ) ) ;
2011-04-04 12:38:38 -04:00
while ( i . more ( ) ) {
BSONElement e = i . next ( ) ;
2011-04-04 17:14:11 -04:00
if ( ! str : : equals ( e . fieldName ( ) , " v " ) & & ! str : : equals ( e . fieldName ( ) , " background " ) ) {
2011-04-04 12:38:38 -04:00
b . append ( e ) ;
2011-04-04 16:30:09 -04:00
}
2010-10-16 20:10:47 -04:00
}
2011-04-04 16:30:09 -04:00
BSONObj o = b . obj ( ) . getOwned ( ) ;
2011-06-16 16:35:07 -04:00
phase1 [ x ] . sorter . reset ( new BSONObjExternalSorter ( idx . idxInterface ( ) , o . getObjectField ( " key " ) ) ) ;
2011-04-04 16:30:09 -04:00
phase1 [ x ] . sorter - > hintNumObjects ( d - > stats . nrecords ) ;
indexSpecs [ x + + ] . reset ( o ) ;
2010-10-16 20:10:47 -04:00
}
2010-10-06 14:09:53 -04:00
}
2011-04-04 12:38:38 -04:00
log ( ) < < " compact orphan deleted lists " < < endl ;
for ( int i = 0 ; i < Buckets ; i + + ) {
d - > deletedList [ i ] . writing ( ) . Null ( ) ;
2011-03-17 16:47:23 -04:00
}
2011-04-04 12:38:38 -04:00
// before dropping indexes, at least make sure we can allocate one extent!
2011-05-23 20:01:42 -04:00
uassert ( 14025 , " compact error no space available to allocate " , ! allocateSpaceForANewRecord ( ns , d , Record : : HeaderSize + 1 , false ) . isNull ( ) ) ;
2010-10-06 14:09:53 -04:00
2011-04-04 12:38:38 -04:00
// note that the drop indexes call also invalidates all clientcursors for the namespace, which is important and wanted here
log ( ) < < " compact dropping indexes " < < endl ;
BSONObjBuilder b ;
if ( ! dropIndexes ( d , ns , " * " , errmsg , b , true ) ) {
2011-04-07 16:34:10 -04:00
errmsg = " compact drop indexes failed " ;
log ( ) < < errmsg < < endl ;
2011-04-04 12:38:38 -04:00
return false ;
2010-10-06 14:09:53 -04:00
}
2011-04-04 12:38:38 -04:00
getDur ( ) . commitNow ( ) ;
2011-04-07 16:11:23 -04:00
long long skipped = 0 ;
2011-04-04 12:38:38 -04:00
int n = 0 ;
2011-04-05 14:14:10 -04:00
for ( list < DiskLoc > : : iterator i = extents . begin ( ) ; i ! = extents . end ( ) ; i + + ) {
2011-04-07 16:11:23 -04:00
skipped + = compactExtent ( ns , d , * i , n + + , indexSpecs , phase1 , nidx , validate ) ;
2011-04-12 12:09:31 -04:00
pm . hit ( ) ;
2011-04-07 16:11:23 -04:00
}
if ( skipped ) {
result . append ( " invalidObjects " , skipped ) ;
2010-10-06 14:09:53 -04:00
}
2011-04-04 12:38:38 -04:00
assert ( d - > firstExtent . ext ( ) - > xprev . isNull ( ) ) ;
2011-04-12 12:09:31 -04:00
// indexes will do their own progress meter?
pm . finished ( ) ;
2011-04-04 12:38:38 -04:00
// build indexes
NamespaceString s ( ns ) ;
string si = s . db + " .system.indexes " ;
2011-04-04 16:30:09 -04:00
for ( int i = 0 ; i < nidx ; i + + ) {
2011-04-07 16:34:10 -04:00
killCurrentOp . checkForInterrupt ( false ) ;
2011-04-04 16:30:09 -04:00
BSONObj info = indexSpecs [ i ] . info ;
log ( ) < < " compact create index " < < info [ " key " ] . Obj ( ) . toString ( ) < < endl ;
try {
precalced = & phase1 [ i ] ;
theDataFileMgr . insert ( si . c_str ( ) , info . objdata ( ) , info . objsize ( ) ) ;
}
catch ( . . . ) {
precalced = 0 ;
throw ;
}
precalced = 0 ;
2010-10-06 14:09:53 -04:00
}
2011-04-04 12:38:38 -04:00
return true ;
2010-10-06 14:09:53 -04:00
}
2011-04-07 16:11:23 -04:00
bool compact ( const string & ns , string & errmsg , bool validate , BSONObjBuilder & result ) {
2011-05-23 20:01:42 -04:00
massert ( 14028 , " bad ns " , NamespaceString : : normal ( ns . c_str ( ) ) ) ;
2011-04-04 13:05:46 -04:00
massert ( 14027 , " can't compact a system namespace " , ! str : : contains ( ns , " .system. " ) ) ; // items in system.indexes cannot be moved there are pointers to those disklocs in NamespaceDetails
2011-04-04 12:38:38 -04:00
bool ok ;
{
writelock lk ;
BackgroundOperation : : assertNoBgOpInProgForNs ( ns . c_str ( ) ) ;
Client : : Context ctx ( ns ) ;
NamespaceDetails * d = nsdetails ( ns . c_str ( ) ) ;
massert ( 13660 , str : : stream ( ) < < " namespace " < < ns < < " does not exist " , d ) ;
massert ( 13661 , " cannot compact capped collection " , ! d - > capped ) ;
log ( ) < < " compact " < < ns < < " begin " < < endl ;
try {
2011-04-07 16:11:23 -04:00
ok = _compact ( ns . c_str ( ) , d , errmsg , validate , result ) ;
2011-04-04 12:38:38 -04:00
}
catch ( . . . ) {
log ( ) < < " compact " < < ns < < " end (with error) " < < endl ;
throw ;
}
log ( ) < < " compact " < < ns < < " end " < < endl ;
}
return ok ;
}
bool isCurrentlyAReplSetPrimary ( ) ;
2010-10-16 20:10:47 -04:00
2011-01-04 00:40:41 -05:00
class CompactCmd : public Command {
2010-10-16 20:10:47 -04:00
public :
2011-03-17 16:47:23 -04:00
virtual LockType locktype ( ) const { return NONE ; }
virtual bool adminOnly ( ) const { return false ; }
virtual bool slaveOk ( ) const { return true ; }
2011-07-26 16:43:21 -04:00
virtual bool maintenanceMode ( ) const { return true ; }
2011-03-17 16:47:23 -04:00
virtual bool logTheOp ( ) { return false ; }
virtual void help ( stringstream & help ) const {
2011-04-04 12:38:38 -04:00
help < < " compact collection \n "
2011-04-07 16:11:23 -04:00
" warning: this operation blocks the server and is slow. you can cancel with cancelOp() \n "
" { compact : <collection_name>, [force:true], [validate:true] } \n "
" force - allows to run on a replica set primary \n "
2011-04-12 12:10:54 -04:00
" validate - check records are noncorrupt before adding to newly compacting extents. slower but safer (default is true in this version) \n " ;
2011-03-17 16:47:23 -04:00
}
virtual bool requiresAuth ( ) { return true ; }
CompactCmd ( ) : Command ( " compact " ) { }
2011-07-18 15:23:37 -04:00
virtual bool run ( const string & db , BSONObj & cmdObj , int , string & errmsg , BSONObjBuilder & result , bool fromRepl ) {
2010-10-16 20:10:47 -04:00
string coll = cmdObj . firstElement ( ) . valuestr ( ) ;
2011-01-04 00:40:41 -05:00
if ( coll . empty ( ) | | db . empty ( ) ) {
2010-10-16 20:10:47 -04:00
errmsg = " no collection name specified " ;
return false ;
}
2011-04-04 12:38:38 -04:00
if ( isCurrentlyAReplSetPrimary ( ) & & ! cmdObj [ " force " ] . trueValue ( ) ) {
errmsg = " will not run compact on an active replica set primary as this is a slow blocking operation. use force:true to force " ;
return false ;
2010-10-16 20:10:47 -04:00
}
2011-03-17 16:47:23 -04:00
2011-04-04 12:38:38 -04:00
string ns = db + ' . ' + coll ;
2011-04-06 14:09:30 -04:00
bool validate = ! cmdObj . hasElement ( " validate " ) | | cmdObj [ " validate " ] . trueValue ( ) ; // default is true at the moment
2011-04-07 16:11:23 -04:00
bool ok = compact ( ns , errmsg , validate , result ) ;
2011-04-04 12:38:38 -04:00
return ok ;
2010-10-16 20:10:47 -04:00
}
} ;
static CompactCmd compactCmd ;
2010-10-06 14:09:53 -04:00
}