/* @file compact.cpp compaction of deleted space in pdfiles (datafiles) */ /* NOTE 6Oct2010 : this file PRELIMINARY, EXPERIMENTAL, NOT DONE, NOT USED YET (not in SConstruct) */ /** * Copyright (C) 2010 10gen Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful,b * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ #include "pch.h" #include "pdfile.h" #include "concurrency.h" #include "commands.h" #include "curop-inl.h" #include "background.h" #include "../util/concurrency/task.h" namespace mongo { char faux; void addRecordToRecListInExtent(Record *r, DiskLoc loc); DiskLoc allocateSpaceForANewRecord(const char *ns, NamespaceDetails *d, int lenWHdr); void freeExtents(DiskLoc firstExt, DiskLoc lastExt); void compactExtent(const char *ns, NamespaceDetails *d, DiskLoc ext, int n) { log() << "compact extent #" << n << endl; Extent *e = ext.ext(); e->assertOk(); assert( e->validates() ); { // the next/prev pointers within the extent might not be in order so we first page the whole thing in // sequentially log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; Timer t; MAdvise adv(e, e->length, MAdvise::Sequential); const char *p = (const char *) e; for( int i = 0; i < e->length; i += 4096 ) { faux += *p; } int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl; } { log() << "compact copying records" << endl; unsigned totalSize = 0; int nrecs = 0; int prev = DiskLoc::NullOfs; DiskLoc L = e->firstRecord; if( !L.isNull() ) while( 1 ) { Record *recOld = L.rec(); L = recOld->nextInExtent(L); nrecs++; BSONObj objOld(recOld); unsigned sz = objOld.objsize(); unsigned lenWHdr = sz + Record::HeaderSize; totalSize += lenWHdr; DiskLoc extentLoc; DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWHdr); uassert(14024, "compact error out of space during compaction", !loc.isNull()); Record *recNew = loc.rec(); recNew = (Record *) getDur().writingPtr(recNew, lenWHdr); addRecordToRecListInExtent(recNew, loc); memcpy(recNew->data, objOld.objdata(), sz); if( L.isNull() ) { // we just did the very last record from the old extent. it's still pointed to // by the old extent ext, but that will be fixed below after this loop break; } // remove the old record (orphan it) e->firstRecord.writing() = L; Record *r = L.rec(); getDur().writingInt(r->prevOfs) = DiskLoc::NullOfs; getDur().commitIfNeeded(); } assert( d->firstExtent == ext ); assert( d->lastExtent != ext ); DiskLoc newFirst = e->xnext; d->firstExtent.writing() = newFirst; newFirst.ext()->xprev.writing().Null(); getDur().writing(e)->markEmpty(); freeExtents(ext,ext); getDur().commitNow(); log() << "compact " << nrecs << " documents " << totalSize/1000000.0 << "MB" << endl; } // drop this extent } bool _compact(const char *ns, NamespaceDetails *d, string& errmsg) { int les = d->lastExtentSize; // this is a big job, so might as well make things tidy before we start just to be nice. getDur().commitNow(); set extents; for( DiskLoc L = d->firstExtent; !L.isNull(); L = L.ext()->xnext ) extents.insert(L); log() << "compact " << extents.size() << " extents" << endl; // same data, but might perform a little different after compact? NamespaceDetailsTransient::get_w(ns).clearQueryCache(); list indexes; { NamespaceDetails::IndexIterator ii = d->ii(); while( ii.more() ) { BSONObjBuilder b; BSONObj::iterator i(ii.next().info.obj()); while( i.more() ) { BSONElement e = i.next(); if( strcmp(e.fieldName(), "v") != 0 && strcmp(e.fieldName(), "background") != 0 ) b.append(e); } indexes.push_back( b.obj() ); } } log() << "compact orphan deleted lists" << endl; for( int i = 0; i < Buckets; i++ ) { d->deletedList[i].writing().Null(); } // before dropping indexes, at least make sure we can allocate one extent! uassert(14025, "compact error no space available to allocate", !allocateSpaceForANewRecord(ns, d, Record::HeaderSize+1).isNull()); // note that the drop indexes call also invalidates all clientcursors for the namespace, which is important and wanted here log() << "compact dropping indexes" << endl; BSONObjBuilder b; if( !dropIndexes(d, ns, "*", errmsg, b, true) ) { log() << "compact drop indexes failed" << endl; return false; } getDur().commitNow(); int n = 0; for( set::iterator i = extents.begin(); i != extents.end(); i++ ) { compactExtent(ns, d, *i, n++); } assert( d->firstExtent.ext()->xprev.isNull() ); // build indexes NamespaceString s(ns); string si = s.db + ".system.indexes"; for( list::iterator i = indexes.begin(); i != indexes.end(); i++ ) { log() << "compact create index " << (*i)["key"].Obj().toString() << endl; theDataFileMgr.insert(si.c_str(), i->objdata(), i->objsize()); } return true; } bool compact(const string& ns, string &errmsg) { massert( 14028, "bad ns", isANormalNSName(ns.c_str()) ); massert( 14027, "can't compact a system namespace", !str::contains(ns, ".system.") ); // items in system.indexes cannot be moved there are pointers to those disklocs in NamespaceDetails bool ok; { writelock lk; BackgroundOperation::assertNoBgOpInProgForNs(ns.c_str()); Client::Context ctx(ns); NamespaceDetails *d = nsdetails(ns.c_str()); massert( 13660, str::stream() << "namespace " << ns << " does not exist", d ); massert( 13661, "cannot compact capped collection", !d->capped ); log() << "compact " << ns << " begin" << endl; try { ok = _compact(ns.c_str(), d, errmsg); } catch(...) { log() << "compact " << ns << " end (with error)" << endl; throw; } log() << "compact " << ns << " end" << endl; } return ok; } bool isCurrentlyAReplSetPrimary(); class CompactCmd : public Command { public: virtual LockType locktype() const { return NONE; } virtual bool adminOnly() const { return false; } virtual bool slaveOk() const { return true; } virtual bool logTheOp() { return false; } virtual void help( stringstream& help ) const { help << "compact collection\n" " { compact : , [force:true] }" "warning: this operation blocks the server and is slow. you can cancel with cancelOp()"; } virtual bool requiresAuth() { return true; } CompactCmd() : Command("compact") { } virtual bool run(const string& db, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { string coll = cmdObj.firstElement().valuestr(); if( coll.empty() || db.empty() ) { errmsg = "no collection name specified"; return false; } if( isCurrentlyAReplSetPrimary() && !cmdObj["force"].trueValue() ) { errmsg = "will not run compact on an active replica set primary as this is a slow blocking operation. use force:true to force"; return false; } // temp if( !cmdObj["dev"].trueValue() ) { errmsg = "compact is not yet implemented"; return false; } string ns = db + '.' + coll; bool ok = compact(ns, errmsg); return ok; } }; static CompactCmd compactCmd; }