diff --git a/db/database.h b/db/database.h index 4090917f008..3e1cfe44a61 100644 --- a/db/database.h +++ b/db/database.h @@ -55,7 +55,7 @@ namespace mongo { delete files[i]; } - MongoDataFile* getFile( int n, int sizeNeeded = 0 ) { + MongoDataFile* getFile( int n, int sizeNeeded = 0, bool preallocateOnly = false ) { assert(this); namespaceIndex.init(); @@ -71,9 +71,12 @@ namespace mongo { if ( n > 100 ) out() << "getFile(): n=" << n << "?" << endl; } - while ( n >= (int) files.size() ) - files.push_back(0); - MongoDataFile* p = files[n]; + MongoDataFile* p = 0; + if ( !preallocateOnly ) { + while ( n >= (int) files.size() ) + files.push_back(0); + p = files[n]; + } if ( p == 0 ) { stringstream ss; ss << name << '.' << n; @@ -87,20 +90,26 @@ namespace mongo { if ( sizeNeeded + MDFHeader::headerSize() > minSize ) minSize = sizeNeeded + MDFHeader::headerSize(); try { - p->open( fullNameString.c_str(), minSize ); + p->open( fullNameString.c_str(), minSize, preallocateOnly ); } catch ( AssertionException& ) { delete p; throw; } - files[n] = p; + if ( preallocateOnly ) + delete p; + else + files[n] = p; } - return p; + return preallocateOnly ? 0 : p; } - MongoDataFile* addAFile( int sizeNeeded = 0 ) { + MongoDataFile* addAFile( int sizeNeeded = 0, bool preallocateNextFile = false ) { int n = (int) files.size(); - return getFile( n, sizeNeeded ); + MongoDataFile *ret = getFile( n, sizeNeeded ); + if ( preallocateNextFile ) + getFile( n + 1, 0, true ); + return ret; } MongoDataFile* suitableFile( int sizeNeeded ) { diff --git a/db/db.cpp b/db/db.cpp index 1260b496a32..f6bb202ecfd 100644 --- a/db/db.cpp +++ b/db/db.cpp @@ -24,6 +24,7 @@ #include "introspect.h" #include "repl.h" #include "../util/unittest.h" +#include "../util/file_allocator.h" #include "dbmessage.h" #include "instance.h" #if !defined(_WIN32) @@ -361,6 +362,8 @@ namespace mongo { acquirePathLock(); + theFileAllocator().start(); + BOOST_CHECK_EXCEPTION( clearTmpFiles() ); clearTmpCollections(); diff --git a/db/instance.cpp b/db/instance.cpp index 08ffc76738f..403479ab9af 100644 --- a/db/instance.cpp +++ b/db/instance.cpp @@ -30,6 +30,7 @@ #include "reccache.h" #include "replset.h" #include "../s/d_logic.h" +#include "../util/file_allocator.h" #if !defined(_WIN32) #include #endif @@ -652,6 +653,11 @@ namespace mongo { /* must do this before unmapping mem or you may get a seg fault */ closeAllSockets(); + // wait until file preallocation finishes + // we would only hang here if the file_allocator code generates a + // synchronous signal, which we don't expect + theFileAllocator().waitUntilFinished(); + stringstream ss3; MemoryMappedFile::closeAllFiles( ss3 ); rawOut( ss3.str() ); diff --git a/db/pdfile.cpp b/db/pdfile.cpp index d3240d00c75..5a34d4a21ed 100644 --- a/db/pdfile.cpp +++ b/db/pdfile.cpp @@ -29,6 +29,7 @@ _ disallow system* manipulations from the database. #include "db.h" #include "../util/mmap.h" #include "../util/hashtab.h" +#include "../util/file_allocator.h" #include "btree.h" #include #include @@ -194,7 +195,7 @@ namespace mongo { return size; } - void MongoDataFile::open( const char *filename, int minSize ) { + void MongoDataFile::open( const char *filename, int minSize, bool preallocateOnly ) { { /* check quotas very simple temporary implementation - we will in future look up @@ -228,6 +229,15 @@ namespace mongo { assert( ( size >= 64*1024*1024 ) || ( strstr( filename, "_hudsonSmall" ) ) ); assert( size % 4096 == 0 ); + if ( preallocateOnly ) { +#if !defined(_WIN32) + // if file exists, update 'size' to match existing file size. + MemoryMappedFile::updateLength( filename, size ); + theFileAllocator().requestAllocation( filename, size ); +#endif + return; + } + header = (MDFHeader *) mmf.map(filename, size); if( sizeof(char *) == 4 ) uassert("can't map file memory - mongo requires 64 bit build for larger datasets", header); @@ -272,7 +282,7 @@ assert( !eloc.isNull() ); out() << "warning: loops=" << loops << " fileno:" << fileNo << ' ' << ns << '\n'; } log() << "newExtent: " << ns << " file " << fileNo << " full, adding a new file\n"; - return database->addAFile()->createExtent(ns, approxSize, newCapped, loops+1); + return database->addAFile( 0, true )->createExtent(ns, approxSize, newCapped, loops+1); } int offset = header->unused.getOfs(); header->unused.setOfs( fileNo, offset + ExtentSize ); diff --git a/db/pdfile.h b/db/pdfile.h index fc8ba529efb..09a03c2fcb0 100644 --- a/db/pdfile.h +++ b/db/pdfile.h @@ -59,7 +59,7 @@ namespace mongo { friend class BasicCursor; public: MongoDataFile(int fn) : fileNo(fn) { } - void open(const char *filename, int requestedDataSize = 0); + void open(const char *filename, int requestedDataSize = 0, bool preallocateOnly = false); /* allocate a new extent from this datafile. @param capped - true if capped collection diff --git a/mongo.xcodeproj/project.pbxproj b/mongo.xcodeproj/project.pbxproj index 2f4b28a56d6..a109824d503 100644 --- a/mongo.xcodeproj/project.pbxproj +++ b/mongo.xcodeproj/project.pbxproj @@ -125,7 +125,6 @@ 9350E1220F8CFFB300B07A1C /* error2.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = error2.js; sourceTree = ""; }; 936AB4BB0F3A5B0300D5015F /* update3.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = update3.js; sourceTree = ""; }; 936AB9350F3C8AB800D5015F /* _lodeRunner.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = _lodeRunner.js; sourceTree = ""; }; - 936ABBAB0F3CBE5400D5015F /* dbNoCreate.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = dbNoCreate.js; sourceTree = ""; }; 936B89590F4C899400934AF2 /* file.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = file.h; sourceTree = ""; }; 936B895A0F4C899400934AF2 /* md5.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = md5.c; sourceTree = ""; }; 936B895B0F4C899400934AF2 /* md5.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = md5.h; sourceTree = ""; }; @@ -238,6 +237,8 @@ 93AB91510F4F1C970020A046 /* _runner_sharding.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = _runner_sharding.js; sourceTree = ""; }; 93AB91520F4F1C970020A046 /* eval1.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = eval1.js; sourceTree = ""; }; 93AB91530F4F1C970020A046 /* eval2.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = eval2.js; sourceTree = ""; }; + 93AE6FB20F9631A200857F1C /* dbNoCreate.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = dbNoCreate.js; sourceTree = ""; }; + 93AE6FB30F9631A200857F1C /* fileAllocation.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = fileAllocation.js; sourceTree = ""; }; 93AF75500F216D0300994C66 /* jsontests.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = jsontests.cpp; sourceTree = ""; }; 93B4A81A0F1C01B4000C862C /* security.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = security.cpp; sourceTree = ""; }; 93B4A81B0F1C01D8000C862C /* lasterror.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = lasterror.cpp; sourceTree = ""; }; @@ -256,6 +257,7 @@ 93D948210F7BF4FA00C3C768 /* shellfork.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = shellfork.js; sourceTree = ""; }; 93D949B40F7D2A7700C3C768 /* median.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = median.js; sourceTree = ""; }; 93DCDB5B0F93ED98005349BC /* nin.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = nin.js; sourceTree = ""; }; + 93DCDBD30F9515AF005349BC /* file_allocator.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = file_allocator.h; sourceTree = ""; }; 93E3C5310F704C9D0029011E /* repl4.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = repl4.js; sourceTree = ""; }; 93E3C5960F7149F40029011E /* repl5.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = repl5.js; sourceTree = ""; }; 93E559BF0F8BC6AC0027A4A6 /* drop.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; path = drop.js; sourceTree = ""; }; @@ -463,6 +465,7 @@ 934DD87B0EFAD23B00459CC1 /* util */ = { isa = PBXGroup; children = ( + 93DCDBD30F9515AF005349BC /* file_allocator.h */, 931184DC0F83C95800A6DC44 /* message_server_port.cpp */, 936B89590F4C899400934AF2 /* file.h */, 936B895A0F4C899400934AF2 /* md5.c */, @@ -491,14 +494,6 @@ path = util; sourceTree = ""; }; - 936ABBAA0F3CBE5400D5015F /* recovery */ = { - isa = PBXGroup; - children = ( - 936ABBAB0F3CBE5400D5015F /* dbNoCreate.js */, - ); - path = recovery; - sourceTree = ""; - }; 93A13A200F4620A500AF1B0D /* s */ = { isa = PBXGroup; children = ( @@ -544,6 +539,7 @@ 93A8D1D10F37544800C92B85 /* jstests */ = { isa = PBXGroup; children = ( + 93AE6FB10F9631A200857F1C /* disk */, 93DCDB5B0F93ED98005349BC /* nin.js */, 9350E1220F8CFFB300B07A1C /* error2.js */, 93E55A260F8BE5320027A4A6 /* rename.js */, @@ -562,7 +558,6 @@ 93AB91520F4F1C970020A046 /* eval1.js */, 93AB91530F4F1C970020A046 /* eval2.js */, 93F386400F40E27800967EFA /* hint1.js */, - 936ABBAA0F3CBE5400D5015F /* recovery */, 936AB9350F3C8AB800D5015F /* _lodeRunner.js */, 936AB4BB0F3A5B0300D5015F /* update3.js */, 93A8D8200F38FE2400C92B85 /* autoid.js */, @@ -651,6 +646,15 @@ path = repl; sourceTree = ""; }; + 93AE6FB10F9631A200857F1C /* disk */ = { + isa = PBXGroup; + children = ( + 93AE6FB20F9631A200857F1C /* dbNoCreate.js */, + 93AE6FB30F9631A200857F1C /* fileAllocation.js */, + ); + path = disk; + sourceTree = ""; + }; 93D19B300F5EF09C0084C329 /* clone */ = { isa = PBXGroup; children = ( diff --git a/util/file_allocator.h b/util/file_allocator.h new file mode 100644 index 00000000000..37a08a414c4 --- /dev/null +++ b/util/file_allocator.h @@ -0,0 +1,171 @@ +/** + * Copyright (C) 2009 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "stdafx.h" +#include + +#ifndef O_NOATIME +#define O_NOATIME 0 +#endif + +namespace mongo { + + // Handles allocation of contiguous files on disk. + class FileAllocator { + // The public functions may not be called concurrently. If + // allocateAsap() is called for a file after requestAllocation(), the + // sizes in each call must be the same. + public: + void start() { + Runner r( *this ); + boost::thread t( r ); + } + // May be called if file exists, but may not be called more than once + // for a file. + void requestAllocation( const string &name, int size ) { + if ( boost::filesystem::exists( name ) ) + return; + { + boostlock lk( pendingMutex_ ); + pending_.push_back( make_pair( name, size ) ); + } + pendingUpdated_.notify_all(); + } + // Returns when file has been allocated. + void allocateAsap( const string &name, int size ) { + pair< string, int > spec( name, size ); + { + boostlock lk( pendingMutex_ ); + if ( allocated( name ) ) + return; + if ( pending_.size() == 0 ) + pending_.push_back( spec ); + else if ( pending_.front() != spec ) { + pending_.remove( spec ); + list< pair< string, int > >::iterator i = pending_.begin(); + ++i; + pending_.insert( i, spec ); + } + } + pendingUpdated_.notify_all(); + boostlock lk( pendingMutex_ ); + while( 1 ) { + if ( allocated( name ) ) { + return; + } + pendingUpdated_.wait( lk ); + } + } + + void waitUntilFinished() const { + boostlock lk( pendingMutex_ ); + while( 1 ) { + if ( pending_.size() == 0 ) + return; + pendingUpdated_.wait( lk ); + } + } + + private: + // caller must hold pendingMutex_ lock + bool allocated( const string &name ) const { + if ( !boost::filesystem::exists( name ) ) + return false; + for( list< pair< string, int > >::const_iterator i = pending_.begin(); i != pending_.end(); ++i ) + if ( i->first == name ) + return false; + return true; + } + + mutable boost::mutex pendingMutex_; + mutable boost::condition_variable pendingUpdated_; + list< pair< string, int > > pending_; + + struct Runner { + Runner( FileAllocator &allocator ) : a_( allocator ) {} + FileAllocator &a_; + void operator()() { + while( 1 ) { + { + boostlock lk( a_.pendingMutex_ ); + if ( a_.pending_.size() == 0 ) + a_.pendingUpdated_.wait( lk ); + } + while( 1 ) { + string name; + int size; + { + boostlock lk( a_.pendingMutex_ ); + if ( a_.pending_.size() == 0 ) + break; + name = a_.pending_.front().first; + size = a_.pending_.front().second; + } + try { + int fd = open(name.c_str(), O_CREAT | O_RDWR | O_NOATIME, S_IRUSR | S_IWUSR); + if ( fd <= 0 ) { + stringstream ss; + ss << "couldn't open " << name << ' ' << errno; + massert( ss.str(), fd <= 0 ); + } + + /* make sure the file is the full desired length */ + off_t filelen = lseek(fd, 0, SEEK_END); + if ( filelen < size ) { + massert( "failure mapping new file", filelen == 0 ); + // Check for end of disk. + massert( "Unable to allocate file of desired size", + size - 1 == lseek(fd, size - 1, SEEK_SET) ); + massert( "Unable to allocate file of desired size", + 1 == write(fd, "", 1) ); + lseek(fd, 0, SEEK_SET); + log() << "allocating new datafile " << name << ", filling with zeroes..." << endl; + Timer t; + int z = 8192; + char buf[z]; + memset(buf, 0, z); + int left = size; + while ( 1 ) { + if ( left <= z ) { + write(fd, buf, left); + break; + } + write(fd, buf, z); + left -= z; + } + log() << "done allocating datafile " << name << ", size: " << size << ", took " << ((double)t.millis())/1000.0 << " secs" << endl; + } + close( fd ); + + } catch ( ... ) { + problem() << "Failed to allocate new file: " << name + << ", size: " << size << ", aborting." << endl; + } + + { + boostlock lk( a_.pendingMutex_ ); + a_.pending_.pop_front(); + } + a_.pendingUpdated_.notify_all(); + } + } + } + }; + }; + + FileAllocator &theFileAllocator(); + +} // namespace mongo \ No newline at end of file diff --git a/util/goodies.h b/util/goodies.h index 80c03052e4e..e9e561e778f 100644 --- a/util/goodies.h +++ b/util/goodies.h @@ -113,6 +113,7 @@ namespace mongo { } // namespace mongo #include +#include #include namespace mongo { diff --git a/util/mmap.cpp b/util/mmap.cpp index e0df6ec1ad9..a68cff77fc2 100644 --- a/util/mmap.cpp +++ b/util/mmap.cpp @@ -46,7 +46,7 @@ namespace mongo { --closingAllFiles; } - void MemoryMappedFile::updateLength( const char *filename, int &length ) const { + void MemoryMappedFile::updateLength( const char *filename, int &length ) { if ( !boost::filesystem::exists( filename ) ) return; // make sure we map full length if preexisting file. diff --git a/util/mmap.h b/util/mmap.h index a2ecc138e82..110f041039e 100644 --- a/util/mmap.h +++ b/util/mmap.h @@ -44,7 +44,7 @@ namespace mongo { return len; } - void updateLength( const char *filename, int &length ) const; + static void updateLength( const char *filename, int &length ); private: void created(); diff --git a/util/mmap_posix.cpp b/util/mmap_posix.cpp index 360484554a0..b351b02392f 100644 --- a/util/mmap_posix.cpp +++ b/util/mmap_posix.cpp @@ -18,6 +18,7 @@ #include "stdafx.h" #include "mmap.h" +#include "file_allocator.h" #include #include @@ -53,47 +54,14 @@ namespace mongo { updateLength( filename, length ); len = length; - fd = open(filename, O_CREAT | O_RDWR | O_NOATIME, S_IRUSR | S_IWUSR); + theFileAllocator().allocateAsap( filename, length ); + + fd = open(filename, O_RDWR | O_NOATIME); if ( fd <= 0 ) { out() << "couldn't open " << filename << ' ' << errno << endl; return 0; } - /* make sure the file is the full desired length */ - off_t filelen = lseek(fd, 0, SEEK_END); - if ( filelen < length ) { - // log() << "map: file length=" << (unsigned) filelen << " want:" - // << length - // << endl; - if ( filelen != 0 ) { - problem() << "failure mapping new file " << filename << " length:" << length << endl; - return 0; - } - // Check for end of disk. - massert( "Unable to allocate file of desired size", - length - 1 == lseek(fd, length - 1, SEEK_SET) ); - massert( "Unable to allocate file of desired size", - 1 == write(fd, "", 1) ); - lseek(fd, 0, SEEK_SET); - Nullstream &l = log(); - l << "new datafile " << filename << " filling with zeroes..."; - l.flush(); - Timer t; - int z = 8192; - char buf[z]; - memset(buf, 0, z); - int left = length; - while ( 1 ) { - if ( left <= z ) { - write(fd, buf, left); - break; - } - write(fd, buf, z); - left -= z; - } - l << "done " << ((double)t.millis())/1000.0 << " secs" << endl; - } - view = mmap(NULL, length, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); if ( view == MAP_FAILED ) { out() << " mmap() failed for " << filename << " len:" << length << " errno:" << errno << endl; diff --git a/util/util.cpp b/util/util.cpp index 4190966849a..ebebf3e8bdf 100644 --- a/util/util.cpp +++ b/util/util.cpp @@ -20,6 +20,7 @@ #include "goodies.h" #include "unittest.h" #include "top.h" +#include "file_allocator.h" namespace mongo { @@ -94,5 +95,9 @@ namespace mongo { Top::UsageMap Top::snapshotB_; Top::UsageMap &Top::snapshot_ = Top::snapshotA_; Top::UsageMap &Top::nextSnapshot_ = Top::snapshotB_; + + // The mutex contained in this object may be held on shutdown. + FileAllocator &theFileAllocator_ = *(new FileAllocator()); + FileAllocator &theFileAllocator() { return theFileAllocator_; } } // namespace mongo