// btree.cpp /** * Copyright (C) 2008 10gen Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ #include "pch.h" #include "db.h" #include "btree.h" #include "pdfile.h" #include "json.h" #include "clientcursor.h" #include "client.h" #include "dbhelpers.h" #include "curop-inl.h" #include "stats/counters.h" namespace mongo { #if !defined(_DURABLE) || !defined(_DEBUG) #define VERIFYTHISLOC dassert( thisLoc.btree() == this ); #else // with _DURABLE, this assert wouldn't work without getting fancier as there are multiple mmap views for _DEBUG mode... #define VERIFYTHISLOC #endif /** * give us a writable version of the btree bucket (declares write intent). * note it is likely more efficient to declare write intent on something smaller when you can. */ BtreeBucket* DiskLoc::btreemod() const { assert( _a != -1 ); BtreeBucket *b = const_cast< BtreeBucket * >( btree() ); return static_cast< BtreeBucket* >( dur::writingPtr( b, BucketSize ) ); } _KeyNode& _KeyNode::writing() const { return *dur::writing( const_cast< _KeyNode* >( this ) ); } KeyNode::KeyNode(const BucketBasics& bb, const _KeyNode &k) : prevChildBucket(k.prevChildBucket), recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs()) { } // largest key size we allow. note we very much need to support bigger keys (somehow) in the future. static const int KeyMax = BucketSize / 10; // We define this value as the maximum number of bytes such that, if we have // fewer than this many bytes, we must be able to either merge with or receive // keys from any neighboring node. If our utilization goes below this value we // know we can bring up the utilization with a simple operation. Ignoring the // 90/10 split policy which is sometimes employed and our 'unused' nodes, this // is a lower bound on bucket utilization for non root buckets. // // Note that the exact value here depends on the implementation of // rebalancedSeparatorPos(). The conditions for lowWaterMark - 1 are as // follows: We know we cannot merge with the neighbor, so the total data size // for us, the neighbor, and the separator must be at least // BtreeBucket::bodySize() + 1. We must be able to accept one key of any // allowed size, so our size plus storage for that additional key must be // <= BtreeBucket::bodySize() / 2. This way, with the extra key we'll have a // new bucket data size < half the total data size and by the implementation // of rebalancedSeparatorPos() the key must be added. static const int lowWaterMark = BtreeBucket::bodySize() / 2 - KeyMax - sizeof( _KeyNode ) + 1; static const int split_debug = 0; static const int insert_debug = 0; extern int otherTraceLevel; /** * this error is ok/benign when doing a background indexing -- that logic in pdfile checks explicitly * for the 10287 error code. */ static void alreadyInIndex() { // we don't use massert() here as that does logging and this is 'benign' - see catches in _indexRecord() throw MsgAssertionException(10287, "btree: key+recloc already in index"); } /* BucketBasics --------------------------------------------------- */ string BtreeBucket::bucketSummary() const { stringstream ss; ss << " Bucket info:" << endl; ss << " n: " << n << endl; ss << " parent: " << parent.toString() << endl; ss << " nextChild: " << parent.toString() << endl; ss << " flags:" << flags << endl; ss << " emptySize: " << emptySize << " topSize: " << topSize << endl; return ss.str(); } int BucketBasics::Size() const { assert( _wasSize == BucketSize ); return BucketSize; } void BucketBasics::_shape(int level, stringstream& ss) const { for ( int i = 0; i < level; i++ ) ss << ' '; ss << "*\n"; for ( int i = 0; i < n; i++ ) if ( !k(i).prevChildBucket.isNull() ) k(i).prevChildBucket.btree()->_shape(level+1,ss); if ( !nextChild.isNull() ) nextChild.btree()->_shape(level+1,ss); } int bt_fv=0; int bt_dmp=0; void BtreeBucket::dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const { bt_dmp=1; fullValidate(thisLoc, order); bt_dmp=0; } int BtreeBucket::fullValidate(const DiskLoc& thisLoc, const BSONObj &order, int *unusedCount, bool strict) const { { bool f = false; assert( f = true ); massert( 10281 , "assert is misdefined", f); } killCurrentOp.checkForInterrupt(); assertValid(order, true); if ( bt_dmp ) { out() << thisLoc.toString() << ' '; ((BtreeBucket *) this)->dump(); } // keycount int kc = 0; for ( int i = 0; i < n; i++ ) { const _KeyNode& kn = k(i); if ( kn.isUsed() ) { kc++; } else { if ( unusedCount ) { ++( *unusedCount ); } } if ( !kn.prevChildBucket.isNull() ) { DiskLoc left = kn.prevChildBucket; const BtreeBucket *b = left.btree(); if ( strict ) { assert( b->parent == thisLoc ); } else { wassert( b->parent == thisLoc ); } kc += b->fullValidate(kn.prevChildBucket, order, unusedCount, strict); } } if ( !nextChild.isNull() ) { const BtreeBucket *b = nextChild.btree(); if ( strict ) { assert( b->parent == thisLoc ); } else { wassert( b->parent == thisLoc ); } kc += b->fullValidate(nextChild, order, unusedCount, strict); } return kc; } int nDumped = 0; void BucketBasics::assertValid(const Ordering &order, bool force) const { if ( !debug && !force ) return; wassert( n >= 0 && n < Size() ); wassert( emptySize >= 0 && emptySize < BucketSize ); wassert( topSize >= n && topSize <= BucketSize ); // this is very slow so don't do often { static int _k; if( ++_k % 128 ) return; } DEV { // slow: for ( int i = 0; i < n-1; i++ ) { BSONObj k1 = keyNode(i).key; BSONObj k2 = keyNode(i+1).key; int z = k1.woCompare(k2, order); //OK if ( z > 0 ) { out() << "ERROR: btree key order corrupt. Keys:" << endl; if ( ++nDumped < 5 ) { for ( int j = 0; j < n; j++ ) { out() << " " << keyNode(j).key.toString() << endl; } ((BtreeBucket *) this)->dump(); } wassert(false); break; } else if ( z == 0 ) { if ( !(k(i).recordLoc < k(i+1).recordLoc) ) { out() << "ERROR: btree key order corrupt (recordloc's wrong). Keys:" << endl; out() << " k(" << i << "):" << keyNode(i).key.toString() << " RL:" << k(i).recordLoc.toString() << endl; out() << " k(" << i+1 << "):" << keyNode(i+1).key.toString() << " RL:" << k(i+1).recordLoc.toString() << endl; wassert( k(i).recordLoc < k(i+1).recordLoc ); } } } } else { //faster: if ( n > 1 ) { BSONObj k1 = keyNode(0).key; BSONObj k2 = keyNode(n-1).key; int z = k1.woCompare(k2, order); //wassert( z <= 0 ); if ( z > 0 ) { problem() << "btree keys out of order" << '\n'; ONCE { ((BtreeBucket *) this)->dump(); } assert(false); } } } } inline void BucketBasics::markUnused(int keypos) { assert( keypos >= 0 && keypos < n ); k(keypos).setUnused(); } inline int BucketBasics::totalDataSize() const { return (int) (Size() - (data-(char*)this)); } void BucketBasics::init() { parent.Null(); nextChild.Null(); _wasSize = BucketSize; _reserved1 = 0; flags = Packed; n = 0; emptySize = totalDataSize(); topSize = 0; reserved = 0; } /** see _alloc */ inline void BucketBasics::_unalloc(int bytes) { topSize -= bytes; emptySize += bytes; } /** * we allocate space from the end of the buffer for data. * the keynodes grow from the front. */ inline int BucketBasics::_alloc(int bytes) { topSize += bytes; emptySize -= bytes; int ofs = totalDataSize() - topSize; assert( ofs > 0 ); return ofs; } void BucketBasics::_delKeyAtPos(int keypos, bool mayEmpty) { assert( keypos >= 0 && keypos <= n ); assert( childForPos(keypos).isNull() ); // TODO audit cases where nextChild is null assert( ( mayEmpty && n > 0 ) || n > 1 || nextChild.isNull() ); emptySize += sizeof(_KeyNode); n--; for ( int j = keypos; j < n; j++ ) k(j) = k(j+1); setNotPacked(); } /** * pull rightmost key from the bucket. this version requires its right child to be null so it * does not bother returning that value. */ void BucketBasics::popBack(DiskLoc& recLoc, BSONObj& key) { massert( 10282 , "n==0 in btree popBack()", n > 0 ); assert( k(n-1).isUsed() ); // no unused skipping in this function at this point - btreebuilder doesn't require that KeyNode kn = keyNode(n-1); recLoc = kn.recordLoc; key = kn.key; int keysize = kn.key.objsize(); massert( 10283 , "rchild not null in btree popBack()", nextChild.isNull()); // weirdly, we also put the rightmost down pointer in nextchild, even when bucket isn't full. nextChild = kn.prevChildBucket; n--; emptySize += sizeof(_KeyNode); _unalloc(keysize); } /** add a key. must be > all existing. be careful to set next ptr right. */ bool BucketBasics::_pushBack(const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, const DiskLoc prevChild) { int bytesNeeded = key.objsize() + sizeof(_KeyNode); if ( bytesNeeded > emptySize ) return false; assert( bytesNeeded <= emptySize ); assert( n == 0 || keyNode(n-1).key.woCompare(key, order) <= 0 ); emptySize -= sizeof(_KeyNode); _KeyNode& kn = k(n++); kn.prevChildBucket = prevChild; kn.recordLoc = recordLoc; kn.setKeyDataOfs( (short) _alloc(key.objsize()) ); char *p = dataAt(kn.keyDataOfs()); memcpy(p, key.objdata(), key.objsize()); return true; } /** * insert a key in a bucket with no complexity -- no splits required * @return false if a split is required. */ bool BucketBasics::basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const BSONObj& key, const Ordering &order) const { assert( keypos >= 0 && keypos <= n ); int bytesNeeded = key.objsize() + sizeof(_KeyNode); if ( bytesNeeded > emptySize ) { _pack(thisLoc, order, keypos); if ( bytesNeeded > emptySize ) return false; } BucketBasics *b; { const char *p = (const char *) &k(keypos); const char *q = (const char *) &k(n+1); // declare that we will write to [k(keypos),k(n)] // todo: this writes a medium amount to the journal. we may want to add a verb "shift" to the redo log so // we can log a very small amount. b = (BucketBasics*) dur::writingAtOffset((void *) this, p-(char*)this, q-p); // e.g. n==3, keypos==2 // 1 4 9 // -> // 1 4 _ 9 for ( int j = n; j > keypos; j-- ) // make room b->k(j) = b->k(j-1); } dur::declareWriteIntent(&b->emptySize, 12); b->emptySize -= sizeof(_KeyNode); b->n++; _KeyNode& kn = b->k(keypos); kn.prevChildBucket.Null(); kn.recordLoc = recordLoc; kn.setKeyDataOfs((short) b->_alloc(key.objsize()) ); char *p = b->dataAt(kn.keyDataOfs()); dur::declareWriteIntent(p, key.objsize()); memcpy(p, key.objdata(), key.objsize()); return true; } /** with this implementation, refPos == 0 disregards effect of refPos */ bool BucketBasics::mayDropKey( int index, int refPos ) const { return index > 0 && ( index != refPos ) && k( index ).isUnused() && k( index ).prevChildBucket.isNull(); } int BucketBasics::packedDataSize( int refPos ) const { if ( flags & Packed ) { return BucketSize - emptySize - headerSize(); } int size = 0; for( int j = 0; j < n; ++j ) { if ( mayDropKey( j, refPos ) ) { continue; } size += keyNode( j ).key.objsize() + sizeof( _KeyNode ); } return size; } /** * when we delete things we just leave empty space until the node is * full and then we repack it. */ void BucketBasics::_pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const { if ( flags & Packed ) return; thisLoc.btreemod()->_packReadyForMod(order, refPos); } void BucketBasics::_packReadyForMod( const Ordering &order, int &refPos ) { if ( flags & Packed ) return; int tdz = totalDataSize(); char temp[BucketSize]; int ofs = tdz; topSize = 0; int i = 0; for ( int j = 0; j < n; j++ ) { if( mayDropKey( j, refPos ) ) { continue; // key is unused and has no children - drop it } if( i != j ) { if ( refPos == j ) { refPos = i; // i < j so j will never be refPos again } k( i ) = k( j ); } short ofsold = k(i).keyDataOfs(); int sz = keyNode(i).key.objsize(); ofs -= sz; topSize += sz; memcpy(temp+ofs, dataAt(ofsold), sz); k(i).setKeyDataOfsSavingUse( ofs ); ++i; } if ( refPos == n ) { refPos = i; } n = i; int dataUsed = tdz - ofs; memcpy(data + ofs, temp + ofs, dataUsed); emptySize = tdz - dataUsed - n * sizeof(_KeyNode); assert( emptySize >= 0 ); setPacked(); assertValid( order ); } inline void BucketBasics::truncateTo(int N, const Ordering &order, int &refPos) { n = N; setNotPacked(); _packReadyForMod( order, refPos ); } /** * In the standard btree algorithm, we would split based on the * existing keys _and_ the new key. But that's more work to * implement, so we split the existing keys and then add the new key. * * There are several published heuristic algorithms for doing splits, * but basically what you want are (1) even balancing between the two * sides and (2) a small split key so the parent can have a larger * branching factor. * * We just have a simple algorithm right now: if a key includes the * halfway point (or 10% way point) in terms of bytes, split on that key; * otherwise split on the key immediately to the left of the halfway * point. * * This function is expected to be called on a packed bucket. */ int BucketBasics::splitPos( int keypos ) const { assert( n > 2 ); int split = 0; int rightSize = 0; // when splitting a btree node, if the new key is greater than all the other keys, we should not do an even split, but a 90/10 split. // see SERVER-983 int rightSizeLimit = ( topSize + sizeof( _KeyNode ) * n ) / ( keypos == n ? 10 : 2 ); for( int i = n - 1; i > -1; --i ) { rightSize += keyNode( i ).key.objsize() + sizeof( _KeyNode ); if ( rightSize > rightSizeLimit ) { split = i; break; } } // safeguards - we must not create an empty bucket if ( split < 1 ) { split = 1; } else if ( split > n - 2 ) { split = n - 2; } return split; } void BucketBasics::reserveKeysFront( int nAdd ) { assert( emptySize >= int( sizeof( _KeyNode ) * nAdd ) ); emptySize -= sizeof( _KeyNode ) * nAdd; for( int i = n - 1; i > -1; --i ) { k( i + nAdd ) = k( i ); } n += nAdd; } void BucketBasics::setKey( int i, const DiskLoc recordLoc, const BSONObj &key, const DiskLoc prevChildBucket ) { _KeyNode &kn = k( i ); kn.recordLoc = recordLoc; kn.prevChildBucket = prevChildBucket; short ofs = (short) _alloc( key.objsize() ); kn.setKeyDataOfs( ofs ); char *p = dataAt( ofs ); memcpy( p, key.objdata(), key.objsize() ); } void BucketBasics::dropFront( int nDrop, const Ordering &order, int &refpos ) { for( int i = nDrop; i < n; ++i ) { k( i - nDrop ) = k( i ); } n -= nDrop; setNotPacked(); _packReadyForMod( order, refpos ); } /* - BtreeBucket --------------------------------------------------- */ /** @return largest key in the subtree. */ void BtreeBucket::findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey) { DiskLoc loc = thisLoc; while ( 1 ) { const BtreeBucket *b = loc.btree(); if ( !b->nextChild.isNull() ) { loc = b->nextChild; continue; } assert(b->n>0); largestLoc = loc; largestKey = b->n-1; break; } } /** * NOTE Currently the Ordering implementation assumes a compound index will * not have more keys than an unsigned variable has bits. The same * assumption is used in the implementation below with respect to the 'mask' * variable. */ int BtreeBucket::customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction ) { BSONObjIterator ll( l ); BSONObjIterator rr( rBegin ); vector< const BSONElement * >::const_iterator rr2 = rEnd.begin(); vector< bool >::const_iterator inc = rEndInclusive.begin(); unsigned mask = 1; for( int i = 0; i < rBeginLen; ++i, mask <<= 1 ) { BSONElement lll = ll.next(); BSONElement rrr = rr.next(); ++rr2; ++inc; int x = lll.woCompare( rrr, false ); if ( o.descending( mask ) ) x = -x; if ( x != 0 ) return x; } if ( rSup ) { return -direction; } for( ; ll.more(); mask <<= 1 ) { BSONElement lll = ll.next(); BSONElement rrr = **rr2; ++rr2; int x = lll.woCompare( rrr, false ); if ( o.descending( mask ) ) x = -x; if ( x != 0 ) return x; if ( !*inc ) { return -direction; } ++inc; } return 0; } bool BtreeBucket::exists(const IndexDetails& idx, const DiskLoc &thisLoc, const BSONObj& key, const Ordering& order) const { int pos; bool found; DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc); // skip unused keys while ( 1 ) { if( b.isNull() ) break; const BtreeBucket *bucket = b.btree(); const _KeyNode& kn = bucket->k(pos); if ( kn.isUsed() ) return bucket->keyAt(pos).woEqual(key); b = bucket->advance(b, pos, 1, "BtreeBucket::exists"); } return false; } /** * @param self - don't complain about ourself already being in the index case. * @return true = there is a duplicate. */ bool BtreeBucket::wouldCreateDup( const IndexDetails& idx, const DiskLoc &thisLoc, const BSONObj& key, const Ordering& order, const DiskLoc &self) const { int pos; bool found; DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc); while ( !b.isNull() ) { // we skip unused keys const BtreeBucket *bucket = b.btree(); const _KeyNode& kn = bucket->k(pos); if ( kn.isUsed() ) { if( bucket->keyAt(pos).woEqual(key) ) return kn.recordLoc != self; break; } b = bucket->advance(b, pos, 1, "BtreeBucket::dupCheck"); } return false; } string BtreeBucket::dupKeyError( const IndexDetails& idx , const BSONObj& key ){ stringstream ss; ss << "E11000 duplicate key error "; ss << "index: " << idx.indexNamespace() << " "; ss << "dup key: " << key; return ss.str(); } /** * Find a key withing this btree bucket. * * When duplicate keys are allowed, we use the DiskLoc of the record as if it were part of the * key. That assures that even when there are many duplicates (e.g., 1 million) for a key, * our performance is still good. * * assertIfDup: if the key exists (ignoring the recordLoc), uassert * * pos: for existing keys k0...kn-1. * returns # it goes BEFORE. so key[pos-1] < key < key[pos] * returns n if it goes after the last existing key. * note result might be an Unused location! */ char foo; bool BtreeBucket::find(const IndexDetails& idx, const BSONObj& key, const DiskLoc &recordLoc, const Ordering &order, int& pos, bool assertIfDup) const { #if defined(_EXPERIMENT1) { char *z = (char *) this; int i = 0; while( 1 ) { i += 4096; if( i >= BucketSize ) break; foo += z[i]; } } #endif globalIndexCounters.btree( (char*)this ); // binary search for this key bool dupsChecked = false; int l=0; int h=n-1; while ( l <= h ) { int m = (l+h)/2; KeyNode M = keyNode(m); int x = key.woCompare(M.key, order); if ( x == 0 ) { if( assertIfDup ) { if( k(m).isUnused() ) { // ok that key is there if unused. but we need to check that there aren't other // entries for the key then. as it is very rare that we get here, we don't put any // coding effort in here to make this particularly fast if( !dupsChecked ) { dupsChecked = true; if( idx.head.btree()->exists(idx, idx.head, key, order) ) { if( idx.head.btree()->wouldCreateDup(idx, idx.head, key, order, recordLoc) ) uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) ); else alreadyInIndex(); } } } else { if( M.recordLoc == recordLoc ) alreadyInIndex(); uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) ); } } // dup keys allowed. use recordLoc as if it is part of the key DiskLoc unusedRL = M.recordLoc; unusedRL.GETOFS() &= ~1; // so we can test equality without the used bit messing us up x = recordLoc.compare(unusedRL); } if ( x < 0 ) // key < M.key h = m-1; else if ( x > 0 ) l = m+1; else { // found it. pos = m; return true; } } // not found pos = l; if ( pos != n ) { BSONObj keyatpos = keyNode(pos).key; wassert( key.woCompare(keyatpos, order) <= 0 ); if ( pos > 0 ) { wassert( keyNode(pos-1).key.woCompare(key, order) <= 0 ); } } return false; } void BtreeBucket::delBucket(const DiskLoc thisLoc, const IndexDetails& id) { ClientCursor::informAboutToDeleteBucket(thisLoc); // slow... assert( !isHead() ); const BtreeBucket *p = parent.btree(); int parentIdx = indexInParent( thisLoc ); p->childForPos( parentIdx ).writing().Null(); deallocBucket( thisLoc, id ); } void BtreeBucket::deallocBucket(const DiskLoc thisLoc, const IndexDetails &id) { #if 0 // as a temporary defensive measure, we zap the whole bucket, AND don't truly delete // it (meaning it is ineligible for reuse). memset(this, 0, Size()); #else // defensive: n = -1; parent.Null(); string ns = id.indexNamespace(); theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), thisLoc.rec(), thisLoc); #endif } /** note: may delete the entire bucket! this invalid upon return sometimes. */ void BtreeBucket::delKeyAtPos( const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order) { assert(n>0); DiskLoc left = childForPos(p); if ( n == 1 ) { if ( left.isNull() && nextChild.isNull() ) { if ( isHead() ) { _delKeyAtPos(p); // we don't delete the top bucket ever } else { delBucket(thisLoc, id); } return; } markUnused(p); return; } if ( left.isNull() ) { _delKeyAtPos(p); balanceWithNeighbors( thisLoc, id, order ); } else { markUnused(p); } } void BtreeBucket::replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id ) { assert( n == 0 && !nextChild.isNull() ); if ( parent.isNull() ) { assert( id.head == thisLoc ); id.head.writing() = nextChild; } else { parent.btree()->childForPos( indexInParent( thisLoc ) ).writing() = nextChild; } nextChild.btree()->parent.writing() = parent; ClientCursor::informAboutToDeleteBucket( thisLoc ); deallocBucket( thisLoc, id ); } bool BtreeBucket::mayMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const { assert( leftIndex >= 0 && leftIndex < n ); DiskLoc leftNodeLoc = childForPos( leftIndex ); DiskLoc rightNodeLoc = childForPos( leftIndex + 1 ); if ( leftNodeLoc.isNull() || rightNodeLoc.isNull() ) { // TODO if this situation is possible in long term implementation, maybe we should compact somehow anyway return false; } int pos = 0; { const BtreeBucket *l = leftNodeLoc.btree(); const BtreeBucket *r = rightNodeLoc.btree(); if ( ( headerSize() + l->packedDataSize( pos ) + r->packedDataSize( pos ) + keyNode( leftIndex ).key.objsize() + sizeof(_KeyNode) > unsigned( BucketSize ) ) ) { return false; } } return true; } /** * This implementation must respect the meaning and value of lowWaterMark. * Also see comments in splitPos(). */ int BtreeBucket::rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const { int split = -1; int rightSize = 0; const BtreeBucket *l = childForPos( leftIndex ).btree(); const BtreeBucket *r = childForPos( leftIndex + 1 ).btree(); int KNS = sizeof( _KeyNode ); int rightSizeLimit = ( l->topSize + l->n * KNS + keyNode( leftIndex ).key.objsize() + KNS + r->topSize + r->n * KNS ) / 2; // This constraint should be ensured by only calling this function // if we go below the low water mark. assert( rightSizeLimit < BtreeBucket::bodySize() ); for( int i = r->n - 1; i > -1; --i ) { rightSize += r->keyNode( i ).key.objsize() + KNS; if ( rightSize > rightSizeLimit ) { split = l->n + 1 + i; break; } } if ( split == -1 ) { rightSize += keyNode( leftIndex ).key.objsize() + KNS; if ( rightSize > rightSizeLimit ) { split = l->n; } } if ( split == -1 ) { for( int i = l->n - 1; i > -1; --i ) { rightSize += l->keyNode( i ).key.objsize() + KNS; if ( rightSize > rightSizeLimit ) { split = i; break; } } } // safeguards - we must not create an empty bucket if ( split < 1 ) { split = 1; } else if ( split > l->n + 1 + r->n - 2 ) { split = l->n + 1 + r->n - 2; } return split; } void BtreeBucket::doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) { DiskLoc leftNodeLoc = childForPos( leftIndex ); DiskLoc rightNodeLoc = childForPos( leftIndex + 1 ); BtreeBucket *l = leftNodeLoc.btreemod(); BtreeBucket *r = rightNodeLoc.btreemod(); int pos = 0; l->_packReadyForMod( order, pos ); r->_packReadyForMod( order, pos ); // pack r in case there are droppable keys int oldLNum = l->n; { KeyNode kn = keyNode( leftIndex ); l->pushBack( kn.recordLoc, kn.key, order, l->nextChild ); // left child's right child becomes old parent key's left child } for( int i = 0; i < r->n; ++i ) { KeyNode kn = r->keyNode( i ); l->pushBack( kn.recordLoc, kn.key, order, kn.prevChildBucket ); } l->nextChild = r->nextChild; l->fixParentPtrs( leftNodeLoc, oldLNum ); r->delBucket( rightNodeLoc, id ); childForPos( leftIndex + 1 ) = leftNodeLoc; childForPos( leftIndex ) = DiskLoc(); _delKeyAtPos( leftIndex, true ); if ( n == 0 ) { // will trash this and thisLoc replaceWithNextChild( thisLoc, id ); } else { // balance recursively - maybe we should do this even when n == 0? balanceWithNeighbors( thisLoc, id, order ); } } int BtreeBucket::indexInParent( const DiskLoc &thisLoc ) const { assert( !parent.isNull() ); const BtreeBucket *p = parent.btree(); if ( p->nextChild == thisLoc ) { return p->n; } else { for( int i = 0; i < p->n; ++i ) { if ( p->k( i ).prevChildBucket == thisLoc ) { return i; } } } out() << "ERROR: can't find ref to child bucket.\n"; out() << "child: " << thisLoc << "\n"; dump(); out() << "Parent: " << parent << "\n"; p->dump(); assert(false); return -1; // just to compile } bool BtreeBucket::tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const { // If we can merge, then we must merge rather than balance to preserve // bucket utilization constraints. if ( mayMergeChildren( thisLoc, leftIndex ) ) { return false; } thisLoc.btreemod()->doBalanceChildren( thisLoc, leftIndex, id, order ); return true; } void BtreeBucket::doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split, BtreeBucket *l, const DiskLoc lchild, BtreeBucket *r, const DiskLoc rchild, IndexDetails &id, const Ordering &order ) { // TODO maybe do some audits the same way pushBack() does? int rAdd = l->n - split; r->reserveKeysFront( rAdd ); for( int i = split + 1, j = 0; i < l->n; ++i, ++j ) { KeyNode kn = l->keyNode( i ); r->setKey( j, kn.recordLoc, kn.key, kn.prevChildBucket ); } { KeyNode kn = keyNode( leftIndex ); r->setKey( rAdd - 1, kn.recordLoc, kn.key, l->nextChild ); // left child's right child becomes old parent key's left child } r->fixParentPtrs( rchild, 0, rAdd - 1 ); { KeyNode kn = l->keyNode( split ); l->nextChild = kn.prevChildBucket; setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id ); } int zeropos = 0; l->truncateTo( split, order, zeropos ); } void BtreeBucket::doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split, BtreeBucket *l, const DiskLoc lchild, BtreeBucket *r, const DiskLoc rchild, IndexDetails &id, const Ordering &order ) { int lN = l->n; { KeyNode kn = keyNode( leftIndex ); l->pushBack( kn.recordLoc, kn.key, order, l->nextChild ); // left child's right child becomes old parent key's left child } for( int i = 0; i < split - lN - 1; ++i ) { KeyNode kn = r->keyNode( i ); l->pushBack( kn.recordLoc, kn.key, order, kn.prevChildBucket ); } { KeyNode kn = r->keyNode( split - lN - 1 ); l->nextChild = kn.prevChildBucket; l->fixParentPtrs( lchild, lN + 1, l->n ); setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id ); } int zeropos = 0; r->dropFront( split - lN, order, zeropos ); } void BtreeBucket::doBalanceChildren( DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) { DiskLoc lchild = childForPos( leftIndex ); DiskLoc rchild = childForPos( leftIndex + 1 ); int zeropos = 0; BtreeBucket *l = lchild.btreemod(); l->_packReadyForMod( order, zeropos ); BtreeBucket *r = rchild.btreemod(); r->_packReadyForMod( order, zeropos ); int split = rebalancedSeparatorPos( thisLoc, leftIndex ); // By definition, if we are below the low water mark and cannot merge // then we must actively balance. assert( split != l->n ); if ( split < l->n ) { doBalanceLeftToRight( thisLoc, leftIndex, split, l, lchild, r, rchild, id, order ); } else { doBalanceRightToLeft( thisLoc, leftIndex, split, l, lchild, r, rchild, id, order ); } } void BtreeBucket::balanceWithNeighbors( const DiskLoc thisLoc, IndexDetails &id, const Ordering &order ) const { if ( parent.isNull() ) { // we are root, there are no neighbors return; } if ( packedDataSize( 0 ) >= lowWaterMark ) { return; } const BtreeBucket *p = parent.btree(); int parentIdx = indexInParent( thisLoc ); // TODO will missing neighbor case be possible long term? Should we try to merge/balance somehow in that case if so? bool mayBalanceRight = ( ( parentIdx < p->n ) && !p->childForPos( parentIdx + 1 ).isNull() ); bool mayBalanceLeft = ( ( parentIdx > 0 ) && !p->childForPos( parentIdx - 1 ).isNull() ); // Balance if possible on one side - we merge only if absolutely necessary // to preserve btree bucket utilization constraints since that's a more // heavy duty operation (especially if we must re-split later). if ( mayBalanceRight && p->tryBalanceChildren( parent, parentIdx, id, order ) ) { return; } if ( mayBalanceLeft && p->tryBalanceChildren( parent, parentIdx - 1, id, order ) ) { return; } BtreeBucket *pm = parent.btreemod(); if ( mayBalanceRight ) { pm->doMergeChildren( parent, parentIdx, id, order ); } else if ( mayBalanceLeft ) { pm->doMergeChildren( parent, parentIdx - 1, id, order ); } } /** remove a key from the index */ bool BtreeBucket::unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc ) const { if ( key.objsize() > KeyMax ) { OCCASIONALLY problem() << "unindex: key too large to index, skipping " << id.indexNamespace() << /* ' ' << key.toString() << */ endl; return false; } int pos; bool found; DiskLoc loc = locate(id, thisLoc, key, Ordering::make(id.keyPattern()), pos, found, recordLoc, 1); if ( found ) { loc.btreemod()->delKeyAtPos(loc, id, pos, Ordering::make(id.keyPattern())); return true; } return false; } BtreeBucket* BtreeBucket::allocTemp() { BtreeBucket *b = (BtreeBucket*) malloc(BucketSize); b->init(); return b; } inline void BtreeBucket::fix(const DiskLoc thisLoc, const DiskLoc child) { if ( !child.isNull() ) { if ( insert_debug ) out() << " " << child.toString() << ".parent=" << thisLoc.toString() << endl; child.btree()->parent.writing() = thisLoc; } } /** this sucks. maybe get rid of parent ptrs. */ void BtreeBucket::fixParentPtrs(const DiskLoc thisLoc, int firstIndex, int lastIndex) const { VERIFYTHISLOC if ( lastIndex == -1 ) { lastIndex = n; } for ( int i = firstIndex; i <= lastIndex; i++ ) { fix(thisLoc, childForPos(i)); } } void BtreeBucket::setInternalKey( const DiskLoc thisLoc, int keypos, const DiskLoc recordLoc, const BSONObj &key, const Ordering &order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx ) { childForPos( keypos ).Null(); // This may leave the bucket empty (n == 0) which is ok only as a // transient state. In the instant case, the implementation of // insertHere behaves correctly when n == 0 and as a side effect // increments n. _delKeyAtPos( keypos, true ); // just set temporarily, required to pass validation in insertHere() childForPos( keypos ) = lchild; insertHere( thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx ); } /** * insert a key in this bucket, splitting if necessary. * @keypos - where to insert the key i3n range 0..n. 0=make leftmost, n=make rightmost. * NOTE this function may free some data, and as a result the value passed for keypos may * be invalid after calling insertHere() */ void BtreeBucket::insertHere( const DiskLoc thisLoc, int keypos, const DiskLoc recordLoc, const BSONObj& key, const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) const { if ( insert_debug ) out() << " " << thisLoc.toString() << ".insertHere " << key.toString() << '/' << recordLoc.toString() << ' ' << lchild.toString() << ' ' << rchild.toString() << " keypos:" << keypos << endl; DiskLoc oldLoc = thisLoc; if ( !basicInsert(thisLoc, keypos, recordLoc, key, order) ) { thisLoc.btreemod()->split(thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx); return; } { const _KeyNode *_kn = &k(keypos); _KeyNode *kn = (_KeyNode *) dur::alreadyDeclared((_KeyNode*) _kn); // already declared intent in basicInsert() if ( keypos+1 == n ) { // last key if ( nextChild != lchild ) { out() << "ERROR nextChild != lchild" << endl; out() << " thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl; out() << " keyPos: " << keypos << " n:" << n << endl; out() << " nextChild: " << nextChild.toString() << " lchild: " << lchild.toString() << endl; out() << " recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl; out() << " key: " << key.toString() << endl; dump(); assert(false); } kn->prevChildBucket = nextChild; assert( kn->prevChildBucket == lchild ); nextChild.writing() = rchild; if ( !rchild.isNull() ) rchild.btree()->parent.writing() = thisLoc; } else { kn->prevChildBucket = lchild; if ( k(keypos+1).prevChildBucket != lchild ) { out() << "ERROR k(keypos+1).prevChildBucket != lchild" << endl; out() << " thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl; out() << " keyPos: " << keypos << " n:" << n << endl; out() << " k(keypos+1).pcb: " << k(keypos+1).prevChildBucket.toString() << " lchild: " << lchild.toString() << endl; out() << " recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl; out() << " key: " << key.toString() << endl; dump(); assert(false); } const DiskLoc *pc = &k(keypos+1).prevChildBucket; *dur::alreadyDeclared((DiskLoc*) pc) = rchild; // declared in basicInsert() if ( !rchild.isNull() ) rchild.btree()->parent.writing() = thisLoc; } return; } } void BtreeBucket::split(const DiskLoc thisLoc, int keypos, const DiskLoc recordLoc, const BSONObj& key, const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) { if ( split_debug ) out() << " " << thisLoc.toString() << ".split" << endl; int split = splitPos( keypos ); DiskLoc rLoc = addBucket(idx); BtreeBucket *r = rLoc.btreemod(); if ( split_debug ) out() << " split:" << split << ' ' << keyNode(split).key.toString() << " n:" << n << endl; for ( int i = split+1; i < n; i++ ) { KeyNode kn = keyNode(i); r->pushBack(kn.recordLoc, kn.key, order, kn.prevChildBucket); } r->nextChild = nextChild; r->assertValid( order ); if ( split_debug ) out() << " new rLoc:" << rLoc.toString() << endl; r = 0; rLoc.btree()->fixParentPtrs(rLoc); { KeyNode splitkey = keyNode(split); nextChild = splitkey.prevChildBucket; // splitkey key gets promoted, its children will be thisLoc (l) and rLoc (r) if ( split_debug ) { out() << " splitkey key:" << splitkey.key.toString() << endl; } // promote splitkey to a parent node if ( parent.isNull() ) { // make a new parent if we were the root DiskLoc L = addBucket(idx); BtreeBucket *p = L.btreemod(); p->pushBack(splitkey.recordLoc, splitkey.key, order, thisLoc); p->nextChild = rLoc; p->assertValid( order ); parent = idx.head.writing() = L; if ( split_debug ) out() << " we were root, making new root:" << hex << parent.getOfs() << dec << endl; rLoc.btree()->parent.writing() = parent; } else { // set this before calling _insert - if it splits it will do fixParent() logic and change the value. rLoc.btree()->parent.writing() = parent; if ( split_debug ) out() << " promoting splitkey key " << splitkey.key.toString() << endl; parent.btree()->_insert(parent, splitkey.recordLoc, splitkey.key, order, /*dupsallowed*/true, thisLoc, rLoc, idx); } } int newpos = keypos; // note this may trash splitkey.key. thus we had to promote it before finishing up here. truncateTo(split, order, newpos); // note this may trash splitkey.key. thus we had to promote it before finishing up here. // add our new key, there is room now { if ( keypos <= split ) { if ( split_debug ) out() << " keypos=0); rLoc.btree()->insertHere(rLoc, kp, recordLoc, key, order, lchild, rchild, idx); } } if ( split_debug ) out() << " split end " << hex << thisLoc.getOfs() << dec << endl; } /** start a new index off, empty */ DiskLoc BtreeBucket::addBucket(const IndexDetails& id) { string ns = id.indexNamespace(); DiskLoc loc = theDataFileMgr.insert(ns.c_str(), 0, BucketSize, true); BtreeBucket *b = loc.btreemod(); b->init(); return loc; } void BtreeBucket::renameIndexNamespace(const char *oldNs, const char *newNs) { renameNamespace( oldNs, newNs ); } const DiskLoc BtreeBucket::getHead(const DiskLoc& thisLoc) const { DiskLoc p = thisLoc; while ( !p.btree()->isHead() ) p = p.btree()->parent; return p; } DiskLoc BtreeBucket::advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const { if ( keyOfs < 0 || keyOfs >= n ) { out() << "ASSERT failure BtreeBucket::advance, caller: " << caller << endl; out() << " thisLoc: " << thisLoc.toString() << endl; out() << " keyOfs: " << keyOfs << " n:" << n << " direction: " << direction << endl; out() << bucketSummary() << endl; assert(false); } int adj = direction < 0 ? 1 : 0; int ko = keyOfs + direction; DiskLoc nextDown = childForPos(ko+adj); if ( !nextDown.isNull() ) { while ( 1 ) { keyOfs = direction>0 ? 0 : nextDown.btree()->n - 1; DiskLoc loc = nextDown.btree()->childForPos(keyOfs + adj); if ( loc.isNull() ) break; nextDown = loc; } return nextDown; } if ( ko < n && ko >= 0 ) { keyOfs = ko; return thisLoc; } // end of bucket. traverse back up. DiskLoc childLoc = thisLoc; DiskLoc ancestor = parent; while ( 1 ) { if ( ancestor.isNull() ) break; const BtreeBucket *an = ancestor.btree(); for ( int i = 0; i < an->n; i++ ) { if ( an->childForPos(i+adj) == childLoc ) { keyOfs = i; return ancestor; } } assert( direction<0 || an->nextChild == childLoc ); // parent exhausted also, keep going up childLoc = ancestor; ancestor = an->parent; } return DiskLoc(); } DiskLoc BtreeBucket::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction) const { int p; found = find(idx, key, recordLoc, order, p, /*assertIfDup*/ false); if ( found ) { pos = p; return thisLoc; } DiskLoc child = childForPos(p); if ( !child.isNull() ) { DiskLoc l = child.btree()->locate(idx, child, key, order, pos, found, recordLoc, direction); if ( !l.isNull() ) return l; } pos = p; if ( direction < 0 ) return --pos == -1 ? DiskLoc() /*theend*/ : thisLoc; else return pos == n ? DiskLoc() /*theend*/ : thisLoc; } bool BtreeBucket::customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) const { while( 1 ) { if ( l + 1 == h ) { keyOfs = ( direction > 0 ) ? h : l; DiskLoc next = thisLoc.btree()->k( h ).prevChildBucket; if ( !next.isNull() ) { bestParent = make_pair( thisLoc, keyOfs ); thisLoc = next; return true; } else { return false; } } int m = l + ( h - l ) / 2; int cmp = customBSONCmp( thisLoc.btree()->keyNode( m ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ); if ( cmp < 0 ) { l = m; } else if ( cmp > 0 ) { h = m; } else { if ( direction < 0 ) { l = m; } else { h = m; } } } } /** * find smallest/biggest value greater-equal/less-equal than specified * starting thisLoc + keyOfs will be strictly less than/strictly greater than keyBegin/keyBeginLen/keyEnd * All the direction checks below allowed me to refactor the code, but possibly separate forward and reverse implementations would be more efficient */ void BtreeBucket::advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const { int l,h; bool dontGoUp; if ( direction > 0 ) { l = keyOfs; h = n - 1; dontGoUp = ( customBSONCmp( keyNode( h ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ); } else { l = 0; h = keyOfs; dontGoUp = ( customBSONCmp( keyNode( l ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ); } pair< DiskLoc, int > bestParent; if ( dontGoUp ) { // this comparison result assures h > l if ( !customFind( l, h, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, thisLoc, keyOfs, bestParent ) ) { return; } } else { // go up parents until rightmost/leftmost node is >=/<= target or at top while( !thisLoc.btree()->parent.isNull() ) { thisLoc = thisLoc.btree()->parent; if ( direction > 0 ) { if ( customBSONCmp( thisLoc.btree()->keyNode( thisLoc.btree()->n - 1 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ) { break; } } else { if ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ) { break; } } } } customLocate( thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, bestParent ); } void BtreeBucket::customLocate(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) const { if ( thisLoc.btree()->n == 0 ) { thisLoc = DiskLoc(); return; } // go down until find smallest/biggest >=/<= target while( 1 ) { int l = 0; int h = thisLoc.btree()->n - 1; // leftmost/rightmost key may possibly be >=/<= search key bool firstCheck; if ( direction > 0 ) { firstCheck = ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ); } else { firstCheck = ( customBSONCmp( thisLoc.btree()->keyNode( h ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ); } if ( firstCheck ) { DiskLoc next; if ( direction > 0 ) { next = thisLoc.btree()->k( 0 ).prevChildBucket; keyOfs = 0; } else { next = thisLoc.btree()->nextChild; keyOfs = h; } if ( !next.isNull() ) { bestParent = pair< DiskLoc, int >( thisLoc, keyOfs ); thisLoc = next; continue; } else { return; } } bool secondCheck; if ( direction > 0 ) { secondCheck = ( customBSONCmp( thisLoc.btree()->keyNode( h ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) < 0 ); } else { secondCheck = ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) > 0 ); } if ( secondCheck ) { DiskLoc next; if ( direction > 0 ) { next = thisLoc.btree()->nextChild; } else { next = thisLoc.btree()->k( 0 ).prevChildBucket; } if ( next.isNull() ) { // if bestParent is null, we've hit the end and thisLoc gets set to DiskLoc() thisLoc = bestParent.first; keyOfs = bestParent.second; return; } else { thisLoc = next; continue; } } if ( !customFind( l, h, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, thisLoc, keyOfs, bestParent ) ) { return; } } } /** @thisLoc disk location of *this */ int BtreeBucket::_insert(const DiskLoc thisLoc, const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, bool dupsAllowed, const DiskLoc lChild, const DiskLoc rChild, IndexDetails& idx) const { if ( key.objsize() > KeyMax ) { problem() << "ERROR: key too large len:" << key.objsize() << " max:" << KeyMax << ' ' << key.objsize() << ' ' << idx.indexNamespace() << endl; return 2; } assert( key.objsize() > 0 ); int pos; bool found = find(idx, key, recordLoc, order, pos, !dupsAllowed); if ( insert_debug ) { out() << " " << thisLoc.toString() << '.' << "_insert " << key.toString() << '/' << recordLoc.toString() << " l:" << lChild.toString() << " r:" << rChild.toString() << endl; out() << " found:" << found << " pos:" << pos << " n:" << n << endl; } if ( found ) { const _KeyNode& kn = k(pos); if ( kn.isUnused() ) { log(4) << "btree _insert: reusing unused key" << endl; massert( 10285 , "_insert: reuse key but lchild is not null", lChild.isNull()); massert( 10286 , "_insert: reuse key but rchild is not null", rChild.isNull()); kn.writing().setUsed(); return 0; } DEV { log() << "_insert(): key already exists in index (ok for background:true)\n"; log() << " " << idx.indexNamespace() << " thisLoc:" << thisLoc.toString() << '\n'; log() << " " << key.toString() << '\n'; log() << " " << "recordLoc:" << recordLoc.toString() << " pos:" << pos << endl; log() << " old l r: " << childForPos(pos).toString() << ' ' << childForPos(pos+1).toString() << endl; log() << " new l r: " << lChild.toString() << ' ' << rChild.toString() << endl; } alreadyInIndex(); } DEBUGGING out() << "TEMP: key: " << key.toString() << endl; DiskLoc child = childForPos(pos); if ( insert_debug ) out() << " getChild(" << pos << "): " << child.toString() << endl; if ( child.isNull() || !rChild.isNull() /* means an 'internal' insert */ ) { insertHere(thisLoc, pos, recordLoc, key, order, lChild, rChild, idx); return 0; } return child.btree()->bt_insert(child, recordLoc, key, order, dupsAllowed, idx, /*toplevel*/false); } void BtreeBucket::dump() const { out() << "DUMP btreebucket n:" << n; out() << " parent:" << hex << parent.getOfs() << dec; for ( int i = 0; i < n; i++ ) { out() << '\n'; KeyNode k = keyNode(i); out() << '\t' << i << '\t' << k.key.toString() << "\tleft:" << hex << k.prevChildBucket.getOfs() << "\tRecLoc:" << k.recordLoc.toString() << dec; if ( this->k(i).isUnused() ) out() << " UNUSED"; } out() << " right:" << hex << nextChild.getOfs() << dec << endl; } /** todo: meaning of return code unclear clean up */ int BtreeBucket::bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, bool dupsAllowed, IndexDetails& idx, bool toplevel) const { if ( toplevel ) { if ( key.objsize() > KeyMax ) { problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() << ' ' << key.objsize() << ' ' << key.toString() << endl; return 3; } } int x = _insert(thisLoc, recordLoc, key, order, dupsAllowed, DiskLoc(), DiskLoc(), idx); assertValid( order ); return x; } void BtreeBucket::shape(stringstream& ss) const { _shape(0, ss); } int BtreeBucket::getLowWaterMark() { return lowWaterMark; } int BtreeBucket::getKeyMax() { return KeyMax; } DiskLoc BtreeBucket::findSingle( const IndexDetails& indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const { int pos; bool found; // TODO: is it really ok here that the order is a default? Ordering o = Ordering::make(BSONObj()); DiskLoc bucket = locate( indexdetails , indexdetails.head , key , o , pos , found , minDiskLoc ); if ( bucket.isNull() ) return bucket; const BtreeBucket *b = bucket.btree(); while ( 1 ){ const _KeyNode& knraw = b->k(pos); if ( knraw.isUsed() ) break; bucket = b->advance( bucket , pos , 1 , "findSingle" ); if ( bucket.isNull() ) return bucket; b = bucket.btree(); } KeyNode kn = b->keyNode( pos ); if ( key.woCompare( kn.key ) != 0 ) return DiskLoc(); return kn.recordLoc; } } // namespace mongo #include "db.h" #include "dbhelpers.h" namespace mongo { void BtreeBucket::a_test(IndexDetails& id) { BtreeBucket *b = id.head.btreemod(); // record locs for testing DiskLoc A(1, 20); DiskLoc B(1, 30); DiskLoc C(1, 40); DiskLoc rl; BSONObj key = fromjson("{x:9}"); BSONObj orderObj = fromjson("{}"); Ordering order = Ordering::make(orderObj); b->bt_insert(id.head, A, key, order, true, id); A.GETOFS() += 2; b->bt_insert(id.head, A, key, order, true, id); A.GETOFS() += 2; b->bt_insert(id.head, A, key, order, true, id); A.GETOFS() += 2; b->bt_insert(id.head, A, key, order, true, id); A.GETOFS() += 2; assert( b->k(0).isUsed() ); // b->k(0).setUnused(); b->k(1).setUnused(); b->k(2).setUnused(); b->k(3).setUnused(); b->dumpTree(id.head, orderObj); /* b->bt_insert(id.head, B, key, order, false, id); b->k(1).setUnused(); b->dumpTree(id.head, order); b->bt_insert(id.head, A, key, order, false, id); b->dumpTree(id.head, order); */ // this should assert. does it? (it might "accidentally" though, not asserting proves a problem, asserting proves nothing) b->bt_insert(id.head, C, key, order, false, id); // b->dumpTree(id.head, order); } /* --- BtreeBuilder --- */ BtreeBuilder::BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx) : dupsAllowed(_dupsAllowed), idx(_idx), n(0), order( idx.keyPattern() ), ordering( Ordering::make(idx.keyPattern()) ) { first = cur = BtreeBucket::addBucket(idx); b = cur.btreemod(); committed = false; } void BtreeBuilder::newBucket() { DiskLoc L = BtreeBucket::addBucket(idx); b->tempNext() = L; cur = L; b = cur.btreemod(); } void BtreeBuilder::addKey(BSONObj& key, DiskLoc loc) { if( !dupsAllowed ) { if( n > 0 ) { int cmp = keyLast.woCompare(key, order); massert( 10288 , "bad key order in BtreeBuilder - server internal error", cmp <= 0 ); if( cmp == 0 ) { //if( !dupsAllowed ) uasserted( ASSERT_ID_DUPKEY , BtreeBucket::dupKeyError( idx , keyLast ) ); } } keyLast = key; } if ( ! b->_pushBack(loc, key, ordering, DiskLoc()) ){ // no room if ( key.objsize() > KeyMax ) { problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() << ' ' << key.objsize() << ' ' << key.toString() << endl; } else { // bucket was full newBucket(); b->pushBack(loc, key, ordering, DiskLoc()); } } n++; } void BtreeBuilder::buildNextLevel(DiskLoc loc) { int levels = 1; while( 1 ) { if( loc.btree()->tempNext().isNull() ) { // only 1 bucket at this level. we are done. dur::writingDiskLoc(idx.head) = loc; break; } levels++; DiskLoc upLoc = BtreeBucket::addBucket(idx); DiskLoc upStart = upLoc; BtreeBucket *up = upLoc.btreemod(); DiskLoc xloc = loc; while( !xloc.isNull() ) { BtreeBucket *x = xloc.btreemod(); BSONObj k; DiskLoc r; x->popBack(r,k); bool keepX = ( x->n != 0 ); DiskLoc keepLoc = keepX ? xloc : x->nextChild; if ( ! up->_pushBack(r, k, ordering, keepLoc) ){ // current bucket full DiskLoc n = BtreeBucket::addBucket(idx); up->tempNext() = n; upLoc = n; up = upLoc.btreemod(); up->pushBack(r, k, ordering, keepLoc); } DiskLoc nextLoc = x->tempNext(); // get next in chain at current level if ( keepX ) { x->parent = upLoc; } else { if ( !x->nextChild.isNull() ) x->nextChild.btreemod()->parent = upLoc; x->deallocBucket( xloc, idx ); } xloc = nextLoc; } loc = upStart; } if( levels > 1 ) log(2) << "btree levels: " << levels << endl; } /** when all addKeys are done, we then build the higher levels of the tree */ void BtreeBuilder::commit() { buildNextLevel(first); committed = true; } BtreeBuilder::~BtreeBuilder() { if( !committed ) { log(2) << "Rolling back partially built index space" << endl; DiskLoc x = first; while( !x.isNull() ) { DiskLoc next = x.btree()->tempNext(); string ns = idx.indexNamespace(); theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), x.rec(), x); x = next; } assert( idx.head.isNull() ); log(2) << "done rollback" << endl; } } }