Files
mongo/db/repl/rs_rollback.cpp

343 lines
12 KiB
C++
Raw Normal View History

2010-07-21 14:15:48 -04:00
/* @file rs_rollback.cpp
2010-07-21 01:45:04 -04:00
*
* Copyright (C) 2008 10gen Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "pch.h"
#include "../client.h"
#include "../../client/dbclient.h"
#include "rs.h"
#include "../repl.h"
2010-07-23 15:48:30 -04:00
#include "../query.h"
2010-07-21 01:45:04 -04:00
/* Scenarios
We went offline with ops not replicated out.
F = node that failed and coming back.
P = node that took over, new primary
#1:
F : a b c d e f g
P : a b c d q
The design is "keep P". One could argue here that "keep F" has some merits, however, in most cases P
will have significantly more data. Also note that P may have a proper subset of F's stream if there were
2010-07-21 12:50:03 -04:00
no subsequent writes.
2010-07-21 01:45:04 -04:00
For now the model is simply : get F back in sync with P. If P was really behind or something, we should have
just chosen not to fail over anyway.
#2:
F : a b c d e f g -> a b c d
P : a b c d
#3:
F : a b c d e f g -> a b c d q r s t u v w x z
P : a b c d.q r s t u v w x z
Steps
find an event in common. 'd'.
undo our events beyond that by:
(1) taking copy from other server of those objects
2010-07-21 12:50:03 -04:00
(2) do not consider copy valid until we pass reach an optime after when we fetched the new version of object
2010-07-21 01:45:04 -04:00
-- i.e., reset minvalid.
2010-07-21 12:50:03 -04:00
(3) we could skip operations on objects that are previous in time to our capture of the object as an optimization.
2010-07-21 01:45:04 -04:00
*/
namespace mongo {
2010-07-21 14:15:48 -04:00
using namespace bson;
2010-07-26 15:29:18 -04:00
struct DocID {
const char *ns;
be _id;
bool operator<(const DocID& d) const {
int c = strcmp(ns, d.ns);
if( c < 0 ) return true;
if( c > 0 ) return false;
return _id < d._id;
}
};
2010-07-21 15:39:59 -04:00
struct HowToFixUp {
2010-07-26 15:29:18 -04:00
/* note this is a set -- if there are many $inc's on a single document we need to rollback, we only
need to refetch it once. */
set<DocID> toRefetch;
2010-07-21 22:20:34 -04:00
2010-07-21 15:39:59 -04:00
OpTime commonPoint;
2010-07-26 15:29:18 -04:00
DiskLoc commonPointOurDiskloc;
2010-07-21 15:39:59 -04:00
};
2010-07-27 02:13:13 -04:00
static void refetch(HowToFixUp& h, const BSONObj& ourObj) {
const char *op = ourObj.getStringField("op");
if( *op == 'n' )
return;
unsigned long long totSize = 0;
totSize += ourObj.objsize();
if( totSize > 512 * 1024 * 1024 )
throw "rollback too large";
DocID d;
d.ns = ourObj.getStringField("ns");
if( *d.ns == 0 ) {
log() << "replSet WARNING ignoring op on rollback TODO : " << ourObj.toString() << rsLog;
return;
}
bo o = ourObj.getObjectField(*op=='u' ? "o2" : "o");
if( o.isEmpty() ) {
log() << "replSet warning ignoring op on rollback : " << ourObj.toString() << rsLog;
return;
}
d._id = o["_id"];
if( d._id.eoo() ) {
log() << "replSet WARNING ignoring op on rollback no _id TODO : " << ourObj.toString() << rsLog;
return;
}
h.toRefetch.insert(d);
}
2010-07-26 15:29:18 -04:00
static void syncRollbackFindCommonPoint(DBClientConnection *them, HowToFixUp& h) {
static time_t last;
if( time(0)-last < 60 ) {
// this could put a lot of load on someone else, don't repeat too often
sleepsecs(10);
throw "findcommonpoint waiting a while before trying again";
}
last = time(0);
assert( dbMutex.atLeastReadLocked() );
2010-07-26 22:03:49 -04:00
Client::Context c(rsoplog, dbpath, 0, false);
2010-07-26 15:29:18 -04:00
NamespaceDetails *nsd = nsdetails(rsoplog);
assert(nsd);
ReverseCappedCursor u(nsd);
if( !u.ok() )
throw "our oplog empty or unreadable";
2010-07-23 14:52:14 -04:00
const Query q = Query().sort(reverseNaturalObj);
2010-07-21 14:15:48 -04:00
const bo fields = BSON( "ts" << 1 << "h" << 1 );
2010-07-26 15:29:18 -04:00
//auto_ptr<DBClientCursor> u = us->query(rsoplog, q, 0, 0, &fields, 0, 0);
2010-07-21 14:15:48 -04:00
auto_ptr<DBClientCursor> t = them->query(rsoplog, q, 0, 0, &fields, 0, 0);
if( !t->more() ) throw "remote oplog empty or unreadable";
2010-07-26 15:29:18 -04:00
BSONObj ourObj = u.current();
2010-07-21 15:39:59 -04:00
OpTime ourTime = ourObj["ts"]._opTime();
2010-07-21 14:15:48 -04:00
BSONObj theirObj = t->nextSafe();
2010-07-21 15:39:59 -04:00
OpTime theirTime = theirObj["ts"]._opTime();
2010-07-21 14:15:48 -04:00
2010-07-21 15:39:59 -04:00
if( 1 ) {
2010-07-21 14:15:48 -04:00
long long diff = (long long) ourTime.getSecs() - ((long long) theirTime.getSecs());
/* diff could be positive, negative, or zero */
2010-07-26 15:29:18 -04:00
log() << "replSet info syncRollback diff in end of log times : " << diff << " seconds" << rsLog;
if( diff > 3600 ) {
log() << "replSet syncRollback too long a time period for a rollback." << rsLog;
2010-07-21 15:39:59 -04:00
throw "error not willing to roll back more than one hour of data";
2010-07-26 15:29:18 -04:00
}
2010-07-21 14:15:48 -04:00
}
2010-07-21 15:39:59 -04:00
unsigned long long scanned = 0;
while( 1 ) {
scanned++;
/* todo add code to assure no excessive scanning for too long */
if( ourTime == theirTime ) {
if( ourObj["h"].Long() == theirObj["h"].Long() ) {
// found the point back in time where we match.
// todo : check a few more just to be careful about hash collisions.
log() << "replSet rollback found matching events at " << ourTime.toStringPretty() << rsLog;
2010-07-28 20:24:21 -04:00
log() << "replSet rollback findcommonpoint scanned : " << scanned << rsLog;
2010-07-21 15:39:59 -04:00
h.commonPoint = ourTime;
2010-07-26 15:29:18 -04:00
h.commonPointOurDiskloc = u.currLoc();
2010-07-21 15:39:59 -04:00
return;
}
2010-07-27 02:13:13 -04:00
refetch(h, ourObj);
2010-07-21 15:39:59 -04:00
theirObj = t->nextSafe();
theirTime = theirObj["ts"]._opTime();
2010-07-27 02:13:13 -04:00
2010-07-26 15:29:18 -04:00
u.advance();
if( !u.ok() ) throw "reached beginning of local oplog";
ourObj = u.current();
2010-07-21 15:39:59 -04:00
ourTime = ourObj["ts"]._opTime();
}
else if( theirTime > ourTime ) {
/* todo: we could hit beginning of log here. exception thrown is ok but not descriptive, so fix up */
theirObj = t->nextSafe();
theirTime = theirObj["ts"]._opTime();
}
else {
// theirTime < ourTime
2010-07-27 02:13:13 -04:00
refetch(h, ourObj);
2010-07-26 15:29:18 -04:00
u.advance();
if( !u.ok() ) throw "reached beginning of local oplog";
ourObj = u.current();
2010-07-21 15:39:59 -04:00
ourTime = ourObj["ts"]._opTime();
}
2010-07-21 14:15:48 -04:00
}
2010-07-21 13:13:36 -04:00
}
2010-07-23 16:19:39 -04:00
struct X {
const bson::bo *op;
bson::bo goodVersionOfObject;
};
2010-07-21 22:20:34 -04:00
void ReplSetImpl::syncFixUp(HowToFixUp& h, DBClientConnection *them) {
2010-07-26 15:29:18 -04:00
// fetch all first so we needn't handle interruption in a fancy way
2010-07-23 15:48:30 -04:00
2010-07-21 22:20:34 -04:00
unsigned long long totSize = 0;
2010-07-23 15:48:30 -04:00
2010-07-26 15:29:18 -04:00
list< pair<DocID,bo> > goodVersions;
2010-07-23 15:48:30 -04:00
2010-07-28 20:57:56 -04:00
DocID d;
unsigned long long n = 0;
2010-07-28 20:24:21 -04:00
try {
for( set<DocID>::iterator i = h.toRefetch.begin(); i != h.toRefetch.end(); i++ ) {
2010-07-28 20:57:56 -04:00
d = *i;
2010-07-21 22:20:34 -04:00
2010-07-28 20:24:21 -04:00
assert( !d._id.eoo() );
2010-07-21 22:20:34 -04:00
2010-07-28 20:24:21 -04:00
{
/* TODO : slow. lots of round trips. */
2010-07-28 20:57:56 -04:00
n++;
2010-07-28 20:24:21 -04:00
bo good= them->findOne(d.ns, d._id.wrap()).getOwned();
totSize += good.objsize();
uassert( 13410, "replSet too much data to roll back", totSize < 300 * 1024 * 1024 );
2010-07-26 15:29:18 -04:00
2010-07-28 20:24:21 -04:00
// note good might be eoo, indicating we should delete it
goodVersions.push_back(pair<DocID,bo>(d,good));
}
2010-07-21 22:20:34 -04:00
}
}
2010-07-28 20:24:21 -04:00
catch(DBException& e) {
sethbmsg(str::stream() << "syncRollback re-get objects: " << e.toString(),0);
2010-07-28 20:57:56 -04:00
log() << "syncRollback couldn't re-get ns:" << d.ns << " _id:" << d._id << ' ' << n << '/' << h.toRefetch.size() << rsLog;
2010-07-28 20:24:21 -04:00
throw e;
}
2010-07-21 22:20:34 -04:00
// update them
2010-07-26 15:29:18 -04:00
sethbmsg(str::stream() << "syncRollback 4 n:" << goodVersions.size());
2010-07-23 14:52:14 -04:00
2010-07-23 15:48:30 -04:00
bool warn = false;
2010-07-26 15:29:18 -04:00
assert( !h.commonPointOurDiskloc.isNull() );
MemoryMappedFile::flushAll(true);
dbMutex.assertWriteLocked();
2010-07-28 20:57:56 -04:00
Client::Context c(rsoplog, dbpath, 0, /*doauth*/false);
2010-07-26 15:29:18 -04:00
NamespaceDetails *oplogDetails = nsdetails(rsoplog);
uassert(13412, str::stream() << "replSet error in rollback can't find " << rsoplog, oplogDetails);
2010-07-26 16:12:51 -04:00
for( list<pair<DocID,bo> >::iterator i = goodVersions.begin(); i != goodVersions.end(); i++ ) {
2010-07-26 15:29:18 -04:00
const DocID& d = i->first;
bo pattern = d._id.wrap(); // { _id : ... }
try {
assert( d.ns && *d.ns );
// todo: lots of overhead in context, this can be faster
Client::Context c(d.ns, dbpath, 0, /*doauth*/false);
if( i->second.isEmpty() ) {
// wasn't on the primary; delete.
/* TODO1.6 : can't delete from a capped collection. need to handle that here. */
try {
deleteObjects(d.ns, pattern, /*justone*/true, /*logop*/false, /*god*/true);
2010-07-23 15:48:30 -04:00
}
2010-07-26 15:29:18 -04:00
catch(...) {
log() << "replSet rollback delete failed - todo finish capped collection support ns:" << d.ns << rsLog;
2010-07-23 15:48:30 -04:00
}
2010-07-26 15:29:18 -04:00
}
else {
// todo faster...
OpDebug debug;
_updateObjects(/*god*/true, d.ns, i->second, pattern, /*upsert=*/true, /*multi=*/false , /*logtheop=*/false , debug);
2010-07-23 15:48:30 -04:00
}
2010-07-26 15:29:18 -04:00
}
catch(DBException& e) {
log() << "replSet exception in rollback ns:" << d.ns << ' ' << pattern.toString() << ' ' << e.toString() << rsLog;
warn = true;
2010-07-23 15:48:30 -04:00
}
2010-07-23 14:52:14 -04:00
}
2010-07-21 22:20:34 -04:00
2010-07-28 20:57:56 -04:00
sethbmsg("syncRollback 5");
MemoryMappedFile::flushAll(true);
sethbmsg("syncRollback 6");
2010-07-21 22:20:34 -04:00
// clean up oplog
2010-07-26 15:29:18 -04:00
oplogDetails->cappedTruncateAfter(rsoplog, h.commonPointOurDiskloc, false);
2010-07-23 22:44:11 -04:00
2010-07-28 20:57:56 -04:00
sethbmsg("syncRollback 7");
2010-07-26 15:29:18 -04:00
MemoryMappedFile::flushAll(true);
2010-07-23 22:44:11 -04:00
// done
2010-07-23 15:48:30 -04:00
if( warn )
sethbmsg("issues during syncRollback, see log");
else
sethbmsg("syncRollback done");
2010-07-21 22:20:34 -04:00
}
2010-07-21 13:13:36 -04:00
void ReplSetImpl::syncRollback(OplogReader&r) {
2010-07-21 14:15:48 -04:00
assert( !lockedByMe() );
assert( !dbMutex.atLeastReadLocked() );
2010-07-26 15:29:18 -04:00
sethbmsg("syncRollback 0");
writelocktry lk(rsoplog, 20000);
if( !lk.got() ) {
sethbmsg("syncRollback couldn't get write lock in a reasonable time");
sleepsecs(2);
return;
}
2010-07-21 15:39:59 -04:00
HowToFixUp how;
2010-07-21 13:50:53 -04:00
sethbmsg("syncRollback 1");
{
r.resetCursor();
2010-07-26 15:29:18 -04:00
/*DBClientConnection us(false, 0, 0);
2010-07-21 13:50:53 -04:00
string errmsg;
if( !us.connect(HostAndPort::me().toString(),errmsg) ) {
sethbmsg("syncRollback connect to self failure" + errmsg);
return;
2010-07-26 15:29:18 -04:00
}*/
2010-07-21 15:39:59 -04:00
2010-07-21 13:50:53 -04:00
sethbmsg("syncRollback 2 FindCommonPoint");
2010-07-21 14:15:48 -04:00
try {
2010-07-26 15:29:18 -04:00
syncRollbackFindCommonPoint(r.conn(), how);
2010-07-21 14:15:48 -04:00
}
catch( const char *p ) {
sethbmsg(string("syncRollback 2 error ") + p);
2010-07-21 15:39:59 -04:00
sleepsecs(10);
2010-07-21 14:15:48 -04:00
return;
}
2010-07-21 15:39:59 -04:00
catch( DBException& e ) {
sethbmsg(string("syncRollback 2 exception ") + e.toString() + "; sleeping 1 min");
sleepsecs(60);
throw;
}
2010-07-21 13:50:53 -04:00
}
2010-07-21 13:13:36 -04:00
2010-07-28 20:24:21 -04:00
sethbmsg("replSet syncRollback 3 fixup");
2010-07-26 15:29:18 -04:00
2010-07-21 22:20:34 -04:00
syncFixUp(how, r.conn());
2010-07-21 13:13:36 -04:00
}
2010-07-21 01:45:04 -04:00
}