Files
mongo/db/repl/rs_rollback.cpp

668 lines
26 KiB
C++
Raw Normal View History

2010-07-21 14:15:48 -04:00
/* @file rs_rollback.cpp
2011-01-04 00:40:41 -05:00
*
2010-07-21 01:45:04 -04:00
* Copyright (C) 2008 10gen Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "pch.h"
#include "../client.h"
#include "../../client/dbclient.h"
#include "rs.h"
#include "../repl.h"
2011-06-22 15:51:08 -04:00
#include "../ops/query.h"
#include "../cloner.h"
2011-06-20 16:23:54 -04:00
#include "../ops/update.h"
#include "../ops/delete.h"
2010-07-21 01:45:04 -04:00
/* Scenarios
We went offline with ops not replicated out.
2011-01-04 00:40:41 -05:00
2010-07-21 01:45:04 -04:00
F = node that failed and coming back.
P = node that took over, new primary
#1:
F : a b c d e f g
P : a b c d q
2011-01-04 00:40:41 -05:00
The design is "keep P". One could argue here that "keep F" has some merits, however, in most cases P
will have significantly more data. Also note that P may have a proper subset of F's stream if there were
2010-07-21 12:50:03 -04:00
no subsequent writes.
2010-07-21 01:45:04 -04:00
2011-01-04 00:40:41 -05:00
For now the model is simply : get F back in sync with P. If P was really behind or something, we should have
2010-07-21 01:45:04 -04:00
just chosen not to fail over anyway.
#2:
F : a b c d e f g -> a b c d
P : a b c d
#3:
F : a b c d e f g -> a b c d q r s t u v w x z
P : a b c d.q r s t u v w x z
Steps
find an event in common. 'd'.
2011-01-04 00:40:41 -05:00
undo our events beyond that by:
2010-07-21 01:45:04 -04:00
(1) taking copy from other server of those objects
2011-01-04 00:40:41 -05:00
(2) do not consider copy valid until we pass reach an optime after when we fetched the new version of object
2010-07-21 01:45:04 -04:00
-- i.e., reset minvalid.
2010-07-21 12:50:03 -04:00
(3) we could skip operations on objects that are previous in time to our capture of the object as an optimization.
2010-07-21 01:45:04 -04:00
*/
namespace mongo {
2010-07-21 14:15:48 -04:00
using namespace bson;
2010-08-16 10:26:07 -04:00
void incRBID();
2010-08-16 10:01:26 -04:00
2011-01-04 00:40:41 -05:00
class rsfatal : public std::exception {
2010-08-16 09:17:14 -04:00
public:
2011-01-04 00:40:41 -05:00
virtual const char* what() const throw() { return "replica set fatal exception"; }
2010-08-16 09:17:14 -04:00
};
2010-07-26 15:29:18 -04:00
struct DocID {
const char *ns;
be _id;
2011-01-04 00:40:41 -05:00
bool operator<(const DocID& d) const {
2010-07-26 15:29:18 -04:00
int c = strcmp(ns, d.ns);
if( c < 0 ) return true;
if( c > 0 ) return false;
return _id < d._id;
}
};
2010-07-21 15:39:59 -04:00
struct HowToFixUp {
2011-01-04 00:40:41 -05:00
/* note this is a set -- if there are many $inc's on a single document we need to rollback, we only
2010-07-26 15:29:18 -04:00
need to refetch it once. */
set<DocID> toRefetch;
2010-07-21 22:20:34 -04:00
2010-08-03 15:00:56 -04:00
/* collections to drop */
set<string> toDrop;
set<string> collectionsToResync;
2010-07-21 15:39:59 -04:00
OpTime commonPoint;
2010-07-26 15:29:18 -04:00
DiskLoc commonPointOurDiskloc;
int rbid; // remote server's current rollback sequence #
2010-07-21 15:39:59 -04:00
};
2011-01-04 00:40:41 -05:00
static void refetch(HowToFixUp& h, const BSONObj& ourObj) {
2010-07-27 02:13:13 -04:00
const char *op = ourObj.getStringField("op");
2011-01-04 00:40:41 -05:00
if( *op == 'n' )
2010-07-27 02:13:13 -04:00
return;
unsigned long long totSize = 0;
totSize += ourObj.objsize();
if( totSize > 512 * 1024 * 1024 )
throw "rollback too large";
DocID d;
2011-01-31 15:58:37 -08:00
// NOTE The assigned ns value may become invalid if we yield.
2010-07-27 02:13:13 -04:00
d.ns = ourObj.getStringField("ns");
2011-01-04 00:40:41 -05:00
if( *d.ns == 0 ) {
2010-08-03 15:00:56 -04:00
log() << "replSet WARNING ignoring op on rollback no ns TODO : " << ourObj.toString() << rsLog;
2010-07-27 02:13:13 -04:00
return;
}
bo o = ourObj.getObjectField(*op=='u' ? "o2" : "o");
2011-01-04 00:40:41 -05:00
if( o.isEmpty() ) {
2010-07-27 02:13:13 -04:00
log() << "replSet warning ignoring op on rollback : " << ourObj.toString() << rsLog;
return;
}
2011-01-04 00:40:41 -05:00
if( *op == 'c' ) {
2010-08-03 15:19:34 -04:00
be first = o.firstElement();
2010-08-03 15:00:56 -04:00
NamespaceString s(d.ns); // foo.$cmd
2010-08-16 09:17:14 -04:00
string cmdname = first.fieldName();
Command *cmd = Command::findCommand(cmdname.c_str());
2011-01-04 00:40:41 -05:00
if( cmd == 0 ) {
2010-08-16 08:57:28 -04:00
log() << "replSet warning rollback no suchcommand " << first.fieldName() << " - different mongod versions perhaps?" << rsLog;
2010-08-03 15:19:34 -04:00
return;
}
2010-08-16 08:57:28 -04:00
else {
/* findandmodify - tranlated?
2011-01-04 00:40:41 -05:00
godinsert?,
2010-08-16 09:17:14 -04:00
renamecollection a->b. just resync a & b
*/
if( cmdname == "create" ) {
2011-01-04 00:40:41 -05:00
/* Create collection operation
2010-08-22 10:30:26 -04:00
{ ts: ..., h: ..., op: "c", ns: "foo.$cmd", o: { create: "abc", ... } }
2010-08-16 08:57:28 -04:00
*/
string ns = s.db + '.' + o["create"].String(); // -> foo.abc
h.toDrop.insert(ns);
return;
}
2011-01-04 00:40:41 -05:00
else if( cmdname == "drop" ) {
string ns = s.db + '.' + first.valuestr();
h.collectionsToResync.insert(ns);
return;
}
2011-01-04 00:40:41 -05:00
else if( cmdname == "dropIndexes" || cmdname == "deleteIndexes" ) {
/* TODO: this is bad. we simply full resync the collection here, which could be very slow. */
log() << "replSet info rollback of dropIndexes is slow in this version of mongod" << rsLog;
string ns = s.db + '.' + first.valuestr();
h.collectionsToResync.insert(ns);
return;
}
2011-01-04 00:40:41 -05:00
else if( cmdname == "renameCollection" ) {
/* TODO: slow. */
log() << "replSet info rollback of renameCollection is slow in this version of mongod" << rsLog;
string from = first.valuestr();
string to = o["to"].String();
h.collectionsToResync.insert(from);
h.collectionsToResync.insert(to);
return;
}
2011-01-04 00:40:41 -05:00
else if( cmdname == "reIndex" ) {
2010-08-16 09:17:14 -04:00
return;
}
2011-01-04 00:40:41 -05:00
else if( cmdname == "dropDatabase" ) {
2010-08-31 05:06:53 -04:00
log() << "replSet error rollback : can't rollback drop database full resync will be required" << rsLog;
2010-08-16 09:17:14 -04:00
log() << "replSet " << o.toString() << rsLog;
throw rsfatal();
}
2011-01-04 00:40:41 -05:00
else {
2010-08-31 05:06:53 -04:00
log() << "replSet error can't rollback this command yet: " << o.toString() << rsLog;
log() << "replSet cmdname=" << cmdname << rsLog;
2010-08-16 09:17:14 -04:00
throw rsfatal();
2010-08-16 08:57:28 -04:00
}
2010-08-03 15:19:34 -04:00
}
2010-08-03 15:00:56 -04:00
}
2010-07-27 02:13:13 -04:00
d._id = o["_id"];
if( d._id.eoo() ) {
2010-08-03 15:19:34 -04:00
log() << "replSet WARNING ignoring op on rollback no _id TODO : " << d.ns << ' '<< ourObj.toString() << rsLog;
2010-07-27 02:13:13 -04:00
return;
}
h.toRefetch.insert(d);
}
int getRBID(DBClientConnection*);
2011-01-04 00:40:41 -05:00
static void syncRollbackFindCommonPoint(DBClientConnection *them, HowToFixUp& h) {
2010-07-26 15:29:18 -04:00
static time_t last;
2011-01-04 00:40:41 -05:00
if( time(0)-last < 60 ) {
2010-07-26 15:29:18 -04:00
throw "findcommonpoint waiting a while before trying again";
}
last = time(0);
assert( dbMutex.atLeastReadLocked() );
2011-01-19 16:41:57 -05:00
Client::Context c(rsoplog);
2010-07-26 15:29:18 -04:00
NamespaceDetails *nsd = nsdetails(rsoplog);
assert(nsd);
ReverseCappedCursor u(nsd);
if( !u.ok() )
throw "our oplog empty or unreadable";
2010-07-23 14:52:14 -04:00
const Query q = Query().sort(reverseNaturalObj);
2010-07-21 14:15:48 -04:00
const bo fields = BSON( "ts" << 1 << "h" << 1 );
2010-07-26 15:29:18 -04:00
//auto_ptr<DBClientCursor> u = us->query(rsoplog, q, 0, 0, &fields, 0, 0);
2010-08-02 14:44:53 -04:00
h.rbid = getRBID(them);
2010-07-21 14:15:48 -04:00
auto_ptr<DBClientCursor> t = them->query(rsoplog, q, 0, 0, &fields, 0, 0);
2010-08-02 14:44:53 -04:00
if( t.get() == 0 || !t->more() ) throw "remote oplog empty or unreadable";
2010-07-21 14:15:48 -04:00
2010-07-26 15:29:18 -04:00
BSONObj ourObj = u.current();
2010-07-21 15:39:59 -04:00
OpTime ourTime = ourObj["ts"]._opTime();
2010-07-21 14:15:48 -04:00
BSONObj theirObj = t->nextSafe();
2010-07-21 15:39:59 -04:00
OpTime theirTime = theirObj["ts"]._opTime();
2010-07-21 14:15:48 -04:00
2010-08-09 10:00:13 -04:00
{
2010-07-21 14:15:48 -04:00
long long diff = (long long) ourTime.getSecs() - ((long long) theirTime.getSecs());
/* diff could be positive, negative, or zero */
2010-08-17 16:31:21 -04:00
log() << "replSet info rollback our last optime: " << ourTime.toStringPretty() << rsLog;
log() << "replSet info rollback their last optime: " << theirTime.toStringPretty() << rsLog;
log() << "replSet info rollback diff in end of log times: " << diff << " seconds" << rsLog;
2011-04-20 13:18:10 -04:00
if( diff > 1800 ) {
2010-08-17 16:31:21 -04:00
log() << "replSet rollback too long a time period for a rollback." << rsLog;
2011-04-20 13:18:10 -04:00
throw "error not willing to roll back more than 30 minutes of data";
2010-07-26 15:29:18 -04:00
}
2010-07-21 14:15:48 -04:00
}
2010-07-21 15:39:59 -04:00
unsigned long long scanned = 0;
while( 1 ) {
scanned++;
/* todo add code to assure no excessive scanning for too long */
2011-01-04 00:40:41 -05:00
if( ourTime == theirTime ) {
if( ourObj["h"].Long() == theirObj["h"].Long() ) {
2010-07-21 15:39:59 -04:00
// found the point back in time where we match.
// todo : check a few more just to be careful about hash collisions.
log() << "replSet rollback found matching events at " << ourTime.toStringPretty() << rsLog;
2010-07-28 20:24:21 -04:00
log() << "replSet rollback findcommonpoint scanned : " << scanned << rsLog;
2010-07-21 15:39:59 -04:00
h.commonPoint = ourTime;
2010-07-26 15:29:18 -04:00
h.commonPointOurDiskloc = u.currLoc();
2010-07-21 15:39:59 -04:00
return;
}
2010-07-27 02:13:13 -04:00
refetch(h, ourObj);
2011-01-04 00:40:41 -05:00
if( !t->more() ) {
log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog;
log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog;
log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog;
log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog;
2010-08-18 11:50:52 -04:00
throw "RS100 reached beginning of remote oplog [2]";
2010-08-08 18:02:12 -04:00
}
2010-07-21 15:39:59 -04:00
theirObj = t->nextSafe();
theirTime = theirObj["ts"]._opTime();
2010-07-27 02:13:13 -04:00
2010-07-26 15:29:18 -04:00
u.advance();
if( !u.ok() ) {
log() << "replSet rollback error RS101 reached beginning of local oplog" << rsLog;
log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog;
log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog;
log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog;
2010-08-18 11:50:52 -04:00
throw "RS101 reached beginning of local oplog [1]";
}
2010-07-26 15:29:18 -04:00
ourObj = u.current();
2010-07-21 15:39:59 -04:00
ourTime = ourObj["ts"]._opTime();
}
2011-01-04 00:40:41 -05:00
else if( theirTime > ourTime ) {
if( !t->more() ) {
log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog;
log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog;
log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog;
log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog;
2010-08-18 11:50:52 -04:00
throw "RS100 reached beginning of remote oplog [1]";
2010-08-08 18:02:12 -04:00
}
2010-07-21 15:39:59 -04:00
theirObj = t->nextSafe();
theirTime = theirObj["ts"]._opTime();
}
2011-01-04 00:40:41 -05:00
else {
2010-07-21 15:39:59 -04:00
// theirTime < ourTime
2010-07-27 02:13:13 -04:00
refetch(h, ourObj);
2010-07-26 15:29:18 -04:00
u.advance();
2011-01-04 00:40:41 -05:00
if( !u.ok() ) {
log() << "replSet rollback error RS101 reached beginning of local oplog" << rsLog;
log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog;
log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog;
log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog;
2010-08-18 11:50:52 -04:00
throw "RS101 reached beginning of local oplog [2]";
}
2010-07-26 15:29:18 -04:00
ourObj = u.current();
2010-07-21 15:39:59 -04:00
ourTime = ourObj["ts"]._opTime();
}
2010-07-21 14:15:48 -04:00
}
2010-07-21 13:13:36 -04:00
}
2011-01-04 00:40:41 -05:00
struct X {
2010-07-23 16:19:39 -04:00
const bson::bo *op;
bson::bo goodVersionOfObject;
};
2011-01-04 00:40:41 -05:00
static void setMinValid(bo newMinValid) {
try {
log() << "replSet minvalid=" << newMinValid["ts"]._opTime().toStringLong() << rsLog;
}
catch(...) { }
{
Helpers::putSingleton("local.replset.minvalid", newMinValid);
Client::Context cx( "local." );
cx.db()->flushFiles(true);
}
2010-08-16 12:21:49 -04:00
}
void ReplSetImpl::syncFixUp(HowToFixUp& h, OplogReader& r) {
2011-01-04 00:40:41 -05:00
DBClientConnection *them = r.conn();
// fetch all first so we needn't handle interruption in a fancy way
unsigned long long totSize = 0;
list< pair<DocID,bo> > goodVersions;
bo newMinValid;
/* fetch all the goodVersions of each document from current primary */
DocID d;
unsigned long long n = 0;
try {
for( set<DocID>::iterator i = h.toRefetch.begin(); i != h.toRefetch.end(); i++ ) {
d = *i;
assert( !d._id.eoo() );
{
/* TODO : slow. lots of round trips. */
n++;
bo good= them->findOne(d.ns, d._id.wrap(), NULL, QueryOption_SlaveOk).getOwned();
2011-01-04 00:40:41 -05:00
totSize += good.objsize();
uassert( 13410, "replSet too much data to roll back", totSize < 300 * 1024 * 1024 );
// note good might be eoo, indicating we should delete it
goodVersions.push_back(pair<DocID,bo>(d,good));
}
}
newMinValid = r.getLastOp(rsoplog);
if( newMinValid.isEmpty() ) {
sethbmsg("rollback error newMinValid empty?");
return;
}
}
catch(DBException& e) {
sethbmsg(str::stream() << "rollback re-get objects: " << e.toString(),0);
log() << "rollback couldn't re-get ns:" << d.ns << " _id:" << d._id << ' ' << n << '/' << h.toRefetch.size() << rsLog;
throw e;
}
MemoryMappedFile::flushAll(true);
sethbmsg("rollback 3.5");
if( h.rbid != getRBID(r.conn()) ) {
// our source rolled back itself. so the data we received isn't necessarily consistent.
sethbmsg("rollback rbid on source changed during rollback, cancelling this attempt");
return;
}
// update them
sethbmsg(str::stream() << "rollback 4 n:" << goodVersions.size());
bool warn = false;
assert( !h.commonPointOurDiskloc.isNull() );
dbMutex.assertWriteLocked();
/* we have items we are writing that aren't from a point-in-time. thus best not to come online
until we get to that point in freshness. */
setMinValid(newMinValid);
/** any full collection resyncs required? */
if( !h.collectionsToResync.empty() ) {
for( set<string>::iterator i = h.collectionsToResync.begin(); i != h.collectionsToResync.end(); i++ ) {
string ns = *i;
sethbmsg(str::stream() << "rollback 4.1 coll resync " << ns);
Client::Context c(ns);
{
2011-01-04 00:40:41 -05:00
bob res;
string errmsg;
dropCollection(ns, errmsg, res);
{
dbtemprelease r;
bool ok = copyCollectionFromRemote(them->getServerAddress(), ns, errmsg);
uassert(15909, str::stream() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg, ok);
2011-01-04 00:40:41 -05:00
}
}
}
/* we did more reading from primary, so check it again for a rollback (which would mess us up), and
make minValid newer.
*/
sethbmsg("rollback 4.2");
{
string err;
try {
newMinValid = r.getLastOp(rsoplog);
if( newMinValid.isEmpty() ) {
err = "can't get minvalid from primary";
}
else {
setMinValid(newMinValid);
}
}
2011-10-25 14:37:25 -04:00
catch (DBException&) {
2011-01-04 00:40:41 -05:00
err = "can't get/set minvalid";
}
if( h.rbid != getRBID(r.conn()) ) {
// our source rolled back itself. so the data we received isn't necessarily consistent.
// however, we've now done writes. thus we have a problem.
err += "rbid at primary changed during resync/rollback";
}
if( !err.empty() ) {
log() << "replSet error rolling back : " << err << ". A full resync will be necessary." << rsLog;
/* todo: reset minvalid so that we are permanently in fatal state */
/* todo: don't be fatal, but rather, get all the data first. */
sethbmsg("rollback error");
throw rsfatal();
}
}
sethbmsg("rollback 4.3");
}
sethbmsg("rollback 4.6");
/** drop collections to drop before doing individual fixups - that might make things faster below actually if there were subsequent inserts to rollback */
for( set<string>::iterator i = h.toDrop.begin(); i != h.toDrop.end(); i++ ) {
2011-01-19 16:41:57 -05:00
Client::Context c(*i);
2011-01-04 00:40:41 -05:00
try {
bob res;
string errmsg;
log(1) << "replSet rollback drop: " << *i << rsLog;
dropCollection(*i, errmsg, res);
}
catch(...) {
log() << "replset rollback error dropping collection " << *i << rsLog;
}
}
sethbmsg("rollback 4.7");
2011-01-19 16:41:57 -05:00
Client::Context c(rsoplog);
2011-01-04 00:40:41 -05:00
NamespaceDetails *oplogDetails = nsdetails(rsoplog);
uassert(13423, str::stream() << "replSet error in rollback can't find " << rsoplog, oplogDetails);
map<string,shared_ptr<RemoveSaver> > removeSavers;
unsigned deletes = 0, updates = 0;
for( list<pair<DocID,bo> >::iterator i = goodVersions.begin(); i != goodVersions.end(); i++ ) {
const DocID& d = i->first;
bo pattern = d._id.wrap(); // { _id : ... }
try {
assert( d.ns && *d.ns );
if( h.collectionsToResync.count(d.ns) ) {
/* we just synced this entire collection */
continue;
}
2011-03-11 15:59:18 -05:00
getDur().commitIfNeeded();
2011-01-04 00:40:41 -05:00
/* keep an archive of items rolled back */
shared_ptr<RemoveSaver>& rs = removeSavers[d.ns];
if ( ! rs )
rs.reset( new RemoveSaver( "rollback" , "" , d.ns ) );
// todo: lots of overhead in context, this can be faster
2011-01-19 16:41:57 -05:00
Client::Context c(d.ns);
2011-01-04 00:40:41 -05:00
if( i->second.isEmpty() ) {
// wasn't on the primary; delete.
/* TODO1.6 : can't delete from a capped collection. need to handle that here. */
deletes++;
NamespaceDetails *nsd = nsdetails(d.ns);
if( nsd ) {
if( nsd->capped ) {
/* can't delete from a capped collection - so we truncate instead. if this item must go,
so must all successors!!! */
try {
/** todo: IIRC cappedTrunateAfter does not handle completely empty. todo. */
// this will crazy slow if no _id index.
long long start = Listener::getElapsedTimeMillis();
DiskLoc loc = Helpers::findOne(d.ns, pattern, false);
if( Listener::getElapsedTimeMillis() - start > 200 )
log() << "replSet warning roll back slow no _id index for " << d.ns << " perhaps?" << rsLog;
//would be faster but requires index: DiskLoc loc = Helpers::findById(nsd, pattern);
if( !loc.isNull() ) {
try {
nsd->cappedTruncateAfter(d.ns, loc, true);
}
catch(DBException& e) {
if( e.getCode() == 13415 ) {
// hack: need to just make cappedTruncate do this...
nsd->emptyCappedCollection(d.ns);
}
else {
throw;
}
}
}
}
catch(DBException& e) {
log() << "replSet error rolling back capped collection rec " << d.ns << ' ' << e.toString() << rsLog;
}
}
else {
try {
deletes++;
deleteObjects(d.ns, pattern, /*justone*/true, /*logop*/false, /*god*/true, rs.get() );
}
catch(...) {
log() << "replSet error rollback delete failed ns:" << d.ns << rsLog;
}
}
// did we just empty the collection? if so let's check if it even exists on the source.
if( nsd->stats.nrecords == 0 ) {
try {
string sys = cc().database()->name + ".system.namespaces";
bo o = them->findOne(sys, QUERY("name"<<d.ns));
if( o.isEmpty() ) {
// we should drop
try {
bob res;
string errmsg;
dropCollection(d.ns, errmsg, res);
}
catch(...) {
log() << "replset error rolling back collection " << d.ns << rsLog;
}
}
}
catch(DBException& ) {
/* this isn't *that* big a deal, but is bad. */
log() << "replSet warning rollback error querying for existence of " << d.ns << " at the primary, ignoring" << rsLog;
}
}
}
}
else {
// todo faster...
OpDebug debug;
updates++;
_updateObjects(/*god*/true, d.ns, i->second, pattern, /*upsert=*/true, /*multi=*/false , /*logtheop=*/false , debug, rs.get() );
}
}
catch(DBException& e) {
log() << "replSet exception in rollback ns:" << d.ns << ' ' << pattern.toString() << ' ' << e.toString() << " ndeletes:" << deletes << rsLog;
warn = true;
}
}
removeSavers.clear(); // this effectively closes all of them
sethbmsg(str::stream() << "rollback 5 d:" << deletes << " u:" << updates);
MemoryMappedFile::flushAll(true);
sethbmsg("rollback 6");
// clean up oplog
2011-07-16 12:43:51 -04:00
LOG(2) << "replSet rollback truncate oplog after " << h.commonPoint.toStringPretty() << rsLog;
2011-01-04 00:40:41 -05:00
// todo: fatal error if this throws?
oplogDetails->cappedTruncateAfter(rsoplog, h.commonPointOurDiskloc, false);
/* reset cached lastoptimewritten and h value */
loadLastOpTimeWritten();
sethbmsg("rollback 7");
MemoryMappedFile::flushAll(true);
// done
if( warn )
sethbmsg("issues during syncRollback, see log");
else
sethbmsg("rollback done");
}
void ReplSetImpl::syncRollback(OplogReader&r) {
unsigned s = _syncRollback(r);
2011-01-04 00:40:41 -05:00
if( s )
sleepsecs(s);
}
2011-01-04 00:40:41 -05:00
unsigned ReplSetImpl::_syncRollback(OplogReader&r) {
2010-07-21 14:15:48 -04:00
assert( !lockedByMe() );
assert( !dbMutex.atLeastReadLocked() );
2010-08-17 16:31:21 -04:00
sethbmsg("rollback 0");
2010-07-26 15:29:18 -04:00
writelocktry lk(rsoplog, 20000);
if( !lk.got() ) {
2010-08-17 16:31:21 -04:00
sethbmsg("rollback couldn't get write lock in a reasonable time");
return 2;
2010-07-26 15:29:18 -04:00
}
2011-05-02 18:26:08 -04:00
if( state().secondary() ) {
/* by doing this, we will not service reads (return an error as we aren't in secondary staate.
2011-01-04 00:40:41 -05:00
that perhaps is moot becasue of the write lock above, but that write lock probably gets deferred
or removed or yielded later anyway.
also, this is better for status reporting - we know what is happening.
*/
2011-05-02 18:26:08 -04:00
changeState(MemberState::RS_ROLLBACK);
}
2010-07-21 15:39:59 -04:00
HowToFixUp how;
2010-08-17 16:31:21 -04:00
sethbmsg("rollback 1");
2010-07-21 13:50:53 -04:00
{
r.resetCursor();
2010-07-21 15:39:59 -04:00
2010-08-17 16:31:21 -04:00
sethbmsg("rollback 2 FindCommonPoint");
2010-07-21 14:15:48 -04:00
try {
2010-07-26 15:29:18 -04:00
syncRollbackFindCommonPoint(r.conn(), how);
2010-07-21 14:15:48 -04:00
}
2011-01-04 00:40:41 -05:00
catch( const char *p ) {
2010-08-17 16:31:21 -04:00
sethbmsg(string("rollback 2 error ") + p);
return 10;
2010-07-21 14:15:48 -04:00
}
2011-01-04 00:40:41 -05:00
catch( rsfatal& ) {
2010-08-16 09:17:14 -04:00
_fatal();
return 2;
2010-08-16 09:17:14 -04:00
}
2011-01-04 00:40:41 -05:00
catch( DBException& e ) {
2010-08-17 16:31:21 -04:00
sethbmsg(string("rollback 2 exception ") + e.toString() + "; sleeping 1 min");
dbtemprelease r;
2010-07-21 15:39:59 -04:00
sleepsecs(60);
throw;
}
2010-07-21 13:50:53 -04:00
}
2010-07-21 13:13:36 -04:00
2010-08-17 16:31:21 -04:00
sethbmsg("replSet rollback 3 fixup");
2010-07-26 15:29:18 -04:00
2010-08-16 10:26:07 -04:00
{
incRBID();
2011-01-04 00:40:41 -05:00
try {
2010-08-16 10:26:07 -04:00
syncFixUp(how, r);
}
2011-01-04 00:40:41 -05:00
catch( rsfatal& ) {
2010-09-02 11:26:38 -04:00
sethbmsg("rollback fixup error");
_fatal();
return 2;
}
2011-01-04 00:40:41 -05:00
catch(...) {
2010-08-16 10:26:07 -04:00
incRBID(); throw;
}
incRBID();
2011-01-04 00:40:41 -05:00
/* success - leave "ROLLBACK" state
can go to SECONDARY once minvalid is achieved
*/
2011-05-02 18:26:08 -04:00
changeState(MemberState::RS_RECOVERING);
2010-08-16 10:26:07 -04:00
}
return 0;
2010-07-21 13:13:36 -04:00
}
2010-07-21 01:45:04 -04:00
}