Files
mongo/db/repl/rs_sync.cpp

528 lines
20 KiB
C++
Raw Normal View History

2010-08-18 14:39:50 -04:00
/**
2010-07-16 10:42:53 -04:00
* Copyright (C) 2008 10gen Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "pch.h"
#include "../client.h"
#include "../../client/dbclient.h"
#include "rs.h"
2010-07-19 20:09:00 -04:00
#include "../repl.h"
#include "connections.h"
2010-07-16 12:51:01 -04:00
namespace mongo {
2010-07-16 10:42:53 -04:00
2010-07-30 15:00:19 -04:00
using namespace bson;
2010-08-29 16:03:17 -04:00
extern unsigned replSetForceInitialSyncFailure;
2010-10-30 14:56:56 -04:00
/* apply the log op that is in param o */
2010-07-19 22:32:43 -04:00
void ReplSetImpl::syncApply(const BSONObj &o) {
2010-09-07 12:18:44 -04:00
char db[MaxDatabaseNameLen];
2010-07-19 22:32:43 -04:00
const char *ns = o.getStringField("ns");
nsToDatabase(ns, db);
if ( *ns == '.' || *ns == 0 ) {
2011-01-04 00:40:41 -05:00
if( *o.getStringField("op") == 'n' )
return;
2010-07-19 22:32:43 -04:00
log() << "replSet skipping bad op in oplog: " << o.toString() << endl;
return;
}
Client::Context ctx(ns);
ctx.getClient()->curop()->reset();
/* todo : if this asserts, do we want to ignore or not? */
applyOperation_inlock(o);
}
2011-01-04 00:40:41 -05:00
/* initial oplog application, during initial sync, after cloning.
@return false on failure.
2010-10-30 14:56:56 -04:00
this method returns an error and doesn't throw exceptions (i think).
*/
bool ReplSetImpl::initialSyncOplogApplication(
const Member *source,
OpTime applyGTE,
2011-01-04 00:40:41 -05:00
OpTime minValid) {
if( source == 0 ) return false;
2010-08-02 18:48:24 -04:00
const string hn = source->h().toString();
OplogReader r;
try {
2011-01-04 00:40:41 -05:00
if( !r.connect(hn) ) {
2010-10-30 14:41:49 -04:00
log() << "replSet initial sync error can't connect to " << hn << " to read " << rsoplog << rsLog;
return false;
}
r.queryGTE( rsoplog, applyGTE );
assert( r.haveCursor() );
2010-08-02 18:48:24 -04:00
{
2011-01-04 00:40:41 -05:00
if( !r.more() ) {
2010-08-02 18:48:24 -04:00
sethbmsg("replSet initial sync error reading remote oplog");
2010-10-30 14:41:49 -04:00
log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog;
2010-08-02 18:48:24 -04:00
return false;
}
bo op = r.next();
OpTime t = op["ts"]._opTime();
r.putBack(op);
2010-10-30 14:41:49 -04:00
2011-01-04 00:40:41 -05:00
if( op.firstElement().fieldName() == string("$err") ) {
2010-10-30 14:41:49 -04:00
log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog;
return false;
}
uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() );
2010-08-02 18:48:24 -04:00
if( t > applyGTE ) {
sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync");
2010-10-30 14:41:49 -04:00
log() << "replSet initial sync expected first optime of " << applyGTE << rsLog;
log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog;
2010-08-02 18:48:24 -04:00
return false;
}
sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at "
<< t.toStringPretty() << " to " << minValid.toStringPretty());
2010-08-02 18:48:24 -04:00
}
}
catch(DBException& e) {
log() << "replSet initial sync failing: " << e.toString() << rsLog;
return false;
}
2010-08-02 18:48:24 -04:00
/* we lock outside the loop to avoid the overhead of locking on every operation. */
writelock lk("");
2010-08-29 16:03:17 -04:00
// todo : use exhaust
OpTime ts;
time_t start = time(0);
unsigned long long n = 0;
while( 1 ) {
try {
if( !r.more() )
break;
BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
{
ts = o["ts"]._opTime();
/* if we have become primary, we dont' want to apply things from elsewhere
2011-01-04 00:40:41 -05:00
anymore. assumePrimary is in the db lock so we are safe as long as
we check after we locked above. */
if( (source->state() != MemberState::RS_PRIMARY &&
2011-01-04 00:40:41 -05:00
source->state() != MemberState::RS_SECONDARY) ||
replSetForceInitialSyncFailure ) {
2010-08-29 16:03:17 -04:00
int f = replSetForceInitialSyncFailure;
if( f > 0 ) {
replSetForceInitialSyncFailure = f-1;
log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog;
throw DBException("forced error",0);
2010-08-29 16:03:17 -04:00
}
log() << "replSet we are now primary" << rsLog;
throw DBException("primary changed",0);
}
if( ts >= applyGTE ) {
// optimes before we started copying need not be applied.
syncApply(o);
}
_logOpObjRS(o); /* with repl sets we write the ops to our oplog too */
}
if ( ++n % 1000 == 0 ) {
time_t now = time(0);
if (now - start > 10) {
// simple progress metering
log() << "initialSyncOplogApplication applied " << n << " operations, synced to "
<< ts.toStringPretty() << rsLog;
start = now;
}
2010-08-02 13:19:18 -04:00
}
getDur().commitIfNeeded();
}
catch (DBException& e) {
// skip duplicate key exceptions
if( e.getCode() == 11000 || e.getCode() == 11001 ) {
continue;
}
// handle cursor not found (just requery)
if( e.getCode() == 13127 ) {
r.resetCursor();
r.queryGTE(rsoplog, ts);
if( r.haveCursor() ) {
continue;
}
}
// TODO: handle server restart
if( ts <= minValid ) {
// didn't make it far enough
log() << "replSet initial sync failing, error applying oplog " << e.toString() << rsLog;
return false;
}
// otherwise, whatever
break;
}
}
return true;
}
2011-01-04 00:40:41 -05:00
/* should be in RECOVERING state on arrival here.
readlocks
@return true if transitioned to SECONDARY
*/
2011-01-04 00:40:41 -05:00
bool ReplSetImpl::tryToGoLiveAsASecondary(OpTime& /*out*/ minvalid) {
bool golive = false;
{
readlock lk("local.replset.minvalid");
BSONObj mv;
2011-01-04 00:40:41 -05:00
if( Helpers::getSingleton("local.replset.minvalid", mv) ) {
minvalid = mv["ts"]._opTime();
2011-01-04 00:40:41 -05:00
if( minvalid <= lastOpTimeWritten ) {
golive=true;
}
}
2011-01-04 00:40:41 -05:00
else
golive = true; /* must have been the original member */
}
if( golive ) {
sethbmsg("");
changeState(MemberState::RS_SECONDARY);
}
return golive;
}
/**
* Checks if the oplog given is too far ahead to read from.
*
* @param r the oplog
* @param hn the hostname (for log messages)
*
* @return if we are stale compared to the oplog on hn
*/
bool ReplSetImpl::_isStale(OplogReader& r, const string& hn) {
BSONObj remoteOldestOp = r.findOne(rsoplog, Query());
OpTime ts = remoteOldestOp["ts"]._opTime();
DEV log() << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog;
else log(3) << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog;
2011-01-04 00:40:41 -05:00
DEV {
// debugging sync1.js...
log() << "replSet lastOpTimeWritten: " << lastOpTimeWritten.toStringLong() << rsLog;
log() << "replSet our state: " << state().toString() << rsLog;
}
2011-01-04 00:40:41 -05:00
if( lastOpTimeWritten < ts ) {
log() << "replSet error RS102 too stale to catch up, at least from " << hn << rsLog;
log() << "replSet our last optime : " << lastOpTimeWritten.toStringLong() << rsLog;
log() << "replSet oldest at " << hn << " : " << ts.toStringLong() << rsLog;
log() << "replSet See http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << rsLog;
sethbmsg("error RS102 too stale to catch up");
changeState(MemberState::RS_RECOVERING);
sleepsecs(120);
return true;
}
return false;
}
/**
* Tries to connect the oplog reader to a potential sync source. If
* successful, it checks that we are not stale compared to this source.
*
* @param r reader to populate
* @param hn hostname to try
*
* @return if both checks pass, it returns true, otherwise false.
*/
bool ReplSetImpl::_getOplogReader(OplogReader& r, string& hn) {
assert(r.conn() == 0);
2011-01-04 00:40:41 -05:00
if( !r.connect(hn) ) {
log(2) << "replSet can't connect to " << hn << " to read operations" << rsLog;
r.resetConnection();
return false;
}
if( _isStale(r, hn)) {
r.resetConnection();
return false;
}
return true;
2011-01-04 00:40:41 -05:00
}
/* tail an oplog. ok to return, will be re-called. */
2011-01-04 00:40:41 -05:00
void ReplSetImpl::syncTail() {
2010-07-19 22:32:43 -04:00
// todo : locking vis a vis the mgr...
2010-07-19 20:09:00 -04:00
OplogReader r;
string hn;
const Member *target = 0;
2011-01-04 00:40:41 -05:00
// if we cannot reach the master but someone else is more up-to-date
// than we are, sync from them.
target = getMemberToSyncTo();
if (target != 0) {
hn = target->h().toString();
if (!_getOplogReader(r, hn)) {
// we might be stale wrt the primary, but could still sync from
// a secondary
target = 0;
}
}
// no server found
if (target == 0) {
// if there is no one to sync from
OpTime minvalid;
tryToGoLiveAsASecondary(minvalid);
return;
2010-07-21 15:39:59 -04:00
}
2010-07-19 22:32:43 -04:00
r.tailingQueryGTE(rsoplog, lastOpTimeWritten);
assert( r.haveCursor() );
uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() );
2010-07-19 22:32:43 -04:00
{
2010-07-30 11:35:28 -04:00
if( !r.more() ) {
2010-07-30 15:00:19 -04:00
/* maybe we are ahead and need to roll back? */
try {
bo theirLastOp = r.getLastOp(rsoplog);
if( theirLastOp.isEmpty() ) {
log() << "replSet error empty query result from " << hn << " oplog" << rsLog;
sleepsecs(2);
return;
}
OpTime theirTS = theirLastOp["ts"]._opTime();
2011-01-04 00:40:41 -05:00
if( theirTS < lastOpTimeWritten ) {
2010-07-30 15:00:19 -04:00
log() << "replSet we are ahead of the primary, will try to roll back" << rsLog;
syncRollback(r);
return;
}
/* we're not ahead? maybe our new query got fresher data. best to come back and try again */
log() << "replSet syncTail condition 1" << rsLog;
sleepsecs(1);
}
2011-01-04 00:40:41 -05:00
catch(DBException& e) {
2010-07-30 15:00:19 -04:00
log() << "replSet error querying " << hn << ' ' << e.toString() << rsLog;
sleepsecs(2);
}
return;
/*
2010-07-30 11:35:28 -04:00
log() << "replSet syncTail error querying oplog >= " << lastOpTimeWritten.toString() << " from " << hn << rsLog;
try {
log() << "replSet " << hn << " last op: " << r.getLastOp(rsoplog).toString() << rsLog;
}
catch(...) { }
sleepsecs(1);
2010-07-30 15:00:19 -04:00
return;*/
2010-07-30 11:35:28 -04:00
}
2010-07-30 15:00:19 -04:00
2010-07-19 22:32:43 -04:00
BSONObj o = r.nextSafe();
OpTime ts = o["ts"]._opTime();
long long h = o["h"].numberLong();
2011-01-04 00:40:41 -05:00
if( ts != lastOpTimeWritten || h != lastH ) {
2010-10-30 14:41:49 -04:00
log() << "replSet our last op time written: " << lastOpTimeWritten.toStringPretty() << endl;
log() << "replset source's GTE: " << ts.toStringPretty() << endl;
2010-07-21 13:13:36 -04:00
syncRollback(r);
2010-07-19 22:32:43 -04:00
return;
}
}
/* we have now checked if we need to rollback and we either don't have to or did it. */
{
OpTime minvalid;
tryToGoLiveAsASecondary(minvalid);
}
2010-08-18 14:11:42 -04:00
while( 1 ) {
2010-07-20 11:05:27 -04:00
while( 1 ) {
2011-01-04 00:40:41 -05:00
if( !r.moreInCurrentBatch() ) {
/* we need to occasionally check some things. between
2010-07-20 11:05:27 -04:00
batches is probably a good time. */
/* perhaps we should check this earlier? but not before the rollback checks. */
2011-01-04 00:40:41 -05:00
if( state().recovering() ) {
2010-07-29 15:01:41 -04:00
/* can we go to RS_SECONDARY state? we can if not too old and if minvalid achieved */
OpTime minvalid;
bool golive = ReplSetImpl::tryToGoLiveAsASecondary(minvalid);
2010-07-29 15:01:41 -04:00
if( golive ) {
;
2010-07-29 15:01:41 -04:00
}
2011-01-04 00:40:41 -05:00
else {
sethbmsg(str::stream() << "still syncing, not yet to minValid optime" << minvalid.toString());
2010-07-29 15:01:41 -04:00
}
2010-07-20 11:05:27 -04:00
/* todo: too stale capability */
}
2011-01-04 00:40:41 -05:00
{
const Member *primary = box.getPrimary();
if( !target->hbinfo().hbstate.readable() ||
// if we are not syncing from the primary, return (if
// it's up) so that we can try accessing it again
(target != primary && primary != 0)) {
return;
}
}
2010-07-20 11:05:27 -04:00
}
if( !r.more() )
break;
2011-01-04 00:40:41 -05:00
{
2010-07-20 11:05:27 -04:00
BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
2010-07-30 12:30:34 -04:00
2010-08-18 14:39:50 -04:00
int sd = myConfig().slaveDelay;
// ignore slaveDelay if the box is still initializing. once
// it becomes secondary we can worry about it.
2011-01-04 00:40:41 -05:00
if( sd && box.getState().secondary() ) {
2010-08-18 14:11:42 -04:00
const OpTime ts = o["ts"]._opTime();
long long a = ts.getSecs();
long long b = time(0);
long long lag = b - a;
long long sleeptime = sd - lag;
if( sleeptime > 0 ) {
uassert(12000, "rs slaveDelay differential too big check clocks and systems", sleeptime < 0x40000000);
log() << "replSet temp slavedelay sleep:" << sleeptime << rsLog;
2010-08-18 14:29:04 -04:00
if( sleeptime < 60 ) {
2010-08-18 14:11:42 -04:00
sleepsecs((int) sleeptime);
}
else {
// sleep(hours) would prevent reconfigs from taking effect & such!
long long waitUntil = b + sleeptime;
while( 1 ) {
2010-08-18 14:29:04 -04:00
sleepsecs(6);
2010-08-18 14:11:42 -04:00
if( time(0) >= waitUntil )
break;
if( !target->hbinfo().hbstate.readable() ) {
2010-08-18 14:11:42 -04:00
break;
}
2010-08-18 14:11:42 -04:00
if( myConfig().slaveDelay != sd ) // reconf
break;
}
}
}
2011-01-04 00:40:41 -05:00
}
{
writelock lk("");
/* if we have become primary, we dont' want to apply things from elsewhere
2011-01-04 00:40:41 -05:00
anymore. assumePrimary is in the db lock so we are safe as long as
we check after we locked above. */
if( box.getState().primary() ) {
log(0) << "replSet stopping syncTail we are now primary" << rsLog;
return;
}
syncApply(o);
2011-01-04 00:40:41 -05:00
_logOpObjRS(o); /* with repl sets we write the ops to our oplog too: */
2010-08-18 14:11:42 -04:00
}
2010-07-19 22:32:43 -04:00
}
}
2010-07-20 13:37:09 -04:00
r.tailCheck();
if( !r.haveCursor() ) {
2010-08-18 14:11:42 -04:00
log(1) << "replSet end syncTail pass with " << hn << rsLog;
// TODO : reuse our connection to the primary.
2010-07-20 11:05:27 -04:00
return;
2010-07-20 13:37:09 -04:00
}
if( !target->hbinfo().hbstate.readable() ) {
2010-07-20 11:05:27 -04:00
return;
}
2010-07-19 22:32:43 -04:00
// looping back is ok because this is a tailable cursor
}
2010-07-19 20:09:00 -04:00
}
2010-07-19 13:08:24 -04:00
void ReplSetImpl::_syncThread() {
2010-07-22 17:50:54 -04:00
StateBox::SP sp = box.get();
2010-07-30 11:57:30 -04:00
if( sp.state.primary() ) {
sleepsecs(1);
2010-07-19 13:08:24 -04:00
return;
2010-07-30 11:57:30 -04:00
}
2011-01-04 00:40:41 -05:00
if( sp.state.fatal() ) {
2010-09-02 10:16:52 -04:00
sleepsecs(5);
return;
}
2010-07-19 13:08:24 -04:00
2010-07-19 18:05:44 -04:00
/* do we have anything at all? */
if( lastOpTimeWritten.isNull() ) {
syncDoInitialSync();
2010-07-19 18:09:00 -04:00
return; // _syncThread will be recalled, starts from top again in case sync failed.
2010-07-19 18:05:44 -04:00
}
2010-07-19 18:09:00 -04:00
/* we have some data. continue tailing. */
2010-07-19 20:09:00 -04:00
syncTail();
2010-07-19 13:08:24 -04:00
}
void ReplSetImpl::syncThread() {
/* test here was to force a receive timeout
ScopedConn c("localhost");
bo info;
try {
log() << "this is temp" << endl;
c.runCommand("admin", BSON("sleep"<<120), info);
log() << info.toString() << endl;
c.runCommand("admin", BSON("sleep"<<120), info);
log() << "temp" << endl;
}
2011-01-04 00:40:41 -05:00
catch( DBException& e ) {
log() << e.toString() << endl;
c.runCommand("admin", BSON("sleep"<<120), info);
log() << "temp" << endl;
}
*/
while( 1 ) {
if( myConfig().arbiterOnly )
return;
2011-01-04 00:40:41 -05:00
2010-07-19 17:23:11 -04:00
try {
_syncThread();
2010-07-19 13:08:24 -04:00
}
2011-01-04 00:40:41 -05:00
catch(DBException& e) {
2010-07-30 15:00:19 -04:00
sethbmsg("syncThread: " + e.toString());
2010-07-19 13:08:24 -04:00
sleepsecs(10);
}
2011-01-04 00:40:41 -05:00
catch(...) {
2010-07-20 13:37:09 -04:00
sethbmsg("unexpected exception in syncThread()");
2010-10-30 14:56:56 -04:00
// TODO : SET NOT SECONDARY here?
2010-07-20 13:37:09 -04:00
sleepsecs(60);
}
2010-07-30 11:57:30 -04:00
sleepsecs(1);
2011-01-04 00:40:41 -05:00
/* normally msgCheckNewState gets called periodically, but in a single node repl set there
are no heartbeat threads, so we do it here to be sure. this is relevant if the singleton
member has done a stepDown() and needs to come back up.
*/
OCCASIONALLY mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) );
2010-07-19 13:08:24 -04:00
}
2010-07-16 12:51:01 -04:00
}
2010-07-16 10:42:53 -04:00
2010-10-30 14:56:56 -04:00
void startSyncThread() {
static int n;
2010-10-31 12:11:38 -04:00
if( n != 0 ) {
log() << "replSet ERROR : more than one sync thread?" << rsLog;
assert( n == 0 );
}
2010-10-30 14:56:56 -04:00
n++;
Client::initThread("replica set sync");
2010-10-30 14:56:56 -04:00
cc().iAmSyncThread();
2010-12-27 16:05:40 -05:00
if (!noauth) {
cc().getAuthenticationInfo()->authorize("local");
}
2010-10-30 14:56:56 -04:00
theReplSet->syncThread();
cc().shutdown();
}
2010-07-16 10:42:53 -04:00
}