2010-08-18 14:39:50 -04:00
|
|
|
/**
|
2010-07-16 10:42:53 -04:00
|
|
|
* Copyright (C) 2008 10gen Inc.
|
|
|
|
|
*
|
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
|
* it under the terms of the GNU Affero General Public License, version 3,
|
|
|
|
|
* as published by the Free Software Foundation.
|
|
|
|
|
*
|
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
* GNU Affero General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU Affero General Public License
|
|
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "pch.h"
|
|
|
|
|
#include "../client.h"
|
|
|
|
|
#include "../../client/dbclient.h"
|
|
|
|
|
#include "rs.h"
|
2010-07-19 20:09:00 -04:00
|
|
|
#include "../repl.h"
|
2010-10-31 19:12:12 -04:00
|
|
|
#include "connections.h"
|
2011-05-23 10:35:58 -04:00
|
|
|
|
2010-07-16 12:51:01 -04:00
|
|
|
namespace mongo {
|
2010-07-16 10:42:53 -04:00
|
|
|
|
2010-07-30 15:00:19 -04:00
|
|
|
using namespace bson;
|
2010-08-29 16:03:17 -04:00
|
|
|
extern unsigned replSetForceInitialSyncFailure;
|
|
|
|
|
|
2011-05-23 10:35:58 -04:00
|
|
|
void NOINLINE_DECL blank(const BSONObj& o) {
|
|
|
|
|
if( *o.getStringField("op") != 'n' ) {
|
|
|
|
|
log() << "replSet skipping bad op in oplog: " << o.toString() << rsLog;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-09-18 17:23:28 -04:00
|
|
|
/* apply the log op that is in param o
|
|
|
|
|
@return bool failedUpdate
|
|
|
|
|
*/
|
|
|
|
|
bool ReplSetImpl::syncApply(const BSONObj &o) {
|
2010-07-19 22:32:43 -04:00
|
|
|
const char *ns = o.getStringField("ns");
|
|
|
|
|
if ( *ns == '.' || *ns == 0 ) {
|
2011-05-23 10:35:58 -04:00
|
|
|
blank(o);
|
2011-09-18 17:23:28 -04:00
|
|
|
return false;
|
2010-07-19 22:32:43 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Client::Context ctx(ns);
|
|
|
|
|
ctx.getClient()->curop()->reset();
|
2011-09-18 17:23:28 -04:00
|
|
|
return applyOperation_inlock(o);
|
2010-07-19 22:32:43 -04:00
|
|
|
}
|
|
|
|
|
|
2011-01-04 00:40:41 -05:00
|
|
|
/* initial oplog application, during initial sync, after cloning.
|
|
|
|
|
@return false on failure.
|
2010-10-30 14:56:56 -04:00
|
|
|
this method returns an error and doesn't throw exceptions (i think).
|
|
|
|
|
*/
|
2011-09-21 17:40:31 -04:00
|
|
|
bool ReplSetImpl::initialSyncOplogApplication(const OpTime& applyGTE, const OpTime& minValid) {
|
2011-09-21 15:05:31 -04:00
|
|
|
Member *source = 0;
|
|
|
|
|
OplogReader r;
|
|
|
|
|
|
2011-09-21 17:40:31 -04:00
|
|
|
// keep trying to initial sync from oplog until we run out of targets
|
|
|
|
|
while ((source = _getOplogReader(r, applyGTE.isNull())) != 0) {
|
|
|
|
|
if (_initialSyncOplogApplication(r, source, applyGTE, minValid)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
r.resetConnection();
|
|
|
|
|
veto(source->fullName(), 60);
|
|
|
|
|
log() << "replSet applying oplog from " << source->fullName() << " failed, trying again" << endl;
|
2011-09-21 15:05:31 -04:00
|
|
|
}
|
2010-08-02 18:48:24 -04:00
|
|
|
|
2011-09-21 17:40:31 -04:00
|
|
|
log() << "replSet initial sync error: couldn't find oplog to sync from" << rsLog;
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool ReplSetImpl::_initialSyncOplogApplication(OplogReader& r, const Member *source,
|
|
|
|
|
const OpTime& applyGTE, const OpTime& minValid) {
|
|
|
|
|
|
|
|
|
|
const string hn = source->fullName();
|
2011-09-18 17:23:28 -04:00
|
|
|
OplogReader missingObjReader;
|
2010-08-02 14:03:33 -04:00
|
|
|
try {
|
2011-07-06 10:21:06 -04:00
|
|
|
r.tailingQueryGTE( rsoplog, applyGTE );
|
2011-05-17 15:03:22 -04:00
|
|
|
if ( !r.haveCursor() ) {
|
|
|
|
|
log() << "replSet initial sync oplog query error" << rsLog;
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2010-08-02 14:03:33 -04:00
|
|
|
|
2010-08-02 18:48:24 -04:00
|
|
|
{
|
2011-01-04 00:40:41 -05:00
|
|
|
if( !r.more() ) {
|
2010-08-02 18:48:24 -04:00
|
|
|
sethbmsg("replSet initial sync error reading remote oplog");
|
2010-10-30 14:41:49 -04:00
|
|
|
log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog;
|
2010-08-02 18:48:24 -04:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
bo op = r.next();
|
|
|
|
|
OpTime t = op["ts"]._opTime();
|
|
|
|
|
r.putBack(op);
|
2010-10-30 14:41:49 -04:00
|
|
|
|
2011-05-23 10:35:58 -04:00
|
|
|
if( op.firstElementFieldName() == string("$err") ) {
|
2010-10-30 14:41:49 -04:00
|
|
|
log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog;
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() );
|
2010-08-02 18:48:24 -04:00
|
|
|
if( t > applyGTE ) {
|
|
|
|
|
sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync");
|
2010-10-30 14:41:49 -04:00
|
|
|
log() << "replSet initial sync expected first optime of " << applyGTE << rsLog;
|
|
|
|
|
log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog;
|
2010-08-02 18:48:24 -04:00
|
|
|
return false;
|
|
|
|
|
}
|
2011-03-21 15:59:05 -04:00
|
|
|
|
|
|
|
|
sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at "
|
|
|
|
|
<< t.toStringPretty() << " to " << minValid.toStringPretty());
|
2010-08-02 18:48:24 -04:00
|
|
|
}
|
2011-01-13 13:30:13 -05:00
|
|
|
}
|
|
|
|
|
catch(DBException& e) {
|
|
|
|
|
log() << "replSet initial sync failing: " << e.toString() << rsLog;
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2010-08-02 18:48:24 -04:00
|
|
|
|
2011-01-13 13:30:13 -05:00
|
|
|
/* we lock outside the loop to avoid the overhead of locking on every operation. */
|
|
|
|
|
writelock lk("");
|
2010-08-29 16:03:17 -04:00
|
|
|
|
2011-01-13 13:30:13 -05:00
|
|
|
// todo : use exhaust
|
|
|
|
|
OpTime ts;
|
2011-03-21 15:59:05 -04:00
|
|
|
time_t start = time(0);
|
2011-01-13 13:30:13 -05:00
|
|
|
unsigned long long n = 0;
|
|
|
|
|
while( 1 ) {
|
|
|
|
|
try {
|
2010-08-02 14:03:33 -04:00
|
|
|
if( !r.more() )
|
|
|
|
|
break;
|
|
|
|
|
BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
|
2011-09-19 13:06:04 -04:00
|
|
|
ts = o["ts"]._opTime();
|
2010-08-02 14:03:33 -04:00
|
|
|
|
2011-09-19 13:06:04 -04:00
|
|
|
{
|
2010-11-09 17:28:32 -05:00
|
|
|
if( (source->state() != MemberState::RS_PRIMARY &&
|
2011-01-04 00:40:41 -05:00
|
|
|
source->state() != MemberState::RS_SECONDARY) ||
|
|
|
|
|
replSetForceInitialSyncFailure ) {
|
|
|
|
|
|
2010-08-29 16:03:17 -04:00
|
|
|
int f = replSetForceInitialSyncFailure;
|
|
|
|
|
if( f > 0 ) {
|
|
|
|
|
replSetForceInitialSyncFailure = f-1;
|
|
|
|
|
log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog;
|
2010-11-09 17:28:32 -05:00
|
|
|
throw DBException("forced error",0);
|
2010-08-29 16:03:17 -04:00
|
|
|
}
|
2010-11-09 17:28:32 -05:00
|
|
|
log() << "replSet we are now primary" << rsLog;
|
2010-08-02 14:03:33 -04:00
|
|
|
throw DBException("primary changed",0);
|
|
|
|
|
}
|
|
|
|
|
|
2011-09-18 17:23:28 -04:00
|
|
|
if( ts >= applyGTE ) { // optimes before we started copying need not be applied.
|
|
|
|
|
bool failedUpdate = syncApply(o);
|
|
|
|
|
if( failedUpdate ) {
|
|
|
|
|
// we don't have the object yet, which is possible on initial sync. get it.
|
|
|
|
|
log() << "replSet info adding missing object" << endl; // rare enough we can log
|
|
|
|
|
if( !missingObjReader.connect(hn) ) { // ok to call more than once
|
|
|
|
|
log() << "replSet initial sync fails, couldn't connect to " << hn << endl;
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
const char *ns = o.getStringField("ns");
|
2011-09-19 21:38:45 -04:00
|
|
|
BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj(); // might be more than just _id in the update criteria
|
|
|
|
|
BSONObj missingObj;
|
|
|
|
|
try {
|
|
|
|
|
missingObj = missingObjReader.findOne(
|
|
|
|
|
ns,
|
|
|
|
|
query );
|
|
|
|
|
} catch(...) {
|
|
|
|
|
log() << "replSet assertion fetching missing object" << endl;
|
|
|
|
|
throw;
|
|
|
|
|
}
|
2011-09-22 22:34:54 -04:00
|
|
|
if( missingObj.isEmpty() ) {
|
|
|
|
|
log() << "replSet missing object not found on source. presumably deleted later in oplog" << endl;
|
|
|
|
|
log() << "replSet o2: " << o.getObjectField("o2").toString() << endl;
|
|
|
|
|
log() << "replSet o firstfield: " << o.getObjectField("o").firstElementFieldName() << endl;
|
2011-09-19 21:38:45 -04:00
|
|
|
}
|
2011-09-22 22:34:54 -04:00
|
|
|
else {
|
|
|
|
|
Client::Context ctx(ns);
|
|
|
|
|
try {
|
|
|
|
|
DiskLoc d = theDataFileMgr.insert(ns, (void*) missingObj.objdata(), missingObj.objsize());
|
|
|
|
|
assert( !d.isNull() );
|
|
|
|
|
} catch(...) {
|
|
|
|
|
log() << "replSet assertion during insert of missing object" << endl;
|
|
|
|
|
throw;
|
|
|
|
|
}
|
|
|
|
|
// now reapply the update from above
|
|
|
|
|
bool failed = syncApply(o);
|
|
|
|
|
if( failed ) {
|
|
|
|
|
log() << "replSet update still fails after adding missing object " << ns << endl;
|
|
|
|
|
assert(false);
|
|
|
|
|
}
|
2011-09-18 17:23:28 -04:00
|
|
|
}
|
|
|
|
|
}
|
2010-08-02 14:03:33 -04:00
|
|
|
}
|
|
|
|
|
_logOpObjRS(o); /* with repl sets we write the ops to our oplog too */
|
|
|
|
|
}
|
2011-03-21 15:59:05 -04:00
|
|
|
|
|
|
|
|
if ( ++n % 1000 == 0 ) {
|
|
|
|
|
time_t now = time(0);
|
|
|
|
|
if (now - start > 10) {
|
|
|
|
|
// simple progress metering
|
2011-04-28 15:47:10 -04:00
|
|
|
log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to "
|
2011-03-21 15:59:05 -04:00
|
|
|
<< ts.toStringPretty() << rsLog;
|
|
|
|
|
start = now;
|
|
|
|
|
}
|
2010-08-02 13:19:18 -04:00
|
|
|
}
|
2011-09-19 13:06:04 -04:00
|
|
|
|
|
|
|
|
if ( ts > minValid ) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2011-03-02 01:28:17 -05:00
|
|
|
getDur().commitIfNeeded();
|
2010-08-02 14:03:33 -04:00
|
|
|
}
|
2011-01-13 13:30:13 -05:00
|
|
|
catch (DBException& e) {
|
2011-03-11 13:52:02 -05:00
|
|
|
// skip duplicate key exceptions
|
2011-01-13 13:30:13 -05:00
|
|
|
if( e.getCode() == 11000 || e.getCode() == 11001 ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2011-03-11 13:52:02 -05:00
|
|
|
|
|
|
|
|
// handle cursor not found (just requery)
|
|
|
|
|
if( e.getCode() == 13127 ) {
|
|
|
|
|
r.resetCursor();
|
2011-07-06 10:21:06 -04:00
|
|
|
r.tailingQueryGTE(rsoplog, ts);
|
2011-03-11 13:52:02 -05:00
|
|
|
if( r.haveCursor() ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// TODO: handle server restart
|
2011-01-13 13:30:13 -05:00
|
|
|
|
|
|
|
|
if( ts <= minValid ) {
|
|
|
|
|
// didn't make it far enough
|
2011-09-22 22:44:06 -04:00
|
|
|
log() << "replSet initial sync failing, error applying oplog : " << e.toString() << rsLog;
|
2011-01-13 13:30:13 -05:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// otherwise, whatever
|
|
|
|
|
break;
|
2010-08-02 14:03:33 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2011-01-04 00:40:41 -05:00
|
|
|
/* should be in RECOVERING state on arrival here.
|
2010-08-30 10:56:17 -04:00
|
|
|
readlocks
|
|
|
|
|
@return true if transitioned to SECONDARY
|
|
|
|
|
*/
|
2011-01-04 00:40:41 -05:00
|
|
|
bool ReplSetImpl::tryToGoLiveAsASecondary(OpTime& /*out*/ minvalid) {
|
2010-12-23 13:51:25 -05:00
|
|
|
bool golive = false;
|
2011-07-26 16:43:21 -04:00
|
|
|
|
|
|
|
|
{
|
|
|
|
|
lock lk( this );
|
|
|
|
|
|
|
|
|
|
if (_maintenanceMode > 0) {
|
|
|
|
|
// we're not actually going live
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2010-08-30 10:56:17 -04:00
|
|
|
{
|
|
|
|
|
readlock lk("local.replset.minvalid");
|
|
|
|
|
BSONObj mv;
|
2011-01-04 00:40:41 -05:00
|
|
|
if( Helpers::getSingleton("local.replset.minvalid", mv) ) {
|
2010-08-30 10:56:17 -04:00
|
|
|
minvalid = mv["ts"]._opTime();
|
2011-01-04 00:40:41 -05:00
|
|
|
if( minvalid <= lastOpTimeWritten ) {
|
2010-08-30 10:56:17 -04:00
|
|
|
golive=true;
|
|
|
|
|
}
|
|
|
|
|
}
|
2011-01-04 00:40:41 -05:00
|
|
|
else
|
2010-08-30 10:56:17 -04:00
|
|
|
golive = true; /* must have been the original member */
|
|
|
|
|
}
|
|
|
|
|
if( golive ) {
|
|
|
|
|
sethbmsg("");
|
|
|
|
|
changeState(MemberState::RS_SECONDARY);
|
|
|
|
|
}
|
|
|
|
|
return golive;
|
|
|
|
|
}
|
|
|
|
|
|
2011-09-21 15:05:31 -04:00
|
|
|
bool ReplSetImpl::_isStale(OplogReader& r, const OpTime& startTs, BSONObj& remoteOldestOp) {
|
|
|
|
|
remoteOldestOp = r.findOne(rsoplog, Query());
|
|
|
|
|
OpTime remoteTs = remoteOldestOp["ts"]._opTime();
|
|
|
|
|
DEV log() << "replSet remoteOldestOp: " << remoteTs.toStringLong() << rsLog;
|
|
|
|
|
else LOG(3) << "replSet remoteOldestOp: " << remoteTs.toStringLong() << rsLog;
|
2011-01-04 00:40:41 -05:00
|
|
|
DEV {
|
2010-12-22 18:42:08 -05:00
|
|
|
log() << "replSet lastOpTimeWritten: " << lastOpTimeWritten.toStringLong() << rsLog;
|
|
|
|
|
log() << "replSet our state: " << state().toString() << rsLog;
|
|
|
|
|
}
|
2011-09-21 15:05:31 -04:00
|
|
|
if( startTs >= remoteTs ) {
|
2011-05-27 12:37:27 -04:00
|
|
|
return false;
|
2010-12-22 18:42:08 -05:00
|
|
|
}
|
2011-05-27 12:37:27 -04:00
|
|
|
|
|
|
|
|
return true;
|
2010-12-22 18:42:08 -05:00
|
|
|
}
|
|
|
|
|
|
2011-09-21 15:05:31 -04:00
|
|
|
Member* ReplSetImpl::_getOplogReader(OplogReader& r, const OpTime& minTS) {
|
|
|
|
|
Member *target = 0, *stale = 0;
|
|
|
|
|
BSONObj oldest;
|
|
|
|
|
|
2010-12-23 13:51:25 -05:00
|
|
|
assert(r.conn() == 0);
|
2011-01-04 00:40:41 -05:00
|
|
|
|
2011-09-21 15:05:31 -04:00
|
|
|
while ((target = getMemberToSyncTo()) != 0) {
|
|
|
|
|
string current = target->fullName();
|
|
|
|
|
|
|
|
|
|
if( !r.connect(current) ) {
|
|
|
|
|
log(2) << "replSet can't connect to " << current << " to read operations" << rsLog;
|
|
|
|
|
r.resetConnection();
|
|
|
|
|
veto(current);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if( !minTS.isNull() && _isStale(r, minTS, oldest) ) {
|
|
|
|
|
r.resetConnection();
|
|
|
|
|
veto(current, 600);
|
|
|
|
|
stale = target;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// if we made it here, the target is up and not stale
|
|
|
|
|
return target;
|
2010-12-22 18:42:08 -05:00
|
|
|
}
|
2011-09-21 15:05:31 -04:00
|
|
|
|
|
|
|
|
// the only viable sync target was stale
|
|
|
|
|
if (stale) {
|
|
|
|
|
log() << "replSet error RS102 too stale to catch up, at least from " << stale->fullName() << rsLog;
|
|
|
|
|
log() << "replSet our last optime : " << lastOpTimeWritten.toStringLong() << rsLog;
|
|
|
|
|
log() << "replSet oldest at " << stale->fullName() << " : " << oldest["ts"]._opTime().toStringLong() << rsLog;
|
|
|
|
|
log() << "replSet See http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << rsLog;
|
|
|
|
|
|
|
|
|
|
// reset minvalid so that we can't become primary prematurely
|
|
|
|
|
{
|
|
|
|
|
writelock lk("local.replset.minvalid");
|
|
|
|
|
Helpers::putSingleton("local.replset.minvalid", oldest);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sethbmsg("error RS102 too stale to catch up");
|
|
|
|
|
changeState(MemberState::RS_RECOVERING);
|
|
|
|
|
sleepsecs(120);
|
2010-12-22 18:42:08 -05:00
|
|
|
}
|
2011-09-21 15:05:31 -04:00
|
|
|
|
|
|
|
|
return 0;
|
2011-01-04 00:40:41 -05:00
|
|
|
}
|
|
|
|
|
|
2010-12-23 13:51:25 -05:00
|
|
|
/* tail an oplog. ok to return, will be re-called. */
|
2011-01-04 00:40:41 -05:00
|
|
|
void ReplSetImpl::syncTail() {
|
2010-07-19 22:32:43 -04:00
|
|
|
// todo : locking vis a vis the mgr...
|
2010-07-19 20:09:00 -04:00
|
|
|
OplogReader r;
|
2010-12-22 18:42:08 -05:00
|
|
|
string hn;
|
2011-09-21 15:05:31 -04:00
|
|
|
|
|
|
|
|
// find a target to sync from the last op time written
|
|
|
|
|
Member* target = _getOplogReader(r, lastOpTimeWritten);
|
|
|
|
|
|
2011-04-06 12:40:53 -04:00
|
|
|
// no server found
|
|
|
|
|
if (target == 0) {
|
|
|
|
|
// if there is no one to sync from
|
|
|
|
|
OpTime minvalid;
|
|
|
|
|
tryToGoLiveAsASecondary(minvalid);
|
|
|
|
|
return;
|
2010-07-21 15:39:59 -04:00
|
|
|
}
|
2011-04-06 12:40:53 -04:00
|
|
|
|
2010-07-19 22:32:43 -04:00
|
|
|
r.tailingQueryGTE(rsoplog, lastOpTimeWritten);
|
2011-05-17 15:03:22 -04:00
|
|
|
// if target cut connections between connecting and querying (for
|
|
|
|
|
// example, because it stepped down) we might not have a cursor
|
|
|
|
|
if ( !r.haveCursor() ) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
2010-11-19 11:05:29 -05:00
|
|
|
|
|
|
|
|
uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() );
|
2010-07-19 22:32:43 -04:00
|
|
|
|
|
|
|
|
{
|
2010-07-30 11:35:28 -04:00
|
|
|
if( !r.more() ) {
|
2010-07-30 15:00:19 -04:00
|
|
|
/* maybe we are ahead and need to roll back? */
|
|
|
|
|
try {
|
|
|
|
|
bo theirLastOp = r.getLastOp(rsoplog);
|
|
|
|
|
if( theirLastOp.isEmpty() ) {
|
|
|
|
|
log() << "replSet error empty query result from " << hn << " oplog" << rsLog;
|
|
|
|
|
sleepsecs(2);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
OpTime theirTS = theirLastOp["ts"]._opTime();
|
2011-01-04 00:40:41 -05:00
|
|
|
if( theirTS < lastOpTimeWritten ) {
|
2010-07-30 15:00:19 -04:00
|
|
|
log() << "replSet we are ahead of the primary, will try to roll back" << rsLog;
|
|
|
|
|
syncRollback(r);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
/* we're not ahead? maybe our new query got fresher data. best to come back and try again */
|
|
|
|
|
log() << "replSet syncTail condition 1" << rsLog;
|
|
|
|
|
sleepsecs(1);
|
|
|
|
|
}
|
2011-01-04 00:40:41 -05:00
|
|
|
catch(DBException& e) {
|
2010-07-30 15:00:19 -04:00
|
|
|
log() << "replSet error querying " << hn << ' ' << e.toString() << rsLog;
|
2011-09-21 15:05:31 -04:00
|
|
|
veto(target->fullName());
|
2010-07-30 15:00:19 -04:00
|
|
|
sleepsecs(2);
|
|
|
|
|
}
|
|
|
|
|
return;
|
2010-07-30 11:35:28 -04:00
|
|
|
}
|
2010-07-30 15:00:19 -04:00
|
|
|
|
2010-07-19 22:32:43 -04:00
|
|
|
BSONObj o = r.nextSafe();
|
|
|
|
|
OpTime ts = o["ts"]._opTime();
|
|
|
|
|
long long h = o["h"].numberLong();
|
2011-01-04 00:40:41 -05:00
|
|
|
if( ts != lastOpTimeWritten || h != lastH ) {
|
2011-04-28 15:47:10 -04:00
|
|
|
log() << "replSet our last op time written: " << lastOpTimeWritten.toStringPretty() << rsLog;
|
|
|
|
|
log() << "replset source's GTE: " << ts.toStringPretty() << rsLog;
|
2010-07-21 13:13:36 -04:00
|
|
|
syncRollback(r);
|
2010-07-19 22:32:43 -04:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2010-08-30 10:56:17 -04:00
|
|
|
/* we have now checked if we need to rollback and we either don't have to or did it. */
|
|
|
|
|
{
|
|
|
|
|
OpTime minvalid;
|
|
|
|
|
tryToGoLiveAsASecondary(minvalid);
|
|
|
|
|
}
|
|
|
|
|
|
2010-08-18 14:11:42 -04:00
|
|
|
while( 1 ) {
|
2010-07-20 11:05:27 -04:00
|
|
|
while( 1 ) {
|
2011-01-04 00:40:41 -05:00
|
|
|
if( !r.moreInCurrentBatch() ) {
|
|
|
|
|
/* we need to occasionally check some things. between
|
2010-07-20 11:05:27 -04:00
|
|
|
batches is probably a good time. */
|
|
|
|
|
|
|
|
|
|
/* perhaps we should check this earlier? but not before the rollback checks. */
|
2011-01-04 00:40:41 -05:00
|
|
|
if( state().recovering() ) {
|
2010-07-29 15:01:41 -04:00
|
|
|
/* can we go to RS_SECONDARY state? we can if not too old and if minvalid achieved */
|
2010-08-02 14:03:33 -04:00
|
|
|
OpTime minvalid;
|
2010-08-30 10:56:17 -04:00
|
|
|
bool golive = ReplSetImpl::tryToGoLiveAsASecondary(minvalid);
|
2010-07-29 15:01:41 -04:00
|
|
|
if( golive ) {
|
2010-08-30 10:56:17 -04:00
|
|
|
;
|
2010-07-29 15:01:41 -04:00
|
|
|
}
|
2011-01-04 00:40:41 -05:00
|
|
|
else {
|
2010-08-30 10:56:17 -04:00
|
|
|
sethbmsg(str::stream() << "still syncing, not yet to minValid optime" << minvalid.toString());
|
2010-07-29 15:01:41 -04:00
|
|
|
}
|
2010-07-20 11:05:27 -04:00
|
|
|
|
|
|
|
|
/* todo: too stale capability */
|
|
|
|
|
}
|
2011-01-04 00:40:41 -05:00
|
|
|
|
2011-04-07 17:22:13 -04:00
|
|
|
if( !target->hbinfo().hbstate.readable() ) {
|
|
|
|
|
return;
|
2010-12-22 18:42:08 -05:00
|
|
|
}
|
2010-07-20 11:05:27 -04:00
|
|
|
}
|
|
|
|
|
if( !r.more() )
|
|
|
|
|
break;
|
2011-01-04 00:40:41 -05:00
|
|
|
{
|
2010-07-20 11:05:27 -04:00
|
|
|
BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
|
2010-07-30 12:30:34 -04:00
|
|
|
|
2010-08-18 14:39:50 -04:00
|
|
|
int sd = myConfig().slaveDelay;
|
2010-10-04 14:52:20 -04:00
|
|
|
// ignore slaveDelay if the box is still initializing. once
|
|
|
|
|
// it becomes secondary we can worry about it.
|
2011-01-04 00:40:41 -05:00
|
|
|
if( sd && box.getState().secondary() ) {
|
2010-08-18 14:11:42 -04:00
|
|
|
const OpTime ts = o["ts"]._opTime();
|
|
|
|
|
long long a = ts.getSecs();
|
|
|
|
|
long long b = time(0);
|
|
|
|
|
long long lag = b - a;
|
|
|
|
|
long long sleeptime = sd - lag;
|
|
|
|
|
if( sleeptime > 0 ) {
|
|
|
|
|
uassert(12000, "rs slaveDelay differential too big check clocks and systems", sleeptime < 0x40000000);
|
2010-08-18 14:29:04 -04:00
|
|
|
if( sleeptime < 60 ) {
|
2010-08-18 14:11:42 -04:00
|
|
|
sleepsecs((int) sleeptime);
|
|
|
|
|
}
|
|
|
|
|
else {
|
2011-06-01 11:05:52 -04:00
|
|
|
log() << "replSet slavedelay sleep long time: " << sleeptime << rsLog;
|
2010-08-18 14:11:42 -04:00
|
|
|
// sleep(hours) would prevent reconfigs from taking effect & such!
|
|
|
|
|
long long waitUntil = b + sleeptime;
|
|
|
|
|
while( 1 ) {
|
2010-08-18 14:29:04 -04:00
|
|
|
sleepsecs(6);
|
2010-08-18 14:11:42 -04:00
|
|
|
if( time(0) >= waitUntil )
|
|
|
|
|
break;
|
2011-05-04 18:46:16 -04:00
|
|
|
|
2011-05-14 11:43:14 -04:00
|
|
|
if( !target->hbinfo().hbstate.readable() ) {
|
|
|
|
|
break;
|
2010-12-22 18:42:08 -05:00
|
|
|
}
|
2011-05-04 18:46:16 -04:00
|
|
|
|
2010-08-18 14:11:42 -04:00
|
|
|
if( myConfig().slaveDelay != sd ) // reconf
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2011-01-04 00:40:41 -05:00
|
|
|
|
2010-10-01 18:02:03 -04:00
|
|
|
}
|
|
|
|
|
|
2011-07-27 12:46:31 -04:00
|
|
|
try {
|
2010-10-01 18:02:03 -04:00
|
|
|
writelock lk("");
|
|
|
|
|
|
|
|
|
|
/* if we have become primary, we dont' want to apply things from elsewhere
|
2011-01-04 00:40:41 -05:00
|
|
|
anymore. assumePrimary is in the db lock so we are safe as long as
|
2010-10-01 18:02:03 -04:00
|
|
|
we check after we locked above. */
|
2010-12-22 18:42:08 -05:00
|
|
|
if( box.getState().primary() ) {
|
|
|
|
|
log(0) << "replSet stopping syncTail we are now primary" << rsLog;
|
2010-10-01 18:02:03 -04:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
syncApply(o);
|
2011-05-21 10:07:22 -04:00
|
|
|
_logOpObjRS(o); // with repl sets we write the ops to our oplog too
|
2010-08-18 14:11:42 -04:00
|
|
|
}
|
2011-07-27 12:46:31 -04:00
|
|
|
catch (DBException& e) {
|
|
|
|
|
sethbmsg(str::stream() << "syncTail: " << e.toString() << ", syncing: " << o);
|
2011-09-21 15:05:31 -04:00
|
|
|
veto(target->fullName(), 300);
|
2011-07-27 12:46:31 -04:00
|
|
|
sleepsecs(30);
|
|
|
|
|
return;
|
|
|
|
|
}
|
2010-07-19 22:32:43 -04:00
|
|
|
}
|
|
|
|
|
}
|
2010-07-20 13:37:09 -04:00
|
|
|
r.tailCheck();
|
|
|
|
|
if( !r.haveCursor() ) {
|
2011-07-16 12:43:51 -04:00
|
|
|
LOG(1) << "replSet end syncTail pass with " << hn << rsLog;
|
2010-08-09 12:01:27 -04:00
|
|
|
// TODO : reuse our connection to the primary.
|
2010-07-20 11:05:27 -04:00
|
|
|
return;
|
2010-07-20 13:37:09 -04:00
|
|
|
}
|
2011-05-04 18:46:16 -04:00
|
|
|
|
2011-05-14 11:43:14 -04:00
|
|
|
if( !target->hbinfo().hbstate.readable() ) {
|
|
|
|
|
return;
|
2010-12-22 18:42:08 -05:00
|
|
|
}
|
2010-07-19 22:32:43 -04:00
|
|
|
// looping back is ok because this is a tailable cursor
|
|
|
|
|
}
|
2010-07-19 20:09:00 -04:00
|
|
|
}
|
|
|
|
|
|
2010-07-19 13:08:24 -04:00
|
|
|
void ReplSetImpl::_syncThread() {
|
2010-07-22 17:50:54 -04:00
|
|
|
StateBox::SP sp = box.get();
|
2010-07-30 11:57:30 -04:00
|
|
|
if( sp.state.primary() ) {
|
|
|
|
|
sleepsecs(1);
|
2010-07-19 13:08:24 -04:00
|
|
|
return;
|
2010-07-30 11:57:30 -04:00
|
|
|
}
|
2011-10-05 16:55:55 -04:00
|
|
|
if( _blockSync || sp.state.fatal() || sp.state.startup() ) {
|
2010-09-02 10:16:52 -04:00
|
|
|
sleepsecs(5);
|
|
|
|
|
return;
|
|
|
|
|
}
|
2010-07-19 13:08:24 -04:00
|
|
|
|
2010-07-19 18:05:44 -04:00
|
|
|
/* do we have anything at all? */
|
|
|
|
|
if( lastOpTimeWritten.isNull() ) {
|
|
|
|
|
syncDoInitialSync();
|
2010-07-19 18:09:00 -04:00
|
|
|
return; // _syncThread will be recalled, starts from top again in case sync failed.
|
2010-07-19 18:05:44 -04:00
|
|
|
}
|
2010-07-19 18:09:00 -04:00
|
|
|
|
|
|
|
|
/* we have some data. continue tailing. */
|
2010-07-19 20:09:00 -04:00
|
|
|
syncTail();
|
2010-07-19 13:08:24 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ReplSetImpl::syncThread() {
|
2010-10-20 16:34:08 -04:00
|
|
|
while( 1 ) {
|
2011-04-27 09:29:45 -07:00
|
|
|
// After a reconfig, we may not be in the replica set anymore, so
|
|
|
|
|
// check that we are in the set (and not an arbiter) before
|
|
|
|
|
// trying to sync with other replicas.
|
2011-05-21 10:07:22 -04:00
|
|
|
if( ! _self ) {
|
|
|
|
|
log() << "replSet warning did not detect own host and port, not syncing, config: " << theReplSet->config() << rsLog;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
if( myConfig().arbiterOnly ) {
|
2011-04-28 15:47:10 -04:00
|
|
|
return;
|
2011-04-28 15:08:25 -04:00
|
|
|
}
|
2011-01-04 00:40:41 -05:00
|
|
|
|
2010-07-19 17:23:11 -04:00
|
|
|
try {
|
|
|
|
|
_syncThread();
|
2010-07-19 13:08:24 -04:00
|
|
|
}
|
2011-01-04 00:40:41 -05:00
|
|
|
catch(DBException& e) {
|
2011-07-27 12:46:31 -04:00
|
|
|
sethbmsg(str::stream() << "syncThread: " << e.toString());
|
2010-07-19 13:08:24 -04:00
|
|
|
sleepsecs(10);
|
|
|
|
|
}
|
2011-01-04 00:40:41 -05:00
|
|
|
catch(...) {
|
2010-07-20 13:37:09 -04:00
|
|
|
sethbmsg("unexpected exception in syncThread()");
|
2010-10-30 14:56:56 -04:00
|
|
|
// TODO : SET NOT SECONDARY here?
|
2010-07-20 13:37:09 -04:00
|
|
|
sleepsecs(60);
|
|
|
|
|
}
|
2010-07-30 11:57:30 -04:00
|
|
|
sleepsecs(1);
|
2010-10-04 11:22:40 -04:00
|
|
|
|
2011-01-04 00:40:41 -05:00
|
|
|
/* normally msgCheckNewState gets called periodically, but in a single node repl set there
|
|
|
|
|
are no heartbeat threads, so we do it here to be sure. this is relevant if the singleton
|
|
|
|
|
member has done a stepDown() and needs to come back up.
|
2010-10-04 11:22:40 -04:00
|
|
|
*/
|
2011-04-26 14:19:57 -04:00
|
|
|
OCCASIONALLY {
|
|
|
|
|
mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) );
|
|
|
|
|
}
|
2010-07-19 13:08:24 -04:00
|
|
|
}
|
2010-07-16 12:51:01 -04:00
|
|
|
}
|
2010-07-16 10:42:53 -04:00
|
|
|
|
2010-10-30 14:56:56 -04:00
|
|
|
void startSyncThread() {
|
|
|
|
|
static int n;
|
2010-10-31 12:11:38 -04:00
|
|
|
if( n != 0 ) {
|
|
|
|
|
log() << "replSet ERROR : more than one sync thread?" << rsLog;
|
|
|
|
|
assert( n == 0 );
|
|
|
|
|
}
|
2010-10-30 14:56:56 -04:00
|
|
|
n++;
|
|
|
|
|
|
2011-07-16 18:18:03 -04:00
|
|
|
Client::initThread("rsSync");
|
2011-05-21 10:07:22 -04:00
|
|
|
cc().iAmSyncThread(); // for isSyncThread() (which is used not used much, is used in secondary create index code
|
2011-05-10 14:41:01 -04:00
|
|
|
replLocalAuth();
|
2010-10-30 14:56:56 -04:00
|
|
|
theReplSet->syncThread();
|
|
|
|
|
cc().shutdown();
|
|
|
|
|
}
|
|
|
|
|
|
2011-07-18 17:01:44 -04:00
|
|
|
void GhostSync::starting() {
|
|
|
|
|
Client::initThread("rsGhostSync");
|
|
|
|
|
replLocalAuth();
|
|
|
|
|
}
|
|
|
|
|
|
2011-10-05 16:55:55 -04:00
|
|
|
void ReplSetImpl::blockSync(bool block) {
|
|
|
|
|
_blockSync = block;
|
|
|
|
|
if (_blockSync) {
|
|
|
|
|
// syncing is how we get into SECONDARY state, so we'll be stuck in
|
|
|
|
|
// RECOVERING until we unblock
|
|
|
|
|
changeState(MemberState::RS_RECOVERING);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-07-16 08:03:52 -04:00
|
|
|
void GhostSync::associateSlave(const BSONObj& id, const int memberId) {
|
|
|
|
|
const OID rid = id["_id"].OID();
|
|
|
|
|
rwlock lk( _lock , true );
|
|
|
|
|
GhostSlave &slave = _ghostCache[rid];
|
2011-06-09 15:05:34 -04:00
|
|
|
if (slave.init) {
|
2011-07-15 10:59:24 -04:00
|
|
|
LOG(1) << "tracking " << slave.slave->h().toString() << " as " << rid << rsLog;
|
2011-06-09 15:05:34 -04:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
slave.slave = (Member*)rs->findById(memberId);
|
|
|
|
|
if (slave.slave != 0) {
|
|
|
|
|
slave.init = true;
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
log() << "replset couldn't find a slave with id " << memberId
|
|
|
|
|
<< ", not tracking " << rid << rsLog;
|
2011-05-10 19:14:41 -04:00
|
|
|
}
|
|
|
|
|
}
|
2011-06-09 15:05:34 -04:00
|
|
|
|
2011-07-16 08:03:52 -04:00
|
|
|
void GhostSync::updateSlave(const mongo::OID& rid, const OpTime& last) {
|
|
|
|
|
rwlock lk( _lock , false );
|
|
|
|
|
MAP::iterator i = _ghostCache.find( rid );
|
|
|
|
|
if ( i == _ghostCache.end() ) {
|
2011-08-30 11:18:30 -04:00
|
|
|
OCCASIONALLY warning() << "couldn't update slave " << rid << " no entry" << rsLog;
|
2011-07-16 08:03:52 -04:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
GhostSlave& slave = i->second;
|
2011-06-09 15:05:34 -04:00
|
|
|
if (!slave.init) {
|
2011-07-16 08:03:52 -04:00
|
|
|
OCCASIONALLY log() << "couldn't update slave " << rid << " not init" << rsLog;
|
2011-06-09 15:05:34 -04:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
((ReplSetConfig::MemberCfg)slave.slave->config()).updateGroups(last);
|
|
|
|
|
}
|
|
|
|
|
|
2011-07-16 08:03:52 -04:00
|
|
|
void GhostSync::percolate(const BSONObj& id, const OpTime& last) {
|
|
|
|
|
const OID rid = id["_id"].OID();
|
2011-07-18 18:07:47 -04:00
|
|
|
GhostSlave* slave;
|
|
|
|
|
{
|
|
|
|
|
rwlock lk( _lock , false );
|
2011-07-16 08:03:52 -04:00
|
|
|
|
2011-07-18 18:07:47 -04:00
|
|
|
MAP::iterator i = _ghostCache.find( rid );
|
|
|
|
|
if ( i == _ghostCache.end() ) {
|
|
|
|
|
OCCASIONALLY log() << "couldn't percolate slave " << rid << " no entry" << rsLog;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
slave = &(i->second);
|
|
|
|
|
if (!slave->init) {
|
|
|
|
|
OCCASIONALLY log() << "couldn't percolate slave " << rid << " not init" << rsLog;
|
|
|
|
|
return;
|
|
|
|
|
}
|
2011-07-16 08:03:52 -04:00
|
|
|
}
|
|
|
|
|
|
2011-07-18 18:07:47 -04:00
|
|
|
assert(slave->slave);
|
2011-05-13 10:55:21 -04:00
|
|
|
|
2011-06-09 15:05:34 -04:00
|
|
|
const Member *target = rs->_currentSyncTarget;
|
|
|
|
|
if (!target || rs->box.getState().primary()
|
2011-05-12 15:14:19 -04:00
|
|
|
// we are currently syncing from someone who's syncing from us
|
|
|
|
|
// the target might end up with a new Member, but s.slave never
|
|
|
|
|
// changes so we'll compare the names
|
2011-07-18 18:07:47 -04:00
|
|
|
|| target == slave->slave || target->fullName() == slave->slave->fullName()) {
|
2011-07-16 12:43:51 -04:00
|
|
|
LOG(1) << "replica set ghost target no good" << endl;
|
2011-05-10 19:14:41 -04:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
try {
|
2011-07-18 18:07:47 -04:00
|
|
|
if (!slave->reader.haveCursor()) {
|
|
|
|
|
if (!slave->reader.connect(id, slave->slave->id(), target->fullName())) {
|
2011-05-10 19:14:41 -04:00
|
|
|
// error message logged in OplogReader::connect
|
|
|
|
|
return;
|
|
|
|
|
}
|
2011-07-18 18:07:47 -04:00
|
|
|
slave->reader.ghostQueryGTE(rsoplog, last);
|
2011-05-10 19:14:41 -04:00
|
|
|
}
|
|
|
|
|
|
2011-07-16 12:43:51 -04:00
|
|
|
LOG(1) << "replSet last: " << slave->last.toString() << " to " << last.toString() << rsLog;
|
2011-07-18 18:07:47 -04:00
|
|
|
if (slave->last > last) {
|
2011-05-10 19:14:41 -04:00
|
|
|
return;
|
|
|
|
|
}
|
2011-07-18 18:07:47 -04:00
|
|
|
|
|
|
|
|
while (slave->last <= last) {
|
|
|
|
|
if (!slave->reader.more()) {
|
2011-05-16 14:52:40 -04:00
|
|
|
// we'll be back
|
|
|
|
|
return;
|
2011-05-10 19:14:41 -04:00
|
|
|
}
|
2011-07-18 18:07:47 -04:00
|
|
|
|
|
|
|
|
BSONObj o = slave->reader.nextSafe();
|
|
|
|
|
slave->last = o["ts"]._opTime();
|
2011-05-10 19:14:41 -04:00
|
|
|
}
|
2011-07-16 12:43:51 -04:00
|
|
|
LOG(2) << "now last is " << slave->last.toString() << rsLog;
|
2011-05-10 19:14:41 -04:00
|
|
|
}
|
|
|
|
|
catch (DBException& e) {
|
|
|
|
|
// we'll be back
|
2011-07-16 12:43:51 -04:00
|
|
|
LOG(2) << "replSet ghost sync error: " << e.what() << " for "
|
2011-07-18 18:07:47 -04:00
|
|
|
<< slave->slave->fullName() << rsLog;
|
|
|
|
|
slave->reader.resetConnection();
|
2011-05-10 19:14:41 -04:00
|
|
|
}
|
|
|
|
|
}
|
2010-07-16 10:42:53 -04:00
|
|
|
}
|