2010-05-11 15:58:44 -04:00
|
|
|
/* @file manager.cpp
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Copyright (C) 2008 10gen Inc.
|
|
|
|
|
*
|
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
|
* it under the terms of the GNU Affero General Public License, version 3,
|
|
|
|
|
* as published by the Free Software Foundation.
|
|
|
|
|
*
|
|
|
|
|
* This program is distributed in the hope that it will be useful,b
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
* GNU Affero General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU Affero General Public License
|
|
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "pch.h"
|
2010-05-29 15:45:47 -04:00
|
|
|
#include "rs.h"
|
2010-06-26 13:05:32 -04:00
|
|
|
#include "../client.h"
|
2010-05-11 15:58:44 -04:00
|
|
|
|
|
|
|
|
namespace mongo {
|
|
|
|
|
|
|
|
|
|
enum {
|
|
|
|
|
NOPRIMARY = -2,
|
|
|
|
|
SELFPRIMARY = -1
|
|
|
|
|
};
|
|
|
|
|
|
2010-05-12 17:43:21 -04:00
|
|
|
/* check members OTHER THAN US to see if they think they are primary */
|
2010-08-17 12:42:50 -04:00
|
|
|
const Member * Manager::findOtherPrimary(bool& two) {
|
|
|
|
|
two = false;
|
2010-05-24 17:11:47 -04:00
|
|
|
Member *m = rs->head();
|
2010-05-12 17:43:21 -04:00
|
|
|
Member *p = 0;
|
|
|
|
|
while( m ) {
|
2010-08-17 12:42:50 -04:00
|
|
|
DEV assert( m != rs->_self );
|
2010-07-22 17:50:54 -04:00
|
|
|
if( m->state().primary() && m->hbinfo().up() ) {
|
2010-08-17 12:42:50 -04:00
|
|
|
if( p ) {
|
|
|
|
|
two = true;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
2010-05-12 17:43:21 -04:00
|
|
|
p = m;
|
|
|
|
|
}
|
|
|
|
|
m = m->next();
|
|
|
|
|
}
|
2010-05-24 17:11:47 -04:00
|
|
|
if( p )
|
|
|
|
|
noteARemoteIsPrimary(p);
|
2010-05-12 17:43:21 -04:00
|
|
|
return p;
|
2010-05-12 16:03:09 -04:00
|
|
|
}
|
|
|
|
|
|
2010-06-01 16:25:47 -04:00
|
|
|
Manager::Manager(ReplSetImpl *_rs) :
|
2010-06-26 13:05:32 -04:00
|
|
|
task::Server("rs Manager"), rs(_rs), busyWithElectSelf(false), _primary(NOPRIMARY)
|
2010-05-11 15:58:44 -04:00
|
|
|
{
|
|
|
|
|
}
|
2010-08-02 14:37:10 -04:00
|
|
|
|
2010-07-28 11:06:04 -04:00
|
|
|
Manager::~Manager() {
|
2010-09-27 12:35:22 -04:00
|
|
|
/* we don't destroy the replset object we sit in; however, the destructor could have thrown on init.
|
|
|
|
|
the log message below is just a reminder to come back one day and review this code more, and to
|
|
|
|
|
make it cleaner.
|
|
|
|
|
*/
|
|
|
|
|
log() << "info: ~Manager called" << rsLog;
|
2010-07-28 11:06:04 -04:00
|
|
|
rs->mgr = 0;
|
|
|
|
|
}
|
|
|
|
|
|
2010-06-26 13:05:32 -04:00
|
|
|
void Manager::starting() {
|
|
|
|
|
Client::initThread("rs Manager");
|
|
|
|
|
}
|
|
|
|
|
|
2010-06-01 16:25:47 -04:00
|
|
|
void Manager::noteARemoteIsPrimary(const Member *m) {
|
2010-07-22 17:50:54 -04:00
|
|
|
if( rs->box.getPrimary() == m )
|
2010-07-20 13:37:09 -04:00
|
|
|
return;
|
2010-05-24 17:11:47 -04:00
|
|
|
rs->_self->lhb() = "";
|
2010-08-12 16:06:16 -04:00
|
|
|
if( rs->iAmArbiterOnly() ) {
|
|
|
|
|
rs->box.set(MemberState::RS_ARBITER, m);
|
|
|
|
|
} else {
|
|
|
|
|
rs->box.noteRemoteIsPrimary(m);
|
|
|
|
|
}
|
2010-05-12 17:43:21 -04:00
|
|
|
}
|
|
|
|
|
|
2010-05-12 16:03:09 -04:00
|
|
|
/** called as the health threads get new results */
|
2010-06-01 16:25:47 -04:00
|
|
|
void Manager::msgCheckNewState() {
|
2010-06-29 15:52:35 -04:00
|
|
|
{
|
2010-07-26 21:44:17 -04:00
|
|
|
theReplSet->assertValid();
|
|
|
|
|
rs->assertValid();
|
|
|
|
|
|
2010-06-29 15:52:35 -04:00
|
|
|
RSBase::lock lk(rs);
|
|
|
|
|
|
|
|
|
|
if( busyWithElectSelf ) return;
|
|
|
|
|
|
2010-07-22 17:50:54 -04:00
|
|
|
const Member *p = rs->box.getPrimary();
|
2010-07-23 13:35:27 -04:00
|
|
|
if( p && p != rs->_self ) {
|
|
|
|
|
if( !p->hbinfo().up() ||
|
|
|
|
|
!p->hbinfo().hbstate.primary() )
|
|
|
|
|
{
|
|
|
|
|
p = 0;
|
|
|
|
|
rs->box.setOtherPrimary(0);
|
|
|
|
|
}
|
2010-07-20 14:58:51 -04:00
|
|
|
}
|
|
|
|
|
|
2010-06-29 15:52:35 -04:00
|
|
|
const Member *p2;
|
2010-08-17 12:42:50 -04:00
|
|
|
{
|
|
|
|
|
bool two;
|
|
|
|
|
p2 = findOtherPrimary(two);
|
|
|
|
|
if( two ) {
|
|
|
|
|
/* two other nodes think they are primary (asynchronously polled) -- wait for things to settle down. */
|
2010-08-24 13:18:47 -04:00
|
|
|
log() << "replSet info two primaries (transiently)" << rsLog;
|
2010-08-17 12:42:50 -04:00
|
|
|
return;
|
|
|
|
|
}
|
2010-06-29 15:52:35 -04:00
|
|
|
}
|
2010-05-24 17:11:47 -04:00
|
|
|
|
2010-07-20 14:58:51 -04:00
|
|
|
if( p2 ) {
|
2010-06-29 15:52:35 -04:00
|
|
|
/* someone else thinks they are primary. */
|
2010-07-22 20:15:30 -04:00
|
|
|
if( p == p2 ) {
|
|
|
|
|
// we thought the same; all set.
|
2010-07-19 10:33:30 -04:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
if( p == 0 ) {
|
|
|
|
|
noteARemoteIsPrimary(p2);
|
2010-06-22 20:05:11 -04:00
|
|
|
return;
|
2010-07-19 10:33:30 -04:00
|
|
|
}
|
2010-07-22 15:28:02 -04:00
|
|
|
// todo xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
2010-07-19 10:33:30 -04:00
|
|
|
if( p != rs->_self ) {
|
2010-07-22 20:15:30 -04:00
|
|
|
// switch primary from oldremotep->newremotep2
|
2010-07-19 10:33:30 -04:00
|
|
|
noteARemoteIsPrimary(p2);
|
|
|
|
|
return;
|
|
|
|
|
}
|
2010-06-29 15:52:35 -04:00
|
|
|
/* we thought we were primary, yet now someone else thinks they are. */
|
2010-07-19 10:33:30 -04:00
|
|
|
if( !rs->elect.aMajoritySeemsToBeUp() ) {
|
2010-07-22 20:15:30 -04:00
|
|
|
/* we can't see a majority. so the other node is probably the right choice. */
|
2010-07-19 10:33:30 -04:00
|
|
|
noteARemoteIsPrimary(p2);
|
|
|
|
|
return;
|
|
|
|
|
}
|
2010-07-22 20:15:30 -04:00
|
|
|
/* ignore for now, keep thinking we are master.
|
|
|
|
|
this could just be timing (we poll every couple seconds) or could indicate
|
|
|
|
|
a problem? if it happens consistently for a duration of time we should
|
|
|
|
|
alert the sysadmin.
|
|
|
|
|
*/
|
2010-06-29 15:52:35 -04:00
|
|
|
return;
|
2010-06-22 20:05:11 -04:00
|
|
|
}
|
2010-06-29 15:52:35 -04:00
|
|
|
|
2010-07-22 20:15:30 -04:00
|
|
|
/* didn't find anyone who wants to be primary */
|
2010-07-20 13:37:09 -04:00
|
|
|
|
2010-06-29 15:52:35 -04:00
|
|
|
if( p ) {
|
2010-07-26 16:19:08 -04:00
|
|
|
/* we are already primary */
|
2010-07-23 13:35:27 -04:00
|
|
|
|
|
|
|
|
if( p != rs->_self ) {
|
|
|
|
|
rs->sethbmsg("error p != rs->self in checkNewState");
|
|
|
|
|
log() << "replSet " << p->fullName() << rsLog;
|
|
|
|
|
log() << "replSet " << rs->_self->fullName() << rsLog;
|
2010-07-26 16:19:08 -04:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2010-08-09 12:01:27 -04:00
|
|
|
if( rs->elect.shouldRelinquish() ) {
|
2010-07-26 16:19:08 -04:00
|
|
|
log() << "replSet can't see a majority of the set, relinquishing primary" << rsLog;
|
|
|
|
|
rs->relinquish();
|
2010-07-23 13:35:27 -04:00
|
|
|
}
|
2010-07-26 16:19:08 -04:00
|
|
|
|
2010-06-29 15:52:35 -04:00
|
|
|
return;
|
2010-06-22 20:05:11 -04:00
|
|
|
}
|
2010-06-29 15:52:35 -04:00
|
|
|
|
2010-07-22 20:15:30 -04:00
|
|
|
if( !rs->iAmPotentiallyHot() ) // if not we never try to be primary
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
/* TODO : CHECK PRIORITY HERE. can't be elected if priority zero. */
|
|
|
|
|
|
2010-06-29 15:52:35 -04:00
|
|
|
/* no one seems to be primary. shall we try to elect ourself? */
|
|
|
|
|
if( !rs->elect.aMajoritySeemsToBeUp() ) {
|
2010-07-28 20:24:21 -04:00
|
|
|
static time_t last;
|
2010-07-23 13:35:27 -04:00
|
|
|
static int n;
|
2010-07-28 20:24:21 -04:00
|
|
|
int ll = 0;
|
|
|
|
|
if( ++n > 5 ) ll++;
|
|
|
|
|
if( last + 60 > time(0 ) ) ll++;
|
|
|
|
|
log(ll) << "replSet can't see a majority, will not try to elect self" << rsLog;
|
|
|
|
|
last = time(0);
|
2010-06-29 15:52:35 -04:00
|
|
|
return;
|
2010-06-22 20:05:11 -04:00
|
|
|
}
|
2010-06-29 15:52:35 -04:00
|
|
|
|
|
|
|
|
busyWithElectSelf = true; // don't try to do further elections & such while we are already working on one.
|
|
|
|
|
}
|
|
|
|
|
try {
|
|
|
|
|
rs->elect.electSelf();
|
|
|
|
|
}
|
|
|
|
|
catch(RetryAfterSleepException&) {
|
|
|
|
|
/* we want to process new inbounds before trying this again. so we just put a checkNewstate in the queue for eval later. */
|
|
|
|
|
requeue();
|
|
|
|
|
}
|
|
|
|
|
catch(...) {
|
|
|
|
|
log() << "replSet error unexpected assertion in rs manager" << rsLog;
|
|
|
|
|
}
|
|
|
|
|
busyWithElectSelf = false;
|
2010-05-24 17:11:47 -04:00
|
|
|
}
|
2010-05-19 14:21:41 -04:00
|
|
|
|
2010-05-11 15:58:44 -04:00
|
|
|
}
|