2010-04-14 17:25:03 -04:00
|
|
|
/**
|
|
|
|
|
* Copyright (C) 2008 10gen Inc.
|
|
|
|
|
*
|
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
|
* it under the terms of the GNU Affero General Public License, version 3,
|
|
|
|
|
* as published by the Free Software Foundation.
|
|
|
|
|
*
|
2010-04-18 12:30:40 -04:00
|
|
|
* This program is distributed in the hope that it will be useful,b
|
2010-04-14 17:25:03 -04:00
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
* GNU Affero General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU Affero General Public License
|
|
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
*/
|
|
|
|
|
|
2010-04-27 15:27:52 -04:00
|
|
|
#include "pch.h"
|
2010-04-14 17:25:03 -04:00
|
|
|
#include "replset.h"
|
|
|
|
|
#include "health.h"
|
2010-04-18 12:30:40 -04:00
|
|
|
#include "../../util/background.h"
|
|
|
|
|
#include "../../client/dbclient.h"
|
|
|
|
|
#include "../commands.h"
|
|
|
|
|
#include "../../util/concurrency/value.h"
|
2010-05-07 16:42:55 -04:00
|
|
|
#include "../../util/mongoutils/html.h"
|
2010-05-05 16:33:35 -04:00
|
|
|
#include "../../util/goodies.h"
|
2010-05-07 16:42:55 -04:00
|
|
|
#include "../../util/ramlog.h"
|
2010-05-06 10:05:48 -04:00
|
|
|
#include "../helpers/dblogger.h"
|
2010-05-06 16:48:07 -04:00
|
|
|
#include "connections.h"
|
|
|
|
|
|
|
|
|
|
namespace mongo {
|
|
|
|
|
/* decls for connections.h */
|
|
|
|
|
ScopedConn::M& ScopedConn::_map = *(new ScopedConn::M());
|
|
|
|
|
mutex ScopedConn::mapMutex;
|
|
|
|
|
}
|
2010-04-14 17:25:03 -04:00
|
|
|
|
|
|
|
|
namespace mongo {
|
|
|
|
|
|
2010-05-05 16:33:35 -04:00
|
|
|
using namespace mongoutils::html;
|
2010-05-13 17:18:17 -04:00
|
|
|
using namespace bson;
|
2010-05-05 16:33:35 -04:00
|
|
|
|
2010-05-07 16:42:55 -04:00
|
|
|
static RamLog _rsLog;
|
|
|
|
|
Tee *rsLog = &_rsLog;
|
|
|
|
|
|
2010-04-22 16:17:18 -04:00
|
|
|
/* { replSetHeartbeat : <setname> } */
|
2010-04-18 12:30:40 -04:00
|
|
|
class CmdReplSetHeartbeat : public Command {
|
|
|
|
|
public:
|
2010-04-23 15:50:49 -04:00
|
|
|
virtual bool slaveOk() const { return true; }
|
2010-04-23 16:41:56 -04:00
|
|
|
virtual bool adminOnly() const { return false; }
|
2010-04-18 12:30:40 -04:00
|
|
|
virtual bool logTheOp() { return false; }
|
2010-04-23 15:50:49 -04:00
|
|
|
virtual LockType locktype() const { return NONE; }
|
2010-04-23 16:41:56 -04:00
|
|
|
virtual void help( stringstream &help ) const { help<<"internal"; }
|
2010-04-18 12:30:40 -04:00
|
|
|
CmdReplSetHeartbeat() : Command("replSetHeartbeat") { }
|
2010-05-03 16:25:34 -04:00
|
|
|
virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
|
2010-04-21 21:19:37 -04:00
|
|
|
if( !replSet ) {
|
2010-04-18 12:30:40 -04:00
|
|
|
errmsg = "not a replset member";
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2010-05-13 11:03:23 -04:00
|
|
|
if( cmdObj["pv"].Int() != 1 ) {
|
|
|
|
|
errmsg = "incompatible replset protocol version";
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
string s = string(cmdObj.getStringField("replSetHeartbeat"))+'/';
|
|
|
|
|
if( !startsWith(cmdLine.replSet, s ) ) {
|
2010-05-08 14:12:24 -04:00
|
|
|
errmsg = "repl set names do not match";
|
2010-05-13 11:03:23 -04:00
|
|
|
cout << "cmdline: " << cmdLine.replSet << endl;
|
|
|
|
|
cout << "s: " << s << endl;
|
2010-05-08 14:12:24 -04:00
|
|
|
result.append("mismatch", true);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2010-05-13 17:18:17 -04:00
|
|
|
result.append("rs", true);
|
2010-04-21 21:19:37 -04:00
|
|
|
if( theReplSet == 0 ) {
|
|
|
|
|
errmsg = "still initializing";
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2010-05-13 17:18:17 -04:00
|
|
|
|
|
|
|
|
if( theReplSet->name() != cmdObj.getStringField("replSetHeartbeat") ) {
|
2010-05-08 14:12:24 -04:00
|
|
|
errmsg = "repl set names do not match (2)";
|
|
|
|
|
result.append("mismatch", true);
|
2010-04-20 15:30:37 -04:00
|
|
|
return false;
|
|
|
|
|
}
|
2010-05-13 17:18:17 -04:00
|
|
|
result.append("set", theReplSet->name());
|
|
|
|
|
result.append("state", theReplSet->state());
|
|
|
|
|
int v = theReplSet->config().version;
|
|
|
|
|
result.append("v", v);
|
|
|
|
|
// if( v > cmdObj["v"].Int() )
|
|
|
|
|
// result << "config" << theReplSet->config().asBson();
|
2010-04-18 12:30:40 -04:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
} cmdReplSetHeartbeat;
|
|
|
|
|
|
2010-05-09 17:29:35 -04:00
|
|
|
/* throws dbexception */
|
2010-05-13 11:03:23 -04:00
|
|
|
bool requestHeartbeat(string setName, string memberFullName, BSONObj& result, int myCfgVersion, int& theirCfgVersion) {
|
|
|
|
|
BSONObj cmd = BSON( "replSetHeartbeat" << setName << "v" << myCfgVersion << "pv" << 1 );
|
2010-05-08 14:12:24 -04:00
|
|
|
ScopedConn conn(memberFullName);
|
|
|
|
|
return conn->runCommand("admin", cmd, result);
|
|
|
|
|
}
|
|
|
|
|
|
2010-04-18 12:35:37 -04:00
|
|
|
/* poll every other set member to check its status */
|
2010-04-18 12:30:40 -04:00
|
|
|
class FeedbackThread : public BackgroundJob {
|
|
|
|
|
public:
|
2010-05-11 15:58:44 -04:00
|
|
|
ReplSet::Member *m;
|
2010-04-20 15:30:37 -04:00
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
void down() {
|
2010-04-22 11:58:19 -04:00
|
|
|
m->_health = 0.0;
|
|
|
|
|
if( m->_upSince ) {
|
|
|
|
|
m->_upSince = 0;
|
2010-05-07 16:42:55 -04:00
|
|
|
log() << "replSet " << m->fullName() << " is now down" << rsLog;
|
2010-04-20 15:30:37 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public:
|
2010-04-18 12:30:40 -04:00
|
|
|
void run() {
|
2010-04-20 15:30:37 -04:00
|
|
|
mongo::lastError.reset( new LastError() );
|
2010-04-18 12:30:40 -04:00
|
|
|
while( 1 ) {
|
|
|
|
|
try {
|
|
|
|
|
BSONObj info;
|
2010-05-13 11:03:23 -04:00
|
|
|
int theirConfigVersion = -10000;
|
2010-05-13 17:18:17 -04:00
|
|
|
bool ok = requestHeartbeat(theReplSet->name(), m->fullName(), info, theReplSet->config().version, theirConfigVersion);
|
2010-05-08 14:12:24 -04:00
|
|
|
m->_lastHeartbeat = time(0); // we set this on any response - we don't get this far if couldn't connect because exception is thrown
|
2010-05-13 17:18:17 -04:00
|
|
|
{
|
|
|
|
|
be state = info["state"];
|
|
|
|
|
if( state.ok() )
|
|
|
|
|
m->_state = (ReplSet::State) state.Int();
|
|
|
|
|
}
|
2010-04-18 12:35:37 -04:00
|
|
|
if( ok ) {
|
2010-04-22 11:58:19 -04:00
|
|
|
if( m->_upSince == 0 ) {
|
2010-05-07 16:42:55 -04:00
|
|
|
log() << "replSet " << m->fullName() << " is now up" << rsLog;
|
2010-04-22 11:58:19 -04:00
|
|
|
m->_upSince = m->_lastHeartbeat;
|
2010-04-20 15:30:37 -04:00
|
|
|
}
|
2010-04-22 11:58:19 -04:00
|
|
|
m->_health = 1.0;
|
|
|
|
|
m->_lastHeartbeatErrMsg.set("");
|
2010-05-13 17:18:17 -04:00
|
|
|
|
|
|
|
|
be cfg = info["config"];
|
|
|
|
|
if( cfg.ok() ) {
|
|
|
|
|
//theReplSet->receivedNewConfig(cfg.Obj());
|
|
|
|
|
}
|
2010-04-20 15:30:37 -04:00
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
down();
|
2010-04-22 11:58:19 -04:00
|
|
|
m->_lastHeartbeatErrMsg.set(info.getStringField("errmsg"));
|
2010-04-18 12:35:37 -04:00
|
|
|
}
|
2010-04-18 12:30:40 -04:00
|
|
|
}
|
|
|
|
|
catch(...) {
|
2010-04-20 15:30:37 -04:00
|
|
|
down();
|
2010-04-22 11:58:19 -04:00
|
|
|
m->_lastHeartbeatErrMsg.set("connect/transport error");
|
2010-04-18 12:30:40 -04:00
|
|
|
}
|
2010-05-11 15:58:44 -04:00
|
|
|
theReplSet->_mgr.checkNewState();
|
2010-04-18 12:30:40 -04:00
|
|
|
sleepsecs(2);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
2010-05-11 13:33:55 -04:00
|
|
|
|
2010-05-13 17:18:17 -04:00
|
|
|
string ago(time_t t) {
|
|
|
|
|
if( t == 0 ) return "";
|
|
|
|
|
|
|
|
|
|
time_t x = time(0) - t;
|
|
|
|
|
stringstream s;
|
|
|
|
|
if( x < 180 ) {
|
|
|
|
|
s << x << " sec";
|
|
|
|
|
if( x != 1 ) s << 's';
|
|
|
|
|
}
|
|
|
|
|
else if( x < 3600 ) {
|
|
|
|
|
s.precision(2);
|
|
|
|
|
s << x / 60.0 << " mins";
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
s.precision(2);
|
|
|
|
|
s << x / 3600.0 << " hrs";
|
|
|
|
|
}
|
|
|
|
|
return s.str();
|
|
|
|
|
}
|
|
|
|
|
|
2010-05-05 16:33:35 -04:00
|
|
|
void ReplSet::Member::summarizeAsHtml(stringstream& s) const {
|
|
|
|
|
s << tr();
|
|
|
|
|
{
|
|
|
|
|
stringstream u;
|
2010-05-07 17:49:24 -04:00
|
|
|
u << "http://" << _h.host() << ':' << (_h.port() + 1000) << "/_replSet";
|
2010-05-05 16:33:35 -04:00
|
|
|
s << td( a(u.str(), "", fullName()) );
|
|
|
|
|
}
|
2010-05-11 15:58:44 -04:00
|
|
|
double h = health();
|
|
|
|
|
bool ok = h > 0;
|
|
|
|
|
s << td(h);
|
2010-05-13 17:18:17 -04:00
|
|
|
s << td(ago(upSince()));
|
2010-05-05 16:33:35 -04:00
|
|
|
{
|
2010-05-13 17:18:17 -04:00
|
|
|
string h;
|
2010-05-05 16:33:35 -04:00
|
|
|
time_t hb = lastHeartbeat();
|
2010-05-13 17:18:17 -04:00
|
|
|
if( hb == 0 ) h = "never";
|
|
|
|
|
else h = ago(hb) + " ago";
|
|
|
|
|
s << td(h);
|
2010-05-05 16:33:35 -04:00
|
|
|
}
|
|
|
|
|
s << td(config().votes);
|
2010-05-11 15:58:44 -04:00
|
|
|
s << td(ReplSet::stateAsStr(state()));
|
|
|
|
|
s << td( red(_lastHeartbeatErrMsg.get(),!ok) );
|
2010-05-05 16:33:35 -04:00
|
|
|
s << _tr();
|
|
|
|
|
}
|
|
|
|
|
|
2010-05-07 15:35:16 -04:00
|
|
|
string ReplSet::stateAsHtml(State s) {
|
|
|
|
|
if( s == STARTUP ) return a("", "serving still starting up, or still trying to initiate the set", "STARTUP");
|
|
|
|
|
if( s == PRIMARY ) return a("", "this server thinks it is primary", "PRIMARY");
|
|
|
|
|
if( s == SECONDARY ) return a("", "this server thinks it is a secondary (slave mode)", "SECONDARY");
|
|
|
|
|
if( s == RECOVERING ) return a("", "recovering/resyncing; after recovery usually auto-transitions to secondary", "RECOVERING");
|
|
|
|
|
if( s == FATAL ) return a("", "something bad has occurred and server is not completely offline with regard to the replica set. fatal error.", "FATAL");
|
2010-05-11 15:58:44 -04:00
|
|
|
if( s == STARTUP2 ) return a("", "loaded config, still determining who is primary", "STARTUP2");
|
|
|
|
|
return "";
|
2010-05-07 15:35:16 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
string ReplSet::stateAsStr(State s) {
|
|
|
|
|
if( s == STARTUP ) return "STARTUP";
|
|
|
|
|
if( s == PRIMARY ) return "PRIMARY";
|
|
|
|
|
if( s == SECONDARY ) return "SECONDARY";
|
|
|
|
|
if( s == RECOVERING ) return "RECOVERING";
|
|
|
|
|
if( s == FATAL ) return "FATAL";
|
2010-05-11 15:58:44 -04:00
|
|
|
if( s == STARTUP2 ) return "STARTUP2";
|
|
|
|
|
return "";
|
2010-05-07 15:35:16 -04:00
|
|
|
}
|
|
|
|
|
|
2010-05-13 17:18:17 -04:00
|
|
|
extern time_t started;
|
|
|
|
|
|
2010-05-05 16:33:35 -04:00
|
|
|
void ReplSet::summarizeAsHtml(stringstream& s) const {
|
2010-05-06 10:05:48 -04:00
|
|
|
s << table(0, false);
|
2010-05-06 16:48:07 -04:00
|
|
|
s << tr("Set name:", _name);
|
2010-05-06 21:50:52 -04:00
|
|
|
s << tr("Majority up:", elect.aMajoritySeemsToBeUp()?"yes":"no" );
|
2010-05-06 10:05:48 -04:00
|
|
|
s << _table();
|
|
|
|
|
|
2010-05-13 17:18:17 -04:00
|
|
|
const char *h[] = {"Member", "Up",
|
|
|
|
|
"<a title=\"length of time we have been continuously connected to the other member with no reconnects\">cctime</a>",
|
2010-05-05 16:33:35 -04:00
|
|
|
"<a title=\"when this server last received a heartbeat response - includes error code responses\">Last heartbeat</a>",
|
2010-05-11 15:58:44 -04:00
|
|
|
"Votes", "State", "Status", 0};
|
2010-05-05 16:33:35 -04:00
|
|
|
s << table(h);
|
2010-05-12 17:43:21 -04:00
|
|
|
{
|
|
|
|
|
/* self row */
|
|
|
|
|
s << tr() << td(_self->fullName()) <<
|
|
|
|
|
td("1") <<
|
2010-05-13 17:18:17 -04:00
|
|
|
td(ago(started)) <<
|
|
|
|
|
td("(self)") <<
|
2010-05-12 17:43:21 -04:00
|
|
|
td(ToString(_self->config().votes)) <<
|
|
|
|
|
td(stateAsHtml(_myState));
|
|
|
|
|
s << td( _self->_lastHeartbeatErrMsg.get() );
|
|
|
|
|
s << _tr();
|
|
|
|
|
}
|
2010-05-05 16:33:35 -04:00
|
|
|
Member *m = head();
|
|
|
|
|
while( m ) {
|
|
|
|
|
m->summarizeAsHtml(s);
|
|
|
|
|
m = m->next();
|
|
|
|
|
}
|
|
|
|
|
s << _table();
|
|
|
|
|
}
|
|
|
|
|
|
2010-05-08 14:12:24 -04:00
|
|
|
static int repeats(const vector<const char *>& v, int i) {
|
|
|
|
|
for( int j = i-1; j >= 0 && j+8 > i; j-- ) {
|
|
|
|
|
if( strcmp(v[i]+20,v[j]+20) == 0 ) {
|
|
|
|
|
for( int x = 1; ; x++ ) {
|
|
|
|
|
if( j+x == i ) return j;
|
2010-05-11 16:36:26 -04:00
|
|
|
if( i+x>=(int) v.size() ) return -1;
|
2010-05-08 14:12:24 -04:00
|
|
|
if( strcmp(v[i+x]+20,v[j+x]+20) ) return -1;
|
|
|
|
|
}
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
2010-05-09 15:16:14 -04:00
|
|
|
static string clean(const vector<const char *>& v, int i, string line="") {
|
|
|
|
|
if( line.empty() ) line = v[i];
|
|
|
|
|
if( i > 0 && strncmp(v[i], v[i-1], 11) == 0 )
|
|
|
|
|
return string(" ") + line.substr(11);
|
|
|
|
|
return v[i];
|
|
|
|
|
}
|
|
|
|
|
|
2010-05-10 11:26:02 -04:00
|
|
|
static bool isWarning(const char *line) {
|
|
|
|
|
const char *p = strstr(line, "replSet ");
|
|
|
|
|
if( p ) {
|
|
|
|
|
p += 8;
|
|
|
|
|
return startsWith(p, "warning") || startsWith(p, "error");
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2010-05-07 16:42:55 -04:00
|
|
|
void fillRsLog(stringstream& s) {
|
2010-05-08 14:12:24 -04:00
|
|
|
bool first = true;
|
2010-05-09 17:29:35 -04:00
|
|
|
s << "<pre>\n";
|
2010-05-07 16:42:55 -04:00
|
|
|
vector<const char *> v = _rsLog.get();
|
2010-05-11 16:36:26 -04:00
|
|
|
for( int i = 0; i < (int)v.size(); i++ ) {
|
2010-05-08 14:12:24 -04:00
|
|
|
assert( strlen(v[i]) > 20 );
|
|
|
|
|
int r = repeats(v, i);
|
|
|
|
|
if( r < 0 ) {
|
2010-05-10 11:26:02 -04:00
|
|
|
s << red( clean(v,i), isWarning(v[i]) );
|
2010-05-08 14:12:24 -04:00
|
|
|
} else {
|
|
|
|
|
stringstream x;
|
|
|
|
|
x << string(v[i], 0, 20);
|
|
|
|
|
int nr = (i-r);
|
|
|
|
|
int last = i+nr-1;
|
|
|
|
|
for( ; r < i ; r++ ) x << '.';
|
|
|
|
|
if( 1 ) {
|
2010-05-09 15:16:14 -04:00
|
|
|
stringstream r;
|
|
|
|
|
if( nr == 1 ) r << "repeat last line";
|
|
|
|
|
else r << "repeats last " << nr << " lines; ends " << string(v[last]+4,0,15);
|
|
|
|
|
first = false; s << a("", r.str(), clean(v,i,x.str()));
|
2010-05-08 14:12:24 -04:00
|
|
|
}
|
|
|
|
|
else s << x.str();
|
|
|
|
|
s << '\n';
|
|
|
|
|
i = last;
|
|
|
|
|
}
|
2010-05-07 17:49:24 -04:00
|
|
|
}
|
2010-05-07 16:42:55 -04:00
|
|
|
s << "</pre>\n";
|
|
|
|
|
}
|
|
|
|
|
|
2010-04-20 12:29:00 -04:00
|
|
|
void ReplSet::summarizeStatus(BSONObjBuilder& b) const {
|
2010-05-05 14:57:49 -04:00
|
|
|
Member *m =_members.head();
|
2010-04-20 12:29:00 -04:00
|
|
|
vector<BSONObj> v;
|
|
|
|
|
|
|
|
|
|
// add self
|
|
|
|
|
{
|
|
|
|
|
HostAndPort h(getHostName(), cmdLine.port);
|
2010-05-12 17:43:21 -04:00
|
|
|
v.push_back(
|
|
|
|
|
BSON( "name" << h.toString() << "self" << true <<
|
|
|
|
|
"errmsg" << _self->_lastHeartbeatErrMsg.get() ) );
|
2010-04-20 12:29:00 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
while( m ) {
|
2010-04-20 15:30:37 -04:00
|
|
|
BSONObjBuilder bb;
|
|
|
|
|
bb.append("name", m->fullName());
|
2010-04-22 11:58:19 -04:00
|
|
|
bb.append("health", m->health());
|
|
|
|
|
bb.append("uptime", (unsigned) (m->upSince() ? (time(0)-m->upSince()) : 0));
|
|
|
|
|
bb.appendDate("lastHeartbeat", m->lastHeartbeat());
|
|
|
|
|
bb.append("errmsg", m->_lastHeartbeatErrMsg.get());
|
2010-04-20 15:30:37 -04:00
|
|
|
v.push_back(bb.obj());
|
2010-04-20 12:29:00 -04:00
|
|
|
m = m->next();
|
|
|
|
|
}
|
2010-05-13 17:18:17 -04:00
|
|
|
b.append("set", name());
|
2010-04-20 15:30:37 -04:00
|
|
|
b.appendDate("date", time(0));
|
2010-05-07 15:35:16 -04:00
|
|
|
b.append("myState", _myState);
|
2010-04-20 12:29:00 -04:00
|
|
|
b.append("members", v);
|
|
|
|
|
}
|
|
|
|
|
|
2010-04-18 12:30:40 -04:00
|
|
|
void ReplSet::startHealthThreads() {
|
2010-05-05 14:57:49 -04:00
|
|
|
Member* m = _members.head();
|
2010-04-18 12:30:40 -04:00
|
|
|
while( m ) {
|
|
|
|
|
FeedbackThread *f = new FeedbackThread();
|
|
|
|
|
f->m = m;
|
2010-04-20 15:30:37 -04:00
|
|
|
f->go();
|
2010-04-18 12:30:40 -04:00
|
|
|
m = m->next();
|
|
|
|
|
}
|
2010-04-14 20:50:15 -04:00
|
|
|
}
|
2010-04-14 17:25:03 -04:00
|
|
|
|
|
|
|
|
}
|
2010-04-18 12:30:40 -04:00
|
|
|
|
|
|
|
|
/* todo:
|
|
|
|
|
stop bg job and delete on removefromset
|
|
|
|
|
*/
|