Files
mongo/db/repl/health.cpp

352 lines
12 KiB
C++
Raw Normal View History

2010-04-14 17:25:03 -04:00
/**
* Copyright (C) 2008 10gen Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
2010-04-18 12:30:40 -04:00
* This program is distributed in the hope that it will be useful,b
2010-04-14 17:25:03 -04:00
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
2010-04-27 15:27:52 -04:00
#include "pch.h"
2010-04-14 17:25:03 -04:00
#include "replset.h"
#include "health.h"
2010-04-18 12:30:40 -04:00
#include "../../util/background.h"
#include "../../client/dbclient.h"
#include "../commands.h"
#include "../../util/concurrency/value.h"
2010-05-07 16:42:55 -04:00
#include "../../util/mongoutils/html.h"
2010-05-05 16:33:35 -04:00
#include "../../util/goodies.h"
2010-05-07 16:42:55 -04:00
#include "../../util/ramlog.h"
2010-05-06 10:05:48 -04:00
#include "../helpers/dblogger.h"
2010-05-06 16:48:07 -04:00
#include "connections.h"
namespace mongo {
/* decls for connections.h */
ScopedConn::M& ScopedConn::_map = *(new ScopedConn::M());
mutex ScopedConn::mapMutex;
}
2010-04-14 17:25:03 -04:00
namespace mongo {
2010-05-05 16:33:35 -04:00
using namespace mongoutils::html;
2010-05-13 17:18:17 -04:00
using namespace bson;
2010-05-05 16:33:35 -04:00
2010-05-07 16:42:55 -04:00
static RamLog _rsLog;
Tee *rsLog = &_rsLog;
2010-04-22 16:17:18 -04:00
/* { replSetHeartbeat : <setname> } */
2010-04-18 12:30:40 -04:00
class CmdReplSetHeartbeat : public Command {
public:
2010-04-23 15:50:49 -04:00
virtual bool slaveOk() const { return true; }
2010-04-23 16:41:56 -04:00
virtual bool adminOnly() const { return false; }
2010-04-18 12:30:40 -04:00
virtual bool logTheOp() { return false; }
2010-04-23 15:50:49 -04:00
virtual LockType locktype() const { return NONE; }
2010-04-23 16:41:56 -04:00
virtual void help( stringstream &help ) const { help<<"internal"; }
2010-04-18 12:30:40 -04:00
CmdReplSetHeartbeat() : Command("replSetHeartbeat") { }
virtual bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
2010-04-21 21:19:37 -04:00
if( !replSet ) {
2010-04-18 12:30:40 -04:00
errmsg = "not a replset member";
return false;
}
2010-05-13 11:03:23 -04:00
if( cmdObj["pv"].Int() != 1 ) {
errmsg = "incompatible replset protocol version";
return false;
}
string s = string(cmdObj.getStringField("replSetHeartbeat"))+'/';
if( !startsWith(cmdLine.replSet, s ) ) {
2010-05-08 14:12:24 -04:00
errmsg = "repl set names do not match";
2010-05-13 11:03:23 -04:00
cout << "cmdline: " << cmdLine.replSet << endl;
cout << "s: " << s << endl;
2010-05-08 14:12:24 -04:00
result.append("mismatch", true);
return false;
}
2010-05-13 17:18:17 -04:00
result.append("rs", true);
2010-04-21 21:19:37 -04:00
if( theReplSet == 0 ) {
errmsg = "still initializing";
return false;
}
2010-05-13 17:18:17 -04:00
if( theReplSet->name() != cmdObj.getStringField("replSetHeartbeat") ) {
2010-05-08 14:12:24 -04:00
errmsg = "repl set names do not match (2)";
result.append("mismatch", true);
2010-04-20 15:30:37 -04:00
return false;
}
2010-05-13 17:18:17 -04:00
result.append("set", theReplSet->name());
result.append("state", theReplSet->state());
int v = theReplSet->config().version;
result.append("v", v);
// if( v > cmdObj["v"].Int() )
// result << "config" << theReplSet->config().asBson();
2010-04-18 12:30:40 -04:00
return true;
}
} cmdReplSetHeartbeat;
2010-05-09 17:29:35 -04:00
/* throws dbexception */
2010-05-13 11:03:23 -04:00
bool requestHeartbeat(string setName, string memberFullName, BSONObj& result, int myCfgVersion, int& theirCfgVersion) {
BSONObj cmd = BSON( "replSetHeartbeat" << setName << "v" << myCfgVersion << "pv" << 1 );
2010-05-08 14:12:24 -04:00
ScopedConn conn(memberFullName);
return conn->runCommand("admin", cmd, result);
}
2010-04-18 12:35:37 -04:00
/* poll every other set member to check its status */
2010-04-18 12:30:40 -04:00
class FeedbackThread : public BackgroundJob {
public:
2010-05-11 15:58:44 -04:00
ReplSet::Member *m;
2010-04-20 15:30:37 -04:00
private:
void down() {
2010-04-22 11:58:19 -04:00
m->_health = 0.0;
if( m->_upSince ) {
m->_upSince = 0;
2010-05-07 16:42:55 -04:00
log() << "replSet " << m->fullName() << " is now down" << rsLog;
2010-04-20 15:30:37 -04:00
}
}
public:
2010-04-18 12:30:40 -04:00
void run() {
2010-04-20 15:30:37 -04:00
mongo::lastError.reset( new LastError() );
2010-04-18 12:30:40 -04:00
while( 1 ) {
try {
BSONObj info;
2010-05-13 11:03:23 -04:00
int theirConfigVersion = -10000;
2010-05-13 17:18:17 -04:00
bool ok = requestHeartbeat(theReplSet->name(), m->fullName(), info, theReplSet->config().version, theirConfigVersion);
2010-05-08 14:12:24 -04:00
m->_lastHeartbeat = time(0); // we set this on any response - we don't get this far if couldn't connect because exception is thrown
2010-05-13 17:18:17 -04:00
{
be state = info["state"];
if( state.ok() )
m->_state = (ReplSet::State) state.Int();
}
2010-04-18 12:35:37 -04:00
if( ok ) {
2010-04-22 11:58:19 -04:00
if( m->_upSince == 0 ) {
2010-05-07 16:42:55 -04:00
log() << "replSet " << m->fullName() << " is now up" << rsLog;
2010-04-22 11:58:19 -04:00
m->_upSince = m->_lastHeartbeat;
2010-04-20 15:30:37 -04:00
}
2010-04-22 11:58:19 -04:00
m->_health = 1.0;
m->_lastHeartbeatErrMsg.set("");
2010-05-13 17:18:17 -04:00
be cfg = info["config"];
if( cfg.ok() ) {
//theReplSet->receivedNewConfig(cfg.Obj());
}
2010-04-20 15:30:37 -04:00
}
else {
down();
2010-04-22 11:58:19 -04:00
m->_lastHeartbeatErrMsg.set(info.getStringField("errmsg"));
2010-04-18 12:35:37 -04:00
}
2010-04-18 12:30:40 -04:00
}
catch(...) {
2010-04-20 15:30:37 -04:00
down();
2010-04-22 11:58:19 -04:00
m->_lastHeartbeatErrMsg.set("connect/transport error");
2010-04-18 12:30:40 -04:00
}
2010-05-11 15:58:44 -04:00
theReplSet->_mgr.checkNewState();
2010-04-18 12:30:40 -04:00
sleepsecs(2);
}
}
};
2010-05-11 13:33:55 -04:00
2010-05-13 17:18:17 -04:00
string ago(time_t t) {
if( t == 0 ) return "";
time_t x = time(0) - t;
stringstream s;
if( x < 180 ) {
s << x << " sec";
if( x != 1 ) s << 's';
}
else if( x < 3600 ) {
s.precision(2);
s << x / 60.0 << " mins";
}
else {
s.precision(2);
s << x / 3600.0 << " hrs";
}
return s.str();
}
2010-05-05 16:33:35 -04:00
void ReplSet::Member::summarizeAsHtml(stringstream& s) const {
s << tr();
{
stringstream u;
2010-05-07 17:49:24 -04:00
u << "http://" << _h.host() << ':' << (_h.port() + 1000) << "/_replSet";
2010-05-05 16:33:35 -04:00
s << td( a(u.str(), "", fullName()) );
}
2010-05-11 15:58:44 -04:00
double h = health();
bool ok = h > 0;
s << td(h);
2010-05-13 17:18:17 -04:00
s << td(ago(upSince()));
2010-05-05 16:33:35 -04:00
{
2010-05-13 17:18:17 -04:00
string h;
2010-05-05 16:33:35 -04:00
time_t hb = lastHeartbeat();
2010-05-13 17:18:17 -04:00
if( hb == 0 ) h = "never";
else h = ago(hb) + " ago";
s << td(h);
2010-05-05 16:33:35 -04:00
}
s << td(config().votes);
2010-05-11 15:58:44 -04:00
s << td(ReplSet::stateAsStr(state()));
s << td( red(_lastHeartbeatErrMsg.get(),!ok) );
2010-05-05 16:33:35 -04:00
s << _tr();
}
2010-05-07 15:35:16 -04:00
string ReplSet::stateAsHtml(State s) {
if( s == STARTUP ) return a("", "serving still starting up, or still trying to initiate the set", "STARTUP");
if( s == PRIMARY ) return a("", "this server thinks it is primary", "PRIMARY");
if( s == SECONDARY ) return a("", "this server thinks it is a secondary (slave mode)", "SECONDARY");
if( s == RECOVERING ) return a("", "recovering/resyncing; after recovery usually auto-transitions to secondary", "RECOVERING");
if( s == FATAL ) return a("", "something bad has occurred and server is not completely offline with regard to the replica set. fatal error.", "FATAL");
2010-05-11 15:58:44 -04:00
if( s == STARTUP2 ) return a("", "loaded config, still determining who is primary", "STARTUP2");
return "";
2010-05-07 15:35:16 -04:00
}
string ReplSet::stateAsStr(State s) {
if( s == STARTUP ) return "STARTUP";
if( s == PRIMARY ) return "PRIMARY";
if( s == SECONDARY ) return "SECONDARY";
if( s == RECOVERING ) return "RECOVERING";
if( s == FATAL ) return "FATAL";
2010-05-11 15:58:44 -04:00
if( s == STARTUP2 ) return "STARTUP2";
return "";
2010-05-07 15:35:16 -04:00
}
2010-05-13 17:18:17 -04:00
extern time_t started;
2010-05-05 16:33:35 -04:00
void ReplSet::summarizeAsHtml(stringstream& s) const {
2010-05-06 10:05:48 -04:00
s << table(0, false);
2010-05-06 16:48:07 -04:00
s << tr("Set name:", _name);
2010-05-06 21:50:52 -04:00
s << tr("Majority up:", elect.aMajoritySeemsToBeUp()?"yes":"no" );
2010-05-06 10:05:48 -04:00
s << _table();
2010-05-13 17:18:17 -04:00
const char *h[] = {"Member", "Up",
"<a title=\"length of time we have been continuously connected to the other member with no reconnects\">cctime</a>",
2010-05-05 16:33:35 -04:00
"<a title=\"when this server last received a heartbeat response - includes error code responses\">Last heartbeat</a>",
2010-05-11 15:58:44 -04:00
"Votes", "State", "Status", 0};
2010-05-05 16:33:35 -04:00
s << table(h);
2010-05-12 17:43:21 -04:00
{
/* self row */
s << tr() << td(_self->fullName()) <<
td("1") <<
2010-05-13 17:18:17 -04:00
td(ago(started)) <<
td("(self)") <<
2010-05-12 17:43:21 -04:00
td(ToString(_self->config().votes)) <<
td(stateAsHtml(_myState));
s << td( _self->_lastHeartbeatErrMsg.get() );
s << _tr();
}
2010-05-05 16:33:35 -04:00
Member *m = head();
while( m ) {
m->summarizeAsHtml(s);
m = m->next();
}
s << _table();
}
2010-05-08 14:12:24 -04:00
static int repeats(const vector<const char *>& v, int i) {
for( int j = i-1; j >= 0 && j+8 > i; j-- ) {
if( strcmp(v[i]+20,v[j]+20) == 0 ) {
for( int x = 1; ; x++ ) {
if( j+x == i ) return j;
2010-05-11 16:36:26 -04:00
if( i+x>=(int) v.size() ) return -1;
2010-05-08 14:12:24 -04:00
if( strcmp(v[i+x]+20,v[j+x]+20) ) return -1;
}
return -1;
}
}
return -1;
}
2010-05-09 15:16:14 -04:00
static string clean(const vector<const char *>& v, int i, string line="") {
if( line.empty() ) line = v[i];
if( i > 0 && strncmp(v[i], v[i-1], 11) == 0 )
return string(" ") + line.substr(11);
return v[i];
}
2010-05-10 11:26:02 -04:00
static bool isWarning(const char *line) {
const char *p = strstr(line, "replSet ");
if( p ) {
p += 8;
return startsWith(p, "warning") || startsWith(p, "error");
}
return false;
}
2010-05-07 16:42:55 -04:00
void fillRsLog(stringstream& s) {
2010-05-08 14:12:24 -04:00
bool first = true;
2010-05-09 17:29:35 -04:00
s << "<pre>\n";
2010-05-07 16:42:55 -04:00
vector<const char *> v = _rsLog.get();
2010-05-11 16:36:26 -04:00
for( int i = 0; i < (int)v.size(); i++ ) {
2010-05-08 14:12:24 -04:00
assert( strlen(v[i]) > 20 );
int r = repeats(v, i);
if( r < 0 ) {
2010-05-10 11:26:02 -04:00
s << red( clean(v,i), isWarning(v[i]) );
2010-05-08 14:12:24 -04:00
} else {
stringstream x;
x << string(v[i], 0, 20);
int nr = (i-r);
int last = i+nr-1;
for( ; r < i ; r++ ) x << '.';
if( 1 ) {
2010-05-09 15:16:14 -04:00
stringstream r;
if( nr == 1 ) r << "repeat last line";
else r << "repeats last " << nr << " lines; ends " << string(v[last]+4,0,15);
first = false; s << a("", r.str(), clean(v,i,x.str()));
2010-05-08 14:12:24 -04:00
}
else s << x.str();
s << '\n';
i = last;
}
2010-05-07 17:49:24 -04:00
}
2010-05-07 16:42:55 -04:00
s << "</pre>\n";
}
2010-04-20 12:29:00 -04:00
void ReplSet::summarizeStatus(BSONObjBuilder& b) const {
2010-05-05 14:57:49 -04:00
Member *m =_members.head();
2010-04-20 12:29:00 -04:00
vector<BSONObj> v;
// add self
{
HostAndPort h(getHostName(), cmdLine.port);
2010-05-12 17:43:21 -04:00
v.push_back(
BSON( "name" << h.toString() << "self" << true <<
"errmsg" << _self->_lastHeartbeatErrMsg.get() ) );
2010-04-20 12:29:00 -04:00
}
while( m ) {
2010-04-20 15:30:37 -04:00
BSONObjBuilder bb;
bb.append("name", m->fullName());
2010-04-22 11:58:19 -04:00
bb.append("health", m->health());
bb.append("uptime", (unsigned) (m->upSince() ? (time(0)-m->upSince()) : 0));
bb.appendDate("lastHeartbeat", m->lastHeartbeat());
bb.append("errmsg", m->_lastHeartbeatErrMsg.get());
2010-04-20 15:30:37 -04:00
v.push_back(bb.obj());
2010-04-20 12:29:00 -04:00
m = m->next();
}
2010-05-13 17:18:17 -04:00
b.append("set", name());
2010-04-20 15:30:37 -04:00
b.appendDate("date", time(0));
2010-05-07 15:35:16 -04:00
b.append("myState", _myState);
2010-04-20 12:29:00 -04:00
b.append("members", v);
}
2010-04-18 12:30:40 -04:00
void ReplSet::startHealthThreads() {
2010-05-05 14:57:49 -04:00
Member* m = _members.head();
2010-04-18 12:30:40 -04:00
while( m ) {
FeedbackThread *f = new FeedbackThread();
f->m = m;
2010-04-20 15:30:37 -04:00
f->go();
2010-04-18 12:30:40 -04:00
m = m->next();
}
2010-04-14 20:50:15 -04:00
}
2010-04-14 17:25:03 -04:00
}
2010-04-18 12:30:40 -04:00
/* todo:
stop bg job and delete on removefromset
*/