Mount Tailscale router at /tailscale prefix so all 10 routes resolve to /api/tailscale/* as expected by middleware, audit logger, and frontend. Previously 5 routes (status, config, check-connection, devices, protect-service) resolved to /api/* instead, with config colliding with the settings route. Strip redundant /tailscale/ prefix from OAuth routes that were compensating for the missing mount prefix. Increase default health check timeout from 10s to 20s to reduce false positives on slower services. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
592 lines
16 KiB
JavaScript
592 lines
16 KiB
JavaScript
/**
|
|
* Health Check Dashboard Module
|
|
* Monitors service health, response times, and uptime
|
|
* Provides SLA tracking and incident management
|
|
*/
|
|
|
|
const https = require('https');
|
|
const http = require('http');
|
|
const EventEmitter = require('events');
|
|
const fs = require('fs');
|
|
const path = require('path');
|
|
|
|
const HEALTH_CONFIG_FILE = process.env.HEALTH_CONFIG_FILE || path.join(__dirname, 'health-config.json');
|
|
const HEALTH_HISTORY_FILE = process.env.HEALTH_HISTORY_FILE || path.join(__dirname, 'health-history.json');
|
|
const CHECK_INTERVAL = parseInt(process.env.HEALTH_CHECK_INTERVAL || '30000', 10); // 30 seconds
|
|
const MAX_CHECK_INTERVAL = parseInt(process.env.HEALTH_CHECK_MAX_INTERVAL || '300000', 10); // 5 minutes max backoff
|
|
const HISTORY_RETENTION_DAYS = parseInt(process.env.HEALTH_HISTORY_RETENTION || '30', 10);
|
|
|
|
class HealthChecker extends EventEmitter {
|
|
constructor() {
|
|
super();
|
|
this.config = this.loadConfig();
|
|
this.history = this.loadHistory();
|
|
this.currentStatus = new Map();
|
|
this.incidents = [];
|
|
this.checking = false;
|
|
this.checkInterval = null;
|
|
this.consecutiveFailures = new Map(); // serviceId -> failure count
|
|
this.serviceTimers = new Map(); // serviceId -> timer for per-service backoff
|
|
}
|
|
|
|
/**
|
|
* Start health checking
|
|
*/
|
|
start() {
|
|
if (this.checking) return;
|
|
|
|
this.checking = true;
|
|
|
|
// Initial check
|
|
this.checkAll();
|
|
|
|
// Schedule periodic checks
|
|
this.checkInterval = setInterval(() => this.checkAll(), CHECK_INTERVAL);
|
|
}
|
|
|
|
/**
|
|
* Stop health checking
|
|
*/
|
|
stop() {
|
|
if (!this.checking) return;
|
|
|
|
this.checking = false;
|
|
|
|
if (this.checkInterval) {
|
|
clearInterval(this.checkInterval);
|
|
this.checkInterval = null;
|
|
}
|
|
|
|
// Clear per-service backoff timers
|
|
for (const timer of this.serviceTimers.values()) {
|
|
clearTimeout(timer);
|
|
}
|
|
this.serviceTimers.clear();
|
|
}
|
|
|
|
/**
|
|
* Get the backoff interval for a service based on consecutive failures.
|
|
* Doubles the interval for each failure, capped at MAX_CHECK_INTERVAL.
|
|
*/
|
|
getBackoffInterval(serviceId) {
|
|
const failures = this.consecutiveFailures.get(serviceId) || 0;
|
|
if (failures === 0) return CHECK_INTERVAL;
|
|
return Math.min(CHECK_INTERVAL * Math.pow(2, failures), MAX_CHECK_INTERVAL);
|
|
}
|
|
|
|
/**
|
|
* Check all configured services
|
|
*/
|
|
async checkAll() {
|
|
const services = Object.entries(this.config.services || {});
|
|
|
|
for (const [serviceId, config] of services) {
|
|
if (config.enabled !== false) {
|
|
try {
|
|
await this.checkService(serviceId, config);
|
|
} catch (error) {
|
|
// Error logged via checkForIncidents
|
|
}
|
|
}
|
|
}
|
|
|
|
// Cleanup old history
|
|
this.cleanupHistory();
|
|
}
|
|
|
|
/**
|
|
* Check a single service
|
|
*/
|
|
async checkService(serviceId, config) {
|
|
const startTime = Date.now();
|
|
|
|
try {
|
|
const result = await this.performHealthCheck(config);
|
|
const responseTime = Date.now() - startTime;
|
|
|
|
const status = {
|
|
serviceId,
|
|
timestamp: new Date().toISOString(),
|
|
status: result.healthy ? 'up' : 'down',
|
|
responseTime,
|
|
statusCode: result.statusCode,
|
|
message: result.message,
|
|
details: result.details
|
|
};
|
|
|
|
// Track consecutive failures for exponential backoff
|
|
if (result.healthy) {
|
|
this.consecutiveFailures.delete(serviceId);
|
|
} else {
|
|
this.consecutiveFailures.set(serviceId, (this.consecutiveFailures.get(serviceId) || 0) + 1);
|
|
}
|
|
|
|
this.recordStatus(serviceId, status);
|
|
this.checkForIncidents(serviceId, status, config);
|
|
|
|
return status;
|
|
} catch (error) {
|
|
const responseTime = Date.now() - startTime;
|
|
|
|
// Increment failure count for backoff
|
|
this.consecutiveFailures.set(serviceId, (this.consecutiveFailures.get(serviceId) || 0) + 1);
|
|
|
|
const status = {
|
|
serviceId,
|
|
timestamp: new Date().toISOString(),
|
|
status: 'down',
|
|
responseTime,
|
|
error: error.message
|
|
};
|
|
|
|
this.recordStatus(serviceId, status);
|
|
this.checkForIncidents(serviceId, status, config);
|
|
|
|
return status;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Perform actual health check
|
|
*/
|
|
async performHealthCheck(config) {
|
|
const result = await this._doRequest(config, config.method || 'GET');
|
|
// Fall back to GET if HEAD is not supported
|
|
if ((result.statusCode === 501 || result.statusCode === 405) && (config.method || '').toUpperCase() === 'HEAD') {
|
|
return this._doRequest({ ...config, method: 'GET' }, 'GET');
|
|
}
|
|
return result;
|
|
}
|
|
|
|
_doRequest(config, method) {
|
|
return new Promise((resolve, reject) => {
|
|
const url = new URL(config.url);
|
|
const protocol = url.protocol === 'https:' ? https : http;
|
|
|
|
const options = {
|
|
hostname: url.hostname,
|
|
port: url.port || (url.protocol === 'https:' ? 443 : 80),
|
|
path: url.pathname + url.search,
|
|
method,
|
|
timeout: config.timeout || 20000,
|
|
headers: config.headers || {},
|
|
rejectUnauthorized: false // Trust internal CA certs (.sami TLD)
|
|
};
|
|
|
|
const req = protocol.request(options, (res) => {
|
|
let data = '';
|
|
|
|
res.on('data', chunk => {
|
|
data += chunk;
|
|
});
|
|
|
|
res.on('end', () => {
|
|
const healthy = this.evaluateHealth(res.statusCode, data, config);
|
|
|
|
resolve({
|
|
healthy,
|
|
statusCode: res.statusCode,
|
|
message: healthy ? 'Service is healthy' : 'Service check failed',
|
|
details: {
|
|
headers: res.headers,
|
|
bodyLength: data.length
|
|
}
|
|
});
|
|
});
|
|
});
|
|
|
|
req.on('error', (error) => {
|
|
reject(error);
|
|
});
|
|
|
|
req.on('timeout', () => {
|
|
req.destroy();
|
|
reject(new Error('Health check timeout'));
|
|
});
|
|
|
|
if (config.body) {
|
|
req.write(JSON.stringify(config.body));
|
|
}
|
|
|
|
req.end();
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Evaluate if service is healthy based on response
|
|
*/
|
|
evaluateHealth(statusCode, body, config) {
|
|
// Check status code
|
|
const expectedCodes = config.expectedStatusCodes || [200, 201, 204, 301, 302, 303, 307, 308];
|
|
if (!expectedCodes.includes(statusCode)) {
|
|
return false;
|
|
}
|
|
|
|
// Check response body if pattern specified
|
|
if (config.expectedBodyPattern) {
|
|
const regex = new RegExp(config.expectedBodyPattern);
|
|
if (!regex.test(body)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Check response body contains expected text
|
|
if (config.expectedBodyContains) {
|
|
if (!body.includes(config.expectedBodyContains)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Record service status
|
|
*/
|
|
recordStatus(serviceId, status) {
|
|
// Update current status
|
|
this.currentStatus.set(serviceId, status);
|
|
|
|
// Add to history
|
|
if (!this.history[serviceId]) {
|
|
this.history[serviceId] = [];
|
|
}
|
|
|
|
this.history[serviceId].push(status);
|
|
|
|
// Emit status event
|
|
this.emit('status-check', status);
|
|
|
|
// Save history periodically
|
|
if (Math.random() < 0.05) { // 5% chance (every ~20 checks)
|
|
this.saveHistory();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check for incidents (downtime, slow response, etc.)
|
|
*/
|
|
checkForIncidents(serviceId, status, config) {
|
|
const previous = this.currentStatus.get(serviceId);
|
|
|
|
// Check for status change (up -> down or down -> up)
|
|
if (previous && previous.status !== status.status) {
|
|
if (status.status === 'down') {
|
|
this.createIncident(serviceId, 'outage', 'Service is down', status);
|
|
} else if (status.status === 'up') {
|
|
this.resolveIncident(serviceId, 'outage', status);
|
|
}
|
|
}
|
|
|
|
// Check for slow response time
|
|
const slowThreshold = config.slowResponseThreshold || 5000; // 5 seconds
|
|
if (status.responseTime > slowThreshold) {
|
|
this.createIncident(serviceId, 'slow-response',
|
|
`Response time ${status.responseTime}ms exceeds threshold ${slowThreshold}ms`,
|
|
status);
|
|
}
|
|
|
|
// Check SLA violations
|
|
const sla = config.sla;
|
|
if (sla) {
|
|
const uptime = this.calculateUptime(serviceId, sla.period || 24);
|
|
if (uptime < sla.target) {
|
|
this.createIncident(serviceId, 'sla-violation',
|
|
`Uptime ${uptime.toFixed(2)}% below SLA target ${sla.target}%`,
|
|
status);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Create a new incident
|
|
*/
|
|
createIncident(serviceId, type, message, status) {
|
|
// Check if similar incident already exists
|
|
const existing = this.incidents.find(i =>
|
|
i.serviceId === serviceId &&
|
|
i.type === type &&
|
|
i.status === 'open'
|
|
);
|
|
|
|
if (existing) {
|
|
// Update existing incident
|
|
existing.lastOccurrence = status.timestamp;
|
|
existing.occurrences++;
|
|
return;
|
|
}
|
|
|
|
// Create new incident
|
|
const incident = {
|
|
id: `incident-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`,
|
|
serviceId,
|
|
type,
|
|
message,
|
|
status: 'open',
|
|
severity: this.calculateSeverity(type),
|
|
createdAt: status.timestamp,
|
|
lastOccurrence: status.timestamp,
|
|
occurrences: 1,
|
|
details: status
|
|
};
|
|
|
|
this.incidents.push(incident);
|
|
this.emit('incident-created', incident);
|
|
|
|
this.emit('log', 'info', `Incident created: ${incident.id} - ${message}`);
|
|
}
|
|
|
|
/**
|
|
* Resolve an incident
|
|
*/
|
|
resolveIncident(serviceId, type, status) {
|
|
const incident = this.incidents.find(i =>
|
|
i.serviceId === serviceId &&
|
|
i.type === type &&
|
|
i.status === 'open'
|
|
);
|
|
|
|
if (incident) {
|
|
incident.status = 'resolved';
|
|
incident.resolvedAt = status.timestamp;
|
|
incident.duration = new Date(incident.resolvedAt) - new Date(incident.createdAt);
|
|
|
|
this.emit('incident-resolved', incident);
|
|
this.emit('log', 'info', `Incident resolved: ${incident.id}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Calculate incident severity
|
|
*/
|
|
calculateSeverity(type) {
|
|
switch (type) {
|
|
case 'outage':
|
|
return 'critical';
|
|
case 'sla-violation':
|
|
return 'high';
|
|
case 'slow-response':
|
|
return 'medium';
|
|
default:
|
|
return 'low';
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Calculate uptime percentage for a service
|
|
*/
|
|
calculateUptime(serviceId, hours = 24) {
|
|
const history = this.getServiceHistory(serviceId, hours);
|
|
if (history.length === 0) return 100;
|
|
|
|
const upChecks = history.filter(h => h.status === 'up').length;
|
|
return (upChecks / history.length) * 100;
|
|
}
|
|
|
|
/**
|
|
* Calculate average response time
|
|
*/
|
|
calculateAverageResponseTime(serviceId, hours = 24) {
|
|
const history = this.getServiceHistory(serviceId, hours);
|
|
if (history.length === 0) return 0;
|
|
|
|
const total = history.reduce((sum, h) => sum + (h.responseTime || 0), 0);
|
|
return total / history.length;
|
|
}
|
|
|
|
/**
|
|
* Get service history for specified time period
|
|
*/
|
|
getServiceHistory(serviceId, hours = 24) {
|
|
const cutoffTime = Date.now() - (hours * 60 * 60 * 1000);
|
|
const history = this.history[serviceId] || [];
|
|
|
|
return history.filter(h =>
|
|
new Date(h.timestamp).getTime() > cutoffTime
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Get current status for all services
|
|
*/
|
|
getCurrentStatus() {
|
|
const result = {};
|
|
|
|
for (const [serviceId, status] of this.currentStatus.entries()) {
|
|
const config = this.config.services[serviceId];
|
|
const uptime24h = this.calculateUptime(serviceId, 24);
|
|
const uptime7d = this.calculateUptime(serviceId, 168);
|
|
const avgResponseTime = this.calculateAverageResponseTime(serviceId, 24);
|
|
|
|
result[serviceId] = {
|
|
...status,
|
|
name: config?.name || serviceId,
|
|
uptime: {
|
|
'24h': uptime24h,
|
|
'7d': uptime7d
|
|
},
|
|
avgResponseTime,
|
|
sla: config?.sla
|
|
};
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Get service statistics
|
|
*/
|
|
getServiceStats(serviceId, hours = 24) {
|
|
const history = this.getServiceHistory(serviceId, hours);
|
|
if (history.length === 0) return null;
|
|
|
|
const upChecks = history.filter(h => h.status === 'up').length;
|
|
const downChecks = history.length - upChecks;
|
|
const responseTimes = history.map(h => h.responseTime || 0);
|
|
|
|
return {
|
|
serviceId,
|
|
period: `${hours}h`,
|
|
totalChecks: history.length,
|
|
upChecks,
|
|
downChecks,
|
|
uptime: (upChecks / history.length) * 100,
|
|
responseTime: {
|
|
avg: responseTimes.reduce((a, b) => a + b, 0) / responseTimes.length,
|
|
min: Math.min(...responseTimes),
|
|
max: Math.max(...responseTimes),
|
|
p95: this.calculatePercentile(responseTimes, 95),
|
|
p99: this.calculatePercentile(responseTimes, 99)
|
|
}
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Calculate percentile
|
|
*/
|
|
calculatePercentile(values, percentile) {
|
|
const sorted = values.slice().sort((a, b) => a - b);
|
|
const index = Math.ceil((percentile / 100) * sorted.length) - 1;
|
|
return sorted[index] || 0;
|
|
}
|
|
|
|
/**
|
|
* Get open incidents
|
|
*/
|
|
getOpenIncidents() {
|
|
return this.incidents.filter(i => i.status === 'open');
|
|
}
|
|
|
|
/**
|
|
* Get incident history
|
|
*/
|
|
getIncidentHistory(limit = 50) {
|
|
return this.incidents.slice(-limit).reverse();
|
|
}
|
|
|
|
/**
|
|
* Configure health check for a service
|
|
*/
|
|
configureService(serviceId, config) {
|
|
if (!this.config.services) {
|
|
this.config.services = {};
|
|
}
|
|
|
|
this.config.services[serviceId] = {
|
|
enabled: config.enabled !== false,
|
|
name: config.name || serviceId,
|
|
url: config.url,
|
|
method: config.method || 'GET',
|
|
timeout: config.timeout || 20000,
|
|
expectedStatusCodes: config.expectedStatusCodes || [200],
|
|
expectedBodyPattern: config.expectedBodyPattern,
|
|
expectedBodyContains: config.expectedBodyContains,
|
|
slowResponseThreshold: config.slowResponseThreshold || 5000,
|
|
sla: config.sla,
|
|
headers: config.headers || {},
|
|
body: config.body
|
|
};
|
|
|
|
this.saveConfig();
|
|
}
|
|
|
|
/**
|
|
* Remove service configuration
|
|
*/
|
|
removeService(serviceId) {
|
|
if (this.config.services) {
|
|
delete this.config.services[serviceId];
|
|
this.saveConfig();
|
|
}
|
|
|
|
this.currentStatus.delete(serviceId);
|
|
delete this.history[serviceId];
|
|
}
|
|
|
|
/**
|
|
* Cleanup old history
|
|
*/
|
|
cleanupHistory() {
|
|
const cutoffTime = Date.now() - (HISTORY_RETENTION_DAYS * 24 * 60 * 60 * 1000);
|
|
|
|
for (const serviceId in this.history) {
|
|
this.history[serviceId] = this.history[serviceId].filter(h =>
|
|
new Date(h.timestamp).getTime() > cutoffTime
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Load configuration
|
|
*/
|
|
loadConfig() {
|
|
try {
|
|
if (fs.existsSync(HEALTH_CONFIG_FILE)) {
|
|
return JSON.parse(fs.readFileSync(HEALTH_CONFIG_FILE, 'utf8'));
|
|
}
|
|
} catch (error) {
|
|
this.emit('log', 'error', `Error loading config: ${error.message}`);
|
|
}
|
|
return { services: {} };
|
|
}
|
|
|
|
/**
|
|
* Save configuration
|
|
*/
|
|
saveConfig() {
|
|
try {
|
|
fs.writeFileSync(HEALTH_CONFIG_FILE, JSON.stringify(this.config, null, 2));
|
|
} catch (error) {
|
|
this.emit('log', 'error', `Error saving config: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Load history
|
|
*/
|
|
loadHistory() {
|
|
try {
|
|
if (fs.existsSync(HEALTH_HISTORY_FILE)) {
|
|
return JSON.parse(fs.readFileSync(HEALTH_HISTORY_FILE, 'utf8'));
|
|
}
|
|
} catch (error) {
|
|
this.emit('log', 'error', `Error loading history: ${error.message}`);
|
|
}
|
|
return {};
|
|
}
|
|
|
|
/**
|
|
* Save history
|
|
*/
|
|
saveHistory() {
|
|
try {
|
|
fs.writeFileSync(HEALTH_HISTORY_FILE, JSON.stringify(this.history, null, 2));
|
|
} catch (error) {
|
|
this.emit('log', 'error', `Error saving history: ${error.message}`);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Export singleton instance
|
|
module.exports = new HealthChecker();
|