Files
dashcaddy/dashcaddy-api/health-checker.js
Sami f61e85d9a7 Initial commit: DashCaddy v1.0
Full codebase including API server (32 modules + routes), dashboard frontend,
DashCA certificate distribution, installer script, and deployment skills.
2026-03-05 02:26:12 -08:00

592 lines
16 KiB
JavaScript

/**
* Health Check Dashboard Module
* Monitors service health, response times, and uptime
* Provides SLA tracking and incident management
*/
const https = require('https');
const http = require('http');
const EventEmitter = require('events');
const fs = require('fs');
const path = require('path');
const HEALTH_CONFIG_FILE = process.env.HEALTH_CONFIG_FILE || path.join(__dirname, 'health-config.json');
const HEALTH_HISTORY_FILE = process.env.HEALTH_HISTORY_FILE || path.join(__dirname, 'health-history.json');
const CHECK_INTERVAL = parseInt(process.env.HEALTH_CHECK_INTERVAL || '30000', 10); // 30 seconds
const MAX_CHECK_INTERVAL = parseInt(process.env.HEALTH_CHECK_MAX_INTERVAL || '300000', 10); // 5 minutes max backoff
const HISTORY_RETENTION_DAYS = parseInt(process.env.HEALTH_HISTORY_RETENTION || '30', 10);
class HealthChecker extends EventEmitter {
constructor() {
super();
this.config = this.loadConfig();
this.history = this.loadHistory();
this.currentStatus = new Map();
this.incidents = [];
this.checking = false;
this.checkInterval = null;
this.consecutiveFailures = new Map(); // serviceId -> failure count
this.serviceTimers = new Map(); // serviceId -> timer for per-service backoff
}
/**
* Start health checking
*/
start() {
if (this.checking) return;
this.checking = true;
// Initial check
this.checkAll();
// Schedule periodic checks
this.checkInterval = setInterval(() => this.checkAll(), CHECK_INTERVAL);
}
/**
* Stop health checking
*/
stop() {
if (!this.checking) return;
this.checking = false;
if (this.checkInterval) {
clearInterval(this.checkInterval);
this.checkInterval = null;
}
// Clear per-service backoff timers
for (const timer of this.serviceTimers.values()) {
clearTimeout(timer);
}
this.serviceTimers.clear();
}
/**
* Get the backoff interval for a service based on consecutive failures.
* Doubles the interval for each failure, capped at MAX_CHECK_INTERVAL.
*/
getBackoffInterval(serviceId) {
const failures = this.consecutiveFailures.get(serviceId) || 0;
if (failures === 0) return CHECK_INTERVAL;
return Math.min(CHECK_INTERVAL * Math.pow(2, failures), MAX_CHECK_INTERVAL);
}
/**
* Check all configured services
*/
async checkAll() {
const services = Object.entries(this.config.services || {});
for (const [serviceId, config] of services) {
if (config.enabled !== false) {
try {
await this.checkService(serviceId, config);
} catch (error) {
// Error logged via checkForIncidents
}
}
}
// Cleanup old history
this.cleanupHistory();
}
/**
* Check a single service
*/
async checkService(serviceId, config) {
const startTime = Date.now();
try {
const result = await this.performHealthCheck(config);
const responseTime = Date.now() - startTime;
const status = {
serviceId,
timestamp: new Date().toISOString(),
status: result.healthy ? 'up' : 'down',
responseTime,
statusCode: result.statusCode,
message: result.message,
details: result.details
};
// Track consecutive failures for exponential backoff
if (result.healthy) {
this.consecutiveFailures.delete(serviceId);
} else {
this.consecutiveFailures.set(serviceId, (this.consecutiveFailures.get(serviceId) || 0) + 1);
}
this.recordStatus(serviceId, status);
this.checkForIncidents(serviceId, status, config);
return status;
} catch (error) {
const responseTime = Date.now() - startTime;
// Increment failure count for backoff
this.consecutiveFailures.set(serviceId, (this.consecutiveFailures.get(serviceId) || 0) + 1);
const status = {
serviceId,
timestamp: new Date().toISOString(),
status: 'down',
responseTime,
error: error.message
};
this.recordStatus(serviceId, status);
this.checkForIncidents(serviceId, status, config);
return status;
}
}
/**
* Perform actual health check
*/
async performHealthCheck(config) {
const result = await this._doRequest(config, config.method || 'GET');
// Fall back to GET if HEAD is not supported
if ((result.statusCode === 501 || result.statusCode === 405) && (config.method || '').toUpperCase() === 'HEAD') {
return this._doRequest({ ...config, method: 'GET' }, 'GET');
}
return result;
}
_doRequest(config, method) {
return new Promise((resolve, reject) => {
const url = new URL(config.url);
const protocol = url.protocol === 'https:' ? https : http;
const options = {
hostname: url.hostname,
port: url.port || (url.protocol === 'https:' ? 443 : 80),
path: url.pathname + url.search,
method,
timeout: config.timeout || 10000,
headers: config.headers || {},
rejectUnauthorized: false // Trust internal CA certs (.sami TLD)
};
const req = protocol.request(options, (res) => {
let data = '';
res.on('data', chunk => {
data += chunk;
});
res.on('end', () => {
const healthy = this.evaluateHealth(res.statusCode, data, config);
resolve({
healthy,
statusCode: res.statusCode,
message: healthy ? 'Service is healthy' : 'Service check failed',
details: {
headers: res.headers,
bodyLength: data.length
}
});
});
});
req.on('error', (error) => {
reject(error);
});
req.on('timeout', () => {
req.destroy();
reject(new Error('Health check timeout'));
});
if (config.body) {
req.write(JSON.stringify(config.body));
}
req.end();
});
}
/**
* Evaluate if service is healthy based on response
*/
evaluateHealth(statusCode, body, config) {
// Check status code
const expectedCodes = config.expectedStatusCodes || [200, 201, 204, 301, 302, 303, 307, 308];
if (!expectedCodes.includes(statusCode)) {
return false;
}
// Check response body if pattern specified
if (config.expectedBodyPattern) {
const regex = new RegExp(config.expectedBodyPattern);
if (!regex.test(body)) {
return false;
}
}
// Check response body contains expected text
if (config.expectedBodyContains) {
if (!body.includes(config.expectedBodyContains)) {
return false;
}
}
return true;
}
/**
* Record service status
*/
recordStatus(serviceId, status) {
// Update current status
this.currentStatus.set(serviceId, status);
// Add to history
if (!this.history[serviceId]) {
this.history[serviceId] = [];
}
this.history[serviceId].push(status);
// Emit status event
this.emit('status-check', status);
// Save history periodically
if (Math.random() < 0.05) { // 5% chance (every ~20 checks)
this.saveHistory();
}
}
/**
* Check for incidents (downtime, slow response, etc.)
*/
checkForIncidents(serviceId, status, config) {
const previous = this.currentStatus.get(serviceId);
// Check for status change (up -> down or down -> up)
if (previous && previous.status !== status.status) {
if (status.status === 'down') {
this.createIncident(serviceId, 'outage', 'Service is down', status);
} else if (status.status === 'up') {
this.resolveIncident(serviceId, 'outage', status);
}
}
// Check for slow response time
const slowThreshold = config.slowResponseThreshold || 5000; // 5 seconds
if (status.responseTime > slowThreshold) {
this.createIncident(serviceId, 'slow-response',
`Response time ${status.responseTime}ms exceeds threshold ${slowThreshold}ms`,
status);
}
// Check SLA violations
const sla = config.sla;
if (sla) {
const uptime = this.calculateUptime(serviceId, sla.period || 24);
if (uptime < sla.target) {
this.createIncident(serviceId, 'sla-violation',
`Uptime ${uptime.toFixed(2)}% below SLA target ${sla.target}%`,
status);
}
}
}
/**
* Create a new incident
*/
createIncident(serviceId, type, message, status) {
// Check if similar incident already exists
const existing = this.incidents.find(i =>
i.serviceId === serviceId &&
i.type === type &&
i.status === 'open'
);
if (existing) {
// Update existing incident
existing.lastOccurrence = status.timestamp;
existing.occurrences++;
return;
}
// Create new incident
const incident = {
id: `incident-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`,
serviceId,
type,
message,
status: 'open',
severity: this.calculateSeverity(type),
createdAt: status.timestamp,
lastOccurrence: status.timestamp,
occurrences: 1,
details: status
};
this.incidents.push(incident);
this.emit('incident-created', incident);
this.emit('log', 'info', `Incident created: ${incident.id} - ${message}`);
}
/**
* Resolve an incident
*/
resolveIncident(serviceId, type, status) {
const incident = this.incidents.find(i =>
i.serviceId === serviceId &&
i.type === type &&
i.status === 'open'
);
if (incident) {
incident.status = 'resolved';
incident.resolvedAt = status.timestamp;
incident.duration = new Date(incident.resolvedAt) - new Date(incident.createdAt);
this.emit('incident-resolved', incident);
this.emit('log', 'info', `Incident resolved: ${incident.id}`);
}
}
/**
* Calculate incident severity
*/
calculateSeverity(type) {
switch (type) {
case 'outage':
return 'critical';
case 'sla-violation':
return 'high';
case 'slow-response':
return 'medium';
default:
return 'low';
}
}
/**
* Calculate uptime percentage for a service
*/
calculateUptime(serviceId, hours = 24) {
const history = this.getServiceHistory(serviceId, hours);
if (history.length === 0) return 100;
const upChecks = history.filter(h => h.status === 'up').length;
return (upChecks / history.length) * 100;
}
/**
* Calculate average response time
*/
calculateAverageResponseTime(serviceId, hours = 24) {
const history = this.getServiceHistory(serviceId, hours);
if (history.length === 0) return 0;
const total = history.reduce((sum, h) => sum + (h.responseTime || 0), 0);
return total / history.length;
}
/**
* Get service history for specified time period
*/
getServiceHistory(serviceId, hours = 24) {
const cutoffTime = Date.now() - (hours * 60 * 60 * 1000);
const history = this.history[serviceId] || [];
return history.filter(h =>
new Date(h.timestamp).getTime() > cutoffTime
);
}
/**
* Get current status for all services
*/
getCurrentStatus() {
const result = {};
for (const [serviceId, status] of this.currentStatus.entries()) {
const config = this.config.services[serviceId];
const uptime24h = this.calculateUptime(serviceId, 24);
const uptime7d = this.calculateUptime(serviceId, 168);
const avgResponseTime = this.calculateAverageResponseTime(serviceId, 24);
result[serviceId] = {
...status,
name: config?.name || serviceId,
uptime: {
'24h': uptime24h,
'7d': uptime7d
},
avgResponseTime,
sla: config?.sla
};
}
return result;
}
/**
* Get service statistics
*/
getServiceStats(serviceId, hours = 24) {
const history = this.getServiceHistory(serviceId, hours);
if (history.length === 0) return null;
const upChecks = history.filter(h => h.status === 'up').length;
const downChecks = history.length - upChecks;
const responseTimes = history.map(h => h.responseTime || 0);
return {
serviceId,
period: `${hours}h`,
totalChecks: history.length,
upChecks,
downChecks,
uptime: (upChecks / history.length) * 100,
responseTime: {
avg: responseTimes.reduce((a, b) => a + b, 0) / responseTimes.length,
min: Math.min(...responseTimes),
max: Math.max(...responseTimes),
p95: this.calculatePercentile(responseTimes, 95),
p99: this.calculatePercentile(responseTimes, 99)
}
};
}
/**
* Calculate percentile
*/
calculatePercentile(values, percentile) {
const sorted = values.slice().sort((a, b) => a - b);
const index = Math.ceil((percentile / 100) * sorted.length) - 1;
return sorted[index] || 0;
}
/**
* Get open incidents
*/
getOpenIncidents() {
return this.incidents.filter(i => i.status === 'open');
}
/**
* Get incident history
*/
getIncidentHistory(limit = 50) {
return this.incidents.slice(-limit).reverse();
}
/**
* Configure health check for a service
*/
configureService(serviceId, config) {
if (!this.config.services) {
this.config.services = {};
}
this.config.services[serviceId] = {
enabled: config.enabled !== false,
name: config.name || serviceId,
url: config.url,
method: config.method || 'GET',
timeout: config.timeout || 10000,
expectedStatusCodes: config.expectedStatusCodes || [200],
expectedBodyPattern: config.expectedBodyPattern,
expectedBodyContains: config.expectedBodyContains,
slowResponseThreshold: config.slowResponseThreshold || 5000,
sla: config.sla,
headers: config.headers || {},
body: config.body
};
this.saveConfig();
}
/**
* Remove service configuration
*/
removeService(serviceId) {
if (this.config.services) {
delete this.config.services[serviceId];
this.saveConfig();
}
this.currentStatus.delete(serviceId);
delete this.history[serviceId];
}
/**
* Cleanup old history
*/
cleanupHistory() {
const cutoffTime = Date.now() - (HISTORY_RETENTION_DAYS * 24 * 60 * 60 * 1000);
for (const serviceId in this.history) {
this.history[serviceId] = this.history[serviceId].filter(h =>
new Date(h.timestamp).getTime() > cutoffTime
);
}
}
/**
* Load configuration
*/
loadConfig() {
try {
if (fs.existsSync(HEALTH_CONFIG_FILE)) {
return JSON.parse(fs.readFileSync(HEALTH_CONFIG_FILE, 'utf8'));
}
} catch (error) {
this.emit('log', 'error', `Error loading config: ${error.message}`);
}
return { services: {} };
}
/**
* Save configuration
*/
saveConfig() {
try {
fs.writeFileSync(HEALTH_CONFIG_FILE, JSON.stringify(this.config, null, 2));
} catch (error) {
this.emit('log', 'error', `Error saving config: ${error.message}`);
}
}
/**
* Load history
*/
loadHistory() {
try {
if (fs.existsSync(HEALTH_HISTORY_FILE)) {
return JSON.parse(fs.readFileSync(HEALTH_HISTORY_FILE, 'utf8'));
}
} catch (error) {
this.emit('log', 'error', `Error loading history: ${error.message}`);
}
return {};
}
/**
* Save history
*/
saveHistory() {
try {
fs.writeFileSync(HEALTH_HISTORY_FILE, JSON.stringify(this.history, null, 2));
} catch (error) {
this.emit('log', 'error', `Error saving history: ${error.message}`);
}
}
}
// Export singleton instance
module.exports = new HealthChecker();