/** * Health Check Dashboard Module * Monitors service health, response times, and uptime * Provides SLA tracking and incident management */ const https = require('https'); const http = require('http'); const EventEmitter = require('events'); const fs = require('fs'); const path = require('path'); const HEALTH_CONFIG_FILE = process.env.HEALTH_CONFIG_FILE || path.join(__dirname, 'health-config.json'); const HEALTH_HISTORY_FILE = process.env.HEALTH_HISTORY_FILE || path.join(__dirname, 'health-history.json'); const CHECK_INTERVAL = parseInt(process.env.HEALTH_CHECK_INTERVAL || '30000', 10); // 30 seconds const MAX_CHECK_INTERVAL = parseInt(process.env.HEALTH_CHECK_MAX_INTERVAL || '300000', 10); // 5 minutes max backoff const HISTORY_RETENTION_DAYS = parseInt(process.env.HEALTH_HISTORY_RETENTION || '30', 10); class HealthChecker extends EventEmitter { constructor() { super(); this.config = this.loadConfig(); this.history = this.loadHistory(); this.currentStatus = new Map(); this.incidents = []; this.checking = false; this.checkInterval = null; this.consecutiveFailures = new Map(); // serviceId -> failure count this.serviceTimers = new Map(); // serviceId -> timer for per-service backoff } /** * Start health checking */ start() { if (this.checking) return; this.checking = true; // Initial check this.checkAll(); // Schedule periodic checks this.checkInterval = setInterval(() => this.checkAll(), CHECK_INTERVAL); } /** * Stop health checking */ stop() { if (!this.checking) return; this.checking = false; if (this.checkInterval) { clearInterval(this.checkInterval); this.checkInterval = null; } // Clear per-service backoff timers for (const timer of this.serviceTimers.values()) { clearTimeout(timer); } this.serviceTimers.clear(); } /** * Get the backoff interval for a service based on consecutive failures. * Doubles the interval for each failure, capped at MAX_CHECK_INTERVAL. */ getBackoffInterval(serviceId) { const failures = this.consecutiveFailures.get(serviceId) || 0; if (failures === 0) return CHECK_INTERVAL; return Math.min(CHECK_INTERVAL * Math.pow(2, failures), MAX_CHECK_INTERVAL); } /** * Check all configured services */ async checkAll() { const services = Object.entries(this.config.services || {}); for (const [serviceId, config] of services) { if (config.enabled !== false) { try { await this.checkService(serviceId, config); } catch (error) { // Error logged via checkForIncidents } } } // Cleanup old history this.cleanupHistory(); } /** * Check a single service */ async checkService(serviceId, config) { const startTime = Date.now(); try { const result = await this.performHealthCheck(config); const responseTime = Date.now() - startTime; const status = { serviceId, timestamp: new Date().toISOString(), status: result.healthy ? 'up' : 'down', responseTime, statusCode: result.statusCode, message: result.message, details: result.details }; // Track consecutive failures for exponential backoff if (result.healthy) { this.consecutiveFailures.delete(serviceId); } else { this.consecutiveFailures.set(serviceId, (this.consecutiveFailures.get(serviceId) || 0) + 1); } this.recordStatus(serviceId, status); this.checkForIncidents(serviceId, status, config); return status; } catch (error) { const responseTime = Date.now() - startTime; // Increment failure count for backoff this.consecutiveFailures.set(serviceId, (this.consecutiveFailures.get(serviceId) || 0) + 1); const status = { serviceId, timestamp: new Date().toISOString(), status: 'down', responseTime, error: error.message }; this.recordStatus(serviceId, status); this.checkForIncidents(serviceId, status, config); return status; } } /** * Perform actual health check */ async performHealthCheck(config) { const result = await this._doRequest(config, config.method || 'GET'); // Fall back to GET if HEAD is not supported if ((result.statusCode === 501 || result.statusCode === 405) && (config.method || '').toUpperCase() === 'HEAD') { return this._doRequest({ ...config, method: 'GET' }, 'GET'); } return result; } _doRequest(config, method) { return new Promise((resolve, reject) => { const url = new URL(config.url); const protocol = url.protocol === 'https:' ? https : http; const options = { hostname: url.hostname, port: url.port || (url.protocol === 'https:' ? 443 : 80), path: url.pathname + url.search, method, timeout: config.timeout || 20000, headers: config.headers || {}, rejectUnauthorized: false // Trust internal CA certs (.sami TLD) }; const req = protocol.request(options, (res) => { let data = ''; res.on('data', chunk => { data += chunk; }); res.on('end', () => { const healthy = this.evaluateHealth(res.statusCode, data, config); resolve({ healthy, statusCode: res.statusCode, message: healthy ? 'Service is healthy' : 'Service check failed', details: { headers: res.headers, bodyLength: data.length } }); }); }); req.on('error', (error) => { reject(error); }); req.on('timeout', () => { req.destroy(); reject(new Error('Health check timeout')); }); if (config.body) { req.write(JSON.stringify(config.body)); } req.end(); }); } /** * Evaluate if service is healthy based on response */ evaluateHealth(statusCode, body, config) { // Check status code const expectedCodes = config.expectedStatusCodes || [200, 201, 204, 301, 302, 303, 307, 308]; if (!expectedCodes.includes(statusCode)) { return false; } // Check response body if pattern specified if (config.expectedBodyPattern) { const regex = new RegExp(config.expectedBodyPattern); if (!regex.test(body)) { return false; } } // Check response body contains expected text if (config.expectedBodyContains) { if (!body.includes(config.expectedBodyContains)) { return false; } } return true; } /** * Record service status */ recordStatus(serviceId, status) { // Update current status this.currentStatus.set(serviceId, status); // Add to history if (!this.history[serviceId]) { this.history[serviceId] = []; } this.history[serviceId].push(status); // Emit status event this.emit('status-check', status); // Save history periodically if (Math.random() < 0.05) { // 5% chance (every ~20 checks) this.saveHistory(); } } /** * Check for incidents (downtime, slow response, etc.) */ checkForIncidents(serviceId, status, config) { const previous = this.currentStatus.get(serviceId); // Check for status change (up -> down or down -> up) if (previous && previous.status !== status.status) { if (status.status === 'down') { this.createIncident(serviceId, 'outage', 'Service is down', status); } else if (status.status === 'up') { this.resolveIncident(serviceId, 'outage', status); } } // Check for slow response time const slowThreshold = config.slowResponseThreshold || 5000; // 5 seconds if (status.responseTime > slowThreshold) { this.createIncident(serviceId, 'slow-response', `Response time ${status.responseTime}ms exceeds threshold ${slowThreshold}ms`, status); } // Check SLA violations const sla = config.sla; if (sla) { const uptime = this.calculateUptime(serviceId, sla.period || 24); if (uptime < sla.target) { this.createIncident(serviceId, 'sla-violation', `Uptime ${uptime.toFixed(2)}% below SLA target ${sla.target}%`, status); } } } /** * Create a new incident */ createIncident(serviceId, type, message, status) { // Check if similar incident already exists const existing = this.incidents.find(i => i.serviceId === serviceId && i.type === type && i.status === 'open' ); if (existing) { // Update existing incident existing.lastOccurrence = status.timestamp; existing.occurrences++; return; } // Create new incident const incident = { id: `incident-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`, serviceId, type, message, status: 'open', severity: this.calculateSeverity(type), createdAt: status.timestamp, lastOccurrence: status.timestamp, occurrences: 1, details: status }; this.incidents.push(incident); this.emit('incident-created', incident); this.emit('log', 'info', `Incident created: ${incident.id} - ${message}`); } /** * Resolve an incident */ resolveIncident(serviceId, type, status) { const incident = this.incidents.find(i => i.serviceId === serviceId && i.type === type && i.status === 'open' ); if (incident) { incident.status = 'resolved'; incident.resolvedAt = status.timestamp; incident.duration = new Date(incident.resolvedAt) - new Date(incident.createdAt); this.emit('incident-resolved', incident); this.emit('log', 'info', `Incident resolved: ${incident.id}`); } } /** * Calculate incident severity */ calculateSeverity(type) { switch (type) { case 'outage': return 'critical'; case 'sla-violation': return 'high'; case 'slow-response': return 'medium'; default: return 'low'; } } /** * Calculate uptime percentage for a service */ calculateUptime(serviceId, hours = 24) { const history = this.getServiceHistory(serviceId, hours); if (history.length === 0) return 100; const upChecks = history.filter(h => h.status === 'up').length; return (upChecks / history.length) * 100; } /** * Calculate average response time */ calculateAverageResponseTime(serviceId, hours = 24) { const history = this.getServiceHistory(serviceId, hours); if (history.length === 0) return 0; const total = history.reduce((sum, h) => sum + (h.responseTime || 0), 0); return total / history.length; } /** * Get service history for specified time period */ getServiceHistory(serviceId, hours = 24) { const cutoffTime = Date.now() - (hours * 60 * 60 * 1000); const history = this.history[serviceId] || []; return history.filter(h => new Date(h.timestamp).getTime() > cutoffTime ); } /** * Get current status for all services */ getCurrentStatus() { const result = {}; for (const [serviceId, status] of this.currentStatus.entries()) { const config = this.config.services[serviceId]; const uptime24h = this.calculateUptime(serviceId, 24); const uptime7d = this.calculateUptime(serviceId, 168); const avgResponseTime = this.calculateAverageResponseTime(serviceId, 24); result[serviceId] = { ...status, name: config?.name || serviceId, uptime: { '24h': uptime24h, '7d': uptime7d }, avgResponseTime, sla: config?.sla }; } return result; } /** * Get service statistics */ getServiceStats(serviceId, hours = 24) { const history = this.getServiceHistory(serviceId, hours); if (history.length === 0) return null; const upChecks = history.filter(h => h.status === 'up').length; const downChecks = history.length - upChecks; const responseTimes = history.map(h => h.responseTime || 0); return { serviceId, period: `${hours}h`, totalChecks: history.length, upChecks, downChecks, uptime: (upChecks / history.length) * 100, responseTime: { avg: responseTimes.reduce((a, b) => a + b, 0) / responseTimes.length, min: Math.min(...responseTimes), max: Math.max(...responseTimes), p95: this.calculatePercentile(responseTimes, 95), p99: this.calculatePercentile(responseTimes, 99) } }; } /** * Calculate percentile */ calculatePercentile(values, percentile) { const sorted = values.slice().sort((a, b) => a - b); const index = Math.ceil((percentile / 100) * sorted.length) - 1; return sorted[index] || 0; } /** * Get open incidents */ getOpenIncidents() { return this.incidents.filter(i => i.status === 'open'); } /** * Get incident history */ getIncidentHistory(limit = 50) { return this.incidents.slice(-limit).reverse(); } /** * Configure health check for a service */ configureService(serviceId, config) { if (!this.config.services) { this.config.services = {}; } this.config.services[serviceId] = { enabled: config.enabled !== false, name: config.name || serviceId, url: config.url, method: config.method || 'GET', timeout: config.timeout || 20000, expectedStatusCodes: config.expectedStatusCodes || [200], expectedBodyPattern: config.expectedBodyPattern, expectedBodyContains: config.expectedBodyContains, slowResponseThreshold: config.slowResponseThreshold || 5000, sla: config.sla, headers: config.headers || {}, body: config.body }; this.saveConfig(); } /** * Remove service configuration */ removeService(serviceId) { if (this.config.services) { delete this.config.services[serviceId]; this.saveConfig(); } this.currentStatus.delete(serviceId); delete this.history[serviceId]; } /** * Cleanup old history */ cleanupHistory() { const cutoffTime = Date.now() - (HISTORY_RETENTION_DAYS * 24 * 60 * 60 * 1000); for (const serviceId in this.history) { this.history[serviceId] = this.history[serviceId].filter(h => new Date(h.timestamp).getTime() > cutoffTime ); } } /** * Load configuration */ loadConfig() { try { if (fs.existsSync(HEALTH_CONFIG_FILE)) { return JSON.parse(fs.readFileSync(HEALTH_CONFIG_FILE, 'utf8')); } } catch (error) { this.emit('log', 'error', `Error loading config: ${error.message}`); } return { services: {} }; } /** * Save configuration */ saveConfig() { try { fs.writeFileSync(HEALTH_CONFIG_FILE, JSON.stringify(this.config, null, 2)); } catch (error) { this.emit('log', 'error', `Error saving config: ${error.message}`); } } /** * Load history */ loadHistory() { try { if (fs.existsSync(HEALTH_HISTORY_FILE)) { return JSON.parse(fs.readFileSync(HEALTH_HISTORY_FILE, 'utf8')); } } catch (error) { this.emit('log', 'error', `Error loading history: ${error.message}`); } return {}; } /** * Save history */ saveHistory() { try { fs.writeFileSync(HEALTH_HISTORY_FILE, JSON.stringify(this.history, null, 2)); } catch (error) { this.emit('log', 'error', `Error saving history: ${error.message}`); } } } // Export singleton instance module.exports = new HealthChecker();