/** * Container Resource Monitoring Module * Tracks CPU, memory, disk, and network usage for Docker containers * Provides alerts and historical data */ const Docker = require('dockerode'); const EventEmitter = require('events'); const fs = require('fs'); const path = require('path'); const docker = new Docker(); // Configuration const STATS_FILE = process.env.STATS_FILE || path.join(__dirname, 'container-stats.json'); const ALERT_CONFIG_FILE = process.env.ALERT_CONFIG_FILE || path.join(__dirname, 'alert-config.json'); const STATS_RETENTION_HOURS = parseInt(process.env.STATS_RETENTION_HOURS || '168', 10); // 7 days default const MONITORING_INTERVAL = parseInt(process.env.MONITORING_INTERVAL || '10000', 10); // 10 seconds class ResourceMonitor extends EventEmitter { constructor() { super(); this.monitoring = false; this.monitoringInterval = null; this.stats = new Map(); // containerId -> array of stats this.alerts = new Map(); // containerId -> alert config this.lastAlerts = new Map(); // containerId -> last alert timestamp this.loadStats(); this.loadAlertConfig(); } /** * Start monitoring all containers */ start() { if (this.monitoring) { console.log('[ResourceMonitor] Already monitoring'); return; } console.log('[ResourceMonitor] Starting container monitoring'); this.monitoring = true; this.monitoringInterval = setInterval(() => this.collectStats(), MONITORING_INTERVAL); // Initial collection this.collectStats(); } /** * Stop monitoring */ stop() { if (!this.monitoring) return; console.log('[ResourceMonitor] Stopping container monitoring'); this.monitoring = false; if (this.monitoringInterval) { clearInterval(this.monitoringInterval); this.monitoringInterval = null; } this.saveStats(); } /** * Collect stats from all running containers */ async collectStats() { try { const containers = await docker.listContainers({ all: false }); for (const containerInfo of containers) { try { const container = docker.getContainer(containerInfo.Id); const stats = await this.getContainerStats(container); if (stats) { this.recordStats(containerInfo.Id, containerInfo.Names[0], stats); this.checkAlerts(containerInfo.Id, containerInfo.Names[0], stats); } } catch (error) { console.error(`[ResourceMonitor] Error collecting stats for ${containerInfo.Names[0]}:`, error.message); } } // Cleanup old stats this.cleanupOldStats(); // Persist stats periodically if (Math.random() < 0.1) { // 10% chance to save (every ~100 seconds) this.saveStats(); } } catch (error) { console.error('[ResourceMonitor] Error collecting container stats:', error.message); } } /** * Get stats for a single container */ async getContainerStats(container) { return new Promise((resolve, reject) => { container.stats({ stream: false }, (err, stats) => { if (err) { reject(err); return; } // Calculate CPU percentage const cpuDelta = stats.cpu_stats.cpu_usage.total_usage - (stats.precpu_stats.cpu_usage?.total_usage || 0); const systemDelta = stats.cpu_stats.system_cpu_usage - (stats.precpu_stats.system_cpu_usage || 0); const cpuPercent = systemDelta > 0 ? (cpuDelta / systemDelta) * 100 : 0; // Calculate memory usage const memoryUsage = stats.memory_stats.usage || 0; const memoryLimit = stats.memory_stats.limit || 0; const memoryPercent = memoryLimit > 0 ? (memoryUsage / memoryLimit) * 100 : 0; // Calculate network I/O let networkRx = 0; let networkTx = 0; if (stats.networks) { Object.values(stats.networks).forEach(net => { networkRx += net.rx_bytes || 0; networkTx += net.tx_bytes || 0; }); } // Calculate block I/O let blockRead = 0; let blockWrite = 0; if (stats.blkio_stats?.io_service_bytes_recursive) { stats.blkio_stats.io_service_bytes_recursive.forEach(io => { if (io.op === 'Read') blockRead += io.value; if (io.op === 'Write') blockWrite += io.value; }); } resolve({ timestamp: new Date().toISOString(), cpu: { percent: Math.round(cpuPercent * 100) / 100, usage: stats.cpu_stats.cpu_usage.total_usage }, memory: { usage: memoryUsage, limit: memoryLimit, percent: Math.round(memoryPercent * 100) / 100, usageMB: Math.round(memoryUsage / 1024 / 1024), limitMB: Math.round(memoryLimit / 1024 / 1024) }, network: { rxBytes: networkRx, txBytes: networkTx, rxMB: Math.round(networkRx / 1024 / 1024 * 100) / 100, txMB: Math.round(networkTx / 1024 / 1024 * 100) / 100 }, disk: { readBytes: blockRead, writeBytes: blockWrite, readMB: Math.round(blockRead / 1024 / 1024 * 100) / 100, writeMB: Math.round(blockWrite / 1024 / 1024 * 100) / 100 }, pids: stats.pids_stats?.current || 0 }); }); }); } /** * Record stats for a container */ recordStats(containerId, containerName, stats) { if (!this.stats.has(containerId)) { this.stats.set(containerId, { name: containerName, history: [] }); } const containerStats = this.stats.get(containerId); containerStats.name = containerName; // Update name in case it changed containerStats.history.push(stats); // Keep only recent stats (based on retention policy) const cutoffTime = Date.now() - (STATS_RETENTION_HOURS * 60 * 60 * 1000); containerStats.history = containerStats.history.filter(s => new Date(s.timestamp).getTime() > cutoffTime ); } /** * Check if any alerts should be triggered */ checkAlerts(containerId, containerName, stats) { const alertConfig = this.alerts.get(containerId); if (!alertConfig || !alertConfig.enabled) return; const now = Date.now(); const lastAlert = this.lastAlerts.get(containerId) || 0; const cooldown = (alertConfig.cooldownMinutes || 15) * 60 * 1000; // Don't spam alerts - respect cooldown period if (now - lastAlert < cooldown) return; const alerts = []; // Check CPU threshold if (alertConfig.cpuThreshold && stats.cpu.percent > alertConfig.cpuThreshold) { alerts.push({ type: 'cpu', severity: 'warning', message: `CPU usage ${stats.cpu.percent.toFixed(1)}% exceeds threshold ${alertConfig.cpuThreshold}%`, value: stats.cpu.percent, threshold: alertConfig.cpuThreshold }); } // Check memory threshold if (alertConfig.memoryThreshold && stats.memory.percent > alertConfig.memoryThreshold) { alerts.push({ type: 'memory', severity: 'warning', message: `Memory usage ${stats.memory.percent.toFixed(1)}% exceeds threshold ${alertConfig.memoryThreshold}%`, value: stats.memory.percent, threshold: alertConfig.memoryThreshold }); } // Check disk I/O threshold (MB/s) if (alertConfig.diskIOThreshold) { const diskIO = stats.disk.readMB + stats.disk.writeMB; if (diskIO > alertConfig.diskIOThreshold) { alerts.push({ type: 'disk', severity: 'warning', message: `Disk I/O ${diskIO.toFixed(1)} MB/s exceeds threshold ${alertConfig.diskIOThreshold} MB/s`, value: diskIO, threshold: alertConfig.diskIOThreshold }); } } if (alerts.length > 0) { this.lastAlerts.set(containerId, now); this.emit('alert', { containerId, containerName, timestamp: new Date().toISOString(), alerts, stats, config: alertConfig }); // Auto-restart if configured if (alertConfig.autoRestart) { this.restartContainer(containerId, containerName, alerts); } } } /** * Restart a container due to resource alerts */ async restartContainer(containerId, containerName, alerts) { try { console.log(`[ResourceMonitor] Auto-restarting ${containerName} due to alerts:`, alerts.map(a => a.type).join(', ')); const container = docker.getContainer(containerId); await container.restart(); this.emit('auto-restart', { containerId, containerName, timestamp: new Date().toISOString(), reason: alerts }); } catch (error) { console.error(`[ResourceMonitor] Failed to restart ${containerName}:`, error.message); } } /** * Get current stats for a container */ getCurrentStats(containerId) { const containerStats = this.stats.get(containerId); if (!containerStats || containerStats.history.length === 0) { return null; } return containerStats.history[containerStats.history.length - 1]; } /** * Get historical stats for a container */ getHistoricalStats(containerId, hours = 24) { const containerStats = this.stats.get(containerId); if (!containerStats) return []; const cutoffTime = Date.now() - (hours * 60 * 60 * 1000); return containerStats.history.filter(s => new Date(s.timestamp).getTime() > cutoffTime ); } /** * Get aggregated stats for a container */ getAggregatedStats(containerId, hours = 24) { const history = this.getHistoricalStats(containerId, hours); if (history.length === 0) return null; const cpuValues = history.map(s => s.cpu.percent); const memoryValues = history.map(s => s.memory.percent); return { cpu: { current: cpuValues[cpuValues.length - 1], avg: cpuValues.reduce((a, b) => a + b, 0) / cpuValues.length, max: Math.max(...cpuValues), min: Math.min(...cpuValues) }, memory: { current: memoryValues[memoryValues.length - 1], avg: memoryValues.reduce((a, b) => a + b, 0) / memoryValues.length, max: Math.max(...memoryValues), min: Math.min(...memoryValues) }, dataPoints: history.length, timeRange: hours }; } /** * Get stats for all containers */ getAllStats() { const result = {}; for (const [containerId, data] of this.stats.entries()) { const current = this.getCurrentStats(containerId); const aggregated = this.getAggregatedStats(containerId, 24); result[containerId] = { name: data.name, current, aggregated, alertConfig: this.alerts.get(containerId) }; } return result; } /** * Configure alerts for a container */ setAlertConfig(containerId, config) { this.alerts.set(containerId, { enabled: config.enabled !== false, cpuThreshold: config.cpuThreshold || null, memoryThreshold: config.memoryThreshold || null, diskIOThreshold: config.diskIOThreshold || null, cooldownMinutes: config.cooldownMinutes || 15, autoRestart: config.autoRestart || false, notificationChannels: config.notificationChannels || [] }); this.saveAlertConfig(); } /** * Get alert configuration for a container */ getAlertConfig(containerId) { return this.alerts.get(containerId) || null; } /** * Remove alert configuration */ removeAlertConfig(containerId) { this.alerts.delete(containerId); this.lastAlerts.delete(containerId); this.saveAlertConfig(); } /** * Cleanup old stats beyond retention period */ cleanupOldStats() { const cutoffTime = Date.now() - (STATS_RETENTION_HOURS * 60 * 60 * 1000); for (const [containerId, data] of this.stats.entries()) { data.history = data.history.filter(s => new Date(s.timestamp).getTime() > cutoffTime ); // Remove container stats if no recent data if (data.history.length === 0) { this.stats.delete(containerId); } } } /** * Load stats from disk */ loadStats() { try { if (fs.existsSync(STATS_FILE)) { const data = JSON.parse(fs.readFileSync(STATS_FILE, 'utf8')); this.stats = new Map(Object.entries(data)); console.log(`[ResourceMonitor] Loaded stats for ${this.stats.size} containers`); } } catch (error) { console.error('[ResourceMonitor] Error loading stats:', error.message); } } /** * Save stats to disk */ saveStats() { try { const data = Object.fromEntries(this.stats); fs.writeFileSync(STATS_FILE, JSON.stringify(data, null, 2)); } catch (error) { console.error('[ResourceMonitor] Error saving stats:', error.message); } } /** * Load alert configuration from disk */ loadAlertConfig() { try { if (fs.existsSync(ALERT_CONFIG_FILE)) { const data = JSON.parse(fs.readFileSync(ALERT_CONFIG_FILE, 'utf8')); this.alerts = new Map(Object.entries(data)); console.log(`[ResourceMonitor] Loaded alert config for ${this.alerts.size} containers`); } } catch (error) { console.error('[ResourceMonitor] Error loading alert config:', error.message); } } /** * Save alert configuration to disk */ saveAlertConfig() { try { const data = Object.fromEntries(this.alerts); fs.writeFileSync(ALERT_CONFIG_FILE, JSON.stringify(data, null, 2)); } catch (error) { console.error('[ResourceMonitor] Error saving alert config:', error.message); } } /** * Export stats for backup */ exportStats() { return { stats: Object.fromEntries(this.stats), alerts: Object.fromEntries(this.alerts), exportedAt: new Date().toISOString() }; } /** * Import stats from backup */ importStats(data) { if (data.stats) { this.stats = new Map(Object.entries(data.stats)); } if (data.alerts) { this.alerts = new Map(Object.entries(data.alerts)); } this.saveStats(); this.saveAlertConfig(); } } // Export singleton instance module.exports = new ResourceMonitor();