package alerts import ( "fmt" "sort" "strings" "time" "heartbeat/internal/config" "heartbeat/internal/metrics" ) type Severity string const ( SeverityHealthy Severity = "healthy" SeverityWarning Severity = "warning" SeverityCritical Severity = "critical" ) type Event struct { Key string Severity Severity Title string Body string } type Evaluator struct { states map[string]Severity } func NewEvaluator() *Evaluator { return &Evaluator{states: make(map[string]Severity)} } func (e *Evaluator) Evaluate(cfg config.Config, sample metrics.Sample) []Event { events := []Event{} events = append(events, e.metricEvent("disk_used", compareHigh(sample.RootUsedPercent, cfg.Thresholds.DiskUsedPercentWarning, cfg.Thresholds.DiskUsedPercentCritical), fmt.Sprintf("Root disk usage %.1f%%", sample.RootUsedPercent), fmt.Sprintf("Root partition is %.1f%% used with %.1f GB free.", sample.RootUsedPercent, sample.RootFreeGB))...) events = append(events, e.metricEvent("free_gb", compareLow(sample.RootFreeGB, cfg.Thresholds.FreeGBWarning, cfg.Thresholds.FreeGBCritical), fmt.Sprintf("Low disk space %.1f GB", sample.RootFreeGB), fmt.Sprintf("Root partition free space is %.1f GB.", sample.RootFreeGB))...) events = append(events, e.metricEvent("inode_used", compareHigh(sample.InodeUsedPercent, cfg.Thresholds.InodeUsedPercentWarning, cfg.Thresholds.InodeUsedPercentCritical), fmt.Sprintf("Inode usage %.1f%%", sample.InodeUsedPercent), fmt.Sprintf("Root partition inode usage is %.1f%%.", sample.InodeUsedPercent))...) events = append(events, e.metricEvent("memory_used", compareHigh(sample.MemoryUsedPercent, cfg.Thresholds.MemoryUsedPercentWarning, cfg.Thresholds.MemoryUsedPercentCritical), fmt.Sprintf("Memory usage %.1f%%", sample.MemoryUsedPercent), fmt.Sprintf("Memory usage is %.1f%%.", sample.MemoryUsedPercent))...) events = append(events, e.metricEvent("swap_used", compareHigh(sample.SwapUsedPercent, cfg.Thresholds.SwapUsedPercentWarning, cfg.Thresholds.SwapUsedPercentCritical), fmt.Sprintf("Swap usage %.1f%%", sample.SwapUsedPercent), fmt.Sprintf("Swap usage is %.1f%%.", sample.SwapUsedPercent))...) events = append(events, e.metricEvent("cpu_avg_15m", compareHigh(sample.CPUAvg15mPercent, cfg.Thresholds.CPUAvg15mWarning, cfg.Thresholds.CPUAvg15mCritical), fmt.Sprintf("CPU 15m average %.1f%%", sample.CPUAvg15mPercent), fmt.Sprintf("CPU rolling 15-minute average is %.1f%%.", sample.CPUAvg15mPercent))...) events = append(events, e.metricEvent("cpu_avg_12h", compareHigh(sample.CPUAvg12hPercent, cfg.Thresholds.CPUAvg12hWarning, cfg.Thresholds.CPUAvg12hCritical), fmt.Sprintf("CPU 12h average %.1f%%", sample.CPUAvg12hPercent), fmt.Sprintf("CPU rolling 12-hour average is %.1f%%.", sample.CPUAvg12hPercent))...) events = append(events, e.metricEvent("load_per_core", compareHigh(sample.LoadPerCore, cfg.Thresholds.LoadPerCoreWarning, cfg.Thresholds.LoadPerCoreCritical), fmt.Sprintf("Load per core %.2f", sample.LoadPerCore), fmt.Sprintf("15-minute load per core is %.2f.", sample.LoadPerCore))...) events = append(events, e.metricEvent("process_count", compareHigh(float64(sample.ProcessCount), float64(cfg.Thresholds.ProcessCountWarning), float64(cfg.Thresholds.ProcessCountCritical)), fmt.Sprintf("Process count %d", sample.ProcessCount), fmt.Sprintf("Process count is %d.", sample.ProcessCount))...) for _, site := range sample.Sites { severity := SeverityHealthy if !site.Healthy { severity = SeverityCritical } title := fmt.Sprintf("Site %s reachable", site.Name) body := fmt.Sprintf("%s responded with %d in %s.", site.URL, site.StatusCode, site.Latency.Round(time.Millisecond)) if !site.Healthy { title = fmt.Sprintf("Site %s failed", site.Name) body = fmt.Sprintf("%s check failed: %s.", site.URL, site.ErrorMessage) } events = append(events, e.metricEvent("site:"+site.Name, severity, title, body)...) } sort.Slice(events, func(i, j int) bool { return events[i].Key < events[j].Key }) return events } func (e *Evaluator) metricEvent(key string, severity Severity, title string, body string) []Event { previous := e.states[key] if previous == "" { e.states[key] = severity if severity == SeverityHealthy { return nil } return []Event{{Key: key, Severity: severity, Title: title, Body: body}} } if previous == severity { return nil } e.states[key] = severity if severity == SeverityHealthy { return []Event{{Key: key, Severity: severity, Title: recoveryTitle(title), Body: body}} } return []Event{{Key: key, Severity: severity, Title: title, Body: body}} } func recoveryTitle(title string) string { return "Recovered: " + title } func compareHigh(value float64, warning float64, critical float64) Severity { switch { case critical > 0 && value >= critical: return SeverityCritical case warning > 0 && value >= warning: return SeverityWarning default: return SeverityHealthy } } func compareLow(value float64, warning float64, critical float64) Severity { switch { case critical > 0 && value <= critical: return SeverityCritical case warning > 0 && value <= warning: return SeverityWarning default: return SeverityHealthy } } func FormatSummary(sample metrics.Sample) string { lines := []string{ fmt.Sprintf("CPU now %.1f%% | 15m %.1f%% | 12h %.1f%%", sample.CPUCurrentPercent, sample.CPUAvg15mPercent, sample.CPUAvg12hPercent), fmt.Sprintf("Memory %.1f%% | Swap %.1f%%", sample.MemoryUsedPercent, sample.SwapUsedPercent), fmt.Sprintf("Disk / %.1f%% used | %.1f GB free | Inodes %.1f%%", sample.RootUsedPercent, sample.RootFreeGB, sample.InodeUsedPercent), fmt.Sprintf("Load %.2f / %.2f / %.2f | Per core %.2f", sample.Load1, sample.Load5, sample.Load15, sample.LoadPerCore), fmt.Sprintf("Network RX %.1f KB/s | TX %.1f KB/s", sample.RXBytesPerSecond/1024, sample.TXBytesPerSecond/1024), fmt.Sprintf("Processes %d | Hosted sites %d", sample.ProcessCount, sample.HostedServiceCount), } siteStates := make([]string, 0, len(sample.Sites)) for _, site := range sample.Sites { state := "up" if !site.Healthy { state = "down" } siteStates = append(siteStates, fmt.Sprintf("%s=%s", site.Name, state)) } if len(siteStates) > 0 { lines = append(lines, "Sites "+strings.Join(siteStates, ", ")) } return strings.Join(lines, "\n") }