init
Some checks failed
Build heartbeat / build (push) Failing after 1m18s

This commit is contained in:
2026-05-03 21:09:59 +02:00
commit 93ae9b66b3
12 changed files with 987 additions and 0 deletions

130
internal/alerts/alerts.go Normal file
View File

@@ -0,0 +1,130 @@
package alerts
import (
"fmt"
"sort"
"strings"
"time"
"heartbeat/internal/config"
"heartbeat/internal/metrics"
)
type Severity string
const (
SeverityHealthy Severity = "healthy"
SeverityWarning Severity = "warning"
SeverityCritical Severity = "critical"
)
type Event struct {
Key string
Severity Severity
Title string
Body string
}
type Evaluator struct {
states map[string]Severity
}
func NewEvaluator() *Evaluator {
return &Evaluator{states: make(map[string]Severity)}
}
func (e *Evaluator) Evaluate(cfg config.Config, sample metrics.Sample) []Event {
events := []Event{}
events = append(events, e.metricEvent("disk_used", compareHigh(sample.RootUsedPercent, cfg.Thresholds.DiskUsedPercentWarning, cfg.Thresholds.DiskUsedPercentCritical), fmt.Sprintf("Root disk usage %.1f%%", sample.RootUsedPercent), fmt.Sprintf("Root partition is %.1f%% used with %.1f GB free.", sample.RootUsedPercent, sample.RootFreeGB))...)
events = append(events, e.metricEvent("free_gb", compareLow(sample.RootFreeGB, cfg.Thresholds.FreeGBWarning, cfg.Thresholds.FreeGBCritical), fmt.Sprintf("Low disk space %.1f GB", sample.RootFreeGB), fmt.Sprintf("Root partition free space is %.1f GB.", sample.RootFreeGB))...)
events = append(events, e.metricEvent("inode_used", compareHigh(sample.InodeUsedPercent, cfg.Thresholds.InodeUsedPercentWarning, cfg.Thresholds.InodeUsedPercentCritical), fmt.Sprintf("Inode usage %.1f%%", sample.InodeUsedPercent), fmt.Sprintf("Root partition inode usage is %.1f%%.", sample.InodeUsedPercent))...)
events = append(events, e.metricEvent("memory_used", compareHigh(sample.MemoryUsedPercent, cfg.Thresholds.MemoryUsedPercentWarning, cfg.Thresholds.MemoryUsedPercentCritical), fmt.Sprintf("Memory usage %.1f%%", sample.MemoryUsedPercent), fmt.Sprintf("Memory usage is %.1f%%.", sample.MemoryUsedPercent))...)
events = append(events, e.metricEvent("swap_used", compareHigh(sample.SwapUsedPercent, cfg.Thresholds.SwapUsedPercentWarning, cfg.Thresholds.SwapUsedPercentCritical), fmt.Sprintf("Swap usage %.1f%%", sample.SwapUsedPercent), fmt.Sprintf("Swap usage is %.1f%%.", sample.SwapUsedPercent))...)
events = append(events, e.metricEvent("cpu_avg_15m", compareHigh(sample.CPUAvg15mPercent, cfg.Thresholds.CPUAvg15mWarning, cfg.Thresholds.CPUAvg15mCritical), fmt.Sprintf("CPU 15m average %.1f%%", sample.CPUAvg15mPercent), fmt.Sprintf("CPU rolling 15-minute average is %.1f%%.", sample.CPUAvg15mPercent))...)
events = append(events, e.metricEvent("cpu_avg_12h", compareHigh(sample.CPUAvg12hPercent, cfg.Thresholds.CPUAvg12hWarning, cfg.Thresholds.CPUAvg12hCritical), fmt.Sprintf("CPU 12h average %.1f%%", sample.CPUAvg12hPercent), fmt.Sprintf("CPU rolling 12-hour average is %.1f%%.", sample.CPUAvg12hPercent))...)
events = append(events, e.metricEvent("load_per_core", compareHigh(sample.LoadPerCore, cfg.Thresholds.LoadPerCoreWarning, cfg.Thresholds.LoadPerCoreCritical), fmt.Sprintf("Load per core %.2f", sample.LoadPerCore), fmt.Sprintf("15-minute load per core is %.2f.", sample.LoadPerCore))...)
events = append(events, e.metricEvent("process_count", compareHigh(float64(sample.ProcessCount), float64(cfg.Thresholds.ProcessCountWarning), float64(cfg.Thresholds.ProcessCountCritical)), fmt.Sprintf("Process count %d", sample.ProcessCount), fmt.Sprintf("Process count is %d.", sample.ProcessCount))...)
for _, site := range sample.Sites {
severity := SeverityHealthy
if !site.Healthy {
severity = SeverityCritical
}
title := fmt.Sprintf("Site %s reachable", site.Name)
body := fmt.Sprintf("%s responded with %d in %s.", site.URL, site.StatusCode, site.Latency.Round(time.Millisecond))
if !site.Healthy {
title = fmt.Sprintf("Site %s failed", site.Name)
body = fmt.Sprintf("%s check failed: %s.", site.URL, site.ErrorMessage)
}
events = append(events, e.metricEvent("site:"+site.Name, severity, title, body)...)
}
sort.Slice(events, func(i, j int) bool { return events[i].Key < events[j].Key })
return events
}
func (e *Evaluator) metricEvent(key string, severity Severity, title string, body string) []Event {
previous := e.states[key]
if previous == "" {
e.states[key] = severity
if severity == SeverityHealthy {
return nil
}
return []Event{{Key: key, Severity: severity, Title: title, Body: body}}
}
if previous == severity {
return nil
}
e.states[key] = severity
if severity == SeverityHealthy {
return []Event{{Key: key, Severity: severity, Title: recoveryTitle(title), Body: body}}
}
return []Event{{Key: key, Severity: severity, Title: title, Body: body}}
}
func recoveryTitle(title string) string {
return "Recovered: " + title
}
func compareHigh(value float64, warning float64, critical float64) Severity {
switch {
case critical > 0 && value >= critical:
return SeverityCritical
case warning > 0 && value >= warning:
return SeverityWarning
default:
return SeverityHealthy
}
}
func compareLow(value float64, warning float64, critical float64) Severity {
switch {
case critical > 0 && value <= critical:
return SeverityCritical
case warning > 0 && value <= warning:
return SeverityWarning
default:
return SeverityHealthy
}
}
func FormatSummary(sample metrics.Sample) string {
lines := []string{
fmt.Sprintf("CPU now %.1f%% | 15m %.1f%% | 12h %.1f%%", sample.CPUCurrentPercent, sample.CPUAvg15mPercent, sample.CPUAvg12hPercent),
fmt.Sprintf("Memory %.1f%% | Swap %.1f%%", sample.MemoryUsedPercent, sample.SwapUsedPercent),
fmt.Sprintf("Disk / %.1f%% used | %.1f GB free | Inodes %.1f%%", sample.RootUsedPercent, sample.RootFreeGB, sample.InodeUsedPercent),
fmt.Sprintf("Load %.2f / %.2f / %.2f | Per core %.2f", sample.Load1, sample.Load5, sample.Load15, sample.LoadPerCore),
fmt.Sprintf("Network RX %.1f KB/s | TX %.1f KB/s", sample.RXBytesPerSecond/1024, sample.TXBytesPerSecond/1024),
fmt.Sprintf("Processes %d | Hosted sites %d", sample.ProcessCount, sample.HostedServiceCount),
}
siteStates := make([]string, 0, len(sample.Sites))
for _, site := range sample.Sites {
state := "up"
if !site.Healthy {
state = "down"
}
siteStates = append(siteStates, fmt.Sprintf("%s=%s", site.Name, state))
}
if len(siteStates) > 0 {
lines = append(lines, "Sites "+strings.Join(siteStates, ", "))
}
return strings.Join(lines, "\n")
}