This commit is contained in:
130
internal/alerts/alerts.go
Normal file
130
internal/alerts/alerts.go
Normal file
@@ -0,0 +1,130 @@
|
||||
package alerts
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"heartbeat/internal/config"
|
||||
"heartbeat/internal/metrics"
|
||||
)
|
||||
|
||||
type Severity string
|
||||
|
||||
const (
|
||||
SeverityHealthy Severity = "healthy"
|
||||
SeverityWarning Severity = "warning"
|
||||
SeverityCritical Severity = "critical"
|
||||
)
|
||||
|
||||
type Event struct {
|
||||
Key string
|
||||
Severity Severity
|
||||
Title string
|
||||
Body string
|
||||
}
|
||||
|
||||
type Evaluator struct {
|
||||
states map[string]Severity
|
||||
}
|
||||
|
||||
func NewEvaluator() *Evaluator {
|
||||
return &Evaluator{states: make(map[string]Severity)}
|
||||
}
|
||||
|
||||
func (e *Evaluator) Evaluate(cfg config.Config, sample metrics.Sample) []Event {
|
||||
events := []Event{}
|
||||
events = append(events, e.metricEvent("disk_used", compareHigh(sample.RootUsedPercent, cfg.Thresholds.DiskUsedPercentWarning, cfg.Thresholds.DiskUsedPercentCritical), fmt.Sprintf("Root disk usage %.1f%%", sample.RootUsedPercent), fmt.Sprintf("Root partition is %.1f%% used with %.1f GB free.", sample.RootUsedPercent, sample.RootFreeGB))...)
|
||||
events = append(events, e.metricEvent("free_gb", compareLow(sample.RootFreeGB, cfg.Thresholds.FreeGBWarning, cfg.Thresholds.FreeGBCritical), fmt.Sprintf("Low disk space %.1f GB", sample.RootFreeGB), fmt.Sprintf("Root partition free space is %.1f GB.", sample.RootFreeGB))...)
|
||||
events = append(events, e.metricEvent("inode_used", compareHigh(sample.InodeUsedPercent, cfg.Thresholds.InodeUsedPercentWarning, cfg.Thresholds.InodeUsedPercentCritical), fmt.Sprintf("Inode usage %.1f%%", sample.InodeUsedPercent), fmt.Sprintf("Root partition inode usage is %.1f%%.", sample.InodeUsedPercent))...)
|
||||
events = append(events, e.metricEvent("memory_used", compareHigh(sample.MemoryUsedPercent, cfg.Thresholds.MemoryUsedPercentWarning, cfg.Thresholds.MemoryUsedPercentCritical), fmt.Sprintf("Memory usage %.1f%%", sample.MemoryUsedPercent), fmt.Sprintf("Memory usage is %.1f%%.", sample.MemoryUsedPercent))...)
|
||||
events = append(events, e.metricEvent("swap_used", compareHigh(sample.SwapUsedPercent, cfg.Thresholds.SwapUsedPercentWarning, cfg.Thresholds.SwapUsedPercentCritical), fmt.Sprintf("Swap usage %.1f%%", sample.SwapUsedPercent), fmt.Sprintf("Swap usage is %.1f%%.", sample.SwapUsedPercent))...)
|
||||
events = append(events, e.metricEvent("cpu_avg_15m", compareHigh(sample.CPUAvg15mPercent, cfg.Thresholds.CPUAvg15mWarning, cfg.Thresholds.CPUAvg15mCritical), fmt.Sprintf("CPU 15m average %.1f%%", sample.CPUAvg15mPercent), fmt.Sprintf("CPU rolling 15-minute average is %.1f%%.", sample.CPUAvg15mPercent))...)
|
||||
events = append(events, e.metricEvent("cpu_avg_12h", compareHigh(sample.CPUAvg12hPercent, cfg.Thresholds.CPUAvg12hWarning, cfg.Thresholds.CPUAvg12hCritical), fmt.Sprintf("CPU 12h average %.1f%%", sample.CPUAvg12hPercent), fmt.Sprintf("CPU rolling 12-hour average is %.1f%%.", sample.CPUAvg12hPercent))...)
|
||||
events = append(events, e.metricEvent("load_per_core", compareHigh(sample.LoadPerCore, cfg.Thresholds.LoadPerCoreWarning, cfg.Thresholds.LoadPerCoreCritical), fmt.Sprintf("Load per core %.2f", sample.LoadPerCore), fmt.Sprintf("15-minute load per core is %.2f.", sample.LoadPerCore))...)
|
||||
events = append(events, e.metricEvent("process_count", compareHigh(float64(sample.ProcessCount), float64(cfg.Thresholds.ProcessCountWarning), float64(cfg.Thresholds.ProcessCountCritical)), fmt.Sprintf("Process count %d", sample.ProcessCount), fmt.Sprintf("Process count is %d.", sample.ProcessCount))...)
|
||||
for _, site := range sample.Sites {
|
||||
severity := SeverityHealthy
|
||||
if !site.Healthy {
|
||||
severity = SeverityCritical
|
||||
}
|
||||
title := fmt.Sprintf("Site %s reachable", site.Name)
|
||||
body := fmt.Sprintf("%s responded with %d in %s.", site.URL, site.StatusCode, site.Latency.Round(time.Millisecond))
|
||||
if !site.Healthy {
|
||||
title = fmt.Sprintf("Site %s failed", site.Name)
|
||||
body = fmt.Sprintf("%s check failed: %s.", site.URL, site.ErrorMessage)
|
||||
}
|
||||
events = append(events, e.metricEvent("site:"+site.Name, severity, title, body)...)
|
||||
}
|
||||
sort.Slice(events, func(i, j int) bool { return events[i].Key < events[j].Key })
|
||||
return events
|
||||
}
|
||||
|
||||
func (e *Evaluator) metricEvent(key string, severity Severity, title string, body string) []Event {
|
||||
previous := e.states[key]
|
||||
if previous == "" {
|
||||
e.states[key] = severity
|
||||
if severity == SeverityHealthy {
|
||||
return nil
|
||||
}
|
||||
return []Event{{Key: key, Severity: severity, Title: title, Body: body}}
|
||||
}
|
||||
if previous == severity {
|
||||
return nil
|
||||
}
|
||||
e.states[key] = severity
|
||||
if severity == SeverityHealthy {
|
||||
return []Event{{Key: key, Severity: severity, Title: recoveryTitle(title), Body: body}}
|
||||
}
|
||||
return []Event{{Key: key, Severity: severity, Title: title, Body: body}}
|
||||
}
|
||||
|
||||
func recoveryTitle(title string) string {
|
||||
return "Recovered: " + title
|
||||
}
|
||||
|
||||
func compareHigh(value float64, warning float64, critical float64) Severity {
|
||||
switch {
|
||||
case critical > 0 && value >= critical:
|
||||
return SeverityCritical
|
||||
case warning > 0 && value >= warning:
|
||||
return SeverityWarning
|
||||
default:
|
||||
return SeverityHealthy
|
||||
}
|
||||
}
|
||||
|
||||
func compareLow(value float64, warning float64, critical float64) Severity {
|
||||
switch {
|
||||
case critical > 0 && value <= critical:
|
||||
return SeverityCritical
|
||||
case warning > 0 && value <= warning:
|
||||
return SeverityWarning
|
||||
default:
|
||||
return SeverityHealthy
|
||||
}
|
||||
}
|
||||
|
||||
func FormatSummary(sample metrics.Sample) string {
|
||||
lines := []string{
|
||||
fmt.Sprintf("CPU now %.1f%% | 15m %.1f%% | 12h %.1f%%", sample.CPUCurrentPercent, sample.CPUAvg15mPercent, sample.CPUAvg12hPercent),
|
||||
fmt.Sprintf("Memory %.1f%% | Swap %.1f%%", sample.MemoryUsedPercent, sample.SwapUsedPercent),
|
||||
fmt.Sprintf("Disk / %.1f%% used | %.1f GB free | Inodes %.1f%%", sample.RootUsedPercent, sample.RootFreeGB, sample.InodeUsedPercent),
|
||||
fmt.Sprintf("Load %.2f / %.2f / %.2f | Per core %.2f", sample.Load1, sample.Load5, sample.Load15, sample.LoadPerCore),
|
||||
fmt.Sprintf("Network RX %.1f KB/s | TX %.1f KB/s", sample.RXBytesPerSecond/1024, sample.TXBytesPerSecond/1024),
|
||||
fmt.Sprintf("Processes %d | Hosted sites %d", sample.ProcessCount, sample.HostedServiceCount),
|
||||
}
|
||||
siteStates := make([]string, 0, len(sample.Sites))
|
||||
for _, site := range sample.Sites {
|
||||
state := "up"
|
||||
if !site.Healthy {
|
||||
state = "down"
|
||||
}
|
||||
siteStates = append(siteStates, fmt.Sprintf("%s=%s", site.Name, state))
|
||||
}
|
||||
if len(siteStates) > 0 {
|
||||
lines = append(lines, "Sites "+strings.Join(siteStates, ", "))
|
||||
}
|
||||
return strings.Join(lines, "\n")
|
||||
}
|
||||
Reference in New Issue
Block a user