131 lines
6.1 KiB
Go
131 lines
6.1 KiB
Go
package alerts
|
|
|
|
import (
|
|
"fmt"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
|
|
"heartbeat/internal/config"
|
|
"heartbeat/internal/metrics"
|
|
)
|
|
|
|
type Severity string
|
|
|
|
const (
|
|
SeverityHealthy Severity = "healthy"
|
|
SeverityWarning Severity = "warning"
|
|
SeverityCritical Severity = "critical"
|
|
)
|
|
|
|
type Event struct {
|
|
Key string
|
|
Severity Severity
|
|
Title string
|
|
Body string
|
|
}
|
|
|
|
type Evaluator struct {
|
|
states map[string]Severity
|
|
}
|
|
|
|
func NewEvaluator() *Evaluator {
|
|
return &Evaluator{states: make(map[string]Severity)}
|
|
}
|
|
|
|
func (e *Evaluator) Evaluate(cfg config.Config, sample metrics.Sample) []Event {
|
|
events := []Event{}
|
|
events = append(events, e.metricEvent("disk_used", compareHigh(sample.RootUsedPercent, cfg.Thresholds.DiskUsedPercentWarning, cfg.Thresholds.DiskUsedPercentCritical), fmt.Sprintf("Root disk usage %.1f%%", sample.RootUsedPercent), fmt.Sprintf("Root partition is %.1f%% used with %.1f GB free.", sample.RootUsedPercent, sample.RootFreeGB))...)
|
|
events = append(events, e.metricEvent("free_gb", compareLow(sample.RootFreeGB, cfg.Thresholds.FreeGBWarning, cfg.Thresholds.FreeGBCritical), fmt.Sprintf("Low disk space %.1f GB", sample.RootFreeGB), fmt.Sprintf("Root partition free space is %.1f GB.", sample.RootFreeGB))...)
|
|
events = append(events, e.metricEvent("inode_used", compareHigh(sample.InodeUsedPercent, cfg.Thresholds.InodeUsedPercentWarning, cfg.Thresholds.InodeUsedPercentCritical), fmt.Sprintf("Inode usage %.1f%%", sample.InodeUsedPercent), fmt.Sprintf("Root partition inode usage is %.1f%%.", sample.InodeUsedPercent))...)
|
|
events = append(events, e.metricEvent("memory_used", compareHigh(sample.MemoryUsedPercent, cfg.Thresholds.MemoryUsedPercentWarning, cfg.Thresholds.MemoryUsedPercentCritical), fmt.Sprintf("Memory usage %.1f%%", sample.MemoryUsedPercent), fmt.Sprintf("Memory usage is %.1f%%.", sample.MemoryUsedPercent))...)
|
|
events = append(events, e.metricEvent("swap_used", compareHigh(sample.SwapUsedPercent, cfg.Thresholds.SwapUsedPercentWarning, cfg.Thresholds.SwapUsedPercentCritical), fmt.Sprintf("Swap usage %.1f%%", sample.SwapUsedPercent), fmt.Sprintf("Swap usage is %.1f%%.", sample.SwapUsedPercent))...)
|
|
events = append(events, e.metricEvent("cpu_avg_15m", compareHigh(sample.CPUAvg15mPercent, cfg.Thresholds.CPUAvg15mWarning, cfg.Thresholds.CPUAvg15mCritical), fmt.Sprintf("CPU 15m average %.1f%%", sample.CPUAvg15mPercent), fmt.Sprintf("CPU rolling 15-minute average is %.1f%%.", sample.CPUAvg15mPercent))...)
|
|
events = append(events, e.metricEvent("cpu_avg_12h", compareHigh(sample.CPUAvg12hPercent, cfg.Thresholds.CPUAvg12hWarning, cfg.Thresholds.CPUAvg12hCritical), fmt.Sprintf("CPU 12h average %.1f%%", sample.CPUAvg12hPercent), fmt.Sprintf("CPU rolling 12-hour average is %.1f%%.", sample.CPUAvg12hPercent))...)
|
|
events = append(events, e.metricEvent("load_per_core", compareHigh(sample.LoadPerCore, cfg.Thresholds.LoadPerCoreWarning, cfg.Thresholds.LoadPerCoreCritical), fmt.Sprintf("Load per core %.2f", sample.LoadPerCore), fmt.Sprintf("15-minute load per core is %.2f.", sample.LoadPerCore))...)
|
|
events = append(events, e.metricEvent("process_count", compareHigh(float64(sample.ProcessCount), float64(cfg.Thresholds.ProcessCountWarning), float64(cfg.Thresholds.ProcessCountCritical)), fmt.Sprintf("Process count %d", sample.ProcessCount), fmt.Sprintf("Process count is %d.", sample.ProcessCount))...)
|
|
for _, site := range sample.Sites {
|
|
severity := SeverityHealthy
|
|
if !site.Healthy {
|
|
severity = SeverityCritical
|
|
}
|
|
title := fmt.Sprintf("Site %s reachable", site.Name)
|
|
body := fmt.Sprintf("%s responded with %d in %s.", site.URL, site.StatusCode, site.Latency.Round(time.Millisecond))
|
|
if !site.Healthy {
|
|
title = fmt.Sprintf("Site %s failed", site.Name)
|
|
body = fmt.Sprintf("%s check failed: %s.", site.URL, site.ErrorMessage)
|
|
}
|
|
events = append(events, e.metricEvent("site:"+site.Name, severity, title, body)...)
|
|
}
|
|
sort.Slice(events, func(i, j int) bool { return events[i].Key < events[j].Key })
|
|
return events
|
|
}
|
|
|
|
func (e *Evaluator) metricEvent(key string, severity Severity, title string, body string) []Event {
|
|
previous := e.states[key]
|
|
if previous == "" {
|
|
e.states[key] = severity
|
|
if severity == SeverityHealthy {
|
|
return nil
|
|
}
|
|
return []Event{{Key: key, Severity: severity, Title: title, Body: body}}
|
|
}
|
|
if previous == severity {
|
|
return nil
|
|
}
|
|
e.states[key] = severity
|
|
if severity == SeverityHealthy {
|
|
return []Event{{Key: key, Severity: severity, Title: recoveryTitle(title), Body: body}}
|
|
}
|
|
return []Event{{Key: key, Severity: severity, Title: title, Body: body}}
|
|
}
|
|
|
|
func recoveryTitle(title string) string {
|
|
return "Recovered: " + title
|
|
}
|
|
|
|
func compareHigh(value float64, warning float64, critical float64) Severity {
|
|
switch {
|
|
case critical > 0 && value >= critical:
|
|
return SeverityCritical
|
|
case warning > 0 && value >= warning:
|
|
return SeverityWarning
|
|
default:
|
|
return SeverityHealthy
|
|
}
|
|
}
|
|
|
|
func compareLow(value float64, warning float64, critical float64) Severity {
|
|
switch {
|
|
case critical > 0 && value <= critical:
|
|
return SeverityCritical
|
|
case warning > 0 && value <= warning:
|
|
return SeverityWarning
|
|
default:
|
|
return SeverityHealthy
|
|
}
|
|
}
|
|
|
|
func FormatSummary(sample metrics.Sample) string {
|
|
lines := []string{
|
|
fmt.Sprintf("CPU now %.1f%% | 15m %.1f%% | 12h %.1f%%", sample.CPUCurrentPercent, sample.CPUAvg15mPercent, sample.CPUAvg12hPercent),
|
|
fmt.Sprintf("Memory %.1f%% | Swap %.1f%%", sample.MemoryUsedPercent, sample.SwapUsedPercent),
|
|
fmt.Sprintf("Disk / %.1f%% used | %.1f GB free | Inodes %.1f%%", sample.RootUsedPercent, sample.RootFreeGB, sample.InodeUsedPercent),
|
|
fmt.Sprintf("Load %.2f / %.2f / %.2f | Per core %.2f", sample.Load1, sample.Load5, sample.Load15, sample.LoadPerCore),
|
|
fmt.Sprintf("Network RX %.1f KB/s | TX %.1f KB/s", sample.RXBytesPerSecond/1024, sample.TXBytesPerSecond/1024),
|
|
fmt.Sprintf("Processes %d | Hosted sites %d", sample.ProcessCount, sample.HostedServiceCount),
|
|
}
|
|
siteStates := make([]string, 0, len(sample.Sites))
|
|
for _, site := range sample.Sites {
|
|
state := "up"
|
|
if !site.Healthy {
|
|
state = "down"
|
|
}
|
|
siteStates = append(siteStates, fmt.Sprintf("%s=%s", site.Name, state))
|
|
}
|
|
if len(siteStates) > 0 {
|
|
lines = append(lines, "Sites "+strings.Join(siteStates, ", "))
|
|
}
|
|
return strings.Join(lines, "\n")
|
|
}
|