From 93ae9b66b37a9c9b7a747b5e2191e051abc85aa4 Mon Sep 17 00:00:00 2001 From: todor Date: Sun, 3 May 2026 21:09:59 +0200 Subject: [PATCH] init --- .gitea/workflows/build.yml | 36 +++++ .gitignore | 22 +++ cmd/heartbeat/main.go | 34 +++++ config.example.yaml | 33 +++++ go.mod | 19 +++ go.sum | 38 ++++++ heartbeat.service | 14 ++ internal/alerts/alerts.go | 130 ++++++++++++++++++ internal/app/app.go | 82 +++++++++++ internal/config/config.go | 117 ++++++++++++++++ internal/discord/discord.go | 201 +++++++++++++++++++++++++++ internal/metrics/metrics.go | 261 ++++++++++++++++++++++++++++++++++++ 12 files changed, 987 insertions(+) create mode 100644 .gitea/workflows/build.yml create mode 100644 .gitignore create mode 100644 cmd/heartbeat/main.go create mode 100644 config.example.yaml create mode 100644 go.mod create mode 100644 go.sum create mode 100644 heartbeat.service create mode 100644 internal/alerts/alerts.go create mode 100644 internal/app/app.go create mode 100644 internal/config/config.go create mode 100644 internal/discord/discord.go create mode 100644 internal/metrics/metrics.go diff --git a/.gitea/workflows/build.yml b/.gitea/workflows/build.yml new file mode 100644 index 0000000..7716d5e --- /dev/null +++ b/.gitea/workflows/build.yml @@ -0,0 +1,36 @@ +name: Build heartbeat + +on: + push: + branches: [main] + paths: + - '**/*.go' + - 'go.mod' + - 'go.sum' + - '.gitea/workflows/build.yml' + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-24.04 + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version: '1.22' + cache: true + + - name: Build binary + run: | + CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -trimpath -ldflags="-s -w" -o heartbeat ./cmd/heartbeat + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: heartbeat-linux-amd64 + path: heartbeat + retention-days: 365 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b20314 --- /dev/null +++ b/.gitignore @@ -0,0 +1,22 @@ +# built binary +/heartbeat + +# runtime config (contains webhook URL) +/config.yaml + +# Go build cache and test artifacts +*.test +*.out +/vendor/ + +# OS +.DS_Store +Thumbs.db + +# Editor +.vscode/ +.idea/ +*.swp +*.swo +*~ + diff --git a/cmd/heartbeat/main.go b/cmd/heartbeat/main.go new file mode 100644 index 0000000..1ed9136 --- /dev/null +++ b/cmd/heartbeat/main.go @@ -0,0 +1,34 @@ +package main + +import ( + "context" + "flag" + "log" + "os/signal" + "syscall" + + "heartbeat/internal/app" + "heartbeat/internal/config" +) + +func main() { + configPath := flag.String("config", "./config.yaml", "Path to config file") + flag.Parse() + + cfg, err := config.Load(*configPath) + if err != nil { + log.Fatalf("load config: %v", err) + } + + runner, err := app.New(cfg) + if err != nil { + log.Fatalf("initialize app: %v", err) + } + + ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer stop() + + if err := runner.Run(ctx); err != nil { + log.Fatalf("run heartbeat: %v", err) + } +} diff --git a/config.example.yaml b/config.example.yaml new file mode 100644 index 0000000..563f49e --- /dev/null +++ b/config.example.yaml @@ -0,0 +1,33 @@ +server_name: s1 +discord_webhook_url: https://discord.com/api/webhooks/replace/me +notify_role_id: "1500565629390819479" + +sample_interval: 1m +summary_interval: 6h +request_timeout: 10s + +thresholds: + disk_used_percent_warning: 85 + disk_used_percent_critical: 93 + free_gb_warning: 20 + free_gb_critical: 10 + inode_used_percent_warning: 85 + inode_used_percent_critical: 93 + memory_used_percent_warning: 90 + memory_used_percent_critical: 97 + swap_used_percent_warning: 25 + swap_used_percent_critical: 50 + cpu_avg_15m_warning: 80 + cpu_avg_15m_critical: 95 + cpu_avg_12h_warning: 65 + cpu_avg_12h_critical: 85 + load_per_core_warning: 0.9 + load_per_core_critical: 1.25 + process_count_warning: 350 + process_count_critical: 500 + +sites: + - name: pismen.com + url: https://pismen.com + expected_status: 200 + timeout: 10s diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..e5011c8 --- /dev/null +++ b/go.mod @@ -0,0 +1,19 @@ +module heartbeat + +go 1.22.0 + +require ( + github.com/shirou/gopsutil/v3 v3.24.5 + gopkg.in/yaml.v3 v3.0.1 +) + +require ( + github.com/go-ole/go-ole v1.2.6 // indirect + github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect + github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect + github.com/shoenig/go-m1cpu v0.1.6 // indirect + github.com/tklauser/go-sysconf v0.3.12 // indirect + github.com/tklauser/numcpus v0.6.1 // indirect + github.com/yusufpapurcu/wmi v1.2.4 // indirect + golang.org/x/sys v0.20.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..75b1601 --- /dev/null +++ b/go.sum @@ -0,0 +1,38 @@ +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= +github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= +github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw= +github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= +github.com/shirou/gopsutil/v3 v3.24.5 h1:i0t8kL+kQTvpAYToeuiVk3TgDeKOFioZO3Ztz/iZ9pI= +github.com/shirou/gopsutil/v3 v3.24.5/go.mod h1:bsoOS1aStSs9ErQ1WWfxllSeS1K5D+U30r2NfcubMVk= +github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM= +github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ= +github.com/shoenig/test v0.6.4 h1:kVTaSd7WLz5WZ2IaoM0RSzRsUD+m8wRR+5qvntpn4LU= +github.com/shoenig/test v0.6.4/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU= +github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI= +github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk= +github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY= +github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= +github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= +golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/heartbeat.service b/heartbeat.service new file mode 100644 index 0000000..92b9e7f --- /dev/null +++ b/heartbeat.service @@ -0,0 +1,14 @@ +[Unit] +Description=heartbeat server monitor +After=network-online.target +Wants=network-online.target + +[Service] +User=root +WorkingDirectory=/opt/heartbeat +ExecStart=/opt/heartbeat/heartbeat --config /opt/heartbeat/config.yaml +Restart=always +RestartSec=5 + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/internal/alerts/alerts.go b/internal/alerts/alerts.go new file mode 100644 index 0000000..8779d7c --- /dev/null +++ b/internal/alerts/alerts.go @@ -0,0 +1,130 @@ +package alerts + +import ( + "fmt" + "sort" + "strings" + "time" + + "heartbeat/internal/config" + "heartbeat/internal/metrics" +) + +type Severity string + +const ( + SeverityHealthy Severity = "healthy" + SeverityWarning Severity = "warning" + SeverityCritical Severity = "critical" +) + +type Event struct { + Key string + Severity Severity + Title string + Body string +} + +type Evaluator struct { + states map[string]Severity +} + +func NewEvaluator() *Evaluator { + return &Evaluator{states: make(map[string]Severity)} +} + +func (e *Evaluator) Evaluate(cfg config.Config, sample metrics.Sample) []Event { + events := []Event{} + events = append(events, e.metricEvent("disk_used", compareHigh(sample.RootUsedPercent, cfg.Thresholds.DiskUsedPercentWarning, cfg.Thresholds.DiskUsedPercentCritical), fmt.Sprintf("Root disk usage %.1f%%", sample.RootUsedPercent), fmt.Sprintf("Root partition is %.1f%% used with %.1f GB free.", sample.RootUsedPercent, sample.RootFreeGB))...) + events = append(events, e.metricEvent("free_gb", compareLow(sample.RootFreeGB, cfg.Thresholds.FreeGBWarning, cfg.Thresholds.FreeGBCritical), fmt.Sprintf("Low disk space %.1f GB", sample.RootFreeGB), fmt.Sprintf("Root partition free space is %.1f GB.", sample.RootFreeGB))...) + events = append(events, e.metricEvent("inode_used", compareHigh(sample.InodeUsedPercent, cfg.Thresholds.InodeUsedPercentWarning, cfg.Thresholds.InodeUsedPercentCritical), fmt.Sprintf("Inode usage %.1f%%", sample.InodeUsedPercent), fmt.Sprintf("Root partition inode usage is %.1f%%.", sample.InodeUsedPercent))...) + events = append(events, e.metricEvent("memory_used", compareHigh(sample.MemoryUsedPercent, cfg.Thresholds.MemoryUsedPercentWarning, cfg.Thresholds.MemoryUsedPercentCritical), fmt.Sprintf("Memory usage %.1f%%", sample.MemoryUsedPercent), fmt.Sprintf("Memory usage is %.1f%%.", sample.MemoryUsedPercent))...) + events = append(events, e.metricEvent("swap_used", compareHigh(sample.SwapUsedPercent, cfg.Thresholds.SwapUsedPercentWarning, cfg.Thresholds.SwapUsedPercentCritical), fmt.Sprintf("Swap usage %.1f%%", sample.SwapUsedPercent), fmt.Sprintf("Swap usage is %.1f%%.", sample.SwapUsedPercent))...) + events = append(events, e.metricEvent("cpu_avg_15m", compareHigh(sample.CPUAvg15mPercent, cfg.Thresholds.CPUAvg15mWarning, cfg.Thresholds.CPUAvg15mCritical), fmt.Sprintf("CPU 15m average %.1f%%", sample.CPUAvg15mPercent), fmt.Sprintf("CPU rolling 15-minute average is %.1f%%.", sample.CPUAvg15mPercent))...) + events = append(events, e.metricEvent("cpu_avg_12h", compareHigh(sample.CPUAvg12hPercent, cfg.Thresholds.CPUAvg12hWarning, cfg.Thresholds.CPUAvg12hCritical), fmt.Sprintf("CPU 12h average %.1f%%", sample.CPUAvg12hPercent), fmt.Sprintf("CPU rolling 12-hour average is %.1f%%.", sample.CPUAvg12hPercent))...) + events = append(events, e.metricEvent("load_per_core", compareHigh(sample.LoadPerCore, cfg.Thresholds.LoadPerCoreWarning, cfg.Thresholds.LoadPerCoreCritical), fmt.Sprintf("Load per core %.2f", sample.LoadPerCore), fmt.Sprintf("15-minute load per core is %.2f.", sample.LoadPerCore))...) + events = append(events, e.metricEvent("process_count", compareHigh(float64(sample.ProcessCount), float64(cfg.Thresholds.ProcessCountWarning), float64(cfg.Thresholds.ProcessCountCritical)), fmt.Sprintf("Process count %d", sample.ProcessCount), fmt.Sprintf("Process count is %d.", sample.ProcessCount))...) + for _, site := range sample.Sites { + severity := SeverityHealthy + if !site.Healthy { + severity = SeverityCritical + } + title := fmt.Sprintf("Site %s reachable", site.Name) + body := fmt.Sprintf("%s responded with %d in %s.", site.URL, site.StatusCode, site.Latency.Round(time.Millisecond)) + if !site.Healthy { + title = fmt.Sprintf("Site %s failed", site.Name) + body = fmt.Sprintf("%s check failed: %s.", site.URL, site.ErrorMessage) + } + events = append(events, e.metricEvent("site:"+site.Name, severity, title, body)...) + } + sort.Slice(events, func(i, j int) bool { return events[i].Key < events[j].Key }) + return events +} + +func (e *Evaluator) metricEvent(key string, severity Severity, title string, body string) []Event { + previous := e.states[key] + if previous == "" { + e.states[key] = severity + if severity == SeverityHealthy { + return nil + } + return []Event{{Key: key, Severity: severity, Title: title, Body: body}} + } + if previous == severity { + return nil + } + e.states[key] = severity + if severity == SeverityHealthy { + return []Event{{Key: key, Severity: severity, Title: recoveryTitle(title), Body: body}} + } + return []Event{{Key: key, Severity: severity, Title: title, Body: body}} +} + +func recoveryTitle(title string) string { + return "Recovered: " + title +} + +func compareHigh(value float64, warning float64, critical float64) Severity { + switch { + case critical > 0 && value >= critical: + return SeverityCritical + case warning > 0 && value >= warning: + return SeverityWarning + default: + return SeverityHealthy + } +} + +func compareLow(value float64, warning float64, critical float64) Severity { + switch { + case critical > 0 && value <= critical: + return SeverityCritical + case warning > 0 && value <= warning: + return SeverityWarning + default: + return SeverityHealthy + } +} + +func FormatSummary(sample metrics.Sample) string { + lines := []string{ + fmt.Sprintf("CPU now %.1f%% | 15m %.1f%% | 12h %.1f%%", sample.CPUCurrentPercent, sample.CPUAvg15mPercent, sample.CPUAvg12hPercent), + fmt.Sprintf("Memory %.1f%% | Swap %.1f%%", sample.MemoryUsedPercent, sample.SwapUsedPercent), + fmt.Sprintf("Disk / %.1f%% used | %.1f GB free | Inodes %.1f%%", sample.RootUsedPercent, sample.RootFreeGB, sample.InodeUsedPercent), + fmt.Sprintf("Load %.2f / %.2f / %.2f | Per core %.2f", sample.Load1, sample.Load5, sample.Load15, sample.LoadPerCore), + fmt.Sprintf("Network RX %.1f KB/s | TX %.1f KB/s", sample.RXBytesPerSecond/1024, sample.TXBytesPerSecond/1024), + fmt.Sprintf("Processes %d | Hosted sites %d", sample.ProcessCount, sample.HostedServiceCount), + } + siteStates := make([]string, 0, len(sample.Sites)) + for _, site := range sample.Sites { + state := "up" + if !site.Healthy { + state = "down" + } + siteStates = append(siteStates, fmt.Sprintf("%s=%s", site.Name, state)) + } + if len(siteStates) > 0 { + lines = append(lines, "Sites "+strings.Join(siteStates, ", ")) + } + return strings.Join(lines, "\n") +} diff --git a/internal/app/app.go b/internal/app/app.go new file mode 100644 index 0000000..80be5f6 --- /dev/null +++ b/internal/app/app.go @@ -0,0 +1,82 @@ +package app + +import ( + "context" + "log" + "time" + + "heartbeat/internal/alerts" + "heartbeat/internal/config" + "heartbeat/internal/discord" + "heartbeat/internal/metrics" +) + +type Runner struct { + cfg config.Config + sampler *metrics.Sampler + evaluator *alerts.Evaluator + discord *discord.Client +} + +func New(cfg config.Config) (*Runner, error) { + return &Runner{ + cfg: cfg, + sampler: metrics.NewSampler(cfg.RequestTimeout), + evaluator: alerts.NewEvaluator(), + discord: discord.New(cfg.ServerName, cfg.DiscordWebhookURL, cfg.NotifyRoleID, cfg.RequestTimeout), + }, nil +} + +func (r *Runner) Run(ctx context.Context) error { + if err := r.tick(ctx, true); err != nil { + log.Printf("initial tick failed: %v", err) + } + + sampleTicker := time.NewTicker(r.cfg.SampleInterval) + defer sampleTicker.Stop() + + summaryTicker := time.NewTicker(r.cfg.SummaryInterval) + defer summaryTicker.Stop() + + for { + select { + case <-ctx.Done(): + return nil + case <-sampleTicker.C: + if err := r.tick(ctx, false); err != nil { + log.Printf("sample tick failed: %v", err) + } + case <-summaryTicker.C: + if err := r.sendSummary(ctx); err != nil { + log.Printf("summary failed: %v", err) + } + } + } +} + +func (r *Runner) tick(ctx context.Context, sendSummary bool) error { + sample, err := r.sampler.Collect(ctx, r.cfg) + if err != nil { + return err + } + if sendSummary { + if err := r.discord.SendSummary(ctx, sample, r.cfg.SummaryInterval); err != nil { + log.Printf("summary send failed: %v", err) + } + } + for _, event := range r.evaluator.Evaluate(r.cfg, sample) { + if err := r.discord.SendEvent(ctx, sample, event); err != nil { + log.Printf("event send failed for %s: %v", event.Key, err) + } + } + return nil +} + +func (r *Runner) sendSummary(ctx context.Context) error { + sample, err := r.sampler.Collect(ctx, r.cfg) + if err != nil { + return err + } + return r.discord.SendSummary(ctx, sample, r.cfg.SummaryInterval) +} + diff --git a/internal/config/config.go b/internal/config/config.go new file mode 100644 index 0000000..f336440 --- /dev/null +++ b/internal/config/config.go @@ -0,0 +1,117 @@ +package config + +import ( + "fmt" + "os" + "time" + + "gopkg.in/yaml.v3" +) + +type Config struct { + ServerName string `yaml:"server_name"` + DiscordWebhookURL string `yaml:"discord_webhook_url"` + NotifyRoleID string `yaml:"notify_role_id"` + SampleInterval time.Duration `yaml:"sample_interval"` + SummaryInterval time.Duration `yaml:"summary_interval"` + RequestTimeout time.Duration `yaml:"request_timeout"` + Thresholds Thresholds `yaml:"thresholds"` + Sites []Site `yaml:"sites"` +} + +type Thresholds struct { + DiskUsedPercentWarning float64 `yaml:"disk_used_percent_warning"` + DiskUsedPercentCritical float64 `yaml:"disk_used_percent_critical"` + FreeGBWarning float64 `yaml:"free_gb_warning"` + FreeGBCritical float64 `yaml:"free_gb_critical"` + InodeUsedPercentWarning float64 `yaml:"inode_used_percent_warning"` + InodeUsedPercentCritical float64 `yaml:"inode_used_percent_critical"` + MemoryUsedPercentWarning float64 `yaml:"memory_used_percent_warning"` + MemoryUsedPercentCritical float64 `yaml:"memory_used_percent_critical"` + SwapUsedPercentWarning float64 `yaml:"swap_used_percent_warning"` + SwapUsedPercentCritical float64 `yaml:"swap_used_percent_critical"` + CPUAvg15mWarning float64 `yaml:"cpu_avg_15m_warning"` + CPUAvg15mCritical float64 `yaml:"cpu_avg_15m_critical"` + CPUAvg12hWarning float64 `yaml:"cpu_avg_12h_warning"` + CPUAvg12hCritical float64 `yaml:"cpu_avg_12h_critical"` + LoadPerCoreWarning float64 `yaml:"load_per_core_warning"` + LoadPerCoreCritical float64 `yaml:"load_per_core_critical"` + ProcessCountWarning int `yaml:"process_count_warning"` + ProcessCountCritical int `yaml:"process_count_critical"` +} + +type Site struct { + Name string `yaml:"name"` + URL string `yaml:"url"` + ExpectedStatus int `yaml:"expected_status"` + Timeout time.Duration `yaml:"timeout"` +} + +func Load(path string) (Config, error) { + data, err := os.ReadFile(path) + if err != nil { + return Config{}, err + } + + var cfg Config + if err := yaml.Unmarshal(data, &cfg); err != nil { + return Config{}, err + } + + applyDefaults(&cfg) + if err := cfg.Validate(); err != nil { + return Config{}, err + } + + return cfg, nil +} + +func applyDefaults(cfg *Config) { + if cfg.SampleInterval == 0 { + cfg.SampleInterval = time.Minute + } + if cfg.SummaryInterval == 0 { + cfg.SummaryInterval = 6 * time.Hour + } + if cfg.RequestTimeout == 0 { + cfg.RequestTimeout = 10 * time.Second + } + for index := range cfg.Sites { + if cfg.Sites[index].ExpectedStatus == 0 { + cfg.Sites[index].ExpectedStatus = 200 + } + if cfg.Sites[index].Timeout == 0 { + cfg.Sites[index].Timeout = cfg.RequestTimeout + } + } + if cfg.Thresholds.ProcessCountWarning == 0 { + cfg.Thresholds.ProcessCountWarning = 350 + } + if cfg.Thresholds.ProcessCountCritical == 0 { + cfg.Thresholds.ProcessCountCritical = 500 + } +} + +func (cfg Config) Validate() error { + if cfg.ServerName == "" { + return fmt.Errorf("server_name is required") + } + if cfg.DiscordWebhookURL == "" { + return fmt.Errorf("discord_webhook_url is required") + } + if cfg.SampleInterval <= 0 { + return fmt.Errorf("sample_interval must be > 0") + } + if cfg.SummaryInterval <= 0 { + return fmt.Errorf("summary_interval must be > 0") + } + for _, site := range cfg.Sites { + if site.Name == "" { + return fmt.Errorf("site name is required") + } + if site.URL == "" { + return fmt.Errorf("site URL is required for %s", site.Name) + } + } + return nil +} diff --git a/internal/discord/discord.go b/internal/discord/discord.go new file mode 100644 index 0000000..9319d33 --- /dev/null +++ b/internal/discord/discord.go @@ -0,0 +1,201 @@ +package discord + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "net/http" + "strings" + "time" + + "heartbeat/internal/alerts" + "heartbeat/internal/metrics" +) + +type Client struct { + webhookURL string + httpClient *http.Client + serverName string + notifyRoleID string +} + +type webhookPayload struct { + Content string `json:"content,omitempty"` + AllowedMentions allowedMentions `json:"allowed_mentions,omitempty"` + Embeds []embed `json:"embeds"` +} + +type allowedMentions struct { + Roles []string `json:"roles,omitempty"` +} + +type embed struct { + Title string `json:"title"` + Description string `json:"description,omitempty"` + Color int `json:"color"` + Timestamp string `json:"timestamp,omitempty"` + Fields []embedField `json:"fields,omitempty"` + Footer *embedFooter `json:"footer,omitempty"` +} + +type embedField struct { + Name string `json:"name"` + Value string `json:"value"` + Inline bool `json:"inline"` +} + +type embedFooter struct { + Text string `json:"text"` +} + +func New(serverName string, webhookURL string, notifyRoleID string, timeout time.Duration) *Client { + return &Client{ + serverName: serverName, + webhookURL: webhookURL, + httpClient: &http.Client{Timeout: timeout}, + notifyRoleID: notifyRoleID, + } +} + +func (c *Client) SendSummary(ctx context.Context, sample metrics.Sample, summaryInterval time.Duration) error { + fields := []embedField{ + {Name: "Server", Value: c.serverName, Inline: true}, + {Name: "Generated", Value: formatSummaryTime(sample.Timestamp), Inline: true}, + {Name: "Uptime", Value: formatUptime(sample.UptimeSeconds), Inline: true}, + {Name: "CPU", Value: fmt.Sprintf("Now: %.1f%%\n15m: %.1f%%\n12h: %.1f%%", sample.CPUCurrentPercent, sample.CPUAvg15mPercent, sample.CPUAvg12hPercent), Inline: true}, + {Name: "Load", Value: fmt.Sprintf("1m: %.2f\n5m: %.2f\n15m/core: %.2f", sample.Load1, sample.Load5, sample.LoadPerCore), Inline: true}, + {Name: "Memory", Value: fmt.Sprintf("RAM: %.1f%%\nSwap: %.1f%%", sample.MemoryUsedPercent, sample.SwapUsedPercent), Inline: true}, + {Name: "Disk /", Value: fmt.Sprintf("Used: %.1f%%\nFree: %.1f GB\nInodes: %.1f%%", sample.RootUsedPercent, sample.RootFreeGB, sample.InodeUsedPercent), Inline: true}, + {Name: "Network", Value: fmt.Sprintf("RX: %s\nTX: %s", formatRate(sample.RXBytesPerSecond), formatRate(sample.TXBytesPerSecond)), Inline: true}, + {Name: "Processes", Value: fmt.Sprintf("Count: %d", sample.ProcessCount), Inline: true}, + {Name: "Sites", Value: formatSiteStatuses(sample.Sites), Inline: false}, + } + return c.send(ctx, webhookPayload{Embeds: []embed{{ + Title: fmt.Sprintf("heartbeat (%s) - %s", formatSummaryIntervalHours(summaryInterval), c.serverName), + Description: "Scheduled server health snapshot.", + Color: 0x2D9CDB, + Fields: fields, + Footer: &embedFooter{Text: formatSummaryFooter(c.serverName)}, + }}}) +} + +func (c *Client) SendEvent(ctx context.Context, sample metrics.Sample, event alerts.Event) error { + fields := []embedField{ + {Name: "Server", Value: c.serverName, Inline: true}, + {Name: "Severity", Value: string(event.Severity), Inline: true}, + {Name: "Timestamp", Value: formatEventTime(sample.Timestamp), Inline: true}, + } + payload := webhookPayload{Embeds: []embed{{ + Title: event.Title, + Description: event.Body, + Color: colorForSeverity(event.Severity), + Timestamp: sample.Timestamp.Format(time.RFC3339), + Fields: fields, + }}} + if c.notifyRoleID != "" && (event.Severity == alerts.SeverityWarning || event.Severity == alerts.SeverityCritical) { + payload.Content = fmt.Sprintf("<@&%s>", c.notifyRoleID) + payload.AllowedMentions = allowedMentions{Roles: []string{c.notifyRoleID}} + } + return c.send(ctx, payload) +} + +func formatUptime(totalSeconds uint64) string { + duration := time.Duration(totalSeconds) * time.Second + days := duration / (24 * time.Hour) + duration -= days * 24 * time.Hour + hours := duration / time.Hour + duration -= hours * time.Hour + minutes := duration / time.Minute + if days > 0 { + return fmt.Sprintf("%dd %dh %dm", days, hours, minutes) + } + return fmt.Sprintf("%dh %dm", hours, minutes) +} + +func formatSiteStatuses(sites []metrics.SiteStatus) string { + if len(sites) == 0 { + return "No site checks configured" + } + parts := make([]string, 0, len(sites)) + for _, site := range sites { + status := "UP" + detail := fmt.Sprintf("%d in %s", site.StatusCode, site.Latency.Round(time.Millisecond)) + if !site.Healthy { + status = "DOWN" + detail = site.ErrorMessage + } + parts = append(parts, fmt.Sprintf("%s: %s (%s)", site.Name, status, detail)) + } + return strings.Join(parts, "\n") +} + +func formatSummaryTime(timestamp time.Time) string { + unixSeconds := timestamp.Unix() + return fmt.Sprintf("", unixSeconds) +} + +func formatEventTime(timestamp time.Time) string { + unixSeconds := timestamp.Unix() + return fmt.Sprintf(" - ", unixSeconds, unixSeconds) +} + +func formatSummaryFooter(serverName string) string { + return fmt.Sprintf("heartbeat - %s", serverName) +} + +func formatSummaryIntervalHours(interval time.Duration) string { + if interval%time.Hour == 0 { + return fmt.Sprintf("%dh", interval/time.Hour) + } + return fmt.Sprintf("%.2fh", interval.Hours()) +} + +func formatRate(bytesPerSecond float64) string { + if bytesPerSecond < 0 { + return "n/a" + } + if bytesPerSecond < 1024 { + return fmt.Sprintf("%.0f B/s", bytesPerSecond) + } + kib := bytesPerSecond / 1024 + if kib < 1024 { + return fmt.Sprintf("%.2f KB/s", kib) + } + mib := kib / 1024 + return fmt.Sprintf("%.2f MB/s", mib) +} + +func (c *Client) send(ctx context.Context, payload webhookPayload) error { + buffer := &bytes.Buffer{} + if err := json.NewEncoder(buffer).Encode(payload); err != nil { + return err + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.webhookURL, buffer) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + + resp, err := c.httpClient.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return fmt.Errorf("discord webhook returned %s", resp.Status) + } + return nil +} + +func colorForSeverity(severity alerts.Severity) int { + switch severity { + case alerts.SeverityCritical: + return 0xE74C3C + case alerts.SeverityWarning: + return 0xF39C12 + default: + return 0x27AE60 + } +} diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go new file mode 100644 index 0000000..5d8da19 --- /dev/null +++ b/internal/metrics/metrics.go @@ -0,0 +1,261 @@ +package metrics + +import ( + "context" + "fmt" + "net" + "net/http" + "strings" + "time" + + "heartbeat/internal/config" + + "github.com/shirou/gopsutil/v3/cpu" + "github.com/shirou/gopsutil/v3/disk" + "github.com/shirou/gopsutil/v3/host" + "github.com/shirou/gopsutil/v3/load" + "github.com/shirou/gopsutil/v3/mem" + gnet "github.com/shirou/gopsutil/v3/net" + "github.com/shirou/gopsutil/v3/process" +) + +const gib = 1024 * 1024 * 1024 + +type Sample struct { + Timestamp time.Time + RootUsedPercent float64 + RootFreeGB float64 + InodeUsedPercent float64 + CPUCurrentPercent float64 + CPUAvg15mPercent float64 + CPUAvg12hPercent float64 + MemoryUsedPercent float64 + SwapUsedPercent float64 + Load1 float64 + Load5 float64 + Load15 float64 + LoadPerCore float64 + RXBytesPerSecond float64 + TXBytesPerSecond float64 + ProcessCount int + HostedServiceCount int + UptimeSeconds uint64 + Sites []SiteStatus +} + +type SiteStatus struct { + Name string + URL string + Healthy bool + StatusCode int + Latency time.Duration + ErrorMessage string + ExpectedStatus int +} + +type Sampler struct { + httpClient *http.Client + prevNet netIO + history []historyPoint + historyCap int +} + +type netIO struct { + timestamp time.Time + rx uint64 + tx uint64 +} + +type historyPoint struct { + timestamp time.Time + cpu float64 +} + +func NewSampler(timeout time.Duration) *Sampler { + transport := &http.Transport{ + Proxy: http.ProxyFromEnvironment, + DialContext: (&net.Dialer{ + Timeout: timeout, + }).DialContext, + TLSHandshakeTimeout: timeout, + } + + return &Sampler{ + httpClient: &http.Client{Timeout: timeout, Transport: transport}, + historyCap: 12 * 60, + } +} + +func (s *Sampler) Collect(ctx context.Context, cfg config.Config) (Sample, error) { + now := time.Now().UTC() + + rootUsage, err := disk.UsageWithContext(ctx, "/") + if err != nil { + return Sample{}, fmt.Errorf("root usage: %w", err) + } + + cpuPercents, err := cpu.PercentWithContext(ctx, time.Second, false) + if err != nil { + return Sample{}, fmt.Errorf("cpu percent: %w", err) + } + + virtualMemory, err := mem.VirtualMemoryWithContext(ctx) + if err != nil { + return Sample{}, fmt.Errorf("memory: %w", err) + } + + swapMemory, err := mem.SwapMemoryWithContext(ctx) + if err != nil { + return Sample{}, fmt.Errorf("swap: %w", err) + } + + avg, err := load.AvgWithContext(ctx) + if err != nil { + return Sample{}, fmt.Errorf("load average: %w", err) + } + + hostInfo, err := host.InfoWithContext(ctx) + if err != nil { + return Sample{}, fmt.Errorf("host info: %w", err) + } + + processes, err := process.ProcessesWithContext(ctx) + if err != nil { + return Sample{}, fmt.Errorf("process list: %w", err) + } + + rxRate, txRate := s.networkRates(ctx, now) + sites := s.checkSites(ctx, cfg.Sites) + + currentCPU := 0.0 + if len(cpuPercents) > 0 { + currentCPU = cpuPercents[0] + } + + s.appendHistory(now, currentCPU) + coreCount, err := cpu.CountsWithContext(ctx, true) + if err != nil || coreCount == 0 { + coreCount = 1 + } + + return Sample{ + Timestamp: now, + RootUsedPercent: rootUsage.UsedPercent, + RootFreeGB: float64(rootUsage.Free) / gib, + InodeUsedPercent: inodeUsedPercent(rootUsage), + CPUCurrentPercent: currentCPU, + CPUAvg15mPercent: s.averageCPU(15 * time.Minute), + CPUAvg12hPercent: s.averageCPU(12 * time.Hour), + MemoryUsedPercent: virtualMemory.UsedPercent, + SwapUsedPercent: swapMemory.UsedPercent, + Load1: avg.Load1, + Load5: avg.Load5, + Load15: avg.Load15, + LoadPerCore: avg.Load15 / float64(coreCount), + RXBytesPerSecond: rxRate, + TXBytesPerSecond: txRate, + ProcessCount: len(processes), + HostedServiceCount: len(cfg.Sites), + UptimeSeconds: hostInfo.Uptime, + Sites: sites, + }, nil +} + +func (s *Sampler) appendHistory(timestamp time.Time, cpuPercent float64) { + s.history = append(s.history, historyPoint{timestamp: timestamp, cpu: cpuPercent}) + if len(s.history) > s.historyCap { + s.history = s.history[len(s.history)-s.historyCap:] + } +} + +func (s *Sampler) averageCPU(window time.Duration) float64 { + if len(s.history) == 0 { + return 0 + } + cutoff := time.Now().UTC().Add(-window) + total := 0.0 + count := 0.0 + for _, point := range s.history { + if point.timestamp.Before(cutoff) { + continue + } + total += point.cpu + count++ + } + if count == 0 { + return 0 + } + return total / count +} + +func (s *Sampler) networkRates(ctx context.Context, now time.Time) (float64, float64) { + stats, err := gnet.IOCountersWithContext(ctx, false) + if err != nil || len(stats) == 0 { + return 0, 0 + } + current := netIO{timestamp: now, rx: stats[0].BytesRecv, tx: stats[0].BytesSent} + if s.prevNet.timestamp.IsZero() { + s.prevNet = current + return 0, 0 + } + seconds := current.timestamp.Sub(s.prevNet.timestamp).Seconds() + if seconds <= 0 { + return 0, 0 + } + rxRate := float64(current.rx-s.prevNet.rx) / seconds + txRate := float64(current.tx-s.prevNet.tx) / seconds + s.prevNet = current + return rxRate, txRate +} + +func (s *Sampler) checkSites(ctx context.Context, sites []config.Site) []SiteStatus { + results := make([]SiteStatus, 0, len(sites)) + for _, site := range sites { + results = append(results, s.checkSite(ctx, site)) + } + return results +} + +func (s *Sampler) checkSite(parent context.Context, site config.Site) SiteStatus { + ctx, cancel := context.WithTimeout(parent, site.Timeout) + defer cancel() + + start := time.Now() + req, err := http.NewRequestWithContext(ctx, http.MethodGet, site.URL, nil) + if err != nil { + return SiteStatus{Name: site.Name, URL: site.URL, ExpectedStatus: site.ExpectedStatus, ErrorMessage: err.Error()} + } + + resp, err := s.httpClient.Do(req) + if err != nil { + return SiteStatus{Name: site.Name, URL: site.URL, ExpectedStatus: site.ExpectedStatus, ErrorMessage: normalizeError(err)} + } + defer resp.Body.Close() + + status := SiteStatus{ + Name: site.Name, + URL: site.URL, + Healthy: resp.StatusCode == site.ExpectedStatus, + StatusCode: resp.StatusCode, + Latency: time.Since(start), + ExpectedStatus: site.ExpectedStatus, + } + if !status.Healthy { + status.ErrorMessage = fmt.Sprintf("expected %d, got %d", site.ExpectedStatus, resp.StatusCode) + } + return status +} + +func normalizeError(err error) string { + message := err.Error() + message = strings.TrimPrefix(message, "Get ") + return message +} + +func inodeUsedPercent(stat *disk.UsageStat) float64 { + if stat.InodesTotal == 0 { + return 0 + } + used := stat.InodesTotal - stat.InodesFree + return float64(used) / float64(stat.InodesTotal) * 100 +}