init
Some checks failed
Build heartbeat / build (push) Failing after 1m18s

This commit is contained in:
2026-05-03 21:09:59 +02:00
commit 93ae9b66b3
12 changed files with 987 additions and 0 deletions

View File

@@ -0,0 +1,36 @@
name: Build heartbeat
on:
push:
branches: [main]
paths:
- '**/*.go'
- 'go.mod'
- 'go.sum'
- '.gitea/workflows/build.yml'
workflow_dispatch:
jobs:
build:
runs-on: ubuntu-24.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Go
uses: actions/setup-go@v5
with:
go-version: '1.22'
cache: true
- name: Build binary
run: |
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -trimpath -ldflags="-s -w" -o heartbeat ./cmd/heartbeat
- name: Upload artifact
uses: actions/upload-artifact@v4
with:
name: heartbeat-linux-amd64
path: heartbeat
retention-days: 365

22
.gitignore vendored Normal file
View File

@@ -0,0 +1,22 @@
# built binary
/heartbeat
# runtime config (contains webhook URL)
/config.yaml
# Go build cache and test artifacts
*.test
*.out
/vendor/
# OS
.DS_Store
Thumbs.db
# Editor
.vscode/
.idea/
*.swp
*.swo
*~

34
cmd/heartbeat/main.go Normal file
View File

@@ -0,0 +1,34 @@
package main
import (
"context"
"flag"
"log"
"os/signal"
"syscall"
"heartbeat/internal/app"
"heartbeat/internal/config"
)
func main() {
configPath := flag.String("config", "./config.yaml", "Path to config file")
flag.Parse()
cfg, err := config.Load(*configPath)
if err != nil {
log.Fatalf("load config: %v", err)
}
runner, err := app.New(cfg)
if err != nil {
log.Fatalf("initialize app: %v", err)
}
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer stop()
if err := runner.Run(ctx); err != nil {
log.Fatalf("run heartbeat: %v", err)
}
}

33
config.example.yaml Normal file
View File

@@ -0,0 +1,33 @@
server_name: s1
discord_webhook_url: https://discord.com/api/webhooks/replace/me
notify_role_id: "1500565629390819479"
sample_interval: 1m
summary_interval: 6h
request_timeout: 10s
thresholds:
disk_used_percent_warning: 85
disk_used_percent_critical: 93
free_gb_warning: 20
free_gb_critical: 10
inode_used_percent_warning: 85
inode_used_percent_critical: 93
memory_used_percent_warning: 90
memory_used_percent_critical: 97
swap_used_percent_warning: 25
swap_used_percent_critical: 50
cpu_avg_15m_warning: 80
cpu_avg_15m_critical: 95
cpu_avg_12h_warning: 65
cpu_avg_12h_critical: 85
load_per_core_warning: 0.9
load_per_core_critical: 1.25
process_count_warning: 350
process_count_critical: 500
sites:
- name: pismen.com
url: https://pismen.com
expected_status: 200
timeout: 10s

19
go.mod Normal file
View File

@@ -0,0 +1,19 @@
module heartbeat
go 1.22.0
require (
github.com/shirou/gopsutil/v3 v3.24.5
gopkg.in/yaml.v3 v3.0.1
)
require (
github.com/go-ole/go-ole v1.2.6 // indirect
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
github.com/shoenig/go-m1cpu v0.1.6 // indirect
github.com/tklauser/go-sysconf v0.3.12 // indirect
github.com/tklauser/numcpus v0.6.1 // indirect
github.com/yusufpapurcu/wmi v1.2.4 // indirect
golang.org/x/sys v0.20.0 // indirect
)

38
go.sum Normal file
View File

@@ -0,0 +1,38 @@
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4=
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw=
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
github.com/shirou/gopsutil/v3 v3.24.5 h1:i0t8kL+kQTvpAYToeuiVk3TgDeKOFioZO3Ztz/iZ9pI=
github.com/shirou/gopsutil/v3 v3.24.5/go.mod h1:bsoOS1aStSs9ErQ1WWfxllSeS1K5D+U30r2NfcubMVk=
github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM=
github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ=
github.com/shoenig/test v0.6.4 h1:kVTaSd7WLz5WZ2IaoM0RSzRsUD+m8wRR+5qvntpn4LU=
github.com/shoenig/test v0.6.4/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU=
github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI=
github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk=
github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY=
github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0=
github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y=
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

14
heartbeat.service Normal file
View File

@@ -0,0 +1,14 @@
[Unit]
Description=heartbeat server monitor
After=network-online.target
Wants=network-online.target
[Service]
User=root
WorkingDirectory=/opt/heartbeat
ExecStart=/opt/heartbeat/heartbeat --config /opt/heartbeat/config.yaml
Restart=always
RestartSec=5
[Install]
WantedBy=multi-user.target

130
internal/alerts/alerts.go Normal file
View File

@@ -0,0 +1,130 @@
package alerts
import (
"fmt"
"sort"
"strings"
"time"
"heartbeat/internal/config"
"heartbeat/internal/metrics"
)
type Severity string
const (
SeverityHealthy Severity = "healthy"
SeverityWarning Severity = "warning"
SeverityCritical Severity = "critical"
)
type Event struct {
Key string
Severity Severity
Title string
Body string
}
type Evaluator struct {
states map[string]Severity
}
func NewEvaluator() *Evaluator {
return &Evaluator{states: make(map[string]Severity)}
}
func (e *Evaluator) Evaluate(cfg config.Config, sample metrics.Sample) []Event {
events := []Event{}
events = append(events, e.metricEvent("disk_used", compareHigh(sample.RootUsedPercent, cfg.Thresholds.DiskUsedPercentWarning, cfg.Thresholds.DiskUsedPercentCritical), fmt.Sprintf("Root disk usage %.1f%%", sample.RootUsedPercent), fmt.Sprintf("Root partition is %.1f%% used with %.1f GB free.", sample.RootUsedPercent, sample.RootFreeGB))...)
events = append(events, e.metricEvent("free_gb", compareLow(sample.RootFreeGB, cfg.Thresholds.FreeGBWarning, cfg.Thresholds.FreeGBCritical), fmt.Sprintf("Low disk space %.1f GB", sample.RootFreeGB), fmt.Sprintf("Root partition free space is %.1f GB.", sample.RootFreeGB))...)
events = append(events, e.metricEvent("inode_used", compareHigh(sample.InodeUsedPercent, cfg.Thresholds.InodeUsedPercentWarning, cfg.Thresholds.InodeUsedPercentCritical), fmt.Sprintf("Inode usage %.1f%%", sample.InodeUsedPercent), fmt.Sprintf("Root partition inode usage is %.1f%%.", sample.InodeUsedPercent))...)
events = append(events, e.metricEvent("memory_used", compareHigh(sample.MemoryUsedPercent, cfg.Thresholds.MemoryUsedPercentWarning, cfg.Thresholds.MemoryUsedPercentCritical), fmt.Sprintf("Memory usage %.1f%%", sample.MemoryUsedPercent), fmt.Sprintf("Memory usage is %.1f%%.", sample.MemoryUsedPercent))...)
events = append(events, e.metricEvent("swap_used", compareHigh(sample.SwapUsedPercent, cfg.Thresholds.SwapUsedPercentWarning, cfg.Thresholds.SwapUsedPercentCritical), fmt.Sprintf("Swap usage %.1f%%", sample.SwapUsedPercent), fmt.Sprintf("Swap usage is %.1f%%.", sample.SwapUsedPercent))...)
events = append(events, e.metricEvent("cpu_avg_15m", compareHigh(sample.CPUAvg15mPercent, cfg.Thresholds.CPUAvg15mWarning, cfg.Thresholds.CPUAvg15mCritical), fmt.Sprintf("CPU 15m average %.1f%%", sample.CPUAvg15mPercent), fmt.Sprintf("CPU rolling 15-minute average is %.1f%%.", sample.CPUAvg15mPercent))...)
events = append(events, e.metricEvent("cpu_avg_12h", compareHigh(sample.CPUAvg12hPercent, cfg.Thresholds.CPUAvg12hWarning, cfg.Thresholds.CPUAvg12hCritical), fmt.Sprintf("CPU 12h average %.1f%%", sample.CPUAvg12hPercent), fmt.Sprintf("CPU rolling 12-hour average is %.1f%%.", sample.CPUAvg12hPercent))...)
events = append(events, e.metricEvent("load_per_core", compareHigh(sample.LoadPerCore, cfg.Thresholds.LoadPerCoreWarning, cfg.Thresholds.LoadPerCoreCritical), fmt.Sprintf("Load per core %.2f", sample.LoadPerCore), fmt.Sprintf("15-minute load per core is %.2f.", sample.LoadPerCore))...)
events = append(events, e.metricEvent("process_count", compareHigh(float64(sample.ProcessCount), float64(cfg.Thresholds.ProcessCountWarning), float64(cfg.Thresholds.ProcessCountCritical)), fmt.Sprintf("Process count %d", sample.ProcessCount), fmt.Sprintf("Process count is %d.", sample.ProcessCount))...)
for _, site := range sample.Sites {
severity := SeverityHealthy
if !site.Healthy {
severity = SeverityCritical
}
title := fmt.Sprintf("Site %s reachable", site.Name)
body := fmt.Sprintf("%s responded with %d in %s.", site.URL, site.StatusCode, site.Latency.Round(time.Millisecond))
if !site.Healthy {
title = fmt.Sprintf("Site %s failed", site.Name)
body = fmt.Sprintf("%s check failed: %s.", site.URL, site.ErrorMessage)
}
events = append(events, e.metricEvent("site:"+site.Name, severity, title, body)...)
}
sort.Slice(events, func(i, j int) bool { return events[i].Key < events[j].Key })
return events
}
func (e *Evaluator) metricEvent(key string, severity Severity, title string, body string) []Event {
previous := e.states[key]
if previous == "" {
e.states[key] = severity
if severity == SeverityHealthy {
return nil
}
return []Event{{Key: key, Severity: severity, Title: title, Body: body}}
}
if previous == severity {
return nil
}
e.states[key] = severity
if severity == SeverityHealthy {
return []Event{{Key: key, Severity: severity, Title: recoveryTitle(title), Body: body}}
}
return []Event{{Key: key, Severity: severity, Title: title, Body: body}}
}
func recoveryTitle(title string) string {
return "Recovered: " + title
}
func compareHigh(value float64, warning float64, critical float64) Severity {
switch {
case critical > 0 && value >= critical:
return SeverityCritical
case warning > 0 && value >= warning:
return SeverityWarning
default:
return SeverityHealthy
}
}
func compareLow(value float64, warning float64, critical float64) Severity {
switch {
case critical > 0 && value <= critical:
return SeverityCritical
case warning > 0 && value <= warning:
return SeverityWarning
default:
return SeverityHealthy
}
}
func FormatSummary(sample metrics.Sample) string {
lines := []string{
fmt.Sprintf("CPU now %.1f%% | 15m %.1f%% | 12h %.1f%%", sample.CPUCurrentPercent, sample.CPUAvg15mPercent, sample.CPUAvg12hPercent),
fmt.Sprintf("Memory %.1f%% | Swap %.1f%%", sample.MemoryUsedPercent, sample.SwapUsedPercent),
fmt.Sprintf("Disk / %.1f%% used | %.1f GB free | Inodes %.1f%%", sample.RootUsedPercent, sample.RootFreeGB, sample.InodeUsedPercent),
fmt.Sprintf("Load %.2f / %.2f / %.2f | Per core %.2f", sample.Load1, sample.Load5, sample.Load15, sample.LoadPerCore),
fmt.Sprintf("Network RX %.1f KB/s | TX %.1f KB/s", sample.RXBytesPerSecond/1024, sample.TXBytesPerSecond/1024),
fmt.Sprintf("Processes %d | Hosted sites %d", sample.ProcessCount, sample.HostedServiceCount),
}
siteStates := make([]string, 0, len(sample.Sites))
for _, site := range sample.Sites {
state := "up"
if !site.Healthy {
state = "down"
}
siteStates = append(siteStates, fmt.Sprintf("%s=%s", site.Name, state))
}
if len(siteStates) > 0 {
lines = append(lines, "Sites "+strings.Join(siteStates, ", "))
}
return strings.Join(lines, "\n")
}

82
internal/app/app.go Normal file
View File

@@ -0,0 +1,82 @@
package app
import (
"context"
"log"
"time"
"heartbeat/internal/alerts"
"heartbeat/internal/config"
"heartbeat/internal/discord"
"heartbeat/internal/metrics"
)
type Runner struct {
cfg config.Config
sampler *metrics.Sampler
evaluator *alerts.Evaluator
discord *discord.Client
}
func New(cfg config.Config) (*Runner, error) {
return &Runner{
cfg: cfg,
sampler: metrics.NewSampler(cfg.RequestTimeout),
evaluator: alerts.NewEvaluator(),
discord: discord.New(cfg.ServerName, cfg.DiscordWebhookURL, cfg.NotifyRoleID, cfg.RequestTimeout),
}, nil
}
func (r *Runner) Run(ctx context.Context) error {
if err := r.tick(ctx, true); err != nil {
log.Printf("initial tick failed: %v", err)
}
sampleTicker := time.NewTicker(r.cfg.SampleInterval)
defer sampleTicker.Stop()
summaryTicker := time.NewTicker(r.cfg.SummaryInterval)
defer summaryTicker.Stop()
for {
select {
case <-ctx.Done():
return nil
case <-sampleTicker.C:
if err := r.tick(ctx, false); err != nil {
log.Printf("sample tick failed: %v", err)
}
case <-summaryTicker.C:
if err := r.sendSummary(ctx); err != nil {
log.Printf("summary failed: %v", err)
}
}
}
}
func (r *Runner) tick(ctx context.Context, sendSummary bool) error {
sample, err := r.sampler.Collect(ctx, r.cfg)
if err != nil {
return err
}
if sendSummary {
if err := r.discord.SendSummary(ctx, sample, r.cfg.SummaryInterval); err != nil {
log.Printf("summary send failed: %v", err)
}
}
for _, event := range r.evaluator.Evaluate(r.cfg, sample) {
if err := r.discord.SendEvent(ctx, sample, event); err != nil {
log.Printf("event send failed for %s: %v", event.Key, err)
}
}
return nil
}
func (r *Runner) sendSummary(ctx context.Context) error {
sample, err := r.sampler.Collect(ctx, r.cfg)
if err != nil {
return err
}
return r.discord.SendSummary(ctx, sample, r.cfg.SummaryInterval)
}

117
internal/config/config.go Normal file
View File

@@ -0,0 +1,117 @@
package config
import (
"fmt"
"os"
"time"
"gopkg.in/yaml.v3"
)
type Config struct {
ServerName string `yaml:"server_name"`
DiscordWebhookURL string `yaml:"discord_webhook_url"`
NotifyRoleID string `yaml:"notify_role_id"`
SampleInterval time.Duration `yaml:"sample_interval"`
SummaryInterval time.Duration `yaml:"summary_interval"`
RequestTimeout time.Duration `yaml:"request_timeout"`
Thresholds Thresholds `yaml:"thresholds"`
Sites []Site `yaml:"sites"`
}
type Thresholds struct {
DiskUsedPercentWarning float64 `yaml:"disk_used_percent_warning"`
DiskUsedPercentCritical float64 `yaml:"disk_used_percent_critical"`
FreeGBWarning float64 `yaml:"free_gb_warning"`
FreeGBCritical float64 `yaml:"free_gb_critical"`
InodeUsedPercentWarning float64 `yaml:"inode_used_percent_warning"`
InodeUsedPercentCritical float64 `yaml:"inode_used_percent_critical"`
MemoryUsedPercentWarning float64 `yaml:"memory_used_percent_warning"`
MemoryUsedPercentCritical float64 `yaml:"memory_used_percent_critical"`
SwapUsedPercentWarning float64 `yaml:"swap_used_percent_warning"`
SwapUsedPercentCritical float64 `yaml:"swap_used_percent_critical"`
CPUAvg15mWarning float64 `yaml:"cpu_avg_15m_warning"`
CPUAvg15mCritical float64 `yaml:"cpu_avg_15m_critical"`
CPUAvg12hWarning float64 `yaml:"cpu_avg_12h_warning"`
CPUAvg12hCritical float64 `yaml:"cpu_avg_12h_critical"`
LoadPerCoreWarning float64 `yaml:"load_per_core_warning"`
LoadPerCoreCritical float64 `yaml:"load_per_core_critical"`
ProcessCountWarning int `yaml:"process_count_warning"`
ProcessCountCritical int `yaml:"process_count_critical"`
}
type Site struct {
Name string `yaml:"name"`
URL string `yaml:"url"`
ExpectedStatus int `yaml:"expected_status"`
Timeout time.Duration `yaml:"timeout"`
}
func Load(path string) (Config, error) {
data, err := os.ReadFile(path)
if err != nil {
return Config{}, err
}
var cfg Config
if err := yaml.Unmarshal(data, &cfg); err != nil {
return Config{}, err
}
applyDefaults(&cfg)
if err := cfg.Validate(); err != nil {
return Config{}, err
}
return cfg, nil
}
func applyDefaults(cfg *Config) {
if cfg.SampleInterval == 0 {
cfg.SampleInterval = time.Minute
}
if cfg.SummaryInterval == 0 {
cfg.SummaryInterval = 6 * time.Hour
}
if cfg.RequestTimeout == 0 {
cfg.RequestTimeout = 10 * time.Second
}
for index := range cfg.Sites {
if cfg.Sites[index].ExpectedStatus == 0 {
cfg.Sites[index].ExpectedStatus = 200
}
if cfg.Sites[index].Timeout == 0 {
cfg.Sites[index].Timeout = cfg.RequestTimeout
}
}
if cfg.Thresholds.ProcessCountWarning == 0 {
cfg.Thresholds.ProcessCountWarning = 350
}
if cfg.Thresholds.ProcessCountCritical == 0 {
cfg.Thresholds.ProcessCountCritical = 500
}
}
func (cfg Config) Validate() error {
if cfg.ServerName == "" {
return fmt.Errorf("server_name is required")
}
if cfg.DiscordWebhookURL == "" {
return fmt.Errorf("discord_webhook_url is required")
}
if cfg.SampleInterval <= 0 {
return fmt.Errorf("sample_interval must be > 0")
}
if cfg.SummaryInterval <= 0 {
return fmt.Errorf("summary_interval must be > 0")
}
for _, site := range cfg.Sites {
if site.Name == "" {
return fmt.Errorf("site name is required")
}
if site.URL == "" {
return fmt.Errorf("site URL is required for %s", site.Name)
}
}
return nil
}

201
internal/discord/discord.go Normal file
View File

@@ -0,0 +1,201 @@
package discord
import (
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"strings"
"time"
"heartbeat/internal/alerts"
"heartbeat/internal/metrics"
)
type Client struct {
webhookURL string
httpClient *http.Client
serverName string
notifyRoleID string
}
type webhookPayload struct {
Content string `json:"content,omitempty"`
AllowedMentions allowedMentions `json:"allowed_mentions,omitempty"`
Embeds []embed `json:"embeds"`
}
type allowedMentions struct {
Roles []string `json:"roles,omitempty"`
}
type embed struct {
Title string `json:"title"`
Description string `json:"description,omitempty"`
Color int `json:"color"`
Timestamp string `json:"timestamp,omitempty"`
Fields []embedField `json:"fields,omitempty"`
Footer *embedFooter `json:"footer,omitempty"`
}
type embedField struct {
Name string `json:"name"`
Value string `json:"value"`
Inline bool `json:"inline"`
}
type embedFooter struct {
Text string `json:"text"`
}
func New(serverName string, webhookURL string, notifyRoleID string, timeout time.Duration) *Client {
return &Client{
serverName: serverName,
webhookURL: webhookURL,
httpClient: &http.Client{Timeout: timeout},
notifyRoleID: notifyRoleID,
}
}
func (c *Client) SendSummary(ctx context.Context, sample metrics.Sample, summaryInterval time.Duration) error {
fields := []embedField{
{Name: "Server", Value: c.serverName, Inline: true},
{Name: "Generated", Value: formatSummaryTime(sample.Timestamp), Inline: true},
{Name: "Uptime", Value: formatUptime(sample.UptimeSeconds), Inline: true},
{Name: "CPU", Value: fmt.Sprintf("Now: %.1f%%\n15m: %.1f%%\n12h: %.1f%%", sample.CPUCurrentPercent, sample.CPUAvg15mPercent, sample.CPUAvg12hPercent), Inline: true},
{Name: "Load", Value: fmt.Sprintf("1m: %.2f\n5m: %.2f\n15m/core: %.2f", sample.Load1, sample.Load5, sample.LoadPerCore), Inline: true},
{Name: "Memory", Value: fmt.Sprintf("RAM: %.1f%%\nSwap: %.1f%%", sample.MemoryUsedPercent, sample.SwapUsedPercent), Inline: true},
{Name: "Disk /", Value: fmt.Sprintf("Used: %.1f%%\nFree: %.1f GB\nInodes: %.1f%%", sample.RootUsedPercent, sample.RootFreeGB, sample.InodeUsedPercent), Inline: true},
{Name: "Network", Value: fmt.Sprintf("RX: %s\nTX: %s", formatRate(sample.RXBytesPerSecond), formatRate(sample.TXBytesPerSecond)), Inline: true},
{Name: "Processes", Value: fmt.Sprintf("Count: %d", sample.ProcessCount), Inline: true},
{Name: "Sites", Value: formatSiteStatuses(sample.Sites), Inline: false},
}
return c.send(ctx, webhookPayload{Embeds: []embed{{
Title: fmt.Sprintf("heartbeat (%s) - %s", formatSummaryIntervalHours(summaryInterval), c.serverName),
Description: "Scheduled server health snapshot.",
Color: 0x2D9CDB,
Fields: fields,
Footer: &embedFooter{Text: formatSummaryFooter(c.serverName)},
}}})
}
func (c *Client) SendEvent(ctx context.Context, sample metrics.Sample, event alerts.Event) error {
fields := []embedField{
{Name: "Server", Value: c.serverName, Inline: true},
{Name: "Severity", Value: string(event.Severity), Inline: true},
{Name: "Timestamp", Value: formatEventTime(sample.Timestamp), Inline: true},
}
payload := webhookPayload{Embeds: []embed{{
Title: event.Title,
Description: event.Body,
Color: colorForSeverity(event.Severity),
Timestamp: sample.Timestamp.Format(time.RFC3339),
Fields: fields,
}}}
if c.notifyRoleID != "" && (event.Severity == alerts.SeverityWarning || event.Severity == alerts.SeverityCritical) {
payload.Content = fmt.Sprintf("<@&%s>", c.notifyRoleID)
payload.AllowedMentions = allowedMentions{Roles: []string{c.notifyRoleID}}
}
return c.send(ctx, payload)
}
func formatUptime(totalSeconds uint64) string {
duration := time.Duration(totalSeconds) * time.Second
days := duration / (24 * time.Hour)
duration -= days * 24 * time.Hour
hours := duration / time.Hour
duration -= hours * time.Hour
minutes := duration / time.Minute
if days > 0 {
return fmt.Sprintf("%dd %dh %dm", days, hours, minutes)
}
return fmt.Sprintf("%dh %dm", hours, minutes)
}
func formatSiteStatuses(sites []metrics.SiteStatus) string {
if len(sites) == 0 {
return "No site checks configured"
}
parts := make([]string, 0, len(sites))
for _, site := range sites {
status := "UP"
detail := fmt.Sprintf("%d in %s", site.StatusCode, site.Latency.Round(time.Millisecond))
if !site.Healthy {
status = "DOWN"
detail = site.ErrorMessage
}
parts = append(parts, fmt.Sprintf("%s: %s (%s)", site.Name, status, detail))
}
return strings.Join(parts, "\n")
}
func formatSummaryTime(timestamp time.Time) string {
unixSeconds := timestamp.Unix()
return fmt.Sprintf("<t:%d:f>", unixSeconds)
}
func formatEventTime(timestamp time.Time) string {
unixSeconds := timestamp.Unix()
return fmt.Sprintf("<t:%d:f> - <t:%d:R>", unixSeconds, unixSeconds)
}
func formatSummaryFooter(serverName string) string {
return fmt.Sprintf("heartbeat - %s", serverName)
}
func formatSummaryIntervalHours(interval time.Duration) string {
if interval%time.Hour == 0 {
return fmt.Sprintf("%dh", interval/time.Hour)
}
return fmt.Sprintf("%.2fh", interval.Hours())
}
func formatRate(bytesPerSecond float64) string {
if bytesPerSecond < 0 {
return "n/a"
}
if bytesPerSecond < 1024 {
return fmt.Sprintf("%.0f B/s", bytesPerSecond)
}
kib := bytesPerSecond / 1024
if kib < 1024 {
return fmt.Sprintf("%.2f KB/s", kib)
}
mib := kib / 1024
return fmt.Sprintf("%.2f MB/s", mib)
}
func (c *Client) send(ctx context.Context, payload webhookPayload) error {
buffer := &bytes.Buffer{}
if err := json.NewEncoder(buffer).Encode(payload); err != nil {
return err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.webhookURL, buffer)
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
resp, err := c.httpClient.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return fmt.Errorf("discord webhook returned %s", resp.Status)
}
return nil
}
func colorForSeverity(severity alerts.Severity) int {
switch severity {
case alerts.SeverityCritical:
return 0xE74C3C
case alerts.SeverityWarning:
return 0xF39C12
default:
return 0x27AE60
}
}

261
internal/metrics/metrics.go Normal file
View File

@@ -0,0 +1,261 @@
package metrics
import (
"context"
"fmt"
"net"
"net/http"
"strings"
"time"
"heartbeat/internal/config"
"github.com/shirou/gopsutil/v3/cpu"
"github.com/shirou/gopsutil/v3/disk"
"github.com/shirou/gopsutil/v3/host"
"github.com/shirou/gopsutil/v3/load"
"github.com/shirou/gopsutil/v3/mem"
gnet "github.com/shirou/gopsutil/v3/net"
"github.com/shirou/gopsutil/v3/process"
)
const gib = 1024 * 1024 * 1024
type Sample struct {
Timestamp time.Time
RootUsedPercent float64
RootFreeGB float64
InodeUsedPercent float64
CPUCurrentPercent float64
CPUAvg15mPercent float64
CPUAvg12hPercent float64
MemoryUsedPercent float64
SwapUsedPercent float64
Load1 float64
Load5 float64
Load15 float64
LoadPerCore float64
RXBytesPerSecond float64
TXBytesPerSecond float64
ProcessCount int
HostedServiceCount int
UptimeSeconds uint64
Sites []SiteStatus
}
type SiteStatus struct {
Name string
URL string
Healthy bool
StatusCode int
Latency time.Duration
ErrorMessage string
ExpectedStatus int
}
type Sampler struct {
httpClient *http.Client
prevNet netIO
history []historyPoint
historyCap int
}
type netIO struct {
timestamp time.Time
rx uint64
tx uint64
}
type historyPoint struct {
timestamp time.Time
cpu float64
}
func NewSampler(timeout time.Duration) *Sampler {
transport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: timeout,
}).DialContext,
TLSHandshakeTimeout: timeout,
}
return &Sampler{
httpClient: &http.Client{Timeout: timeout, Transport: transport},
historyCap: 12 * 60,
}
}
func (s *Sampler) Collect(ctx context.Context, cfg config.Config) (Sample, error) {
now := time.Now().UTC()
rootUsage, err := disk.UsageWithContext(ctx, "/")
if err != nil {
return Sample{}, fmt.Errorf("root usage: %w", err)
}
cpuPercents, err := cpu.PercentWithContext(ctx, time.Second, false)
if err != nil {
return Sample{}, fmt.Errorf("cpu percent: %w", err)
}
virtualMemory, err := mem.VirtualMemoryWithContext(ctx)
if err != nil {
return Sample{}, fmt.Errorf("memory: %w", err)
}
swapMemory, err := mem.SwapMemoryWithContext(ctx)
if err != nil {
return Sample{}, fmt.Errorf("swap: %w", err)
}
avg, err := load.AvgWithContext(ctx)
if err != nil {
return Sample{}, fmt.Errorf("load average: %w", err)
}
hostInfo, err := host.InfoWithContext(ctx)
if err != nil {
return Sample{}, fmt.Errorf("host info: %w", err)
}
processes, err := process.ProcessesWithContext(ctx)
if err != nil {
return Sample{}, fmt.Errorf("process list: %w", err)
}
rxRate, txRate := s.networkRates(ctx, now)
sites := s.checkSites(ctx, cfg.Sites)
currentCPU := 0.0
if len(cpuPercents) > 0 {
currentCPU = cpuPercents[0]
}
s.appendHistory(now, currentCPU)
coreCount, err := cpu.CountsWithContext(ctx, true)
if err != nil || coreCount == 0 {
coreCount = 1
}
return Sample{
Timestamp: now,
RootUsedPercent: rootUsage.UsedPercent,
RootFreeGB: float64(rootUsage.Free) / gib,
InodeUsedPercent: inodeUsedPercent(rootUsage),
CPUCurrentPercent: currentCPU,
CPUAvg15mPercent: s.averageCPU(15 * time.Minute),
CPUAvg12hPercent: s.averageCPU(12 * time.Hour),
MemoryUsedPercent: virtualMemory.UsedPercent,
SwapUsedPercent: swapMemory.UsedPercent,
Load1: avg.Load1,
Load5: avg.Load5,
Load15: avg.Load15,
LoadPerCore: avg.Load15 / float64(coreCount),
RXBytesPerSecond: rxRate,
TXBytesPerSecond: txRate,
ProcessCount: len(processes),
HostedServiceCount: len(cfg.Sites),
UptimeSeconds: hostInfo.Uptime,
Sites: sites,
}, nil
}
func (s *Sampler) appendHistory(timestamp time.Time, cpuPercent float64) {
s.history = append(s.history, historyPoint{timestamp: timestamp, cpu: cpuPercent})
if len(s.history) > s.historyCap {
s.history = s.history[len(s.history)-s.historyCap:]
}
}
func (s *Sampler) averageCPU(window time.Duration) float64 {
if len(s.history) == 0 {
return 0
}
cutoff := time.Now().UTC().Add(-window)
total := 0.0
count := 0.0
for _, point := range s.history {
if point.timestamp.Before(cutoff) {
continue
}
total += point.cpu
count++
}
if count == 0 {
return 0
}
return total / count
}
func (s *Sampler) networkRates(ctx context.Context, now time.Time) (float64, float64) {
stats, err := gnet.IOCountersWithContext(ctx, false)
if err != nil || len(stats) == 0 {
return 0, 0
}
current := netIO{timestamp: now, rx: stats[0].BytesRecv, tx: stats[0].BytesSent}
if s.prevNet.timestamp.IsZero() {
s.prevNet = current
return 0, 0
}
seconds := current.timestamp.Sub(s.prevNet.timestamp).Seconds()
if seconds <= 0 {
return 0, 0
}
rxRate := float64(current.rx-s.prevNet.rx) / seconds
txRate := float64(current.tx-s.prevNet.tx) / seconds
s.prevNet = current
return rxRate, txRate
}
func (s *Sampler) checkSites(ctx context.Context, sites []config.Site) []SiteStatus {
results := make([]SiteStatus, 0, len(sites))
for _, site := range sites {
results = append(results, s.checkSite(ctx, site))
}
return results
}
func (s *Sampler) checkSite(parent context.Context, site config.Site) SiteStatus {
ctx, cancel := context.WithTimeout(parent, site.Timeout)
defer cancel()
start := time.Now()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, site.URL, nil)
if err != nil {
return SiteStatus{Name: site.Name, URL: site.URL, ExpectedStatus: site.ExpectedStatus, ErrorMessage: err.Error()}
}
resp, err := s.httpClient.Do(req)
if err != nil {
return SiteStatus{Name: site.Name, URL: site.URL, ExpectedStatus: site.ExpectedStatus, ErrorMessage: normalizeError(err)}
}
defer resp.Body.Close()
status := SiteStatus{
Name: site.Name,
URL: site.URL,
Healthy: resp.StatusCode == site.ExpectedStatus,
StatusCode: resp.StatusCode,
Latency: time.Since(start),
ExpectedStatus: site.ExpectedStatus,
}
if !status.Healthy {
status.ErrorMessage = fmt.Sprintf("expected %d, got %d", site.ExpectedStatus, resp.StatusCode)
}
return status
}
func normalizeError(err error) string {
message := err.Error()
message = strings.TrimPrefix(message, "Get ")
return message
}
func inodeUsedPercent(stat *disk.UsageStat) float64 {
if stat.InodesTotal == 0 {
return 0
}
used := stat.InodesTotal - stat.InodesFree
return float64(used) / float64(stat.InodesTotal) * 100
}