This commit is contained in:
36
.gitea/workflows/build.yml
Normal file
36
.gitea/workflows/build.yml
Normal file
@@ -0,0 +1,36 @@
|
||||
name: Build heartbeat
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- '**/*.go'
|
||||
- 'go.mod'
|
||||
- 'go.sum'
|
||||
- '.gitea/workflows/build.yml'
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.22'
|
||||
cache: true
|
||||
|
||||
- name: Build binary
|
||||
run: |
|
||||
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -trimpath -ldflags="-s -w" -o heartbeat ./cmd/heartbeat
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: heartbeat-linux-amd64
|
||||
path: heartbeat
|
||||
retention-days: 365
|
||||
22
.gitignore
vendored
Normal file
22
.gitignore
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
# built binary
|
||||
/heartbeat
|
||||
|
||||
# runtime config (contains webhook URL)
|
||||
/config.yaml
|
||||
|
||||
# Go build cache and test artifacts
|
||||
*.test
|
||||
*.out
|
||||
/vendor/
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Editor
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
34
cmd/heartbeat/main.go
Normal file
34
cmd/heartbeat/main.go
Normal file
@@ -0,0 +1,34 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"log"
|
||||
"os/signal"
|
||||
"syscall"
|
||||
|
||||
"heartbeat/internal/app"
|
||||
"heartbeat/internal/config"
|
||||
)
|
||||
|
||||
func main() {
|
||||
configPath := flag.String("config", "./config.yaml", "Path to config file")
|
||||
flag.Parse()
|
||||
|
||||
cfg, err := config.Load(*configPath)
|
||||
if err != nil {
|
||||
log.Fatalf("load config: %v", err)
|
||||
}
|
||||
|
||||
runner, err := app.New(cfg)
|
||||
if err != nil {
|
||||
log.Fatalf("initialize app: %v", err)
|
||||
}
|
||||
|
||||
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
||||
defer stop()
|
||||
|
||||
if err := runner.Run(ctx); err != nil {
|
||||
log.Fatalf("run heartbeat: %v", err)
|
||||
}
|
||||
}
|
||||
33
config.example.yaml
Normal file
33
config.example.yaml
Normal file
@@ -0,0 +1,33 @@
|
||||
server_name: s1
|
||||
discord_webhook_url: https://discord.com/api/webhooks/replace/me
|
||||
notify_role_id: "1500565629390819479"
|
||||
|
||||
sample_interval: 1m
|
||||
summary_interval: 6h
|
||||
request_timeout: 10s
|
||||
|
||||
thresholds:
|
||||
disk_used_percent_warning: 85
|
||||
disk_used_percent_critical: 93
|
||||
free_gb_warning: 20
|
||||
free_gb_critical: 10
|
||||
inode_used_percent_warning: 85
|
||||
inode_used_percent_critical: 93
|
||||
memory_used_percent_warning: 90
|
||||
memory_used_percent_critical: 97
|
||||
swap_used_percent_warning: 25
|
||||
swap_used_percent_critical: 50
|
||||
cpu_avg_15m_warning: 80
|
||||
cpu_avg_15m_critical: 95
|
||||
cpu_avg_12h_warning: 65
|
||||
cpu_avg_12h_critical: 85
|
||||
load_per_core_warning: 0.9
|
||||
load_per_core_critical: 1.25
|
||||
process_count_warning: 350
|
||||
process_count_critical: 500
|
||||
|
||||
sites:
|
||||
- name: pismen.com
|
||||
url: https://pismen.com
|
||||
expected_status: 200
|
||||
timeout: 10s
|
||||
19
go.mod
Normal file
19
go.mod
Normal file
@@ -0,0 +1,19 @@
|
||||
module heartbeat
|
||||
|
||||
go 1.22.0
|
||||
|
||||
require (
|
||||
github.com/shirou/gopsutil/v3 v3.24.5
|
||||
gopkg.in/yaml.v3 v3.0.1
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/go-ole/go-ole v1.2.6 // indirect
|
||||
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
|
||||
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
|
||||
github.com/shoenig/go-m1cpu v0.1.6 // indirect
|
||||
github.com/tklauser/go-sysconf v0.3.12 // indirect
|
||||
github.com/tklauser/numcpus v0.6.1 // indirect
|
||||
github.com/yusufpapurcu/wmi v1.2.4 // indirect
|
||||
golang.org/x/sys v0.20.0 // indirect
|
||||
)
|
||||
38
go.sum
Normal file
38
go.sum
Normal file
@@ -0,0 +1,38 @@
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
|
||||
github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
|
||||
github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4=
|
||||
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw=
|
||||
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
|
||||
github.com/shirou/gopsutil/v3 v3.24.5 h1:i0t8kL+kQTvpAYToeuiVk3TgDeKOFioZO3Ztz/iZ9pI=
|
||||
github.com/shirou/gopsutil/v3 v3.24.5/go.mod h1:bsoOS1aStSs9ErQ1WWfxllSeS1K5D+U30r2NfcubMVk=
|
||||
github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM=
|
||||
github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ=
|
||||
github.com/shoenig/test v0.6.4 h1:kVTaSd7WLz5WZ2IaoM0RSzRsUD+m8wRR+5qvntpn4LU=
|
||||
github.com/shoenig/test v0.6.4/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k=
|
||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU=
|
||||
github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI=
|
||||
github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk=
|
||||
github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY=
|
||||
github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0=
|
||||
github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
|
||||
golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y=
|
||||
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
14
heartbeat.service
Normal file
14
heartbeat.service
Normal file
@@ -0,0 +1,14 @@
|
||||
[Unit]
|
||||
Description=heartbeat server monitor
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
User=root
|
||||
WorkingDirectory=/opt/heartbeat
|
||||
ExecStart=/opt/heartbeat/heartbeat --config /opt/heartbeat/config.yaml
|
||||
Restart=always
|
||||
RestartSec=5
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
130
internal/alerts/alerts.go
Normal file
130
internal/alerts/alerts.go
Normal file
@@ -0,0 +1,130 @@
|
||||
package alerts
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"heartbeat/internal/config"
|
||||
"heartbeat/internal/metrics"
|
||||
)
|
||||
|
||||
type Severity string
|
||||
|
||||
const (
|
||||
SeverityHealthy Severity = "healthy"
|
||||
SeverityWarning Severity = "warning"
|
||||
SeverityCritical Severity = "critical"
|
||||
)
|
||||
|
||||
type Event struct {
|
||||
Key string
|
||||
Severity Severity
|
||||
Title string
|
||||
Body string
|
||||
}
|
||||
|
||||
type Evaluator struct {
|
||||
states map[string]Severity
|
||||
}
|
||||
|
||||
func NewEvaluator() *Evaluator {
|
||||
return &Evaluator{states: make(map[string]Severity)}
|
||||
}
|
||||
|
||||
func (e *Evaluator) Evaluate(cfg config.Config, sample metrics.Sample) []Event {
|
||||
events := []Event{}
|
||||
events = append(events, e.metricEvent("disk_used", compareHigh(sample.RootUsedPercent, cfg.Thresholds.DiskUsedPercentWarning, cfg.Thresholds.DiskUsedPercentCritical), fmt.Sprintf("Root disk usage %.1f%%", sample.RootUsedPercent), fmt.Sprintf("Root partition is %.1f%% used with %.1f GB free.", sample.RootUsedPercent, sample.RootFreeGB))...)
|
||||
events = append(events, e.metricEvent("free_gb", compareLow(sample.RootFreeGB, cfg.Thresholds.FreeGBWarning, cfg.Thresholds.FreeGBCritical), fmt.Sprintf("Low disk space %.1f GB", sample.RootFreeGB), fmt.Sprintf("Root partition free space is %.1f GB.", sample.RootFreeGB))...)
|
||||
events = append(events, e.metricEvent("inode_used", compareHigh(sample.InodeUsedPercent, cfg.Thresholds.InodeUsedPercentWarning, cfg.Thresholds.InodeUsedPercentCritical), fmt.Sprintf("Inode usage %.1f%%", sample.InodeUsedPercent), fmt.Sprintf("Root partition inode usage is %.1f%%.", sample.InodeUsedPercent))...)
|
||||
events = append(events, e.metricEvent("memory_used", compareHigh(sample.MemoryUsedPercent, cfg.Thresholds.MemoryUsedPercentWarning, cfg.Thresholds.MemoryUsedPercentCritical), fmt.Sprintf("Memory usage %.1f%%", sample.MemoryUsedPercent), fmt.Sprintf("Memory usage is %.1f%%.", sample.MemoryUsedPercent))...)
|
||||
events = append(events, e.metricEvent("swap_used", compareHigh(sample.SwapUsedPercent, cfg.Thresholds.SwapUsedPercentWarning, cfg.Thresholds.SwapUsedPercentCritical), fmt.Sprintf("Swap usage %.1f%%", sample.SwapUsedPercent), fmt.Sprintf("Swap usage is %.1f%%.", sample.SwapUsedPercent))...)
|
||||
events = append(events, e.metricEvent("cpu_avg_15m", compareHigh(sample.CPUAvg15mPercent, cfg.Thresholds.CPUAvg15mWarning, cfg.Thresholds.CPUAvg15mCritical), fmt.Sprintf("CPU 15m average %.1f%%", sample.CPUAvg15mPercent), fmt.Sprintf("CPU rolling 15-minute average is %.1f%%.", sample.CPUAvg15mPercent))...)
|
||||
events = append(events, e.metricEvent("cpu_avg_12h", compareHigh(sample.CPUAvg12hPercent, cfg.Thresholds.CPUAvg12hWarning, cfg.Thresholds.CPUAvg12hCritical), fmt.Sprintf("CPU 12h average %.1f%%", sample.CPUAvg12hPercent), fmt.Sprintf("CPU rolling 12-hour average is %.1f%%.", sample.CPUAvg12hPercent))...)
|
||||
events = append(events, e.metricEvent("load_per_core", compareHigh(sample.LoadPerCore, cfg.Thresholds.LoadPerCoreWarning, cfg.Thresholds.LoadPerCoreCritical), fmt.Sprintf("Load per core %.2f", sample.LoadPerCore), fmt.Sprintf("15-minute load per core is %.2f.", sample.LoadPerCore))...)
|
||||
events = append(events, e.metricEvent("process_count", compareHigh(float64(sample.ProcessCount), float64(cfg.Thresholds.ProcessCountWarning), float64(cfg.Thresholds.ProcessCountCritical)), fmt.Sprintf("Process count %d", sample.ProcessCount), fmt.Sprintf("Process count is %d.", sample.ProcessCount))...)
|
||||
for _, site := range sample.Sites {
|
||||
severity := SeverityHealthy
|
||||
if !site.Healthy {
|
||||
severity = SeverityCritical
|
||||
}
|
||||
title := fmt.Sprintf("Site %s reachable", site.Name)
|
||||
body := fmt.Sprintf("%s responded with %d in %s.", site.URL, site.StatusCode, site.Latency.Round(time.Millisecond))
|
||||
if !site.Healthy {
|
||||
title = fmt.Sprintf("Site %s failed", site.Name)
|
||||
body = fmt.Sprintf("%s check failed: %s.", site.URL, site.ErrorMessage)
|
||||
}
|
||||
events = append(events, e.metricEvent("site:"+site.Name, severity, title, body)...)
|
||||
}
|
||||
sort.Slice(events, func(i, j int) bool { return events[i].Key < events[j].Key })
|
||||
return events
|
||||
}
|
||||
|
||||
func (e *Evaluator) metricEvent(key string, severity Severity, title string, body string) []Event {
|
||||
previous := e.states[key]
|
||||
if previous == "" {
|
||||
e.states[key] = severity
|
||||
if severity == SeverityHealthy {
|
||||
return nil
|
||||
}
|
||||
return []Event{{Key: key, Severity: severity, Title: title, Body: body}}
|
||||
}
|
||||
if previous == severity {
|
||||
return nil
|
||||
}
|
||||
e.states[key] = severity
|
||||
if severity == SeverityHealthy {
|
||||
return []Event{{Key: key, Severity: severity, Title: recoveryTitle(title), Body: body}}
|
||||
}
|
||||
return []Event{{Key: key, Severity: severity, Title: title, Body: body}}
|
||||
}
|
||||
|
||||
func recoveryTitle(title string) string {
|
||||
return "Recovered: " + title
|
||||
}
|
||||
|
||||
func compareHigh(value float64, warning float64, critical float64) Severity {
|
||||
switch {
|
||||
case critical > 0 && value >= critical:
|
||||
return SeverityCritical
|
||||
case warning > 0 && value >= warning:
|
||||
return SeverityWarning
|
||||
default:
|
||||
return SeverityHealthy
|
||||
}
|
||||
}
|
||||
|
||||
func compareLow(value float64, warning float64, critical float64) Severity {
|
||||
switch {
|
||||
case critical > 0 && value <= critical:
|
||||
return SeverityCritical
|
||||
case warning > 0 && value <= warning:
|
||||
return SeverityWarning
|
||||
default:
|
||||
return SeverityHealthy
|
||||
}
|
||||
}
|
||||
|
||||
func FormatSummary(sample metrics.Sample) string {
|
||||
lines := []string{
|
||||
fmt.Sprintf("CPU now %.1f%% | 15m %.1f%% | 12h %.1f%%", sample.CPUCurrentPercent, sample.CPUAvg15mPercent, sample.CPUAvg12hPercent),
|
||||
fmt.Sprintf("Memory %.1f%% | Swap %.1f%%", sample.MemoryUsedPercent, sample.SwapUsedPercent),
|
||||
fmt.Sprintf("Disk / %.1f%% used | %.1f GB free | Inodes %.1f%%", sample.RootUsedPercent, sample.RootFreeGB, sample.InodeUsedPercent),
|
||||
fmt.Sprintf("Load %.2f / %.2f / %.2f | Per core %.2f", sample.Load1, sample.Load5, sample.Load15, sample.LoadPerCore),
|
||||
fmt.Sprintf("Network RX %.1f KB/s | TX %.1f KB/s", sample.RXBytesPerSecond/1024, sample.TXBytesPerSecond/1024),
|
||||
fmt.Sprintf("Processes %d | Hosted sites %d", sample.ProcessCount, sample.HostedServiceCount),
|
||||
}
|
||||
siteStates := make([]string, 0, len(sample.Sites))
|
||||
for _, site := range sample.Sites {
|
||||
state := "up"
|
||||
if !site.Healthy {
|
||||
state = "down"
|
||||
}
|
||||
siteStates = append(siteStates, fmt.Sprintf("%s=%s", site.Name, state))
|
||||
}
|
||||
if len(siteStates) > 0 {
|
||||
lines = append(lines, "Sites "+strings.Join(siteStates, ", "))
|
||||
}
|
||||
return strings.Join(lines, "\n")
|
||||
}
|
||||
82
internal/app/app.go
Normal file
82
internal/app/app.go
Normal file
@@ -0,0 +1,82 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"time"
|
||||
|
||||
"heartbeat/internal/alerts"
|
||||
"heartbeat/internal/config"
|
||||
"heartbeat/internal/discord"
|
||||
"heartbeat/internal/metrics"
|
||||
)
|
||||
|
||||
type Runner struct {
|
||||
cfg config.Config
|
||||
sampler *metrics.Sampler
|
||||
evaluator *alerts.Evaluator
|
||||
discord *discord.Client
|
||||
}
|
||||
|
||||
func New(cfg config.Config) (*Runner, error) {
|
||||
return &Runner{
|
||||
cfg: cfg,
|
||||
sampler: metrics.NewSampler(cfg.RequestTimeout),
|
||||
evaluator: alerts.NewEvaluator(),
|
||||
discord: discord.New(cfg.ServerName, cfg.DiscordWebhookURL, cfg.NotifyRoleID, cfg.RequestTimeout),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (r *Runner) Run(ctx context.Context) error {
|
||||
if err := r.tick(ctx, true); err != nil {
|
||||
log.Printf("initial tick failed: %v", err)
|
||||
}
|
||||
|
||||
sampleTicker := time.NewTicker(r.cfg.SampleInterval)
|
||||
defer sampleTicker.Stop()
|
||||
|
||||
summaryTicker := time.NewTicker(r.cfg.SummaryInterval)
|
||||
defer summaryTicker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil
|
||||
case <-sampleTicker.C:
|
||||
if err := r.tick(ctx, false); err != nil {
|
||||
log.Printf("sample tick failed: %v", err)
|
||||
}
|
||||
case <-summaryTicker.C:
|
||||
if err := r.sendSummary(ctx); err != nil {
|
||||
log.Printf("summary failed: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (r *Runner) tick(ctx context.Context, sendSummary bool) error {
|
||||
sample, err := r.sampler.Collect(ctx, r.cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if sendSummary {
|
||||
if err := r.discord.SendSummary(ctx, sample, r.cfg.SummaryInterval); err != nil {
|
||||
log.Printf("summary send failed: %v", err)
|
||||
}
|
||||
}
|
||||
for _, event := range r.evaluator.Evaluate(r.cfg, sample) {
|
||||
if err := r.discord.SendEvent(ctx, sample, event); err != nil {
|
||||
log.Printf("event send failed for %s: %v", event.Key, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *Runner) sendSummary(ctx context.Context) error {
|
||||
sample, err := r.sampler.Collect(ctx, r.cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return r.discord.SendSummary(ctx, sample, r.cfg.SummaryInterval)
|
||||
}
|
||||
|
||||
117
internal/config/config.go
Normal file
117
internal/config/config.go
Normal file
@@ -0,0 +1,117 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
type Config struct {
|
||||
ServerName string `yaml:"server_name"`
|
||||
DiscordWebhookURL string `yaml:"discord_webhook_url"`
|
||||
NotifyRoleID string `yaml:"notify_role_id"`
|
||||
SampleInterval time.Duration `yaml:"sample_interval"`
|
||||
SummaryInterval time.Duration `yaml:"summary_interval"`
|
||||
RequestTimeout time.Duration `yaml:"request_timeout"`
|
||||
Thresholds Thresholds `yaml:"thresholds"`
|
||||
Sites []Site `yaml:"sites"`
|
||||
}
|
||||
|
||||
type Thresholds struct {
|
||||
DiskUsedPercentWarning float64 `yaml:"disk_used_percent_warning"`
|
||||
DiskUsedPercentCritical float64 `yaml:"disk_used_percent_critical"`
|
||||
FreeGBWarning float64 `yaml:"free_gb_warning"`
|
||||
FreeGBCritical float64 `yaml:"free_gb_critical"`
|
||||
InodeUsedPercentWarning float64 `yaml:"inode_used_percent_warning"`
|
||||
InodeUsedPercentCritical float64 `yaml:"inode_used_percent_critical"`
|
||||
MemoryUsedPercentWarning float64 `yaml:"memory_used_percent_warning"`
|
||||
MemoryUsedPercentCritical float64 `yaml:"memory_used_percent_critical"`
|
||||
SwapUsedPercentWarning float64 `yaml:"swap_used_percent_warning"`
|
||||
SwapUsedPercentCritical float64 `yaml:"swap_used_percent_critical"`
|
||||
CPUAvg15mWarning float64 `yaml:"cpu_avg_15m_warning"`
|
||||
CPUAvg15mCritical float64 `yaml:"cpu_avg_15m_critical"`
|
||||
CPUAvg12hWarning float64 `yaml:"cpu_avg_12h_warning"`
|
||||
CPUAvg12hCritical float64 `yaml:"cpu_avg_12h_critical"`
|
||||
LoadPerCoreWarning float64 `yaml:"load_per_core_warning"`
|
||||
LoadPerCoreCritical float64 `yaml:"load_per_core_critical"`
|
||||
ProcessCountWarning int `yaml:"process_count_warning"`
|
||||
ProcessCountCritical int `yaml:"process_count_critical"`
|
||||
}
|
||||
|
||||
type Site struct {
|
||||
Name string `yaml:"name"`
|
||||
URL string `yaml:"url"`
|
||||
ExpectedStatus int `yaml:"expected_status"`
|
||||
Timeout time.Duration `yaml:"timeout"`
|
||||
}
|
||||
|
||||
func Load(path string) (Config, error) {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
|
||||
var cfg Config
|
||||
if err := yaml.Unmarshal(data, &cfg); err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
|
||||
applyDefaults(&cfg)
|
||||
if err := cfg.Validate(); err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
func applyDefaults(cfg *Config) {
|
||||
if cfg.SampleInterval == 0 {
|
||||
cfg.SampleInterval = time.Minute
|
||||
}
|
||||
if cfg.SummaryInterval == 0 {
|
||||
cfg.SummaryInterval = 6 * time.Hour
|
||||
}
|
||||
if cfg.RequestTimeout == 0 {
|
||||
cfg.RequestTimeout = 10 * time.Second
|
||||
}
|
||||
for index := range cfg.Sites {
|
||||
if cfg.Sites[index].ExpectedStatus == 0 {
|
||||
cfg.Sites[index].ExpectedStatus = 200
|
||||
}
|
||||
if cfg.Sites[index].Timeout == 0 {
|
||||
cfg.Sites[index].Timeout = cfg.RequestTimeout
|
||||
}
|
||||
}
|
||||
if cfg.Thresholds.ProcessCountWarning == 0 {
|
||||
cfg.Thresholds.ProcessCountWarning = 350
|
||||
}
|
||||
if cfg.Thresholds.ProcessCountCritical == 0 {
|
||||
cfg.Thresholds.ProcessCountCritical = 500
|
||||
}
|
||||
}
|
||||
|
||||
func (cfg Config) Validate() error {
|
||||
if cfg.ServerName == "" {
|
||||
return fmt.Errorf("server_name is required")
|
||||
}
|
||||
if cfg.DiscordWebhookURL == "" {
|
||||
return fmt.Errorf("discord_webhook_url is required")
|
||||
}
|
||||
if cfg.SampleInterval <= 0 {
|
||||
return fmt.Errorf("sample_interval must be > 0")
|
||||
}
|
||||
if cfg.SummaryInterval <= 0 {
|
||||
return fmt.Errorf("summary_interval must be > 0")
|
||||
}
|
||||
for _, site := range cfg.Sites {
|
||||
if site.Name == "" {
|
||||
return fmt.Errorf("site name is required")
|
||||
}
|
||||
if site.URL == "" {
|
||||
return fmt.Errorf("site URL is required for %s", site.Name)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
201
internal/discord/discord.go
Normal file
201
internal/discord/discord.go
Normal file
@@ -0,0 +1,201 @@
|
||||
package discord
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"heartbeat/internal/alerts"
|
||||
"heartbeat/internal/metrics"
|
||||
)
|
||||
|
||||
type Client struct {
|
||||
webhookURL string
|
||||
httpClient *http.Client
|
||||
serverName string
|
||||
notifyRoleID string
|
||||
}
|
||||
|
||||
type webhookPayload struct {
|
||||
Content string `json:"content,omitempty"`
|
||||
AllowedMentions allowedMentions `json:"allowed_mentions,omitempty"`
|
||||
Embeds []embed `json:"embeds"`
|
||||
}
|
||||
|
||||
type allowedMentions struct {
|
||||
Roles []string `json:"roles,omitempty"`
|
||||
}
|
||||
|
||||
type embed struct {
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description,omitempty"`
|
||||
Color int `json:"color"`
|
||||
Timestamp string `json:"timestamp,omitempty"`
|
||||
Fields []embedField `json:"fields,omitempty"`
|
||||
Footer *embedFooter `json:"footer,omitempty"`
|
||||
}
|
||||
|
||||
type embedField struct {
|
||||
Name string `json:"name"`
|
||||
Value string `json:"value"`
|
||||
Inline bool `json:"inline"`
|
||||
}
|
||||
|
||||
type embedFooter struct {
|
||||
Text string `json:"text"`
|
||||
}
|
||||
|
||||
func New(serverName string, webhookURL string, notifyRoleID string, timeout time.Duration) *Client {
|
||||
return &Client{
|
||||
serverName: serverName,
|
||||
webhookURL: webhookURL,
|
||||
httpClient: &http.Client{Timeout: timeout},
|
||||
notifyRoleID: notifyRoleID,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Client) SendSummary(ctx context.Context, sample metrics.Sample, summaryInterval time.Duration) error {
|
||||
fields := []embedField{
|
||||
{Name: "Server", Value: c.serverName, Inline: true},
|
||||
{Name: "Generated", Value: formatSummaryTime(sample.Timestamp), Inline: true},
|
||||
{Name: "Uptime", Value: formatUptime(sample.UptimeSeconds), Inline: true},
|
||||
{Name: "CPU", Value: fmt.Sprintf("Now: %.1f%%\n15m: %.1f%%\n12h: %.1f%%", sample.CPUCurrentPercent, sample.CPUAvg15mPercent, sample.CPUAvg12hPercent), Inline: true},
|
||||
{Name: "Load", Value: fmt.Sprintf("1m: %.2f\n5m: %.2f\n15m/core: %.2f", sample.Load1, sample.Load5, sample.LoadPerCore), Inline: true},
|
||||
{Name: "Memory", Value: fmt.Sprintf("RAM: %.1f%%\nSwap: %.1f%%", sample.MemoryUsedPercent, sample.SwapUsedPercent), Inline: true},
|
||||
{Name: "Disk /", Value: fmt.Sprintf("Used: %.1f%%\nFree: %.1f GB\nInodes: %.1f%%", sample.RootUsedPercent, sample.RootFreeGB, sample.InodeUsedPercent), Inline: true},
|
||||
{Name: "Network", Value: fmt.Sprintf("RX: %s\nTX: %s", formatRate(sample.RXBytesPerSecond), formatRate(sample.TXBytesPerSecond)), Inline: true},
|
||||
{Name: "Processes", Value: fmt.Sprintf("Count: %d", sample.ProcessCount), Inline: true},
|
||||
{Name: "Sites", Value: formatSiteStatuses(sample.Sites), Inline: false},
|
||||
}
|
||||
return c.send(ctx, webhookPayload{Embeds: []embed{{
|
||||
Title: fmt.Sprintf("heartbeat (%s) - %s", formatSummaryIntervalHours(summaryInterval), c.serverName),
|
||||
Description: "Scheduled server health snapshot.",
|
||||
Color: 0x2D9CDB,
|
||||
Fields: fields,
|
||||
Footer: &embedFooter{Text: formatSummaryFooter(c.serverName)},
|
||||
}}})
|
||||
}
|
||||
|
||||
func (c *Client) SendEvent(ctx context.Context, sample metrics.Sample, event alerts.Event) error {
|
||||
fields := []embedField{
|
||||
{Name: "Server", Value: c.serverName, Inline: true},
|
||||
{Name: "Severity", Value: string(event.Severity), Inline: true},
|
||||
{Name: "Timestamp", Value: formatEventTime(sample.Timestamp), Inline: true},
|
||||
}
|
||||
payload := webhookPayload{Embeds: []embed{{
|
||||
Title: event.Title,
|
||||
Description: event.Body,
|
||||
Color: colorForSeverity(event.Severity),
|
||||
Timestamp: sample.Timestamp.Format(time.RFC3339),
|
||||
Fields: fields,
|
||||
}}}
|
||||
if c.notifyRoleID != "" && (event.Severity == alerts.SeverityWarning || event.Severity == alerts.SeverityCritical) {
|
||||
payload.Content = fmt.Sprintf("<@&%s>", c.notifyRoleID)
|
||||
payload.AllowedMentions = allowedMentions{Roles: []string{c.notifyRoleID}}
|
||||
}
|
||||
return c.send(ctx, payload)
|
||||
}
|
||||
|
||||
func formatUptime(totalSeconds uint64) string {
|
||||
duration := time.Duration(totalSeconds) * time.Second
|
||||
days := duration / (24 * time.Hour)
|
||||
duration -= days * 24 * time.Hour
|
||||
hours := duration / time.Hour
|
||||
duration -= hours * time.Hour
|
||||
minutes := duration / time.Minute
|
||||
if days > 0 {
|
||||
return fmt.Sprintf("%dd %dh %dm", days, hours, minutes)
|
||||
}
|
||||
return fmt.Sprintf("%dh %dm", hours, minutes)
|
||||
}
|
||||
|
||||
func formatSiteStatuses(sites []metrics.SiteStatus) string {
|
||||
if len(sites) == 0 {
|
||||
return "No site checks configured"
|
||||
}
|
||||
parts := make([]string, 0, len(sites))
|
||||
for _, site := range sites {
|
||||
status := "UP"
|
||||
detail := fmt.Sprintf("%d in %s", site.StatusCode, site.Latency.Round(time.Millisecond))
|
||||
if !site.Healthy {
|
||||
status = "DOWN"
|
||||
detail = site.ErrorMessage
|
||||
}
|
||||
parts = append(parts, fmt.Sprintf("%s: %s (%s)", site.Name, status, detail))
|
||||
}
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
func formatSummaryTime(timestamp time.Time) string {
|
||||
unixSeconds := timestamp.Unix()
|
||||
return fmt.Sprintf("<t:%d:f>", unixSeconds)
|
||||
}
|
||||
|
||||
func formatEventTime(timestamp time.Time) string {
|
||||
unixSeconds := timestamp.Unix()
|
||||
return fmt.Sprintf("<t:%d:f> - <t:%d:R>", unixSeconds, unixSeconds)
|
||||
}
|
||||
|
||||
func formatSummaryFooter(serverName string) string {
|
||||
return fmt.Sprintf("heartbeat - %s", serverName)
|
||||
}
|
||||
|
||||
func formatSummaryIntervalHours(interval time.Duration) string {
|
||||
if interval%time.Hour == 0 {
|
||||
return fmt.Sprintf("%dh", interval/time.Hour)
|
||||
}
|
||||
return fmt.Sprintf("%.2fh", interval.Hours())
|
||||
}
|
||||
|
||||
func formatRate(bytesPerSecond float64) string {
|
||||
if bytesPerSecond < 0 {
|
||||
return "n/a"
|
||||
}
|
||||
if bytesPerSecond < 1024 {
|
||||
return fmt.Sprintf("%.0f B/s", bytesPerSecond)
|
||||
}
|
||||
kib := bytesPerSecond / 1024
|
||||
if kib < 1024 {
|
||||
return fmt.Sprintf("%.2f KB/s", kib)
|
||||
}
|
||||
mib := kib / 1024
|
||||
return fmt.Sprintf("%.2f MB/s", mib)
|
||||
}
|
||||
|
||||
func (c *Client) send(ctx context.Context, payload webhookPayload) error {
|
||||
buffer := &bytes.Buffer{}
|
||||
if err := json.NewEncoder(buffer).Encode(payload); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.webhookURL, buffer)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := c.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return fmt.Errorf("discord webhook returned %s", resp.Status)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func colorForSeverity(severity alerts.Severity) int {
|
||||
switch severity {
|
||||
case alerts.SeverityCritical:
|
||||
return 0xE74C3C
|
||||
case alerts.SeverityWarning:
|
||||
return 0xF39C12
|
||||
default:
|
||||
return 0x27AE60
|
||||
}
|
||||
}
|
||||
261
internal/metrics/metrics.go
Normal file
261
internal/metrics/metrics.go
Normal file
@@ -0,0 +1,261 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"heartbeat/internal/config"
|
||||
|
||||
"github.com/shirou/gopsutil/v3/cpu"
|
||||
"github.com/shirou/gopsutil/v3/disk"
|
||||
"github.com/shirou/gopsutil/v3/host"
|
||||
"github.com/shirou/gopsutil/v3/load"
|
||||
"github.com/shirou/gopsutil/v3/mem"
|
||||
gnet "github.com/shirou/gopsutil/v3/net"
|
||||
"github.com/shirou/gopsutil/v3/process"
|
||||
)
|
||||
|
||||
const gib = 1024 * 1024 * 1024
|
||||
|
||||
type Sample struct {
|
||||
Timestamp time.Time
|
||||
RootUsedPercent float64
|
||||
RootFreeGB float64
|
||||
InodeUsedPercent float64
|
||||
CPUCurrentPercent float64
|
||||
CPUAvg15mPercent float64
|
||||
CPUAvg12hPercent float64
|
||||
MemoryUsedPercent float64
|
||||
SwapUsedPercent float64
|
||||
Load1 float64
|
||||
Load5 float64
|
||||
Load15 float64
|
||||
LoadPerCore float64
|
||||
RXBytesPerSecond float64
|
||||
TXBytesPerSecond float64
|
||||
ProcessCount int
|
||||
HostedServiceCount int
|
||||
UptimeSeconds uint64
|
||||
Sites []SiteStatus
|
||||
}
|
||||
|
||||
type SiteStatus struct {
|
||||
Name string
|
||||
URL string
|
||||
Healthy bool
|
||||
StatusCode int
|
||||
Latency time.Duration
|
||||
ErrorMessage string
|
||||
ExpectedStatus int
|
||||
}
|
||||
|
||||
type Sampler struct {
|
||||
httpClient *http.Client
|
||||
prevNet netIO
|
||||
history []historyPoint
|
||||
historyCap int
|
||||
}
|
||||
|
||||
type netIO struct {
|
||||
timestamp time.Time
|
||||
rx uint64
|
||||
tx uint64
|
||||
}
|
||||
|
||||
type historyPoint struct {
|
||||
timestamp time.Time
|
||||
cpu float64
|
||||
}
|
||||
|
||||
func NewSampler(timeout time.Duration) *Sampler {
|
||||
transport := &http.Transport{
|
||||
Proxy: http.ProxyFromEnvironment,
|
||||
DialContext: (&net.Dialer{
|
||||
Timeout: timeout,
|
||||
}).DialContext,
|
||||
TLSHandshakeTimeout: timeout,
|
||||
}
|
||||
|
||||
return &Sampler{
|
||||
httpClient: &http.Client{Timeout: timeout, Transport: transport},
|
||||
historyCap: 12 * 60,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Sampler) Collect(ctx context.Context, cfg config.Config) (Sample, error) {
|
||||
now := time.Now().UTC()
|
||||
|
||||
rootUsage, err := disk.UsageWithContext(ctx, "/")
|
||||
if err != nil {
|
||||
return Sample{}, fmt.Errorf("root usage: %w", err)
|
||||
}
|
||||
|
||||
cpuPercents, err := cpu.PercentWithContext(ctx, time.Second, false)
|
||||
if err != nil {
|
||||
return Sample{}, fmt.Errorf("cpu percent: %w", err)
|
||||
}
|
||||
|
||||
virtualMemory, err := mem.VirtualMemoryWithContext(ctx)
|
||||
if err != nil {
|
||||
return Sample{}, fmt.Errorf("memory: %w", err)
|
||||
}
|
||||
|
||||
swapMemory, err := mem.SwapMemoryWithContext(ctx)
|
||||
if err != nil {
|
||||
return Sample{}, fmt.Errorf("swap: %w", err)
|
||||
}
|
||||
|
||||
avg, err := load.AvgWithContext(ctx)
|
||||
if err != nil {
|
||||
return Sample{}, fmt.Errorf("load average: %w", err)
|
||||
}
|
||||
|
||||
hostInfo, err := host.InfoWithContext(ctx)
|
||||
if err != nil {
|
||||
return Sample{}, fmt.Errorf("host info: %w", err)
|
||||
}
|
||||
|
||||
processes, err := process.ProcessesWithContext(ctx)
|
||||
if err != nil {
|
||||
return Sample{}, fmt.Errorf("process list: %w", err)
|
||||
}
|
||||
|
||||
rxRate, txRate := s.networkRates(ctx, now)
|
||||
sites := s.checkSites(ctx, cfg.Sites)
|
||||
|
||||
currentCPU := 0.0
|
||||
if len(cpuPercents) > 0 {
|
||||
currentCPU = cpuPercents[0]
|
||||
}
|
||||
|
||||
s.appendHistory(now, currentCPU)
|
||||
coreCount, err := cpu.CountsWithContext(ctx, true)
|
||||
if err != nil || coreCount == 0 {
|
||||
coreCount = 1
|
||||
}
|
||||
|
||||
return Sample{
|
||||
Timestamp: now,
|
||||
RootUsedPercent: rootUsage.UsedPercent,
|
||||
RootFreeGB: float64(rootUsage.Free) / gib,
|
||||
InodeUsedPercent: inodeUsedPercent(rootUsage),
|
||||
CPUCurrentPercent: currentCPU,
|
||||
CPUAvg15mPercent: s.averageCPU(15 * time.Minute),
|
||||
CPUAvg12hPercent: s.averageCPU(12 * time.Hour),
|
||||
MemoryUsedPercent: virtualMemory.UsedPercent,
|
||||
SwapUsedPercent: swapMemory.UsedPercent,
|
||||
Load1: avg.Load1,
|
||||
Load5: avg.Load5,
|
||||
Load15: avg.Load15,
|
||||
LoadPerCore: avg.Load15 / float64(coreCount),
|
||||
RXBytesPerSecond: rxRate,
|
||||
TXBytesPerSecond: txRate,
|
||||
ProcessCount: len(processes),
|
||||
HostedServiceCount: len(cfg.Sites),
|
||||
UptimeSeconds: hostInfo.Uptime,
|
||||
Sites: sites,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (s *Sampler) appendHistory(timestamp time.Time, cpuPercent float64) {
|
||||
s.history = append(s.history, historyPoint{timestamp: timestamp, cpu: cpuPercent})
|
||||
if len(s.history) > s.historyCap {
|
||||
s.history = s.history[len(s.history)-s.historyCap:]
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Sampler) averageCPU(window time.Duration) float64 {
|
||||
if len(s.history) == 0 {
|
||||
return 0
|
||||
}
|
||||
cutoff := time.Now().UTC().Add(-window)
|
||||
total := 0.0
|
||||
count := 0.0
|
||||
for _, point := range s.history {
|
||||
if point.timestamp.Before(cutoff) {
|
||||
continue
|
||||
}
|
||||
total += point.cpu
|
||||
count++
|
||||
}
|
||||
if count == 0 {
|
||||
return 0
|
||||
}
|
||||
return total / count
|
||||
}
|
||||
|
||||
func (s *Sampler) networkRates(ctx context.Context, now time.Time) (float64, float64) {
|
||||
stats, err := gnet.IOCountersWithContext(ctx, false)
|
||||
if err != nil || len(stats) == 0 {
|
||||
return 0, 0
|
||||
}
|
||||
current := netIO{timestamp: now, rx: stats[0].BytesRecv, tx: stats[0].BytesSent}
|
||||
if s.prevNet.timestamp.IsZero() {
|
||||
s.prevNet = current
|
||||
return 0, 0
|
||||
}
|
||||
seconds := current.timestamp.Sub(s.prevNet.timestamp).Seconds()
|
||||
if seconds <= 0 {
|
||||
return 0, 0
|
||||
}
|
||||
rxRate := float64(current.rx-s.prevNet.rx) / seconds
|
||||
txRate := float64(current.tx-s.prevNet.tx) / seconds
|
||||
s.prevNet = current
|
||||
return rxRate, txRate
|
||||
}
|
||||
|
||||
func (s *Sampler) checkSites(ctx context.Context, sites []config.Site) []SiteStatus {
|
||||
results := make([]SiteStatus, 0, len(sites))
|
||||
for _, site := range sites {
|
||||
results = append(results, s.checkSite(ctx, site))
|
||||
}
|
||||
return results
|
||||
}
|
||||
|
||||
func (s *Sampler) checkSite(parent context.Context, site config.Site) SiteStatus {
|
||||
ctx, cancel := context.WithTimeout(parent, site.Timeout)
|
||||
defer cancel()
|
||||
|
||||
start := time.Now()
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, site.URL, nil)
|
||||
if err != nil {
|
||||
return SiteStatus{Name: site.Name, URL: site.URL, ExpectedStatus: site.ExpectedStatus, ErrorMessage: err.Error()}
|
||||
}
|
||||
|
||||
resp, err := s.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return SiteStatus{Name: site.Name, URL: site.URL, ExpectedStatus: site.ExpectedStatus, ErrorMessage: normalizeError(err)}
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
status := SiteStatus{
|
||||
Name: site.Name,
|
||||
URL: site.URL,
|
||||
Healthy: resp.StatusCode == site.ExpectedStatus,
|
||||
StatusCode: resp.StatusCode,
|
||||
Latency: time.Since(start),
|
||||
ExpectedStatus: site.ExpectedStatus,
|
||||
}
|
||||
if !status.Healthy {
|
||||
status.ErrorMessage = fmt.Sprintf("expected %d, got %d", site.ExpectedStatus, resp.StatusCode)
|
||||
}
|
||||
return status
|
||||
}
|
||||
|
||||
func normalizeError(err error) string {
|
||||
message := err.Error()
|
||||
message = strings.TrimPrefix(message, "Get ")
|
||||
return message
|
||||
}
|
||||
|
||||
func inodeUsedPercent(stat *disk.UsageStat) float64 {
|
||||
if stat.InodesTotal == 0 {
|
||||
return 0
|
||||
}
|
||||
used := stat.InodesTotal - stat.InodesFree
|
||||
return float64(used) / float64(stat.InodesTotal) * 100
|
||||
}
|
||||
Reference in New Issue
Block a user