init
Some checks failed
Build heartbeat / build (push) Failing after 1m18s

This commit is contained in:
2026-05-03 21:09:59 +02:00
commit 93ae9b66b3
12 changed files with 987 additions and 0 deletions

130
internal/alerts/alerts.go Normal file
View File

@@ -0,0 +1,130 @@
package alerts
import (
"fmt"
"sort"
"strings"
"time"
"heartbeat/internal/config"
"heartbeat/internal/metrics"
)
type Severity string
const (
SeverityHealthy Severity = "healthy"
SeverityWarning Severity = "warning"
SeverityCritical Severity = "critical"
)
type Event struct {
Key string
Severity Severity
Title string
Body string
}
type Evaluator struct {
states map[string]Severity
}
func NewEvaluator() *Evaluator {
return &Evaluator{states: make(map[string]Severity)}
}
func (e *Evaluator) Evaluate(cfg config.Config, sample metrics.Sample) []Event {
events := []Event{}
events = append(events, e.metricEvent("disk_used", compareHigh(sample.RootUsedPercent, cfg.Thresholds.DiskUsedPercentWarning, cfg.Thresholds.DiskUsedPercentCritical), fmt.Sprintf("Root disk usage %.1f%%", sample.RootUsedPercent), fmt.Sprintf("Root partition is %.1f%% used with %.1f GB free.", sample.RootUsedPercent, sample.RootFreeGB))...)
events = append(events, e.metricEvent("free_gb", compareLow(sample.RootFreeGB, cfg.Thresholds.FreeGBWarning, cfg.Thresholds.FreeGBCritical), fmt.Sprintf("Low disk space %.1f GB", sample.RootFreeGB), fmt.Sprintf("Root partition free space is %.1f GB.", sample.RootFreeGB))...)
events = append(events, e.metricEvent("inode_used", compareHigh(sample.InodeUsedPercent, cfg.Thresholds.InodeUsedPercentWarning, cfg.Thresholds.InodeUsedPercentCritical), fmt.Sprintf("Inode usage %.1f%%", sample.InodeUsedPercent), fmt.Sprintf("Root partition inode usage is %.1f%%.", sample.InodeUsedPercent))...)
events = append(events, e.metricEvent("memory_used", compareHigh(sample.MemoryUsedPercent, cfg.Thresholds.MemoryUsedPercentWarning, cfg.Thresholds.MemoryUsedPercentCritical), fmt.Sprintf("Memory usage %.1f%%", sample.MemoryUsedPercent), fmt.Sprintf("Memory usage is %.1f%%.", sample.MemoryUsedPercent))...)
events = append(events, e.metricEvent("swap_used", compareHigh(sample.SwapUsedPercent, cfg.Thresholds.SwapUsedPercentWarning, cfg.Thresholds.SwapUsedPercentCritical), fmt.Sprintf("Swap usage %.1f%%", sample.SwapUsedPercent), fmt.Sprintf("Swap usage is %.1f%%.", sample.SwapUsedPercent))...)
events = append(events, e.metricEvent("cpu_avg_15m", compareHigh(sample.CPUAvg15mPercent, cfg.Thresholds.CPUAvg15mWarning, cfg.Thresholds.CPUAvg15mCritical), fmt.Sprintf("CPU 15m average %.1f%%", sample.CPUAvg15mPercent), fmt.Sprintf("CPU rolling 15-minute average is %.1f%%.", sample.CPUAvg15mPercent))...)
events = append(events, e.metricEvent("cpu_avg_12h", compareHigh(sample.CPUAvg12hPercent, cfg.Thresholds.CPUAvg12hWarning, cfg.Thresholds.CPUAvg12hCritical), fmt.Sprintf("CPU 12h average %.1f%%", sample.CPUAvg12hPercent), fmt.Sprintf("CPU rolling 12-hour average is %.1f%%.", sample.CPUAvg12hPercent))...)
events = append(events, e.metricEvent("load_per_core", compareHigh(sample.LoadPerCore, cfg.Thresholds.LoadPerCoreWarning, cfg.Thresholds.LoadPerCoreCritical), fmt.Sprintf("Load per core %.2f", sample.LoadPerCore), fmt.Sprintf("15-minute load per core is %.2f.", sample.LoadPerCore))...)
events = append(events, e.metricEvent("process_count", compareHigh(float64(sample.ProcessCount), float64(cfg.Thresholds.ProcessCountWarning), float64(cfg.Thresholds.ProcessCountCritical)), fmt.Sprintf("Process count %d", sample.ProcessCount), fmt.Sprintf("Process count is %d.", sample.ProcessCount))...)
for _, site := range sample.Sites {
severity := SeverityHealthy
if !site.Healthy {
severity = SeverityCritical
}
title := fmt.Sprintf("Site %s reachable", site.Name)
body := fmt.Sprintf("%s responded with %d in %s.", site.URL, site.StatusCode, site.Latency.Round(time.Millisecond))
if !site.Healthy {
title = fmt.Sprintf("Site %s failed", site.Name)
body = fmt.Sprintf("%s check failed: %s.", site.URL, site.ErrorMessage)
}
events = append(events, e.metricEvent("site:"+site.Name, severity, title, body)...)
}
sort.Slice(events, func(i, j int) bool { return events[i].Key < events[j].Key })
return events
}
func (e *Evaluator) metricEvent(key string, severity Severity, title string, body string) []Event {
previous := e.states[key]
if previous == "" {
e.states[key] = severity
if severity == SeverityHealthy {
return nil
}
return []Event{{Key: key, Severity: severity, Title: title, Body: body}}
}
if previous == severity {
return nil
}
e.states[key] = severity
if severity == SeverityHealthy {
return []Event{{Key: key, Severity: severity, Title: recoveryTitle(title), Body: body}}
}
return []Event{{Key: key, Severity: severity, Title: title, Body: body}}
}
func recoveryTitle(title string) string {
return "Recovered: " + title
}
func compareHigh(value float64, warning float64, critical float64) Severity {
switch {
case critical > 0 && value >= critical:
return SeverityCritical
case warning > 0 && value >= warning:
return SeverityWarning
default:
return SeverityHealthy
}
}
func compareLow(value float64, warning float64, critical float64) Severity {
switch {
case critical > 0 && value <= critical:
return SeverityCritical
case warning > 0 && value <= warning:
return SeverityWarning
default:
return SeverityHealthy
}
}
func FormatSummary(sample metrics.Sample) string {
lines := []string{
fmt.Sprintf("CPU now %.1f%% | 15m %.1f%% | 12h %.1f%%", sample.CPUCurrentPercent, sample.CPUAvg15mPercent, sample.CPUAvg12hPercent),
fmt.Sprintf("Memory %.1f%% | Swap %.1f%%", sample.MemoryUsedPercent, sample.SwapUsedPercent),
fmt.Sprintf("Disk / %.1f%% used | %.1f GB free | Inodes %.1f%%", sample.RootUsedPercent, sample.RootFreeGB, sample.InodeUsedPercent),
fmt.Sprintf("Load %.2f / %.2f / %.2f | Per core %.2f", sample.Load1, sample.Load5, sample.Load15, sample.LoadPerCore),
fmt.Sprintf("Network RX %.1f KB/s | TX %.1f KB/s", sample.RXBytesPerSecond/1024, sample.TXBytesPerSecond/1024),
fmt.Sprintf("Processes %d | Hosted sites %d", sample.ProcessCount, sample.HostedServiceCount),
}
siteStates := make([]string, 0, len(sample.Sites))
for _, site := range sample.Sites {
state := "up"
if !site.Healthy {
state = "down"
}
siteStates = append(siteStates, fmt.Sprintf("%s=%s", site.Name, state))
}
if len(siteStates) > 0 {
lines = append(lines, "Sites "+strings.Join(siteStates, ", "))
}
return strings.Join(lines, "\n")
}

82
internal/app/app.go Normal file
View File

@@ -0,0 +1,82 @@
package app
import (
"context"
"log"
"time"
"heartbeat/internal/alerts"
"heartbeat/internal/config"
"heartbeat/internal/discord"
"heartbeat/internal/metrics"
)
type Runner struct {
cfg config.Config
sampler *metrics.Sampler
evaluator *alerts.Evaluator
discord *discord.Client
}
func New(cfg config.Config) (*Runner, error) {
return &Runner{
cfg: cfg,
sampler: metrics.NewSampler(cfg.RequestTimeout),
evaluator: alerts.NewEvaluator(),
discord: discord.New(cfg.ServerName, cfg.DiscordWebhookURL, cfg.NotifyRoleID, cfg.RequestTimeout),
}, nil
}
func (r *Runner) Run(ctx context.Context) error {
if err := r.tick(ctx, true); err != nil {
log.Printf("initial tick failed: %v", err)
}
sampleTicker := time.NewTicker(r.cfg.SampleInterval)
defer sampleTicker.Stop()
summaryTicker := time.NewTicker(r.cfg.SummaryInterval)
defer summaryTicker.Stop()
for {
select {
case <-ctx.Done():
return nil
case <-sampleTicker.C:
if err := r.tick(ctx, false); err != nil {
log.Printf("sample tick failed: %v", err)
}
case <-summaryTicker.C:
if err := r.sendSummary(ctx); err != nil {
log.Printf("summary failed: %v", err)
}
}
}
}
func (r *Runner) tick(ctx context.Context, sendSummary bool) error {
sample, err := r.sampler.Collect(ctx, r.cfg)
if err != nil {
return err
}
if sendSummary {
if err := r.discord.SendSummary(ctx, sample, r.cfg.SummaryInterval); err != nil {
log.Printf("summary send failed: %v", err)
}
}
for _, event := range r.evaluator.Evaluate(r.cfg, sample) {
if err := r.discord.SendEvent(ctx, sample, event); err != nil {
log.Printf("event send failed for %s: %v", event.Key, err)
}
}
return nil
}
func (r *Runner) sendSummary(ctx context.Context) error {
sample, err := r.sampler.Collect(ctx, r.cfg)
if err != nil {
return err
}
return r.discord.SendSummary(ctx, sample, r.cfg.SummaryInterval)
}

117
internal/config/config.go Normal file
View File

@@ -0,0 +1,117 @@
package config
import (
"fmt"
"os"
"time"
"gopkg.in/yaml.v3"
)
type Config struct {
ServerName string `yaml:"server_name"`
DiscordWebhookURL string `yaml:"discord_webhook_url"`
NotifyRoleID string `yaml:"notify_role_id"`
SampleInterval time.Duration `yaml:"sample_interval"`
SummaryInterval time.Duration `yaml:"summary_interval"`
RequestTimeout time.Duration `yaml:"request_timeout"`
Thresholds Thresholds `yaml:"thresholds"`
Sites []Site `yaml:"sites"`
}
type Thresholds struct {
DiskUsedPercentWarning float64 `yaml:"disk_used_percent_warning"`
DiskUsedPercentCritical float64 `yaml:"disk_used_percent_critical"`
FreeGBWarning float64 `yaml:"free_gb_warning"`
FreeGBCritical float64 `yaml:"free_gb_critical"`
InodeUsedPercentWarning float64 `yaml:"inode_used_percent_warning"`
InodeUsedPercentCritical float64 `yaml:"inode_used_percent_critical"`
MemoryUsedPercentWarning float64 `yaml:"memory_used_percent_warning"`
MemoryUsedPercentCritical float64 `yaml:"memory_used_percent_critical"`
SwapUsedPercentWarning float64 `yaml:"swap_used_percent_warning"`
SwapUsedPercentCritical float64 `yaml:"swap_used_percent_critical"`
CPUAvg15mWarning float64 `yaml:"cpu_avg_15m_warning"`
CPUAvg15mCritical float64 `yaml:"cpu_avg_15m_critical"`
CPUAvg12hWarning float64 `yaml:"cpu_avg_12h_warning"`
CPUAvg12hCritical float64 `yaml:"cpu_avg_12h_critical"`
LoadPerCoreWarning float64 `yaml:"load_per_core_warning"`
LoadPerCoreCritical float64 `yaml:"load_per_core_critical"`
ProcessCountWarning int `yaml:"process_count_warning"`
ProcessCountCritical int `yaml:"process_count_critical"`
}
type Site struct {
Name string `yaml:"name"`
URL string `yaml:"url"`
ExpectedStatus int `yaml:"expected_status"`
Timeout time.Duration `yaml:"timeout"`
}
func Load(path string) (Config, error) {
data, err := os.ReadFile(path)
if err != nil {
return Config{}, err
}
var cfg Config
if err := yaml.Unmarshal(data, &cfg); err != nil {
return Config{}, err
}
applyDefaults(&cfg)
if err := cfg.Validate(); err != nil {
return Config{}, err
}
return cfg, nil
}
func applyDefaults(cfg *Config) {
if cfg.SampleInterval == 0 {
cfg.SampleInterval = time.Minute
}
if cfg.SummaryInterval == 0 {
cfg.SummaryInterval = 6 * time.Hour
}
if cfg.RequestTimeout == 0 {
cfg.RequestTimeout = 10 * time.Second
}
for index := range cfg.Sites {
if cfg.Sites[index].ExpectedStatus == 0 {
cfg.Sites[index].ExpectedStatus = 200
}
if cfg.Sites[index].Timeout == 0 {
cfg.Sites[index].Timeout = cfg.RequestTimeout
}
}
if cfg.Thresholds.ProcessCountWarning == 0 {
cfg.Thresholds.ProcessCountWarning = 350
}
if cfg.Thresholds.ProcessCountCritical == 0 {
cfg.Thresholds.ProcessCountCritical = 500
}
}
func (cfg Config) Validate() error {
if cfg.ServerName == "" {
return fmt.Errorf("server_name is required")
}
if cfg.DiscordWebhookURL == "" {
return fmt.Errorf("discord_webhook_url is required")
}
if cfg.SampleInterval <= 0 {
return fmt.Errorf("sample_interval must be > 0")
}
if cfg.SummaryInterval <= 0 {
return fmt.Errorf("summary_interval must be > 0")
}
for _, site := range cfg.Sites {
if site.Name == "" {
return fmt.Errorf("site name is required")
}
if site.URL == "" {
return fmt.Errorf("site URL is required for %s", site.Name)
}
}
return nil
}

201
internal/discord/discord.go Normal file
View File

@@ -0,0 +1,201 @@
package discord
import (
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"strings"
"time"
"heartbeat/internal/alerts"
"heartbeat/internal/metrics"
)
type Client struct {
webhookURL string
httpClient *http.Client
serverName string
notifyRoleID string
}
type webhookPayload struct {
Content string `json:"content,omitempty"`
AllowedMentions allowedMentions `json:"allowed_mentions,omitempty"`
Embeds []embed `json:"embeds"`
}
type allowedMentions struct {
Roles []string `json:"roles,omitempty"`
}
type embed struct {
Title string `json:"title"`
Description string `json:"description,omitempty"`
Color int `json:"color"`
Timestamp string `json:"timestamp,omitempty"`
Fields []embedField `json:"fields,omitempty"`
Footer *embedFooter `json:"footer,omitempty"`
}
type embedField struct {
Name string `json:"name"`
Value string `json:"value"`
Inline bool `json:"inline"`
}
type embedFooter struct {
Text string `json:"text"`
}
func New(serverName string, webhookURL string, notifyRoleID string, timeout time.Duration) *Client {
return &Client{
serverName: serverName,
webhookURL: webhookURL,
httpClient: &http.Client{Timeout: timeout},
notifyRoleID: notifyRoleID,
}
}
func (c *Client) SendSummary(ctx context.Context, sample metrics.Sample, summaryInterval time.Duration) error {
fields := []embedField{
{Name: "Server", Value: c.serverName, Inline: true},
{Name: "Generated", Value: formatSummaryTime(sample.Timestamp), Inline: true},
{Name: "Uptime", Value: formatUptime(sample.UptimeSeconds), Inline: true},
{Name: "CPU", Value: fmt.Sprintf("Now: %.1f%%\n15m: %.1f%%\n12h: %.1f%%", sample.CPUCurrentPercent, sample.CPUAvg15mPercent, sample.CPUAvg12hPercent), Inline: true},
{Name: "Load", Value: fmt.Sprintf("1m: %.2f\n5m: %.2f\n15m/core: %.2f", sample.Load1, sample.Load5, sample.LoadPerCore), Inline: true},
{Name: "Memory", Value: fmt.Sprintf("RAM: %.1f%%\nSwap: %.1f%%", sample.MemoryUsedPercent, sample.SwapUsedPercent), Inline: true},
{Name: "Disk /", Value: fmt.Sprintf("Used: %.1f%%\nFree: %.1f GB\nInodes: %.1f%%", sample.RootUsedPercent, sample.RootFreeGB, sample.InodeUsedPercent), Inline: true},
{Name: "Network", Value: fmt.Sprintf("RX: %s\nTX: %s", formatRate(sample.RXBytesPerSecond), formatRate(sample.TXBytesPerSecond)), Inline: true},
{Name: "Processes", Value: fmt.Sprintf("Count: %d", sample.ProcessCount), Inline: true},
{Name: "Sites", Value: formatSiteStatuses(sample.Sites), Inline: false},
}
return c.send(ctx, webhookPayload{Embeds: []embed{{
Title: fmt.Sprintf("heartbeat (%s) - %s", formatSummaryIntervalHours(summaryInterval), c.serverName),
Description: "Scheduled server health snapshot.",
Color: 0x2D9CDB,
Fields: fields,
Footer: &embedFooter{Text: formatSummaryFooter(c.serverName)},
}}})
}
func (c *Client) SendEvent(ctx context.Context, sample metrics.Sample, event alerts.Event) error {
fields := []embedField{
{Name: "Server", Value: c.serverName, Inline: true},
{Name: "Severity", Value: string(event.Severity), Inline: true},
{Name: "Timestamp", Value: formatEventTime(sample.Timestamp), Inline: true},
}
payload := webhookPayload{Embeds: []embed{{
Title: event.Title,
Description: event.Body,
Color: colorForSeverity(event.Severity),
Timestamp: sample.Timestamp.Format(time.RFC3339),
Fields: fields,
}}}
if c.notifyRoleID != "" && (event.Severity == alerts.SeverityWarning || event.Severity == alerts.SeverityCritical) {
payload.Content = fmt.Sprintf("<@&%s>", c.notifyRoleID)
payload.AllowedMentions = allowedMentions{Roles: []string{c.notifyRoleID}}
}
return c.send(ctx, payload)
}
func formatUptime(totalSeconds uint64) string {
duration := time.Duration(totalSeconds) * time.Second
days := duration / (24 * time.Hour)
duration -= days * 24 * time.Hour
hours := duration / time.Hour
duration -= hours * time.Hour
minutes := duration / time.Minute
if days > 0 {
return fmt.Sprintf("%dd %dh %dm", days, hours, minutes)
}
return fmt.Sprintf("%dh %dm", hours, minutes)
}
func formatSiteStatuses(sites []metrics.SiteStatus) string {
if len(sites) == 0 {
return "No site checks configured"
}
parts := make([]string, 0, len(sites))
for _, site := range sites {
status := "UP"
detail := fmt.Sprintf("%d in %s", site.StatusCode, site.Latency.Round(time.Millisecond))
if !site.Healthy {
status = "DOWN"
detail = site.ErrorMessage
}
parts = append(parts, fmt.Sprintf("%s: %s (%s)", site.Name, status, detail))
}
return strings.Join(parts, "\n")
}
func formatSummaryTime(timestamp time.Time) string {
unixSeconds := timestamp.Unix()
return fmt.Sprintf("<t:%d:f>", unixSeconds)
}
func formatEventTime(timestamp time.Time) string {
unixSeconds := timestamp.Unix()
return fmt.Sprintf("<t:%d:f> - <t:%d:R>", unixSeconds, unixSeconds)
}
func formatSummaryFooter(serverName string) string {
return fmt.Sprintf("heartbeat - %s", serverName)
}
func formatSummaryIntervalHours(interval time.Duration) string {
if interval%time.Hour == 0 {
return fmt.Sprintf("%dh", interval/time.Hour)
}
return fmt.Sprintf("%.2fh", interval.Hours())
}
func formatRate(bytesPerSecond float64) string {
if bytesPerSecond < 0 {
return "n/a"
}
if bytesPerSecond < 1024 {
return fmt.Sprintf("%.0f B/s", bytesPerSecond)
}
kib := bytesPerSecond / 1024
if kib < 1024 {
return fmt.Sprintf("%.2f KB/s", kib)
}
mib := kib / 1024
return fmt.Sprintf("%.2f MB/s", mib)
}
func (c *Client) send(ctx context.Context, payload webhookPayload) error {
buffer := &bytes.Buffer{}
if err := json.NewEncoder(buffer).Encode(payload); err != nil {
return err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.webhookURL, buffer)
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
resp, err := c.httpClient.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return fmt.Errorf("discord webhook returned %s", resp.Status)
}
return nil
}
func colorForSeverity(severity alerts.Severity) int {
switch severity {
case alerts.SeverityCritical:
return 0xE74C3C
case alerts.SeverityWarning:
return 0xF39C12
default:
return 0x27AE60
}
}

261
internal/metrics/metrics.go Normal file
View File

@@ -0,0 +1,261 @@
package metrics
import (
"context"
"fmt"
"net"
"net/http"
"strings"
"time"
"heartbeat/internal/config"
"github.com/shirou/gopsutil/v3/cpu"
"github.com/shirou/gopsutil/v3/disk"
"github.com/shirou/gopsutil/v3/host"
"github.com/shirou/gopsutil/v3/load"
"github.com/shirou/gopsutil/v3/mem"
gnet "github.com/shirou/gopsutil/v3/net"
"github.com/shirou/gopsutil/v3/process"
)
const gib = 1024 * 1024 * 1024
type Sample struct {
Timestamp time.Time
RootUsedPercent float64
RootFreeGB float64
InodeUsedPercent float64
CPUCurrentPercent float64
CPUAvg15mPercent float64
CPUAvg12hPercent float64
MemoryUsedPercent float64
SwapUsedPercent float64
Load1 float64
Load5 float64
Load15 float64
LoadPerCore float64
RXBytesPerSecond float64
TXBytesPerSecond float64
ProcessCount int
HostedServiceCount int
UptimeSeconds uint64
Sites []SiteStatus
}
type SiteStatus struct {
Name string
URL string
Healthy bool
StatusCode int
Latency time.Duration
ErrorMessage string
ExpectedStatus int
}
type Sampler struct {
httpClient *http.Client
prevNet netIO
history []historyPoint
historyCap int
}
type netIO struct {
timestamp time.Time
rx uint64
tx uint64
}
type historyPoint struct {
timestamp time.Time
cpu float64
}
func NewSampler(timeout time.Duration) *Sampler {
transport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: timeout,
}).DialContext,
TLSHandshakeTimeout: timeout,
}
return &Sampler{
httpClient: &http.Client{Timeout: timeout, Transport: transport},
historyCap: 12 * 60,
}
}
func (s *Sampler) Collect(ctx context.Context, cfg config.Config) (Sample, error) {
now := time.Now().UTC()
rootUsage, err := disk.UsageWithContext(ctx, "/")
if err != nil {
return Sample{}, fmt.Errorf("root usage: %w", err)
}
cpuPercents, err := cpu.PercentWithContext(ctx, time.Second, false)
if err != nil {
return Sample{}, fmt.Errorf("cpu percent: %w", err)
}
virtualMemory, err := mem.VirtualMemoryWithContext(ctx)
if err != nil {
return Sample{}, fmt.Errorf("memory: %w", err)
}
swapMemory, err := mem.SwapMemoryWithContext(ctx)
if err != nil {
return Sample{}, fmt.Errorf("swap: %w", err)
}
avg, err := load.AvgWithContext(ctx)
if err != nil {
return Sample{}, fmt.Errorf("load average: %w", err)
}
hostInfo, err := host.InfoWithContext(ctx)
if err != nil {
return Sample{}, fmt.Errorf("host info: %w", err)
}
processes, err := process.ProcessesWithContext(ctx)
if err != nil {
return Sample{}, fmt.Errorf("process list: %w", err)
}
rxRate, txRate := s.networkRates(ctx, now)
sites := s.checkSites(ctx, cfg.Sites)
currentCPU := 0.0
if len(cpuPercents) > 0 {
currentCPU = cpuPercents[0]
}
s.appendHistory(now, currentCPU)
coreCount, err := cpu.CountsWithContext(ctx, true)
if err != nil || coreCount == 0 {
coreCount = 1
}
return Sample{
Timestamp: now,
RootUsedPercent: rootUsage.UsedPercent,
RootFreeGB: float64(rootUsage.Free) / gib,
InodeUsedPercent: inodeUsedPercent(rootUsage),
CPUCurrentPercent: currentCPU,
CPUAvg15mPercent: s.averageCPU(15 * time.Minute),
CPUAvg12hPercent: s.averageCPU(12 * time.Hour),
MemoryUsedPercent: virtualMemory.UsedPercent,
SwapUsedPercent: swapMemory.UsedPercent,
Load1: avg.Load1,
Load5: avg.Load5,
Load15: avg.Load15,
LoadPerCore: avg.Load15 / float64(coreCount),
RXBytesPerSecond: rxRate,
TXBytesPerSecond: txRate,
ProcessCount: len(processes),
HostedServiceCount: len(cfg.Sites),
UptimeSeconds: hostInfo.Uptime,
Sites: sites,
}, nil
}
func (s *Sampler) appendHistory(timestamp time.Time, cpuPercent float64) {
s.history = append(s.history, historyPoint{timestamp: timestamp, cpu: cpuPercent})
if len(s.history) > s.historyCap {
s.history = s.history[len(s.history)-s.historyCap:]
}
}
func (s *Sampler) averageCPU(window time.Duration) float64 {
if len(s.history) == 0 {
return 0
}
cutoff := time.Now().UTC().Add(-window)
total := 0.0
count := 0.0
for _, point := range s.history {
if point.timestamp.Before(cutoff) {
continue
}
total += point.cpu
count++
}
if count == 0 {
return 0
}
return total / count
}
func (s *Sampler) networkRates(ctx context.Context, now time.Time) (float64, float64) {
stats, err := gnet.IOCountersWithContext(ctx, false)
if err != nil || len(stats) == 0 {
return 0, 0
}
current := netIO{timestamp: now, rx: stats[0].BytesRecv, tx: stats[0].BytesSent}
if s.prevNet.timestamp.IsZero() {
s.prevNet = current
return 0, 0
}
seconds := current.timestamp.Sub(s.prevNet.timestamp).Seconds()
if seconds <= 0 {
return 0, 0
}
rxRate := float64(current.rx-s.prevNet.rx) / seconds
txRate := float64(current.tx-s.prevNet.tx) / seconds
s.prevNet = current
return rxRate, txRate
}
func (s *Sampler) checkSites(ctx context.Context, sites []config.Site) []SiteStatus {
results := make([]SiteStatus, 0, len(sites))
for _, site := range sites {
results = append(results, s.checkSite(ctx, site))
}
return results
}
func (s *Sampler) checkSite(parent context.Context, site config.Site) SiteStatus {
ctx, cancel := context.WithTimeout(parent, site.Timeout)
defer cancel()
start := time.Now()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, site.URL, nil)
if err != nil {
return SiteStatus{Name: site.Name, URL: site.URL, ExpectedStatus: site.ExpectedStatus, ErrorMessage: err.Error()}
}
resp, err := s.httpClient.Do(req)
if err != nil {
return SiteStatus{Name: site.Name, URL: site.URL, ExpectedStatus: site.ExpectedStatus, ErrorMessage: normalizeError(err)}
}
defer resp.Body.Close()
status := SiteStatus{
Name: site.Name,
URL: site.URL,
Healthy: resp.StatusCode == site.ExpectedStatus,
StatusCode: resp.StatusCode,
Latency: time.Since(start),
ExpectedStatus: site.ExpectedStatus,
}
if !status.Healthy {
status.ErrorMessage = fmt.Sprintf("expected %d, got %d", site.ExpectedStatus, resp.StatusCode)
}
return status
}
func normalizeError(err error) string {
message := err.Error()
message = strings.TrimPrefix(message, "Get ")
return message
}
func inodeUsedPercent(stat *disk.UsageStat) float64 {
if stat.InodesTotal == 0 {
return 0
}
used := stat.InodesTotal - stat.InodesFree
return float64(used) / float64(stat.InodesTotal) * 100
}