init
Some checks failed
Build heartbeat / build (push) Failing after 1m18s

This commit is contained in:
2026-05-03 21:09:59 +02:00
commit 93ae9b66b3
12 changed files with 987 additions and 0 deletions

261
internal/metrics/metrics.go Normal file
View File

@@ -0,0 +1,261 @@
package metrics
import (
"context"
"fmt"
"net"
"net/http"
"strings"
"time"
"heartbeat/internal/config"
"github.com/shirou/gopsutil/v3/cpu"
"github.com/shirou/gopsutil/v3/disk"
"github.com/shirou/gopsutil/v3/host"
"github.com/shirou/gopsutil/v3/load"
"github.com/shirou/gopsutil/v3/mem"
gnet "github.com/shirou/gopsutil/v3/net"
"github.com/shirou/gopsutil/v3/process"
)
const gib = 1024 * 1024 * 1024
type Sample struct {
Timestamp time.Time
RootUsedPercent float64
RootFreeGB float64
InodeUsedPercent float64
CPUCurrentPercent float64
CPUAvg15mPercent float64
CPUAvg12hPercent float64
MemoryUsedPercent float64
SwapUsedPercent float64
Load1 float64
Load5 float64
Load15 float64
LoadPerCore float64
RXBytesPerSecond float64
TXBytesPerSecond float64
ProcessCount int
HostedServiceCount int
UptimeSeconds uint64
Sites []SiteStatus
}
type SiteStatus struct {
Name string
URL string
Healthy bool
StatusCode int
Latency time.Duration
ErrorMessage string
ExpectedStatus int
}
type Sampler struct {
httpClient *http.Client
prevNet netIO
history []historyPoint
historyCap int
}
type netIO struct {
timestamp time.Time
rx uint64
tx uint64
}
type historyPoint struct {
timestamp time.Time
cpu float64
}
func NewSampler(timeout time.Duration) *Sampler {
transport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: timeout,
}).DialContext,
TLSHandshakeTimeout: timeout,
}
return &Sampler{
httpClient: &http.Client{Timeout: timeout, Transport: transport},
historyCap: 12 * 60,
}
}
func (s *Sampler) Collect(ctx context.Context, cfg config.Config) (Sample, error) {
now := time.Now().UTC()
rootUsage, err := disk.UsageWithContext(ctx, "/")
if err != nil {
return Sample{}, fmt.Errorf("root usage: %w", err)
}
cpuPercents, err := cpu.PercentWithContext(ctx, time.Second, false)
if err != nil {
return Sample{}, fmt.Errorf("cpu percent: %w", err)
}
virtualMemory, err := mem.VirtualMemoryWithContext(ctx)
if err != nil {
return Sample{}, fmt.Errorf("memory: %w", err)
}
swapMemory, err := mem.SwapMemoryWithContext(ctx)
if err != nil {
return Sample{}, fmt.Errorf("swap: %w", err)
}
avg, err := load.AvgWithContext(ctx)
if err != nil {
return Sample{}, fmt.Errorf("load average: %w", err)
}
hostInfo, err := host.InfoWithContext(ctx)
if err != nil {
return Sample{}, fmt.Errorf("host info: %w", err)
}
processes, err := process.ProcessesWithContext(ctx)
if err != nil {
return Sample{}, fmt.Errorf("process list: %w", err)
}
rxRate, txRate := s.networkRates(ctx, now)
sites := s.checkSites(ctx, cfg.Sites)
currentCPU := 0.0
if len(cpuPercents) > 0 {
currentCPU = cpuPercents[0]
}
s.appendHistory(now, currentCPU)
coreCount, err := cpu.CountsWithContext(ctx, true)
if err != nil || coreCount == 0 {
coreCount = 1
}
return Sample{
Timestamp: now,
RootUsedPercent: rootUsage.UsedPercent,
RootFreeGB: float64(rootUsage.Free) / gib,
InodeUsedPercent: inodeUsedPercent(rootUsage),
CPUCurrentPercent: currentCPU,
CPUAvg15mPercent: s.averageCPU(15 * time.Minute),
CPUAvg12hPercent: s.averageCPU(12 * time.Hour),
MemoryUsedPercent: virtualMemory.UsedPercent,
SwapUsedPercent: swapMemory.UsedPercent,
Load1: avg.Load1,
Load5: avg.Load5,
Load15: avg.Load15,
LoadPerCore: avg.Load15 / float64(coreCount),
RXBytesPerSecond: rxRate,
TXBytesPerSecond: txRate,
ProcessCount: len(processes),
HostedServiceCount: len(cfg.Sites),
UptimeSeconds: hostInfo.Uptime,
Sites: sites,
}, nil
}
func (s *Sampler) appendHistory(timestamp time.Time, cpuPercent float64) {
s.history = append(s.history, historyPoint{timestamp: timestamp, cpu: cpuPercent})
if len(s.history) > s.historyCap {
s.history = s.history[len(s.history)-s.historyCap:]
}
}
func (s *Sampler) averageCPU(window time.Duration) float64 {
if len(s.history) == 0 {
return 0
}
cutoff := time.Now().UTC().Add(-window)
total := 0.0
count := 0.0
for _, point := range s.history {
if point.timestamp.Before(cutoff) {
continue
}
total += point.cpu
count++
}
if count == 0 {
return 0
}
return total / count
}
func (s *Sampler) networkRates(ctx context.Context, now time.Time) (float64, float64) {
stats, err := gnet.IOCountersWithContext(ctx, false)
if err != nil || len(stats) == 0 {
return 0, 0
}
current := netIO{timestamp: now, rx: stats[0].BytesRecv, tx: stats[0].BytesSent}
if s.prevNet.timestamp.IsZero() {
s.prevNet = current
return 0, 0
}
seconds := current.timestamp.Sub(s.prevNet.timestamp).Seconds()
if seconds <= 0 {
return 0, 0
}
rxRate := float64(current.rx-s.prevNet.rx) / seconds
txRate := float64(current.tx-s.prevNet.tx) / seconds
s.prevNet = current
return rxRate, txRate
}
func (s *Sampler) checkSites(ctx context.Context, sites []config.Site) []SiteStatus {
results := make([]SiteStatus, 0, len(sites))
for _, site := range sites {
results = append(results, s.checkSite(ctx, site))
}
return results
}
func (s *Sampler) checkSite(parent context.Context, site config.Site) SiteStatus {
ctx, cancel := context.WithTimeout(parent, site.Timeout)
defer cancel()
start := time.Now()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, site.URL, nil)
if err != nil {
return SiteStatus{Name: site.Name, URL: site.URL, ExpectedStatus: site.ExpectedStatus, ErrorMessage: err.Error()}
}
resp, err := s.httpClient.Do(req)
if err != nil {
return SiteStatus{Name: site.Name, URL: site.URL, ExpectedStatus: site.ExpectedStatus, ErrorMessage: normalizeError(err)}
}
defer resp.Body.Close()
status := SiteStatus{
Name: site.Name,
URL: site.URL,
Healthy: resp.StatusCode == site.ExpectedStatus,
StatusCode: resp.StatusCode,
Latency: time.Since(start),
ExpectedStatus: site.ExpectedStatus,
}
if !status.Healthy {
status.ErrorMessage = fmt.Sprintf("expected %d, got %d", site.ExpectedStatus, resp.StatusCode)
}
return status
}
func normalizeError(err error) string {
message := err.Error()
message = strings.TrimPrefix(message, "Get ")
return message
}
func inodeUsedPercent(stat *disk.UsageStat) float64 {
if stat.InodesTotal == 0 {
return 0
}
used := stat.InodesTotal - stat.InodesFree
return float64(used) / float64(stat.InodesTotal) * 100
}