262 lines
6.5 KiB
Go
262 lines
6.5 KiB
Go
package metrics
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"net"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"heartbeat/internal/config"
|
|
|
|
"github.com/shirou/gopsutil/v3/cpu"
|
|
"github.com/shirou/gopsutil/v3/disk"
|
|
"github.com/shirou/gopsutil/v3/host"
|
|
"github.com/shirou/gopsutil/v3/load"
|
|
"github.com/shirou/gopsutil/v3/mem"
|
|
gnet "github.com/shirou/gopsutil/v3/net"
|
|
"github.com/shirou/gopsutil/v3/process"
|
|
)
|
|
|
|
const gib = 1024 * 1024 * 1024
|
|
|
|
type Sample struct {
|
|
Timestamp time.Time
|
|
RootUsedPercent float64
|
|
RootFreeGB float64
|
|
InodeUsedPercent float64
|
|
CPUCurrentPercent float64
|
|
CPUAvg15mPercent float64
|
|
CPUAvg12hPercent float64
|
|
MemoryUsedPercent float64
|
|
SwapUsedPercent float64
|
|
Load1 float64
|
|
Load5 float64
|
|
Load15 float64
|
|
LoadPerCore float64
|
|
RXBytesPerSecond float64
|
|
TXBytesPerSecond float64
|
|
ProcessCount int
|
|
HostedServiceCount int
|
|
UptimeSeconds uint64
|
|
Sites []SiteStatus
|
|
}
|
|
|
|
type SiteStatus struct {
|
|
Name string
|
|
URL string
|
|
Healthy bool
|
|
StatusCode int
|
|
Latency time.Duration
|
|
ErrorMessage string
|
|
ExpectedStatus int
|
|
}
|
|
|
|
type Sampler struct {
|
|
httpClient *http.Client
|
|
prevNet netIO
|
|
history []historyPoint
|
|
historyCap int
|
|
}
|
|
|
|
type netIO struct {
|
|
timestamp time.Time
|
|
rx uint64
|
|
tx uint64
|
|
}
|
|
|
|
type historyPoint struct {
|
|
timestamp time.Time
|
|
cpu float64
|
|
}
|
|
|
|
func NewSampler(timeout time.Duration) *Sampler {
|
|
transport := &http.Transport{
|
|
Proxy: http.ProxyFromEnvironment,
|
|
DialContext: (&net.Dialer{
|
|
Timeout: timeout,
|
|
}).DialContext,
|
|
TLSHandshakeTimeout: timeout,
|
|
}
|
|
|
|
return &Sampler{
|
|
httpClient: &http.Client{Timeout: timeout, Transport: transport},
|
|
historyCap: 12 * 60,
|
|
}
|
|
}
|
|
|
|
func (s *Sampler) Collect(ctx context.Context, cfg config.Config) (Sample, error) {
|
|
now := time.Now().UTC()
|
|
|
|
rootUsage, err := disk.UsageWithContext(ctx, "/")
|
|
if err != nil {
|
|
return Sample{}, fmt.Errorf("root usage: %w", err)
|
|
}
|
|
|
|
cpuPercents, err := cpu.PercentWithContext(ctx, time.Second, false)
|
|
if err != nil {
|
|
return Sample{}, fmt.Errorf("cpu percent: %w", err)
|
|
}
|
|
|
|
virtualMemory, err := mem.VirtualMemoryWithContext(ctx)
|
|
if err != nil {
|
|
return Sample{}, fmt.Errorf("memory: %w", err)
|
|
}
|
|
|
|
swapMemory, err := mem.SwapMemoryWithContext(ctx)
|
|
if err != nil {
|
|
return Sample{}, fmt.Errorf("swap: %w", err)
|
|
}
|
|
|
|
avg, err := load.AvgWithContext(ctx)
|
|
if err != nil {
|
|
return Sample{}, fmt.Errorf("load average: %w", err)
|
|
}
|
|
|
|
hostInfo, err := host.InfoWithContext(ctx)
|
|
if err != nil {
|
|
return Sample{}, fmt.Errorf("host info: %w", err)
|
|
}
|
|
|
|
processes, err := process.ProcessesWithContext(ctx)
|
|
if err != nil {
|
|
return Sample{}, fmt.Errorf("process list: %w", err)
|
|
}
|
|
|
|
rxRate, txRate := s.networkRates(ctx, now)
|
|
sites := s.checkSites(ctx, cfg.Sites)
|
|
|
|
currentCPU := 0.0
|
|
if len(cpuPercents) > 0 {
|
|
currentCPU = cpuPercents[0]
|
|
}
|
|
|
|
s.appendHistory(now, currentCPU)
|
|
coreCount, err := cpu.CountsWithContext(ctx, true)
|
|
if err != nil || coreCount == 0 {
|
|
coreCount = 1
|
|
}
|
|
|
|
return Sample{
|
|
Timestamp: now,
|
|
RootUsedPercent: rootUsage.UsedPercent,
|
|
RootFreeGB: float64(rootUsage.Free) / gib,
|
|
InodeUsedPercent: inodeUsedPercent(rootUsage),
|
|
CPUCurrentPercent: currentCPU,
|
|
CPUAvg15mPercent: s.averageCPU(15 * time.Minute),
|
|
CPUAvg12hPercent: s.averageCPU(12 * time.Hour),
|
|
MemoryUsedPercent: virtualMemory.UsedPercent,
|
|
SwapUsedPercent: swapMemory.UsedPercent,
|
|
Load1: avg.Load1,
|
|
Load5: avg.Load5,
|
|
Load15: avg.Load15,
|
|
LoadPerCore: avg.Load15 / float64(coreCount),
|
|
RXBytesPerSecond: rxRate,
|
|
TXBytesPerSecond: txRate,
|
|
ProcessCount: len(processes),
|
|
HostedServiceCount: len(cfg.Sites),
|
|
UptimeSeconds: hostInfo.Uptime,
|
|
Sites: sites,
|
|
}, nil
|
|
}
|
|
|
|
func (s *Sampler) appendHistory(timestamp time.Time, cpuPercent float64) {
|
|
s.history = append(s.history, historyPoint{timestamp: timestamp, cpu: cpuPercent})
|
|
if len(s.history) > s.historyCap {
|
|
s.history = s.history[len(s.history)-s.historyCap:]
|
|
}
|
|
}
|
|
|
|
func (s *Sampler) averageCPU(window time.Duration) float64 {
|
|
if len(s.history) == 0 {
|
|
return 0
|
|
}
|
|
cutoff := time.Now().UTC().Add(-window)
|
|
total := 0.0
|
|
count := 0.0
|
|
for _, point := range s.history {
|
|
if point.timestamp.Before(cutoff) {
|
|
continue
|
|
}
|
|
total += point.cpu
|
|
count++
|
|
}
|
|
if count == 0 {
|
|
return 0
|
|
}
|
|
return total / count
|
|
}
|
|
|
|
func (s *Sampler) networkRates(ctx context.Context, now time.Time) (float64, float64) {
|
|
stats, err := gnet.IOCountersWithContext(ctx, false)
|
|
if err != nil || len(stats) == 0 {
|
|
return 0, 0
|
|
}
|
|
current := netIO{timestamp: now, rx: stats[0].BytesRecv, tx: stats[0].BytesSent}
|
|
if s.prevNet.timestamp.IsZero() {
|
|
s.prevNet = current
|
|
return 0, 0
|
|
}
|
|
seconds := current.timestamp.Sub(s.prevNet.timestamp).Seconds()
|
|
if seconds <= 0 {
|
|
return 0, 0
|
|
}
|
|
rxRate := float64(current.rx-s.prevNet.rx) / seconds
|
|
txRate := float64(current.tx-s.prevNet.tx) / seconds
|
|
s.prevNet = current
|
|
return rxRate, txRate
|
|
}
|
|
|
|
func (s *Sampler) checkSites(ctx context.Context, sites []config.Site) []SiteStatus {
|
|
results := make([]SiteStatus, 0, len(sites))
|
|
for _, site := range sites {
|
|
results = append(results, s.checkSite(ctx, site))
|
|
}
|
|
return results
|
|
}
|
|
|
|
func (s *Sampler) checkSite(parent context.Context, site config.Site) SiteStatus {
|
|
ctx, cancel := context.WithTimeout(parent, site.Timeout)
|
|
defer cancel()
|
|
|
|
start := time.Now()
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, site.URL, nil)
|
|
if err != nil {
|
|
return SiteStatus{Name: site.Name, URL: site.URL, ExpectedStatus: site.ExpectedStatus, ErrorMessage: err.Error()}
|
|
}
|
|
|
|
resp, err := s.httpClient.Do(req)
|
|
if err != nil {
|
|
return SiteStatus{Name: site.Name, URL: site.URL, ExpectedStatus: site.ExpectedStatus, ErrorMessage: normalizeError(err)}
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
status := SiteStatus{
|
|
Name: site.Name,
|
|
URL: site.URL,
|
|
Healthy: resp.StatusCode == site.ExpectedStatus,
|
|
StatusCode: resp.StatusCode,
|
|
Latency: time.Since(start),
|
|
ExpectedStatus: site.ExpectedStatus,
|
|
}
|
|
if !status.Healthy {
|
|
status.ErrorMessage = fmt.Sprintf("expected %d, got %d", site.ExpectedStatus, resp.StatusCode)
|
|
}
|
|
return status
|
|
}
|
|
|
|
func normalizeError(err error) string {
|
|
message := err.Error()
|
|
message = strings.TrimPrefix(message, "Get ")
|
|
return message
|
|
}
|
|
|
|
func inodeUsedPercent(stat *disk.UsageStat) float64 {
|
|
if stat.InodesTotal == 0 {
|
|
return 0
|
|
}
|
|
used := stat.InodesTotal - stat.InodesFree
|
|
return float64(used) / float64(stat.InodesTotal) * 100
|
|
}
|