This commit is contained in:
261
internal/metrics/metrics.go
Normal file
261
internal/metrics/metrics.go
Normal file
@@ -0,0 +1,261 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"heartbeat/internal/config"
|
||||
|
||||
"github.com/shirou/gopsutil/v3/cpu"
|
||||
"github.com/shirou/gopsutil/v3/disk"
|
||||
"github.com/shirou/gopsutil/v3/host"
|
||||
"github.com/shirou/gopsutil/v3/load"
|
||||
"github.com/shirou/gopsutil/v3/mem"
|
||||
gnet "github.com/shirou/gopsutil/v3/net"
|
||||
"github.com/shirou/gopsutil/v3/process"
|
||||
)
|
||||
|
||||
const gib = 1024 * 1024 * 1024
|
||||
|
||||
type Sample struct {
|
||||
Timestamp time.Time
|
||||
RootUsedPercent float64
|
||||
RootFreeGB float64
|
||||
InodeUsedPercent float64
|
||||
CPUCurrentPercent float64
|
||||
CPUAvg15mPercent float64
|
||||
CPUAvg12hPercent float64
|
||||
MemoryUsedPercent float64
|
||||
SwapUsedPercent float64
|
||||
Load1 float64
|
||||
Load5 float64
|
||||
Load15 float64
|
||||
LoadPerCore float64
|
||||
RXBytesPerSecond float64
|
||||
TXBytesPerSecond float64
|
||||
ProcessCount int
|
||||
HostedServiceCount int
|
||||
UptimeSeconds uint64
|
||||
Sites []SiteStatus
|
||||
}
|
||||
|
||||
type SiteStatus struct {
|
||||
Name string
|
||||
URL string
|
||||
Healthy bool
|
||||
StatusCode int
|
||||
Latency time.Duration
|
||||
ErrorMessage string
|
||||
ExpectedStatus int
|
||||
}
|
||||
|
||||
type Sampler struct {
|
||||
httpClient *http.Client
|
||||
prevNet netIO
|
||||
history []historyPoint
|
||||
historyCap int
|
||||
}
|
||||
|
||||
type netIO struct {
|
||||
timestamp time.Time
|
||||
rx uint64
|
||||
tx uint64
|
||||
}
|
||||
|
||||
type historyPoint struct {
|
||||
timestamp time.Time
|
||||
cpu float64
|
||||
}
|
||||
|
||||
func NewSampler(timeout time.Duration) *Sampler {
|
||||
transport := &http.Transport{
|
||||
Proxy: http.ProxyFromEnvironment,
|
||||
DialContext: (&net.Dialer{
|
||||
Timeout: timeout,
|
||||
}).DialContext,
|
||||
TLSHandshakeTimeout: timeout,
|
||||
}
|
||||
|
||||
return &Sampler{
|
||||
httpClient: &http.Client{Timeout: timeout, Transport: transport},
|
||||
historyCap: 12 * 60,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Sampler) Collect(ctx context.Context, cfg config.Config) (Sample, error) {
|
||||
now := time.Now().UTC()
|
||||
|
||||
rootUsage, err := disk.UsageWithContext(ctx, "/")
|
||||
if err != nil {
|
||||
return Sample{}, fmt.Errorf("root usage: %w", err)
|
||||
}
|
||||
|
||||
cpuPercents, err := cpu.PercentWithContext(ctx, time.Second, false)
|
||||
if err != nil {
|
||||
return Sample{}, fmt.Errorf("cpu percent: %w", err)
|
||||
}
|
||||
|
||||
virtualMemory, err := mem.VirtualMemoryWithContext(ctx)
|
||||
if err != nil {
|
||||
return Sample{}, fmt.Errorf("memory: %w", err)
|
||||
}
|
||||
|
||||
swapMemory, err := mem.SwapMemoryWithContext(ctx)
|
||||
if err != nil {
|
||||
return Sample{}, fmt.Errorf("swap: %w", err)
|
||||
}
|
||||
|
||||
avg, err := load.AvgWithContext(ctx)
|
||||
if err != nil {
|
||||
return Sample{}, fmt.Errorf("load average: %w", err)
|
||||
}
|
||||
|
||||
hostInfo, err := host.InfoWithContext(ctx)
|
||||
if err != nil {
|
||||
return Sample{}, fmt.Errorf("host info: %w", err)
|
||||
}
|
||||
|
||||
processes, err := process.ProcessesWithContext(ctx)
|
||||
if err != nil {
|
||||
return Sample{}, fmt.Errorf("process list: %w", err)
|
||||
}
|
||||
|
||||
rxRate, txRate := s.networkRates(ctx, now)
|
||||
sites := s.checkSites(ctx, cfg.Sites)
|
||||
|
||||
currentCPU := 0.0
|
||||
if len(cpuPercents) > 0 {
|
||||
currentCPU = cpuPercents[0]
|
||||
}
|
||||
|
||||
s.appendHistory(now, currentCPU)
|
||||
coreCount, err := cpu.CountsWithContext(ctx, true)
|
||||
if err != nil || coreCount == 0 {
|
||||
coreCount = 1
|
||||
}
|
||||
|
||||
return Sample{
|
||||
Timestamp: now,
|
||||
RootUsedPercent: rootUsage.UsedPercent,
|
||||
RootFreeGB: float64(rootUsage.Free) / gib,
|
||||
InodeUsedPercent: inodeUsedPercent(rootUsage),
|
||||
CPUCurrentPercent: currentCPU,
|
||||
CPUAvg15mPercent: s.averageCPU(15 * time.Minute),
|
||||
CPUAvg12hPercent: s.averageCPU(12 * time.Hour),
|
||||
MemoryUsedPercent: virtualMemory.UsedPercent,
|
||||
SwapUsedPercent: swapMemory.UsedPercent,
|
||||
Load1: avg.Load1,
|
||||
Load5: avg.Load5,
|
||||
Load15: avg.Load15,
|
||||
LoadPerCore: avg.Load15 / float64(coreCount),
|
||||
RXBytesPerSecond: rxRate,
|
||||
TXBytesPerSecond: txRate,
|
||||
ProcessCount: len(processes),
|
||||
HostedServiceCount: len(cfg.Sites),
|
||||
UptimeSeconds: hostInfo.Uptime,
|
||||
Sites: sites,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (s *Sampler) appendHistory(timestamp time.Time, cpuPercent float64) {
|
||||
s.history = append(s.history, historyPoint{timestamp: timestamp, cpu: cpuPercent})
|
||||
if len(s.history) > s.historyCap {
|
||||
s.history = s.history[len(s.history)-s.historyCap:]
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Sampler) averageCPU(window time.Duration) float64 {
|
||||
if len(s.history) == 0 {
|
||||
return 0
|
||||
}
|
||||
cutoff := time.Now().UTC().Add(-window)
|
||||
total := 0.0
|
||||
count := 0.0
|
||||
for _, point := range s.history {
|
||||
if point.timestamp.Before(cutoff) {
|
||||
continue
|
||||
}
|
||||
total += point.cpu
|
||||
count++
|
||||
}
|
||||
if count == 0 {
|
||||
return 0
|
||||
}
|
||||
return total / count
|
||||
}
|
||||
|
||||
func (s *Sampler) networkRates(ctx context.Context, now time.Time) (float64, float64) {
|
||||
stats, err := gnet.IOCountersWithContext(ctx, false)
|
||||
if err != nil || len(stats) == 0 {
|
||||
return 0, 0
|
||||
}
|
||||
current := netIO{timestamp: now, rx: stats[0].BytesRecv, tx: stats[0].BytesSent}
|
||||
if s.prevNet.timestamp.IsZero() {
|
||||
s.prevNet = current
|
||||
return 0, 0
|
||||
}
|
||||
seconds := current.timestamp.Sub(s.prevNet.timestamp).Seconds()
|
||||
if seconds <= 0 {
|
||||
return 0, 0
|
||||
}
|
||||
rxRate := float64(current.rx-s.prevNet.rx) / seconds
|
||||
txRate := float64(current.tx-s.prevNet.tx) / seconds
|
||||
s.prevNet = current
|
||||
return rxRate, txRate
|
||||
}
|
||||
|
||||
func (s *Sampler) checkSites(ctx context.Context, sites []config.Site) []SiteStatus {
|
||||
results := make([]SiteStatus, 0, len(sites))
|
||||
for _, site := range sites {
|
||||
results = append(results, s.checkSite(ctx, site))
|
||||
}
|
||||
return results
|
||||
}
|
||||
|
||||
func (s *Sampler) checkSite(parent context.Context, site config.Site) SiteStatus {
|
||||
ctx, cancel := context.WithTimeout(parent, site.Timeout)
|
||||
defer cancel()
|
||||
|
||||
start := time.Now()
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, site.URL, nil)
|
||||
if err != nil {
|
||||
return SiteStatus{Name: site.Name, URL: site.URL, ExpectedStatus: site.ExpectedStatus, ErrorMessage: err.Error()}
|
||||
}
|
||||
|
||||
resp, err := s.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return SiteStatus{Name: site.Name, URL: site.URL, ExpectedStatus: site.ExpectedStatus, ErrorMessage: normalizeError(err)}
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
status := SiteStatus{
|
||||
Name: site.Name,
|
||||
URL: site.URL,
|
||||
Healthy: resp.StatusCode == site.ExpectedStatus,
|
||||
StatusCode: resp.StatusCode,
|
||||
Latency: time.Since(start),
|
||||
ExpectedStatus: site.ExpectedStatus,
|
||||
}
|
||||
if !status.Healthy {
|
||||
status.ErrorMessage = fmt.Sprintf("expected %d, got %d", site.ExpectedStatus, resp.StatusCode)
|
||||
}
|
||||
return status
|
||||
}
|
||||
|
||||
func normalizeError(err error) string {
|
||||
message := err.Error()
|
||||
message = strings.TrimPrefix(message, "Get ")
|
||||
return message
|
||||
}
|
||||
|
||||
func inodeUsedPercent(stat *disk.UsageStat) float64 {
|
||||
if stat.InodesTotal == 0 {
|
||||
return 0
|
||||
}
|
||||
used := stat.InodesTotal - stat.InodesFree
|
||||
return float64(used) / float64(stat.InodesTotal) * 100
|
||||
}
|
||||
Reference in New Issue
Block a user