package metrics import ( "context" "fmt" "net" "net/http" "strings" "time" "heartbeat/internal/config" "github.com/shirou/gopsutil/v3/cpu" "github.com/shirou/gopsutil/v3/disk" "github.com/shirou/gopsutil/v3/host" "github.com/shirou/gopsutil/v3/load" "github.com/shirou/gopsutil/v3/mem" gnet "github.com/shirou/gopsutil/v3/net" "github.com/shirou/gopsutil/v3/process" ) const gib = 1024 * 1024 * 1024 type Sample struct { Timestamp time.Time RootUsedPercent float64 RootFreeGB float64 InodeUsedPercent float64 CPUCurrentPercent float64 CPUAvg15mPercent float64 CPUAvg12hPercent float64 MemoryUsedPercent float64 SwapUsedPercent float64 Load1 float64 Load5 float64 Load15 float64 LoadPerCore float64 RXBytesPerSecond float64 TXBytesPerSecond float64 ProcessCount int HostedServiceCount int UptimeSeconds uint64 Sites []SiteStatus } type SiteStatus struct { Name string URL string Healthy bool StatusCode int Latency time.Duration ErrorMessage string ExpectedStatus int } type Sampler struct { httpClient *http.Client prevNet netIO history []historyPoint historyCap int } type netIO struct { timestamp time.Time rx uint64 tx uint64 } type historyPoint struct { timestamp time.Time cpu float64 } func NewSampler(timeout time.Duration) *Sampler { transport := &http.Transport{ Proxy: http.ProxyFromEnvironment, DialContext: (&net.Dialer{ Timeout: timeout, }).DialContext, TLSHandshakeTimeout: timeout, } return &Sampler{ httpClient: &http.Client{Timeout: timeout, Transport: transport}, historyCap: 12 * 60, } } func (s *Sampler) Collect(ctx context.Context, cfg config.Config) (Sample, error) { now := time.Now().UTC() rootUsage, err := disk.UsageWithContext(ctx, "/") if err != nil { return Sample{}, fmt.Errorf("root usage: %w", err) } cpuPercents, err := cpu.PercentWithContext(ctx, time.Second, false) if err != nil { return Sample{}, fmt.Errorf("cpu percent: %w", err) } virtualMemory, err := mem.VirtualMemoryWithContext(ctx) if err != nil { return Sample{}, fmt.Errorf("memory: %w", err) } swapMemory, err := mem.SwapMemoryWithContext(ctx) if err != nil { return Sample{}, fmt.Errorf("swap: %w", err) } avg, err := load.AvgWithContext(ctx) if err != nil { return Sample{}, fmt.Errorf("load average: %w", err) } hostInfo, err := host.InfoWithContext(ctx) if err != nil { return Sample{}, fmt.Errorf("host info: %w", err) } processes, err := process.ProcessesWithContext(ctx) if err != nil { return Sample{}, fmt.Errorf("process list: %w", err) } rxRate, txRate := s.networkRates(ctx, now) sites := s.checkSites(ctx, cfg.Sites) currentCPU := 0.0 if len(cpuPercents) > 0 { currentCPU = cpuPercents[0] } s.appendHistory(now, currentCPU) coreCount, err := cpu.CountsWithContext(ctx, true) if err != nil || coreCount == 0 { coreCount = 1 } return Sample{ Timestamp: now, RootUsedPercent: rootUsage.UsedPercent, RootFreeGB: float64(rootUsage.Free) / gib, InodeUsedPercent: inodeUsedPercent(rootUsage), CPUCurrentPercent: currentCPU, CPUAvg15mPercent: s.averageCPU(15 * time.Minute), CPUAvg12hPercent: s.averageCPU(12 * time.Hour), MemoryUsedPercent: virtualMemory.UsedPercent, SwapUsedPercent: swapMemory.UsedPercent, Load1: avg.Load1, Load5: avg.Load5, Load15: avg.Load15, LoadPerCore: avg.Load15 / float64(coreCount), RXBytesPerSecond: rxRate, TXBytesPerSecond: txRate, ProcessCount: len(processes), HostedServiceCount: len(cfg.Sites), UptimeSeconds: hostInfo.Uptime, Sites: sites, }, nil } func (s *Sampler) appendHistory(timestamp time.Time, cpuPercent float64) { s.history = append(s.history, historyPoint{timestamp: timestamp, cpu: cpuPercent}) if len(s.history) > s.historyCap { s.history = s.history[len(s.history)-s.historyCap:] } } func (s *Sampler) averageCPU(window time.Duration) float64 { if len(s.history) == 0 { return 0 } cutoff := time.Now().UTC().Add(-window) total := 0.0 count := 0.0 for _, point := range s.history { if point.timestamp.Before(cutoff) { continue } total += point.cpu count++ } if count == 0 { return 0 } return total / count } func (s *Sampler) networkRates(ctx context.Context, now time.Time) (float64, float64) { stats, err := gnet.IOCountersWithContext(ctx, false) if err != nil || len(stats) == 0 { return 0, 0 } current := netIO{timestamp: now, rx: stats[0].BytesRecv, tx: stats[0].BytesSent} if s.prevNet.timestamp.IsZero() { s.prevNet = current return 0, 0 } seconds := current.timestamp.Sub(s.prevNet.timestamp).Seconds() if seconds <= 0 { return 0, 0 } rxRate := float64(current.rx-s.prevNet.rx) / seconds txRate := float64(current.tx-s.prevNet.tx) / seconds s.prevNet = current return rxRate, txRate } func (s *Sampler) checkSites(ctx context.Context, sites []config.Site) []SiteStatus { results := make([]SiteStatus, 0, len(sites)) for _, site := range sites { results = append(results, s.checkSite(ctx, site)) } return results } func (s *Sampler) checkSite(parent context.Context, site config.Site) SiteStatus { ctx, cancel := context.WithTimeout(parent, site.Timeout) defer cancel() start := time.Now() req, err := http.NewRequestWithContext(ctx, http.MethodGet, site.URL, nil) if err != nil { return SiteStatus{Name: site.Name, URL: site.URL, ExpectedStatus: site.ExpectedStatus, ErrorMessage: err.Error()} } resp, err := s.httpClient.Do(req) if err != nil { return SiteStatus{Name: site.Name, URL: site.URL, ExpectedStatus: site.ExpectedStatus, ErrorMessage: normalizeError(err)} } defer resp.Body.Close() status := SiteStatus{ Name: site.Name, URL: site.URL, Healthy: resp.StatusCode == site.ExpectedStatus, StatusCode: resp.StatusCode, Latency: time.Since(start), ExpectedStatus: site.ExpectedStatus, } if !status.Healthy { status.ErrorMessage = fmt.Sprintf("expected %d, got %d", site.ExpectedStatus, resp.StatusCode) } return status } func normalizeError(err error) string { message := err.Error() message = strings.TrimPrefix(message, "Get ") return message } func inodeUsedPercent(stat *disk.UsageStat) float64 { if stat.InodesTotal == 0 { return 0 } used := stat.InodesTotal - stat.InodesFree return float64(used) / float64(stat.InodesTotal) * 100 }