diff --git a/src/internal/collector/collector.go b/src/internal/collector/collector.go index f0f5cfe..b7325a0 100644 --- a/src/internal/collector/collector.go +++ b/src/internal/collector/collector.go @@ -1,6 +1,7 @@ package collector import ( + "fmt" "log/slog" "os" "os/exec" @@ -51,6 +52,18 @@ type PVECollector struct { descDiskSize *prometheus.Desc descStorageSize *prometheus.Desc descStorageFree *prometheus.Desc + + // IO counter descriptors (counters). + descIOReadCount *prometheus.Desc + descIOReadBytes *prometheus.Desc + descIOReadChars *prometheus.Desc + descIOWriteCount *prometheus.Desc + descIOWriteBytes *prometheus.Desc + descIOWriteChars *prometheus.Desc + + // Operational metrics. + descScrapeDuration *prometheus.Desc + descBuildInfo *prometheus.Desc } type poolData struct { @@ -119,18 +132,28 @@ func NewWithDeps(cfg config.Config, proc procfs.ProcReader, sys sysfs.SysReader, fileReader: fr, prefix: p, - descCPU: prometheus.NewDesc(p+"_kvm_cpu", "KVM CPU time", []string{"id", "mode"}, nil), + descCPU: prometheus.NewDesc(p+"_kvm_cpu_seconds_total", "KVM CPU time", []string{"id", "mode"}, nil), descVcores: prometheus.NewDesc(p+"_kvm_vcores", "vCores allocated", []string{"id"}, nil), - descMaxmem: prometheus.NewDesc(p+"_kvm_maxmem", "Maximum memory bytes", []string{"id"}, nil), + descMaxmem: prometheus.NewDesc(p+"_kvm_maxmem_bytes", "Maximum memory bytes", []string{"id"}, nil), descMemPct: prometheus.NewDesc(p+"_kvm_memory_percent", "Memory percent of host", []string{"id"}, nil), descMemExt: prometheus.NewDesc(p+"_kvm_memory_extended", "Extended memory info", []string{"id", "type"}, nil), descThreads: prometheus.NewDesc(p+"_kvm_threads", "Threads used", []string{"id"}, nil), - descCtxSwitches: prometheus.NewDesc(p+"_kvm_ctx_switches", "Context switches", []string{"id", "type"}, nil), + descCtxSwitches: prometheus.NewDesc(p+"_kvm_ctx_switches_total", "Context switches", []string{"id", "type"}, nil), descNicInfo: prometheus.NewDesc(p+"_kvm_nic", "NIC info", []string{"id", "ifname", "netdev", "queues", "type", "model", "macaddr"}, nil), descNicQueues: prometheus.NewDesc(p+"_kvm_nic_queues", "NIC queue count", []string{"id", "ifname"}, nil), - descDiskSize: prometheus.NewDesc(p+"_kvm_disk_size", "Disk size bytes", []string{"id", "disk_name"}, nil), - descStorageSize: prometheus.NewDesc(p+"_node_storage_size", "Storage total size", []string{"name", "type"}, nil), - descStorageFree: prometheus.NewDesc(p+"_node_storage_free", "Storage free space", []string{"name", "type"}, nil), + descDiskSize: prometheus.NewDesc(p+"_kvm_disk_size_bytes", "Disk size bytes", []string{"id", "disk_name"}, nil), + descStorageSize: prometheus.NewDesc(p+"_node_storage_size_bytes", "Storage total size", []string{"name", "type"}, nil), + descStorageFree: prometheus.NewDesc(p+"_node_storage_free_bytes", "Storage free space", []string{"name", "type"}, nil), + + descIOReadCount: prometheus.NewDesc(p+"_kvm_io_read_count_total", "Read system calls by KVM process", []string{"id"}, nil), + descIOReadBytes: prometheus.NewDesc(p+"_kvm_io_read_bytes_total", "Bytes read from disk by KVM process", []string{"id"}, nil), + descIOReadChars: prometheus.NewDesc(p+"_kvm_io_read_chars_total", "Bytes read including buffers by KVM process", []string{"id"}, nil), + descIOWriteCount: prometheus.NewDesc(p+"_kvm_io_write_count_total", "Write system calls by KVM process", []string{"id"}, nil), + descIOWriteBytes: prometheus.NewDesc(p+"_kvm_io_write_bytes_total", "Bytes written to disk by KVM process", []string{"id"}, nil), + descIOWriteChars: prometheus.NewDesc(p+"_kvm_io_write_chars_total", "Bytes written including buffers by KVM process", []string{"id"}, nil), + + descScrapeDuration: prometheus.NewDesc(p+"_scrape_duration_seconds", "Duration of metrics collection", nil, nil), + descBuildInfo: prometheus.NewDesc(p+"_exporter_build_info", "Build information", []string{"version"}, nil), } c.poolCache = cache.NewMtimeCache[poolData]("/etc/pve/user.cfg", fileMtime) c.storageCache = cache.NewMtimeCache[[]pveconfig.StorageEntry]("/etc/pve/storage.cfg", fileMtime) @@ -143,12 +166,17 @@ func (c *PVECollector) Describe(ch chan<- *prometheus.Desc) { } func (c *PVECollector) Collect(ch chan<- prometheus.Metric) { + start := time.Now() + if c.cfg.CollectRunningVMs { c.collectVMs(ch) } if c.cfg.CollectStorage { c.collectStorage(ch) } + + ch <- prometheus.MustNewConstMetric(c.descScrapeDuration, prometheus.GaugeValue, time.Since(start).Seconds()) + ch <- prometheus.MustNewConstMetric(c.descBuildInfo, prometheus.GaugeValue, 1, c.cfg.Version) } func (c *PVECollector) collectVMs(ch chan<- prometheus.Metric) { @@ -203,7 +231,7 @@ func (c *PVECollector) collectVMMetrics(ch chan<- prometheus.Metric, proc procfs {"system", cpu.System}, {"iowait", cpu.IOWait}, } { - ch <- prometheus.MustNewConstMetric(c.descCPU, prometheus.GaugeValue, m.val, id, m.mode) + ch <- prometheus.MustNewConstMetric(c.descCPU, prometheus.CounterValue, m.val, id, m.mode) } } @@ -229,28 +257,18 @@ func (c *PVECollector) collectVMMetrics(ch chan<- prometheus.Metric, proc procfs ch <- prometheus.MustNewConstMetric(c.descThreads, prometheus.GaugeValue, float64(status.Threads), id) // Context switches - ch <- prometheus.MustNewConstMetric(c.descCtxSwitches, prometheus.GaugeValue, float64(status.CtxSwitches.Voluntary), id, "voluntary") - ch <- prometheus.MustNewConstMetric(c.descCtxSwitches, prometheus.GaugeValue, float64(status.CtxSwitches.Involuntary), id, "involuntary") + ch <- prometheus.MustNewConstMetric(c.descCtxSwitches, prometheus.CounterValue, float64(status.CtxSwitches.Voluntary), id, "voluntary") + ch <- prometheus.MustNewConstMetric(c.descCtxSwitches, prometheus.CounterValue, float64(status.CtxSwitches.Involuntary), id, "involuntary") } // IO counters if io, err := c.proc.GetIOCounters(proc.PID); err == nil { - for _, m := range []struct { - name string - val uint64 - }{ - {"kvm_io_read_count", io.ReadSyscalls}, - {"kvm_io_read_bytes", io.ReadBytes}, - {"kvm_io_read_chars", io.ReadChars}, - {"kvm_io_write_count", io.WriteSyscalls}, - {"kvm_io_write_bytes", io.WriteBytes}, - {"kvm_io_write_chars", io.WriteChars}, - } { - ch <- prometheus.MustNewConstMetric( - prometheus.NewDesc(c.prefix+"_"+m.name, "", []string{"id"}, nil), - prometheus.GaugeValue, float64(m.val), id, - ) - } + ch <- prometheus.MustNewConstMetric(c.descIOReadCount, prometheus.CounterValue, float64(io.ReadSyscalls), id) + ch <- prometheus.MustNewConstMetric(c.descIOReadBytes, prometheus.CounterValue, float64(io.ReadBytes), id) + ch <- prometheus.MustNewConstMetric(c.descIOReadChars, prometheus.CounterValue, float64(io.ReadChars), id) + ch <- prometheus.MustNewConstMetric(c.descIOWriteCount, prometheus.CounterValue, float64(io.WriteSyscalls), id) + ch <- prometheus.MustNewConstMetric(c.descIOWriteBytes, prometheus.CounterValue, float64(io.WriteBytes), id) + ch <- prometheus.MustNewConstMetric(c.descIOWriteChars, prometheus.CounterValue, float64(io.WriteChars), id) } // VM info metric @@ -295,8 +313,8 @@ func (c *PVECollector) collectNICMetrics(ch chan<- prometheus.Metric, proc procf } for statName, val := range stats { ch <- prometheus.MustNewConstMetric( - prometheus.NewDesc(c.prefix+"_kvm_nic_"+statName, "", []string{"id", "ifname"}, nil), - prometheus.GaugeValue, float64(val), id, nic.Ifname, + prometheus.NewDesc(c.prefix+"_kvm_nic_"+statName+"_total", fmt.Sprintf("NIC statistic %s", statName), []string{"id", "ifname"}, nil), + prometheus.CounterValue, float64(val), id, nic.Ifname, ) } } diff --git a/src/internal/collector/collector_test.go b/src/internal/collector/collector_test.go index 9d3cddf..39fef70 100644 --- a/src/internal/collector/collector_test.go +++ b/src/internal/collector/collector_test.go @@ -87,6 +87,17 @@ func (m *mockFileReader) ReadFile(path string) (string, error) { return m.files[path], nil } +// metricValue extracts the numeric value from a dto.Metric, whether it is a Gauge or Counter. +func metricValue(m *dto.Metric) float64 { + if m.Gauge != nil { + return m.Gauge.GetValue() + } + if m.Counter != nil { + return m.Counter.GetValue() + } + return 0 +} + // collectMetrics collects all metrics from a collector into a map keyed by metric name. func collectMetrics(c prometheus.Collector) map[string][]*dto.Metric { ch := make(chan prometheus.Metric, 200) @@ -182,54 +193,60 @@ func TestCollector_BasicVMMetrics(t *testing.T) { c := NewWithDeps(cfg, proc, sys, qm, &mockStatFS{}, &mockCmdRunner{}, fr) metrics := collectMetrics(c) - // Check CPU metrics - cpuMetrics := metrics["pve_kvm_cpu"] + // Check CPU metrics (counter) + cpuMetrics := metrics["pve_kvm_cpu_seconds_total"] if len(cpuMetrics) != 3 { t.Fatalf("expected 3 cpu metrics, got %d", len(cpuMetrics)) } m := findMetricWithLabels(cpuMetrics, map[string]string{"mode": "user"}) - if m == nil || m.Gauge.GetValue() != 5.0 { + if m == nil || metricValue(m) != 5.0 { t.Errorf("cpu user = %v", m) } m = findMetricWithLabels(cpuMetrics, map[string]string{"mode": "system"}) - if m == nil || m.Gauge.GetValue() != 2.0 { + if m == nil || metricValue(m) != 2.0 { t.Errorf("cpu system = %v", m) } m = findMetricWithLabels(cpuMetrics, map[string]string{"mode": "iowait"}) - if m == nil || m.Gauge.GetValue() != 0.5 { + if m == nil || metricValue(m) != 0.5 { t.Errorf("cpu iowait = %v", m) } // Check vcores vcoreMetrics := metrics["pve_kvm_vcores"] - if len(vcoreMetrics) != 1 || vcoreMetrics[0].Gauge.GetValue() != 4 { + if len(vcoreMetrics) != 1 || metricValue(vcoreMetrics[0]) != 4 { t.Errorf("vcores = %v", vcoreMetrics) } // Check threads threadMetrics := metrics["pve_kvm_threads"] - if len(threadMetrics) != 1 || threadMetrics[0].Gauge.GetValue() != 50 { + if len(threadMetrics) != 1 || metricValue(threadMetrics[0]) != 50 { t.Errorf("threads = %v", threadMetrics) } // Check memory percent memPctMetrics := metrics["pve_kvm_memory_percent"] - if len(memPctMetrics) != 1 || memPctMetrics[0].Gauge.GetValue() != 25.5 { + if len(memPctMetrics) != 1 || metricValue(memPctMetrics[0]) != 25.5 { t.Errorf("memory_percent = %v", memPctMetrics) } - // Check IO - if m := metrics["pve_kvm_io_read_count"]; len(m) != 1 || m[0].Gauge.GetValue() != 10 { - t.Errorf("io_read_count = %v", m) - } - if m := metrics["pve_kvm_io_write_bytes"]; len(m) != 1 || m[0].Gauge.GetValue() != 1000 { - t.Errorf("io_write_bytes = %v", m) + // Check maxmem (renamed with _bytes) + maxmemMetrics := metrics["pve_kvm_maxmem_bytes"] + if len(maxmemMetrics) != 1 || metricValue(maxmemMetrics[0]) != float64(4194304*1024) { + t.Errorf("maxmem_bytes = %v", maxmemMetrics) } - // Check context switches - csMetrics := metrics["pve_kvm_ctx_switches"] + // Check IO (counters, renamed with _total) + if m := metrics["pve_kvm_io_read_count_total"]; len(m) != 1 || metricValue(m[0]) != 10 { + t.Errorf("io_read_count_total = %v", m) + } + if m := metrics["pve_kvm_io_write_bytes_total"]; len(m) != 1 || metricValue(m[0]) != 1000 { + t.Errorf("io_write_bytes_total = %v", m) + } + + // Check context switches (counter, renamed with _total) + csMetrics := metrics["pve_kvm_ctx_switches_total"] if len(csMetrics) != 2 { - t.Fatalf("expected 2 ctx_switches metrics, got %d", len(csMetrics)) + t.Fatalf("expected 2 ctx_switches_total metrics, got %d", len(csMetrics)) } // Check VM info metric @@ -241,6 +258,16 @@ func TestCollector_BasicVMMetrics(t *testing.T) { if m == nil { t.Error("kvm info metric not found with expected labels") } + + // Check scrape duration exists + if sd := metrics["pve_scrape_duration_seconds"]; len(sd) != 1 { + t.Errorf("expected 1 scrape_duration_seconds, got %d", len(sd)) + } + + // Check build info exists + if bi := metrics["pve_exporter_build_info"]; len(bi) != 1 { + t.Errorf("expected 1 build_info, got %d", len(bi)) + } } func TestCollector_StorageMetrics(t *testing.T) { @@ -267,16 +294,16 @@ func TestCollector_StorageMetrics(t *testing.T) { metrics := collectMetrics(c) - // Check storage size - sizeMetrics := metrics["pve_node_storage_size"] - if len(sizeMetrics) != 1 || sizeMetrics[0].Gauge.GetValue() != 1e9 { - t.Errorf("storage_size = %v", sizeMetrics) + // Check storage size (renamed with _bytes) + sizeMetrics := metrics["pve_node_storage_size_bytes"] + if len(sizeMetrics) != 1 || metricValue(sizeMetrics[0]) != 1e9 { + t.Errorf("storage_size_bytes = %v", sizeMetrics) } - // Check storage free - freeMetrics := metrics["pve_node_storage_free"] - if len(freeMetrics) != 1 || freeMetrics[0].Gauge.GetValue() != 5e8 { - t.Errorf("storage_free = %v", freeMetrics) + // Check storage free (renamed with _bytes) + freeMetrics := metrics["pve_node_storage_free_bytes"] + if len(freeMetrics) != 1 || metricValue(freeMetrics[0]) != 5e8 { + t.Errorf("storage_free_bytes = %v", freeMetrics) } // Check storage info @@ -326,14 +353,14 @@ func TestCollector_NICMetrics(t *testing.T) { t.Fatalf("expected 1 nic info, got %d", len(nicInfo)) } - // NIC stats - rxBytes := metrics["pve_kvm_nic_rx_bytes"] - if len(rxBytes) != 1 || rxBytes[0].Gauge.GetValue() != 1000 { - t.Errorf("rx_bytes = %v", rxBytes) + // NIC stats (counter, renamed with _total) + rxBytes := metrics["pve_kvm_nic_rx_bytes_total"] + if len(rxBytes) != 1 || metricValue(rxBytes[0]) != 1000 { + t.Errorf("rx_bytes_total = %v", rxBytes) } - txBytes := metrics["pve_kvm_nic_tx_bytes"] - if len(txBytes) != 1 || txBytes[0].Gauge.GetValue() != 2000 { - t.Errorf("tx_bytes = %v", txBytes) + txBytes := metrics["pve_kvm_nic_tx_bytes_total"] + if len(txBytes) != 1 || metricValue(txBytes[0]) != 2000 { + t.Errorf("tx_bytes_total = %v", txBytes) } } @@ -409,8 +436,43 @@ func TestCollector_ProcessDiscoveryError(t *testing.T) { metrics := collectMetrics(c) - // No VM metrics should be emitted - if len(metrics) != 0 { - t.Errorf("expected 0 metrics on discovery error, got %d metric names", len(metrics)) + // No VM metrics should be emitted, but scrape_duration + build_info are always present + expectedNames := map[string]bool{ + "pve_scrape_duration_seconds": true, + "pve_exporter_build_info": true, + } + for name := range metrics { + if !expectedNames[name] { + t.Errorf("unexpected metric %q on discovery error", name) + } + } + if len(metrics) != 2 { + t.Errorf("expected 2 metrics (scrape_duration + build_info) on discovery error, got %d", len(metrics)) + } +} + +func TestCollector_BuildInfo(t *testing.T) { + cfg := config.Config{ + CollectRunningVMs: false, + CollectStorage: false, + MetricsPrefix: "pve", + Version: "1.2.3", + } + + c := NewWithDeps(cfg, &mockProcReader{}, &mockSysReader{}, &mockQMMonitor{responses: map[string]string{}}, + &mockStatFS{}, &mockCmdRunner{}, &mockFileReader{files: map[string]string{}}) + + metrics := collectMetrics(c) + + bi := metrics["pve_exporter_build_info"] + if len(bi) != 1 { + t.Fatalf("expected 1 build_info metric, got %d", len(bi)) + } + if metricValue(bi[0]) != 1 { + t.Errorf("build_info value = %v, want 1", metricValue(bi[0])) + } + m := findMetricWithLabels(bi, map[string]string{"version": "1.2.3"}) + if m == nil { + t.Error("build_info missing version label") } } diff --git a/src/internal/config/config.go b/src/internal/config/config.go index 2b79064..f774733 100644 --- a/src/internal/config/config.go +++ b/src/internal/config/config.go @@ -17,6 +17,7 @@ type Config struct { QMRand time.Duration QMMonitorDeferClose bool ShowVersion bool + Version string } func Parse() Config { diff --git a/src/main.go b/src/main.go index 63a754c..f10b000 100644 --- a/src/main.go +++ b/src/main.go @@ -21,6 +21,7 @@ var version string func main() { cfg := config.Parse() + cfg.Version = version if cfg.ShowVersion { fmt.Println(version)