follow Prometheus best practices: metric renames, counter types, build_info, scrape duration

This commit is contained in:
illustris
2026-03-12 13:21:50 +05:30
parent f332a2f6ac
commit 00404095b9
4 changed files with 144 additions and 62 deletions

View File

@@ -1,6 +1,7 @@
package collector package collector
import ( import (
"fmt"
"log/slog" "log/slog"
"os" "os"
"os/exec" "os/exec"
@@ -51,6 +52,18 @@ type PVECollector struct {
descDiskSize *prometheus.Desc descDiskSize *prometheus.Desc
descStorageSize *prometheus.Desc descStorageSize *prometheus.Desc
descStorageFree *prometheus.Desc descStorageFree *prometheus.Desc
// IO counter descriptors (counters).
descIOReadCount *prometheus.Desc
descIOReadBytes *prometheus.Desc
descIOReadChars *prometheus.Desc
descIOWriteCount *prometheus.Desc
descIOWriteBytes *prometheus.Desc
descIOWriteChars *prometheus.Desc
// Operational metrics.
descScrapeDuration *prometheus.Desc
descBuildInfo *prometheus.Desc
} }
type poolData struct { type poolData struct {
@@ -119,18 +132,28 @@ func NewWithDeps(cfg config.Config, proc procfs.ProcReader, sys sysfs.SysReader,
fileReader: fr, fileReader: fr,
prefix: p, prefix: p,
descCPU: prometheus.NewDesc(p+"_kvm_cpu", "KVM CPU time", []string{"id", "mode"}, nil), descCPU: prometheus.NewDesc(p+"_kvm_cpu_seconds_total", "KVM CPU time", []string{"id", "mode"}, nil),
descVcores: prometheus.NewDesc(p+"_kvm_vcores", "vCores allocated", []string{"id"}, nil), descVcores: prometheus.NewDesc(p+"_kvm_vcores", "vCores allocated", []string{"id"}, nil),
descMaxmem: prometheus.NewDesc(p+"_kvm_maxmem", "Maximum memory bytes", []string{"id"}, nil), descMaxmem: prometheus.NewDesc(p+"_kvm_maxmem_bytes", "Maximum memory bytes", []string{"id"}, nil),
descMemPct: prometheus.NewDesc(p+"_kvm_memory_percent", "Memory percent of host", []string{"id"}, nil), descMemPct: prometheus.NewDesc(p+"_kvm_memory_percent", "Memory percent of host", []string{"id"}, nil),
descMemExt: prometheus.NewDesc(p+"_kvm_memory_extended", "Extended memory info", []string{"id", "type"}, nil), descMemExt: prometheus.NewDesc(p+"_kvm_memory_extended", "Extended memory info", []string{"id", "type"}, nil),
descThreads: prometheus.NewDesc(p+"_kvm_threads", "Threads used", []string{"id"}, nil), descThreads: prometheus.NewDesc(p+"_kvm_threads", "Threads used", []string{"id"}, nil),
descCtxSwitches: prometheus.NewDesc(p+"_kvm_ctx_switches", "Context switches", []string{"id", "type"}, nil), descCtxSwitches: prometheus.NewDesc(p+"_kvm_ctx_switches_total", "Context switches", []string{"id", "type"}, nil),
descNicInfo: prometheus.NewDesc(p+"_kvm_nic", "NIC info", []string{"id", "ifname", "netdev", "queues", "type", "model", "macaddr"}, nil), descNicInfo: prometheus.NewDesc(p+"_kvm_nic", "NIC info", []string{"id", "ifname", "netdev", "queues", "type", "model", "macaddr"}, nil),
descNicQueues: prometheus.NewDesc(p+"_kvm_nic_queues", "NIC queue count", []string{"id", "ifname"}, nil), descNicQueues: prometheus.NewDesc(p+"_kvm_nic_queues", "NIC queue count", []string{"id", "ifname"}, nil),
descDiskSize: prometheus.NewDesc(p+"_kvm_disk_size", "Disk size bytes", []string{"id", "disk_name"}, nil), descDiskSize: prometheus.NewDesc(p+"_kvm_disk_size_bytes", "Disk size bytes", []string{"id", "disk_name"}, nil),
descStorageSize: prometheus.NewDesc(p+"_node_storage_size", "Storage total size", []string{"name", "type"}, nil), descStorageSize: prometheus.NewDesc(p+"_node_storage_size_bytes", "Storage total size", []string{"name", "type"}, nil),
descStorageFree: prometheus.NewDesc(p+"_node_storage_free", "Storage free space", []string{"name", "type"}, nil), descStorageFree: prometheus.NewDesc(p+"_node_storage_free_bytes", "Storage free space", []string{"name", "type"}, nil),
descIOReadCount: prometheus.NewDesc(p+"_kvm_io_read_count_total", "Read system calls by KVM process", []string{"id"}, nil),
descIOReadBytes: prometheus.NewDesc(p+"_kvm_io_read_bytes_total", "Bytes read from disk by KVM process", []string{"id"}, nil),
descIOReadChars: prometheus.NewDesc(p+"_kvm_io_read_chars_total", "Bytes read including buffers by KVM process", []string{"id"}, nil),
descIOWriteCount: prometheus.NewDesc(p+"_kvm_io_write_count_total", "Write system calls by KVM process", []string{"id"}, nil),
descIOWriteBytes: prometheus.NewDesc(p+"_kvm_io_write_bytes_total", "Bytes written to disk by KVM process", []string{"id"}, nil),
descIOWriteChars: prometheus.NewDesc(p+"_kvm_io_write_chars_total", "Bytes written including buffers by KVM process", []string{"id"}, nil),
descScrapeDuration: prometheus.NewDesc(p+"_scrape_duration_seconds", "Duration of metrics collection", nil, nil),
descBuildInfo: prometheus.NewDesc(p+"_exporter_build_info", "Build information", []string{"version"}, nil),
} }
c.poolCache = cache.NewMtimeCache[poolData]("/etc/pve/user.cfg", fileMtime) c.poolCache = cache.NewMtimeCache[poolData]("/etc/pve/user.cfg", fileMtime)
c.storageCache = cache.NewMtimeCache[[]pveconfig.StorageEntry]("/etc/pve/storage.cfg", fileMtime) c.storageCache = cache.NewMtimeCache[[]pveconfig.StorageEntry]("/etc/pve/storage.cfg", fileMtime)
@@ -143,12 +166,17 @@ func (c *PVECollector) Describe(ch chan<- *prometheus.Desc) {
} }
func (c *PVECollector) Collect(ch chan<- prometheus.Metric) { func (c *PVECollector) Collect(ch chan<- prometheus.Metric) {
start := time.Now()
if c.cfg.CollectRunningVMs { if c.cfg.CollectRunningVMs {
c.collectVMs(ch) c.collectVMs(ch)
} }
if c.cfg.CollectStorage { if c.cfg.CollectStorage {
c.collectStorage(ch) c.collectStorage(ch)
} }
ch <- prometheus.MustNewConstMetric(c.descScrapeDuration, prometheus.GaugeValue, time.Since(start).Seconds())
ch <- prometheus.MustNewConstMetric(c.descBuildInfo, prometheus.GaugeValue, 1, c.cfg.Version)
} }
func (c *PVECollector) collectVMs(ch chan<- prometheus.Metric) { func (c *PVECollector) collectVMs(ch chan<- prometheus.Metric) {
@@ -203,7 +231,7 @@ func (c *PVECollector) collectVMMetrics(ch chan<- prometheus.Metric, proc procfs
{"system", cpu.System}, {"system", cpu.System},
{"iowait", cpu.IOWait}, {"iowait", cpu.IOWait},
} { } {
ch <- prometheus.MustNewConstMetric(c.descCPU, prometheus.GaugeValue, m.val, id, m.mode) ch <- prometheus.MustNewConstMetric(c.descCPU, prometheus.CounterValue, m.val, id, m.mode)
} }
} }
@@ -229,28 +257,18 @@ func (c *PVECollector) collectVMMetrics(ch chan<- prometheus.Metric, proc procfs
ch <- prometheus.MustNewConstMetric(c.descThreads, prometheus.GaugeValue, float64(status.Threads), id) ch <- prometheus.MustNewConstMetric(c.descThreads, prometheus.GaugeValue, float64(status.Threads), id)
// Context switches // Context switches
ch <- prometheus.MustNewConstMetric(c.descCtxSwitches, prometheus.GaugeValue, float64(status.CtxSwitches.Voluntary), id, "voluntary") ch <- prometheus.MustNewConstMetric(c.descCtxSwitches, prometheus.CounterValue, float64(status.CtxSwitches.Voluntary), id, "voluntary")
ch <- prometheus.MustNewConstMetric(c.descCtxSwitches, prometheus.GaugeValue, float64(status.CtxSwitches.Involuntary), id, "involuntary") ch <- prometheus.MustNewConstMetric(c.descCtxSwitches, prometheus.CounterValue, float64(status.CtxSwitches.Involuntary), id, "involuntary")
} }
// IO counters // IO counters
if io, err := c.proc.GetIOCounters(proc.PID); err == nil { if io, err := c.proc.GetIOCounters(proc.PID); err == nil {
for _, m := range []struct { ch <- prometheus.MustNewConstMetric(c.descIOReadCount, prometheus.CounterValue, float64(io.ReadSyscalls), id)
name string ch <- prometheus.MustNewConstMetric(c.descIOReadBytes, prometheus.CounterValue, float64(io.ReadBytes), id)
val uint64 ch <- prometheus.MustNewConstMetric(c.descIOReadChars, prometheus.CounterValue, float64(io.ReadChars), id)
}{ ch <- prometheus.MustNewConstMetric(c.descIOWriteCount, prometheus.CounterValue, float64(io.WriteSyscalls), id)
{"kvm_io_read_count", io.ReadSyscalls}, ch <- prometheus.MustNewConstMetric(c.descIOWriteBytes, prometheus.CounterValue, float64(io.WriteBytes), id)
{"kvm_io_read_bytes", io.ReadBytes}, ch <- prometheus.MustNewConstMetric(c.descIOWriteChars, prometheus.CounterValue, float64(io.WriteChars), id)
{"kvm_io_read_chars", io.ReadChars},
{"kvm_io_write_count", io.WriteSyscalls},
{"kvm_io_write_bytes", io.WriteBytes},
{"kvm_io_write_chars", io.WriteChars},
} {
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(c.prefix+"_"+m.name, "", []string{"id"}, nil),
prometheus.GaugeValue, float64(m.val), id,
)
}
} }
// VM info metric // VM info metric
@@ -295,8 +313,8 @@ func (c *PVECollector) collectNICMetrics(ch chan<- prometheus.Metric, proc procf
} }
for statName, val := range stats { for statName, val := range stats {
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(c.prefix+"_kvm_nic_"+statName, "", []string{"id", "ifname"}, nil), prometheus.NewDesc(c.prefix+"_kvm_nic_"+statName+"_total", fmt.Sprintf("NIC statistic %s", statName), []string{"id", "ifname"}, nil),
prometheus.GaugeValue, float64(val), id, nic.Ifname, prometheus.CounterValue, float64(val), id, nic.Ifname,
) )
} }
} }

View File

@@ -87,6 +87,17 @@ func (m *mockFileReader) ReadFile(path string) (string, error) {
return m.files[path], nil return m.files[path], nil
} }
// metricValue extracts the numeric value from a dto.Metric, whether it is a Gauge or Counter.
func metricValue(m *dto.Metric) float64 {
if m.Gauge != nil {
return m.Gauge.GetValue()
}
if m.Counter != nil {
return m.Counter.GetValue()
}
return 0
}
// collectMetrics collects all metrics from a collector into a map keyed by metric name. // collectMetrics collects all metrics from a collector into a map keyed by metric name.
func collectMetrics(c prometheus.Collector) map[string][]*dto.Metric { func collectMetrics(c prometheus.Collector) map[string][]*dto.Metric {
ch := make(chan prometheus.Metric, 200) ch := make(chan prometheus.Metric, 200)
@@ -182,54 +193,60 @@ func TestCollector_BasicVMMetrics(t *testing.T) {
c := NewWithDeps(cfg, proc, sys, qm, &mockStatFS{}, &mockCmdRunner{}, fr) c := NewWithDeps(cfg, proc, sys, qm, &mockStatFS{}, &mockCmdRunner{}, fr)
metrics := collectMetrics(c) metrics := collectMetrics(c)
// Check CPU metrics // Check CPU metrics (counter)
cpuMetrics := metrics["pve_kvm_cpu"] cpuMetrics := metrics["pve_kvm_cpu_seconds_total"]
if len(cpuMetrics) != 3 { if len(cpuMetrics) != 3 {
t.Fatalf("expected 3 cpu metrics, got %d", len(cpuMetrics)) t.Fatalf("expected 3 cpu metrics, got %d", len(cpuMetrics))
} }
m := findMetricWithLabels(cpuMetrics, map[string]string{"mode": "user"}) m := findMetricWithLabels(cpuMetrics, map[string]string{"mode": "user"})
if m == nil || m.Gauge.GetValue() != 5.0 { if m == nil || metricValue(m) != 5.0 {
t.Errorf("cpu user = %v", m) t.Errorf("cpu user = %v", m)
} }
m = findMetricWithLabels(cpuMetrics, map[string]string{"mode": "system"}) m = findMetricWithLabels(cpuMetrics, map[string]string{"mode": "system"})
if m == nil || m.Gauge.GetValue() != 2.0 { if m == nil || metricValue(m) != 2.0 {
t.Errorf("cpu system = %v", m) t.Errorf("cpu system = %v", m)
} }
m = findMetricWithLabels(cpuMetrics, map[string]string{"mode": "iowait"}) m = findMetricWithLabels(cpuMetrics, map[string]string{"mode": "iowait"})
if m == nil || m.Gauge.GetValue() != 0.5 { if m == nil || metricValue(m) != 0.5 {
t.Errorf("cpu iowait = %v", m) t.Errorf("cpu iowait = %v", m)
} }
// Check vcores // Check vcores
vcoreMetrics := metrics["pve_kvm_vcores"] vcoreMetrics := metrics["pve_kvm_vcores"]
if len(vcoreMetrics) != 1 || vcoreMetrics[0].Gauge.GetValue() != 4 { if len(vcoreMetrics) != 1 || metricValue(vcoreMetrics[0]) != 4 {
t.Errorf("vcores = %v", vcoreMetrics) t.Errorf("vcores = %v", vcoreMetrics)
} }
// Check threads // Check threads
threadMetrics := metrics["pve_kvm_threads"] threadMetrics := metrics["pve_kvm_threads"]
if len(threadMetrics) != 1 || threadMetrics[0].Gauge.GetValue() != 50 { if len(threadMetrics) != 1 || metricValue(threadMetrics[0]) != 50 {
t.Errorf("threads = %v", threadMetrics) t.Errorf("threads = %v", threadMetrics)
} }
// Check memory percent // Check memory percent
memPctMetrics := metrics["pve_kvm_memory_percent"] memPctMetrics := metrics["pve_kvm_memory_percent"]
if len(memPctMetrics) != 1 || memPctMetrics[0].Gauge.GetValue() != 25.5 { if len(memPctMetrics) != 1 || metricValue(memPctMetrics[0]) != 25.5 {
t.Errorf("memory_percent = %v", memPctMetrics) t.Errorf("memory_percent = %v", memPctMetrics)
} }
// Check IO // Check maxmem (renamed with _bytes)
if m := metrics["pve_kvm_io_read_count"]; len(m) != 1 || m[0].Gauge.GetValue() != 10 { maxmemMetrics := metrics["pve_kvm_maxmem_bytes"]
t.Errorf("io_read_count = %v", m) if len(maxmemMetrics) != 1 || metricValue(maxmemMetrics[0]) != float64(4194304*1024) {
} t.Errorf("maxmem_bytes = %v", maxmemMetrics)
if m := metrics["pve_kvm_io_write_bytes"]; len(m) != 1 || m[0].Gauge.GetValue() != 1000 {
t.Errorf("io_write_bytes = %v", m)
} }
// Check context switches // Check IO (counters, renamed with _total)
csMetrics := metrics["pve_kvm_ctx_switches"] if m := metrics["pve_kvm_io_read_count_total"]; len(m) != 1 || metricValue(m[0]) != 10 {
t.Errorf("io_read_count_total = %v", m)
}
if m := metrics["pve_kvm_io_write_bytes_total"]; len(m) != 1 || metricValue(m[0]) != 1000 {
t.Errorf("io_write_bytes_total = %v", m)
}
// Check context switches (counter, renamed with _total)
csMetrics := metrics["pve_kvm_ctx_switches_total"]
if len(csMetrics) != 2 { if len(csMetrics) != 2 {
t.Fatalf("expected 2 ctx_switches metrics, got %d", len(csMetrics)) t.Fatalf("expected 2 ctx_switches_total metrics, got %d", len(csMetrics))
} }
// Check VM info metric // Check VM info metric
@@ -241,6 +258,16 @@ func TestCollector_BasicVMMetrics(t *testing.T) {
if m == nil { if m == nil {
t.Error("kvm info metric not found with expected labels") t.Error("kvm info metric not found with expected labels")
} }
// Check scrape duration exists
if sd := metrics["pve_scrape_duration_seconds"]; len(sd) != 1 {
t.Errorf("expected 1 scrape_duration_seconds, got %d", len(sd))
}
// Check build info exists
if bi := metrics["pve_exporter_build_info"]; len(bi) != 1 {
t.Errorf("expected 1 build_info, got %d", len(bi))
}
} }
func TestCollector_StorageMetrics(t *testing.T) { func TestCollector_StorageMetrics(t *testing.T) {
@@ -267,16 +294,16 @@ func TestCollector_StorageMetrics(t *testing.T) {
metrics := collectMetrics(c) metrics := collectMetrics(c)
// Check storage size // Check storage size (renamed with _bytes)
sizeMetrics := metrics["pve_node_storage_size"] sizeMetrics := metrics["pve_node_storage_size_bytes"]
if len(sizeMetrics) != 1 || sizeMetrics[0].Gauge.GetValue() != 1e9 { if len(sizeMetrics) != 1 || metricValue(sizeMetrics[0]) != 1e9 {
t.Errorf("storage_size = %v", sizeMetrics) t.Errorf("storage_size_bytes = %v", sizeMetrics)
} }
// Check storage free // Check storage free (renamed with _bytes)
freeMetrics := metrics["pve_node_storage_free"] freeMetrics := metrics["pve_node_storage_free_bytes"]
if len(freeMetrics) != 1 || freeMetrics[0].Gauge.GetValue() != 5e8 { if len(freeMetrics) != 1 || metricValue(freeMetrics[0]) != 5e8 {
t.Errorf("storage_free = %v", freeMetrics) t.Errorf("storage_free_bytes = %v", freeMetrics)
} }
// Check storage info // Check storage info
@@ -326,14 +353,14 @@ func TestCollector_NICMetrics(t *testing.T) {
t.Fatalf("expected 1 nic info, got %d", len(nicInfo)) t.Fatalf("expected 1 nic info, got %d", len(nicInfo))
} }
// NIC stats // NIC stats (counter, renamed with _total)
rxBytes := metrics["pve_kvm_nic_rx_bytes"] rxBytes := metrics["pve_kvm_nic_rx_bytes_total"]
if len(rxBytes) != 1 || rxBytes[0].Gauge.GetValue() != 1000 { if len(rxBytes) != 1 || metricValue(rxBytes[0]) != 1000 {
t.Errorf("rx_bytes = %v", rxBytes) t.Errorf("rx_bytes_total = %v", rxBytes)
} }
txBytes := metrics["pve_kvm_nic_tx_bytes"] txBytes := metrics["pve_kvm_nic_tx_bytes_total"]
if len(txBytes) != 1 || txBytes[0].Gauge.GetValue() != 2000 { if len(txBytes) != 1 || metricValue(txBytes[0]) != 2000 {
t.Errorf("tx_bytes = %v", txBytes) t.Errorf("tx_bytes_total = %v", txBytes)
} }
} }
@@ -409,8 +436,43 @@ func TestCollector_ProcessDiscoveryError(t *testing.T) {
metrics := collectMetrics(c) metrics := collectMetrics(c)
// No VM metrics should be emitted // No VM metrics should be emitted, but scrape_duration + build_info are always present
if len(metrics) != 0 { expectedNames := map[string]bool{
t.Errorf("expected 0 metrics on discovery error, got %d metric names", len(metrics)) "pve_scrape_duration_seconds": true,
"pve_exporter_build_info": true,
}
for name := range metrics {
if !expectedNames[name] {
t.Errorf("unexpected metric %q on discovery error", name)
}
}
if len(metrics) != 2 {
t.Errorf("expected 2 metrics (scrape_duration + build_info) on discovery error, got %d", len(metrics))
}
}
func TestCollector_BuildInfo(t *testing.T) {
cfg := config.Config{
CollectRunningVMs: false,
CollectStorage: false,
MetricsPrefix: "pve",
Version: "1.2.3",
}
c := NewWithDeps(cfg, &mockProcReader{}, &mockSysReader{}, &mockQMMonitor{responses: map[string]string{}},
&mockStatFS{}, &mockCmdRunner{}, &mockFileReader{files: map[string]string{}})
metrics := collectMetrics(c)
bi := metrics["pve_exporter_build_info"]
if len(bi) != 1 {
t.Fatalf("expected 1 build_info metric, got %d", len(bi))
}
if metricValue(bi[0]) != 1 {
t.Errorf("build_info value = %v, want 1", metricValue(bi[0]))
}
m := findMetricWithLabels(bi, map[string]string{"version": "1.2.3"})
if m == nil {
t.Error("build_info missing version label")
} }
} }

View File

@@ -17,6 +17,7 @@ type Config struct {
QMRand time.Duration QMRand time.Duration
QMMonitorDeferClose bool QMMonitorDeferClose bool
ShowVersion bool ShowVersion bool
Version string
} }
func Parse() Config { func Parse() Config {

View File

@@ -21,6 +21,7 @@ var version string
func main() { func main() {
cfg := config.Parse() cfg := config.Parse()
cfg.Version = version
if cfg.ShowVersion { if cfg.ShowVersion {
fmt.Println(version) fmt.Println(version)