diff --git a/src/internal/collector/collector.go b/src/internal/collector/collector.go index f4394ff..cf373b4 100644 --- a/src/internal/collector/collector.go +++ b/src/internal/collector/collector.go @@ -5,6 +5,7 @@ import ( "log/slog" "os" "os/exec" + "path/filepath" "slices" "strconv" "strings" @@ -15,6 +16,7 @@ import ( "pve_local_exporter/internal/cache" "pve_local_exporter/internal/config" + "pve_local_exporter/internal/logging" "pve_local_exporter/internal/procfs" "pve_local_exporter/internal/pveconfig" "pve_local_exporter/internal/qmmonitor" @@ -191,9 +193,11 @@ func (c *PVECollector) collectVMs(ch chan<- prometheus.Metric) { slog.Error("discover QEMU processes", "err", err) return } + logging.Trace("collectVMs", "vm_count", len(procs)) // Load pool info vmPoolMap, pools := c.getPoolInfo() + logging.Trace("pool info loaded", "vm_pool_map_size", len(vmPoolMap), "pools_count", len(pools)) for _, proc := range procs { c.collectVMMetrics(ch, proc, vmPoolMap, pools) @@ -299,8 +303,10 @@ func (c *PVECollector) collectNICMetrics(ch chan<- prometheus.Metric, proc procf slog.Error("qm info network", "vmid", id, "err", err) return } + logging.Trace("qm info network response", "vmid", id, "raw_len", len(raw)) nics := qmmonitor.ParseNetworkInfo(raw) + logging.Trace("parsed NICs", "vmid", id, "nic_count", len(nics)) for _, nic := range nics { // NIC info metric ch <- prometheus.MustNewConstMetric(c.descNicInfo, prometheus.GaugeValue, 1, @@ -334,14 +340,16 @@ func (c *PVECollector) collectDiskMetrics(ch chan<- prometheus.Metric, proc proc slog.Error("qm info block", "vmid", id, "err", err) return } + logging.Trace("qm info block response", "vmid", id, "raw_len", len(raw)) disks := qmmonitor.ParseBlockInfo(raw) + logging.Trace("parsed disks", "vmid", id, "disk_count", len(disks)) for diskName, disk := range disks { // Try to get device symlink target for zvol/rbd/lvm if disk.DiskType == "zvol" || disk.DiskType == "rbd" || disk.DiskType == "lvm" { target, err := sysfs.GetDeviceSymlinkTarget(disk.DiskPath) if err == nil { - disk.Labels["device"] = target + disk.Labels["device"] = filepath.Base(target) } else { slog.Debug("resolve device symlink", "path", disk.DiskPath, "err", err) // Retry with cache invalidation @@ -365,6 +373,9 @@ func (c *PVECollector) collectDiskMetrics(ch chan<- prometheus.Metric, proc proc } } + logging.Trace("disk metric", "vmid", id, "disk", diskName, "type", disk.DiskType, + "path", disk.DiskPath, "size", diskSize, "device", disk.Labels["device"]) + if diskSize > 0 { ch <- prometheus.MustNewConstMetric(c.descDiskSize, prometheus.GaugeValue, float64(diskSize), id, diskName) } @@ -392,6 +403,7 @@ func (c *PVECollector) collectDiskMetrics(ch chan<- prometheus.Metric, proc proc func (c *PVECollector) collectStorage(ch chan<- prometheus.Metric) { entries := c.getStorageEntries() + logging.Trace("collectStorage", "entries_count", len(entries)) // Compute superset of property keys across all entries keySet := make(map[string]struct{}) @@ -410,6 +422,7 @@ func (c *PVECollector) collectStorage(ch chan<- prometheus.Metric) { for _, entry := range entries { storageType := entry.Properties["type"] storageName := entry.Properties["name"] + logging.Trace("storage entry", "name", storageName, "type", storageType) // Info metric with consistent labels vals := make([]string, len(allKeys)) @@ -438,7 +451,7 @@ func (c *PVECollector) collectStorage(ch chan<- prometheus.Metric) { poolName := strings.Split(pool, "/")[0] out, runErr := c.cmdRunner.Run("zpool", "list", "-p", poolName) if runErr != nil { - slog.Error("zpool list", "pool", poolName, "err", runErr) + slog.Warn("zpool list", "pool", poolName, "err", runErr) continue } size, err = storage.GetZPoolSize(out) diff --git a/src/internal/logging/logging.go b/src/internal/logging/logging.go new file mode 100644 index 0000000..20f6a93 --- /dev/null +++ b/src/internal/logging/logging.go @@ -0,0 +1,14 @@ +package logging + +import ( + "context" + "log/slog" +) + +// LevelTrace is a custom log level below Debug for very detailed diagnostic output. +const LevelTrace = slog.Level(-8) + +// Trace logs a message at TRACE level using the default logger. +func Trace(msg string, args ...any) { + slog.Default().Log(context.Background(), LevelTrace, msg, args...) +} diff --git a/src/internal/procfs/procfs.go b/src/internal/procfs/procfs.go index 73b54c3..e801b84 100644 --- a/src/internal/procfs/procfs.go +++ b/src/internal/procfs/procfs.go @@ -2,10 +2,13 @@ package procfs import ( "fmt" + "log/slog" "os" "path/filepath" "strconv" "strings" + + "pve_local_exporter/internal/logging" ) const clkTck = 100 // sysconf(_SC_CLK_TCK) on Linux @@ -83,6 +86,7 @@ func (r *RealProcReader) DiscoverQEMUProcesses() ([]QEMUProcess, error) { return nil, err } + numericPIDs := 0 var procs []QEMUProcess for _, e := range entries { if !e.IsDir() { @@ -92,26 +96,32 @@ func (r *RealProcReader) DiscoverQEMUProcesses() ([]QEMUProcess, error) { if err != nil { continue } + numericPIDs++ exe, err := os.Readlink(filepath.Join(r.ProcPath, e.Name(), "exe")) if err != nil { + logging.Trace("proc readlink failed", "pid", pid, "err", err) continue } if exe != "/usr/bin/qemu-system-x86_64" { + logging.Trace("proc exe skip", "pid", pid, "exe", exe) continue } cmdlineBytes, err := os.ReadFile(filepath.Join(r.ProcPath, e.Name(), "cmdline")) if err != nil { + logging.Trace("proc cmdline read failed", "pid", pid, "err", err) continue } cmdline := ParseCmdline(cmdlineBytes) vmid := FlagValue(cmdline, "-id") if vmid == "" { + logging.Trace("proc no -id flag", "pid", pid) continue } if !r.VMConfigExists(vmid) { + logging.Trace("proc no config", "pid", pid, "vmid", vmid) continue } @@ -123,8 +133,14 @@ func (r *RealProcReader) DiscoverQEMUProcesses() ([]QEMUProcess, error) { } proc.Vcores = ParseVcores(cmdline) proc.MaxMem = ParseMem(cmdline) + logging.Trace("proc discovered VM", "pid", pid, "vmid", vmid, "name", proc.Name) procs = append(procs, proc) } + + logging.Trace("proc scan complete", "numeric_pids", numericPIDs, "qemu_count", len(procs)) + if len(procs) == 0 { + slog.Warn("no QEMU processes discovered", "numeric_pids", numericPIDs, "proc_path", r.ProcPath, "pve_cfg_path", r.PVECfgPath) + } return procs, nil } diff --git a/src/internal/pveconfig/storage.go b/src/internal/pveconfig/storage.go index 2e4e5a9..7365a19 100644 --- a/src/internal/pveconfig/storage.go +++ b/src/internal/pveconfig/storage.go @@ -3,6 +3,8 @@ package pveconfig import ( "regexp" "strings" + + "pve_local_exporter/internal/logging" ) // StorageEntry holds a parsed storage definition from storage.cfg. @@ -47,6 +49,7 @@ func ParseStorageConfig(data string) []StorageEntry { "name": SanitizeKey(sectionName), }, } + logging.Trace("storage.cfg section", "type", sectionType, "name", sectionName) continue } @@ -66,5 +69,6 @@ func ParseStorageConfig(data string) []StorageEntry { result = append(result, *current) } + logging.Trace("ParseStorageConfig complete", "entries", len(result)) return result } diff --git a/src/internal/qmmonitor/block.go b/src/internal/qmmonitor/block.go index fa51f3a..ff13a13 100644 --- a/src/internal/qmmonitor/block.go +++ b/src/internal/qmmonitor/block.go @@ -6,6 +6,8 @@ import ( "log/slog" "regexp" "strings" + + "pve_local_exporter/internal/logging" ) // DiskInfo holds parsed block device info from "info block". @@ -44,6 +46,7 @@ func ParseBlockInfo(raw string) map[string]DiskInfo { match := blockHeaderRe.FindStringSubmatch(strings.TrimSpace(lines[0])) if match == nil { + logging.Trace("block header no match", "line", lines[0]) continue } @@ -69,8 +72,10 @@ func ParseBlockInfo(raw string) map[string]DiskInfo { if strings.HasPrefix(diskPath, "json:") { resolved, err := HandleJSONPath(diskPath) if err != nil { + logging.Trace("block json path error", "disk", diskName, "err", err) continue } + logging.Trace("block json resolved", "disk", diskName, "resolved", resolved) diskPath = resolved } @@ -88,6 +93,7 @@ func ParseBlockInfo(raw string) map[string]DiskInfo { // Detect disk type from path classifyDisk(&info) + logging.Trace("block classified", "disk", diskName, "type", info.DiskType, "path", diskPath) // Parse additional info from remaining lines for _, line := range lines[1:] { @@ -95,6 +101,13 @@ func ParseBlockInfo(raw string) map[string]DiskInfo { if strings.HasPrefix(line, "Attached to:") { // Extract device ID, e.g. "Attached to: /machine/peripheral/virtio0/virtio-backend" val := strings.TrimSpace(strings.TrimPrefix(line, "Attached to:")) + // Extract short device name from QOM path + if strings.Contains(val, "/") { + qomParts := strings.Split(val, "/") + if len(qomParts) > 3 { + val = qomParts[3] + } + } info.Labels["attached_to"] = val } else if strings.HasPrefix(line, "Cache mode:") { val := strings.TrimSpace(strings.TrimPrefix(line, "Cache mode:")) @@ -105,9 +118,11 @@ func ParseBlockInfo(raw string) map[string]DiskInfo { } } + logging.Trace("block parsed", "disk", diskName, "labels", info.Labels) result[diskName] = info } + logging.Trace("ParseBlockInfo complete", "disk_count", len(result)) if len(result) == 0 && raw != "" { slog.Debug("ParseBlockInfo found no disks", "rawLen", len(raw)) } diff --git a/src/internal/qmmonitor/block_test.go b/src/internal/qmmonitor/block_test.go index 7cc259a..e45961c 100644 --- a/src/internal/qmmonitor/block_test.go +++ b/src/internal/qmmonitor/block_test.go @@ -218,6 +218,39 @@ func TestParseBlockInfo_DetectZeroesUnmap(t *testing.T) { } } +func TestParseBlockInfo_AttachedToVirtio(t *testing.T) { + raw := `drive-scsi0 (#block100): /dev/zvol/rpool/data/vm-100-disk-0 (raw, read-write) + Attached to: /machine/peripheral/virtio0/virtio-backend +` + disks := ParseBlockInfo(raw) + d := disks["scsi0"] + if d.Labels["attached_to"] != "virtio0" { + t.Errorf("attached_to = %q, want %q", d.Labels["attached_to"], "virtio0") + } +} + +func TestParseBlockInfo_AttachedToVirtioScsi(t *testing.T) { + raw := `drive-scsi0 (#block100): /dev/zvol/rpool/data/vm-100-disk-0 (raw, read-write) + Attached to: /machine/peripheral/virtioscsi0/virtio-backend +` + disks := ParseBlockInfo(raw) + d := disks["scsi0"] + if d.Labels["attached_to"] != "virtioscsi0" { + t.Errorf("attached_to = %q, want %q", d.Labels["attached_to"], "virtioscsi0") + } +} + +func TestParseBlockInfo_AttachedToBare(t *testing.T) { + raw := `drive-ide2 (#block100): /path/to/disk.iso (raw, read-only) + Attached to: ide2 +` + disks := ParseBlockInfo(raw) + d := disks["ide2"] + if d.Labels["attached_to"] != "ide2" { + t.Errorf("attached_to = %q, want %q", d.Labels["attached_to"], "ide2") + } +} + func TestParseBlockInfo_MultiDisk(t *testing.T) { raw := `drive-scsi0 (#block100): /dev/zvol/rpool/data/vm-100-disk-0 (raw, read-write) Attached to: /machine/peripheral/virtioscsi0/virtio-backend diff --git a/src/internal/qmmonitor/network.go b/src/internal/qmmonitor/network.go index 69342d7..b052ba6 100644 --- a/src/internal/qmmonitor/network.go +++ b/src/internal/qmmonitor/network.go @@ -3,6 +3,8 @@ package qmmonitor import ( "strconv" "strings" + + "pve_local_exporter/internal/logging" ) // NICInfo holds parsed network interface info from "info network". @@ -64,14 +66,17 @@ func ParseNetworkInfo(raw string) []NICInfo { var result []NICInfo for netdev, cfg := range nicsMap { idx, _ := strconv.Atoi(cfg["index"]) - result = append(result, NICInfo{ + nic := NICInfo{ Netdev: netdev, Queues: idx + 1, Type: cfg["type"], Model: cfg["model"], Macaddr: cfg["macaddr"], Ifname: cfg["ifname"], - }) + } + logging.Trace("parsed NIC", "netdev", netdev, "ifname", nic.Ifname, "queues", nic.Queues, "model", nic.Model) + result = append(result, nic) } + logging.Trace("ParseNetworkInfo complete", "nic_count", len(result)) return result } diff --git a/src/internal/qmmonitor/qmmonitor.go b/src/internal/qmmonitor/qmmonitor.go index a43c94b..29ce5cd 100644 --- a/src/internal/qmmonitor/qmmonitor.go +++ b/src/internal/qmmonitor/qmmonitor.go @@ -14,6 +14,7 @@ import ( "github.com/creack/pty" "pve_local_exporter/internal/cache" + "pve_local_exporter/internal/logging" ) var errTimeout = errors.New("timeout waiting for qm monitor") @@ -75,6 +76,7 @@ func (m *RealQMMonitor) execQMMonitor(vmid, cmd string) (string, error) { slog.Debug("qm monitor exec", "vmid", vmid, "cmd", cmd) start := time.Now() + logging.Trace("qm pty spawn start", "vmid", vmid) qmCmd := exec.Command("qm", "monitor", vmid) qmCmd.Env = append(os.Environ(), "TERM=dumb") @@ -82,6 +84,7 @@ func (m *RealQMMonitor) execQMMonitor(vmid, cmd string) (string, error) { if err != nil { return "", fmt.Errorf("start qm monitor: %w", err) } + logging.Trace("qm pty spawn success", "vmid", vmid, "pid", qmCmd.Process.Pid) reader := bufio.NewReader(ptmx) @@ -93,8 +96,10 @@ func (m *RealQMMonitor) execQMMonitor(vmid, cmd string) (string, error) { m.killOrDefer(qmCmd, ptmx) return "", fmt.Errorf("initial prompt: %w", err) } + logging.Trace("qm initial prompt received", "vmid", vmid) // Send command + logging.Trace("qm send command", "vmid", vmid, "cmd", cmd) fmt.Fprintf(ptmx, "%s\n", cmd) // Read response until next "qm>" prompt @@ -105,6 +110,7 @@ func (m *RealQMMonitor) execQMMonitor(vmid, cmd string) (string, error) { m.killOrDefer(qmCmd, ptmx) return "", fmt.Errorf("read response: %w", err) } + logging.Trace("qm raw response", "vmid", vmid, "raw_len", len(raw)) response := parseQMResponse(raw) diff --git a/src/internal/storage/storage.go b/src/internal/storage/storage.go index 2a84507..5c0f992 100644 --- a/src/internal/storage/storage.go +++ b/src/internal/storage/storage.go @@ -5,6 +5,8 @@ import ( "strconv" "strings" "syscall" + + "pve_local_exporter/internal/logging" ) // StorageSize holds the total and free bytes of a storage pool. @@ -51,6 +53,8 @@ func GetZPoolSize(output string) (StorageSize, error) { return StorageSize{}, fmt.Errorf("not enough fields in zpool output: %q", lines[1]) } + logging.Trace("zpool fields", "name", fields[0], "size", fields[1], "alloc", fields[2], "free", fields[3]) + total, err := strconv.ParseInt(fields[1], 10, 64) if err != nil { return StorageSize{}, fmt.Errorf("parse total: %w", err) @@ -60,5 +64,6 @@ func GetZPoolSize(output string) (StorageSize, error) { return StorageSize{}, fmt.Errorf("parse free: %w", err) } + logging.Trace("zpool parsed", "total", total, "free", free) return StorageSize{Total: total, Free: free}, nil } diff --git a/src/internal/sysfs/sysfs.go b/src/internal/sysfs/sysfs.go index bb98ec8..cc1a352 100644 --- a/src/internal/sysfs/sysfs.go +++ b/src/internal/sysfs/sysfs.go @@ -6,6 +6,8 @@ import ( "path/filepath" "strconv" "strings" + + "pve_local_exporter/internal/logging" ) // SysReader abstracts /sys access for testability. @@ -46,6 +48,7 @@ func (r *RealSysReader) ReadInterfaceStats(ifname string) (map[string]int64, err } stats[e.Name()] = val } + logging.Trace("interface stats", "ifname", ifname, "stat_count", len(stats)) return stats, nil } @@ -58,6 +61,7 @@ func (r *RealSysReader) GetBlockDeviceSize(devPath string) (int64, error) { if err != nil { return 0, fmt.Errorf("resolve symlink %s: %w", devPath, err) } + logging.Trace("block device resolved", "path", devPath, "resolved", resolved) // Extract device name from /dev/XXX devName := filepath.Base(resolved) @@ -84,5 +88,6 @@ func GetDeviceSymlinkTarget(devPath string) (string, error) { if err != nil { return "", err } + logging.Trace("device symlink resolved", "path", devPath, "target", resolved) return resolved, nil } diff --git a/src/main.go b/src/main.go index f10b000..8e91f8b 100644 --- a/src/main.go +++ b/src/main.go @@ -15,6 +15,7 @@ import ( "pve_local_exporter/internal/collector" "pve_local_exporter/internal/config" + "pve_local_exporter/internal/logging" ) var version string @@ -30,6 +31,8 @@ func main() { level := slog.LevelInfo switch strings.ToUpper(cfg.LogLevel) { + case "TRACE": + level = logging.LevelTrace case "DEBUG": level = slog.LevelDebug case "WARNING", "WARN": @@ -37,7 +40,18 @@ func main() { case "ERROR", "CRITICAL": level = slog.LevelError } - slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: level}))) + slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ + Level: level, + ReplaceAttr: func(groups []string, a slog.Attr) slog.Attr { + if a.Key == slog.LevelKey { + lvl := a.Value.Any().(slog.Level) + if lvl == logging.LevelTrace { + a.Value = slog.StringValue("TRACE") + } + } + return a + }, + }))) reg := prometheus.NewRegistry() c := collector.New(cfg) @@ -60,7 +74,7 @@ func main() { server.Close() }() - log.Printf("listening on %s", addr) + slog.Info("listening", "addr", addr) if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed { log.Fatal(err) }