Compare commits
25 Commits
2083479945
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bd955e8067
|
||
|
|
b1e9b1e0b5
|
||
|
|
4ac1ba1f24
|
||
|
|
b57db23a35
|
||
|
|
ebcc08cc8f
|
||
|
|
1d06e1c180
|
||
|
|
4cc5a1f207
|
||
|
|
8207792bf7
|
||
|
|
066753ebc7
|
||
|
|
46bd7d67d2
|
||
|
|
7923d425a5 | ||
|
|
14db1fa68c
|
||
|
|
9745364a72
|
||
|
|
2eb85eed75
|
||
|
|
07c07a6b7d
|
||
|
|
80eadbdddb
|
||
|
|
1eb1bf9053
|
||
|
|
4416fbb6fe
|
||
|
|
2b4be475b6
|
||
|
|
66b09d0fa5
|
||
|
|
ce6ecf63ff
|
||
|
|
bd2b2c2a6e
|
||
|
|
8e42f6306c
|
||
|
|
ffcdcff16e
|
||
|
|
173369ff8e
|
21
LICENSE
Normal file
21
LICENSE
Normal file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 Harikrishnan R
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
40
flake.lock
generated
40
flake.lock
generated
@@ -1,18 +1,13 @@
|
||||
{
|
||||
"nodes": {
|
||||
"debBundler": {
|
||||
"inputs": {
|
||||
"home-manager": "home-manager",
|
||||
"nixpkgs": [
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1699154900,
|
||||
"narHash": "sha256-y+PK9ToYcAyY86EoM7Iam7gC++rCuAGndlnPTEzd3EA=",
|
||||
"lastModified": 1746317543,
|
||||
"narHash": "sha256-1Xph5g1Lazzkc9XuY1nOkG5Fn7+lmSdldAC91boDawY=",
|
||||
"owner": "illustris",
|
||||
"repo": "flake",
|
||||
"rev": "a56221a54571b0e4326af29cf75b4cec081b8de7",
|
||||
"rev": "e86bd104d76d22b2ba36fede405e7bff290ef489",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -21,34 +16,13 @@
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"home-manager": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"debBundler",
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1699025595,
|
||||
"narHash": "sha256-e+o4PoSu2Z6Ww8y/AVUmMU200rNZoRK+p2opQ7Db8Rg=",
|
||||
"owner": "nix-community",
|
||||
"repo": "home-manager",
|
||||
"rev": "8765d4e38aa0be53cdeee26f7386173e6c65618d",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-community",
|
||||
"repo": "home-manager",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1699099776,
|
||||
"narHash": "sha256-X09iKJ27mGsGambGfkKzqvw5esP1L/Rf8H3u3fCqIiU=",
|
||||
"lastModified": 1746663147,
|
||||
"narHash": "sha256-Ua0drDHawlzNqJnclTJGf87dBmaO/tn7iZ+TCkTRpRc=",
|
||||
"owner": "nixos",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "85f1ba3e51676fa8cc604a3d863d729026a6b8eb",
|
||||
"rev": "dda3dcd3fe03e991015e9a74b22d35950f264a54",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
@@ -5,7 +5,7 @@ rec {
|
||||
nixpkgs.url = github:nixos/nixpkgs/nixos-unstable;
|
||||
debBundler = {
|
||||
url = github:illustris/flake;
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
flake = false;
|
||||
};
|
||||
};
|
||||
|
||||
@@ -14,7 +14,7 @@ rec {
|
||||
packages.x86_64-linux = with nixpkgs.legacyPackages.x86_64-linux; rec {
|
||||
pvemon = python3Packages.buildPythonApplication {
|
||||
pname = "pvemon";
|
||||
version = "1.1.1";
|
||||
version = "1.3.3";
|
||||
src = ./src;
|
||||
propagatedBuildInputs = with python3Packages; [
|
||||
pexpect
|
||||
@@ -28,7 +28,7 @@ rec {
|
||||
};
|
||||
};
|
||||
default = pvemon;
|
||||
deb = debBundler.bundlers.x86_64-linux.deb default;
|
||||
deb = (import "${debBundler}/bundlers/deb" { inherit pkgs; }) default;
|
||||
updateRelease = writeScriptBin "update-release" (builtins.readFile ./utils/update-release.sh);
|
||||
};
|
||||
|
||||
|
||||
@@ -30,11 +30,20 @@ def ttl_cache_with_randomness(max_ttl, randomness_factor):
|
||||
result = func(*args, **kwargs)
|
||||
cache[key] = (result, time.time())
|
||||
return result
|
||||
|
||||
def invalidate_cache(*args, **kwargs):
|
||||
key = str(args) + str(kwargs)
|
||||
if key in cache:
|
||||
del cache[key]
|
||||
|
||||
# Attach the invalidation function to the wrapper
|
||||
wrapper.invalidate_cache = invalidate_cache
|
||||
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
@ttl_cache_with_randomness(qm_max_ttl, qm_rand)
|
||||
def qm_term_cmd(vm_id, cmd, timeout=global_qm_timeout):
|
||||
def qm_term_cmd(vm_id, cmd, timeout=global_qm_timeout): # TODO: ignore cmd timeout in cache key
|
||||
global deferred_closing
|
||||
child = pexpect.spawn(f'qm monitor {vm_id}')
|
||||
try:
|
||||
|
||||
@@ -19,11 +19,22 @@ from concurrent.futures import ThreadPoolExecutor
|
||||
from threading import Lock
|
||||
|
||||
import pvecommon
|
||||
import pvestorage
|
||||
import qmblock
|
||||
|
||||
import builtins
|
||||
|
||||
# Cache for pool data
|
||||
pool_cache = {
|
||||
'last_mtime': 0,
|
||||
'vm_pool_map': {},
|
||||
'pools': {}
|
||||
}
|
||||
|
||||
DEFAULT_PORT = 9116
|
||||
DEFAULT_INTERVAL = 10
|
||||
DEFAULT_PREFIX = "pve"
|
||||
DEFAULT_HOST = "0.0.0.0"
|
||||
|
||||
gauge_settings = [
|
||||
('kvm_cpu', 'CPU time for VM', ['id', 'mode']),
|
||||
@@ -41,6 +52,8 @@ gauge_settings = [
|
||||
('kvm_io_write_chars', 'Number of bytes written including buffers', ['id']),
|
||||
|
||||
('kvm_nic_queues', 'Number of queues in multiqueue config', ['id', 'ifname']),
|
||||
|
||||
('kvm_disk_size', 'Size of virtual disk', ['id', 'disk_name']),
|
||||
]
|
||||
|
||||
label_flags = [ "-id", "-name", "-cpu" ]
|
||||
@@ -51,13 +64,30 @@ info_settings = [
|
||||
|
||||
flag_to_label_value = lambda args, match: next((args[i+1] for i, x in enumerate(args[:-1]) if x == match), "unknown").split(",")[0]
|
||||
|
||||
def parse_mem(cmdline):
|
||||
ret = flag_to_label_value(cmdline, "-m")
|
||||
# lazy way to detect NUMA
|
||||
# the token after -m might look something like 'size=1024,slots=255,maxmem=4194304M'
|
||||
if ret.isnumeric():
|
||||
return int(ret)*1024
|
||||
|
||||
# probably using NUMA
|
||||
ret = 0
|
||||
for arg in cmdline:
|
||||
if "memory-backend-ram" in arg:
|
||||
assert(arg[-1]=='M')
|
||||
ret += 1024*int(arg.split("=")[-1][:-1])
|
||||
return ret
|
||||
|
||||
def create_or_get_gauge(metric_name, labels, dynamic_gauges, gauge_lock):
|
||||
logging.debug(f"create_or_get_gauge({metric_name=}, labels={str(labels)}")
|
||||
with gauge_lock:
|
||||
if metric_name not in dynamic_gauges:
|
||||
dynamic_gauges[metric_name] = GaugeMetricFamily(f"{prefix}_{metric_name}", f'{metric_name} for KVM process', labels=labels)
|
||||
return dynamic_gauges[metric_name]
|
||||
|
||||
def create_or_get_info(info_name, labels, dynamic_infos, info_lock):
|
||||
logging.debug(f"create_or_get_info({info_name=}, labels={str(labels)}")
|
||||
with info_lock:
|
||||
if (info_name,str(labels)) not in dynamic_infos:
|
||||
dynamic_infos[(info_name,str(labels))] = InfoMetricFamily(f"{prefix}_{info_name}", f'{info_name} for {str(labels)}', labels=labels)
|
||||
@@ -113,6 +143,67 @@ def read_interface_stats(ifname):
|
||||
pass
|
||||
return stats
|
||||
|
||||
def get_pool_info():
|
||||
"""
|
||||
Read pool information from /etc/pve/user.cfg, caching based on file modification time.
|
||||
Returns a tuple of (vm_to_pool_map, pool_info) where:
|
||||
- vm_to_pool_map maps VM IDs to their pool names
|
||||
- pool_info contains details about each pool (levels, etc.)
|
||||
"""
|
||||
pool_cfg_path = '/etc/pve/user.cfg'
|
||||
|
||||
try:
|
||||
# Check modification time
|
||||
current_mtime = os.path.getmtime(pool_cfg_path)
|
||||
|
||||
# If file hasn't changed, return cached data
|
||||
if current_mtime <= pool_cache['last_mtime'] and pool_cache['vm_pool_map']:
|
||||
return pool_cache['vm_pool_map'], pool_cache['pools']
|
||||
|
||||
# File has changed or first run, parse it
|
||||
logging.debug(f"Reading pool configuration from {pool_cfg_path}")
|
||||
|
||||
vm_pool_map = {}
|
||||
pools = {}
|
||||
|
||||
with open(pool_cfg_path, 'r') as f:
|
||||
for line in f:
|
||||
if line.startswith('pool:'):
|
||||
parts = line.strip().split(':')
|
||||
if len(parts) < 3:
|
||||
continue
|
||||
|
||||
pool_name = parts[1]
|
||||
vm_list = parts[3] if len(parts) > 3 else ''
|
||||
|
||||
# Store pool info
|
||||
pool_parts = pool_name.split('/')
|
||||
pool_level_count = len(pool_parts)
|
||||
|
||||
pools[pool_name] = {
|
||||
'level_count': pool_level_count,
|
||||
'level1': pool_parts[0] if pool_level_count > 0 else '',
|
||||
'level2': pool_parts[1] if pool_level_count > 1 else '',
|
||||
'level3': pool_parts[2] if pool_level_count > 2 else ''
|
||||
}
|
||||
|
||||
# Map VMs to this pool
|
||||
if vm_list:
|
||||
for vm_id in vm_list.split(','):
|
||||
if vm_id.strip():
|
||||
vm_pool_map[vm_id.strip()] = pool_name
|
||||
|
||||
# Update cache
|
||||
pool_cache['last_mtime'] = current_mtime
|
||||
pool_cache['vm_pool_map'] = vm_pool_map
|
||||
pool_cache['pools'] = pools
|
||||
|
||||
return vm_pool_map, pools
|
||||
|
||||
except (FileNotFoundError, PermissionError) as e:
|
||||
logging.warning(f"Could not read pool configuration: {e}")
|
||||
return {}, {}
|
||||
|
||||
def collect_kvm_metrics():
|
||||
logging.debug("collect_kvm_metrics() called")
|
||||
gauge_dict = {}
|
||||
@@ -132,26 +223,55 @@ def collect_kvm_metrics():
|
||||
for proc in psutil.process_iter(['pid', 'name', 'exe', 'cmdline', 'cpu_percent', 'memory_percent', 'num_threads']):
|
||||
try:
|
||||
if proc.info['exe'] == '/usr/bin/qemu-system-x86_64':
|
||||
vmid = flag_to_label_value(proc.info['cmdline'], "-id")
|
||||
# Check if VM definition exists. If it is missing, qm commands will fail.
|
||||
# VM configs are typically missing when a VM is migrating in.
|
||||
# The config file is moved after the drives and memory are synced.
|
||||
if not os.path.exists(f'/etc/pve/qemu-server/{vmid}.conf'):
|
||||
continue
|
||||
procs.append(
|
||||
(
|
||||
proc,
|
||||
proc.info['cmdline'],
|
||||
flag_to_label_value(proc.info['cmdline'], "-id")
|
||||
vmid
|
||||
)
|
||||
)
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
||||
continue
|
||||
|
||||
# Get VM to pool mapping
|
||||
vm_pool_map, pools = get_pool_info()
|
||||
|
||||
for proc, cmdline, id in procs:
|
||||
# Extract vm labels from cmdline
|
||||
info_label_dict = {get_label_name(l): flag_to_label_value(cmdline,l) for l in label_flags}
|
||||
info_label_dict['pid']=str(proc.pid)
|
||||
info_label_dict['pid'] = str(proc.pid)
|
||||
logging.debug(f"got PID: {proc.pid}")
|
||||
|
||||
# Add pool information if available
|
||||
if id in vm_pool_map:
|
||||
pool_name = vm_pool_map[id]
|
||||
pool_info = pools[pool_name]
|
||||
|
||||
info_label_dict['pool'] = pool_name
|
||||
info_label_dict['pool_levels'] = str(pool_info['level_count'])
|
||||
info_label_dict['pool1'] = pool_info['level1']
|
||||
info_label_dict['pool2'] = pool_info['level2']
|
||||
info_label_dict['pool3'] = pool_info['level3']
|
||||
logging.debug(f"VM {id} belongs to pool {pool_name}")
|
||||
else:
|
||||
# VM not in any pool
|
||||
info_label_dict['pool'] = ''
|
||||
info_label_dict['pool_levels'] = '0'
|
||||
info_label_dict['pool1'] = ''
|
||||
info_label_dict['pool2'] = ''
|
||||
info_label_dict['pool3'] = ''
|
||||
|
||||
info_dict["kvm"].add_metric([], info_label_dict)
|
||||
|
||||
d = {
|
||||
"kvm_vcores": flag_to_label_value(cmdline,"-smp"),
|
||||
"kvm_maxmem": int(flag_to_label_value(cmdline,"-m"))*1024,
|
||||
"kvm_maxmem": parse_mem(cmdline),
|
||||
"kvm_memory_percent": proc.info['memory_percent'],
|
||||
"kvm_threads": proc.info['num_threads'],
|
||||
}
|
||||
@@ -200,6 +320,11 @@ def collect_kvm_metrics():
|
||||
disk_labels = {"id": id, "disk_name": disk_name}
|
||||
prom_disk_info = create_or_get_info("kvm_disk", disk_labels.keys(), dynamic_infos, info_lock)
|
||||
prom_disk_info.add_metric(disk_labels.values(), disk_info)
|
||||
disk_size = qmblock.get_disk_size(disk_info["disk_path"], disk_info["disk_type"])
|
||||
if disk_size == None and disk_info["disk_type"] != "qcow2":
|
||||
logging.debug(f"collect_kvm_metrics: failed to get disk size for {disk_info=}")
|
||||
else:
|
||||
gauge_dict["kvm_disk_size"].add_metric([id, disk_name], qmblock.get_disk_size(disk_info["disk_path"], disk_info["disk_type"]))
|
||||
|
||||
list(executor.map(map_netstat_proc, [ proc[2] for proc in procs ]))
|
||||
list(executor.map(map_disk_proc, [ proc[2] for proc in procs ]))
|
||||
@@ -222,12 +347,17 @@ class PVECollector(object):
|
||||
if cli_args.collect_running_vms.lower() == 'true':
|
||||
for x in collect_kvm_metrics():
|
||||
yield x
|
||||
if cli_args.collect_storage.lower() == 'true':
|
||||
for x in pvestorage.collect_storage_metrics():
|
||||
yield x
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='PVE metrics exporter for Prometheus')
|
||||
parser.add_argument('--port', type=int, default=DEFAULT_PORT, help='Port for the exporter to listen on')
|
||||
parser.add_argument('--host', type=str, default=DEFAULT_HOST, help='Host address to bind the exporter to')
|
||||
parser.add_argument('--interval', type=int, default=DEFAULT_INTERVAL, help='THIS OPTION DOES NOTHING')
|
||||
parser.add_argument('--collect-running-vms', type=str, default='true', help='Enable or disable collecting running VMs metric (true/false)')
|
||||
parser.add_argument('--collect-storage', type=str, default='true', help='Enable or disable collecting storage info (true/false)')
|
||||
parser.add_argument('--metrics-prefix', type=str, default=DEFAULT_PREFIX, help='<prefix>_ will be prepended to each metric name')
|
||||
parser.add_argument('--loglevel', type=str, default='INFO', help='Set log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)')
|
||||
parser.add_argument('--profile', type=str, default='false', help='collect metrics once, and print profiling stats')
|
||||
@@ -236,23 +366,22 @@ def main():
|
||||
parser.add_argument('--qm-rand', type=int, default=60, help='randomize qm monitor cache expiry')
|
||||
parser.add_argument('--qm-monitor-defer-close', type=str, default="true", help='defer and retry closing unresponsive qm monitor sessions')
|
||||
|
||||
args = parser.parse_args()
|
||||
global cli_args
|
||||
cli_args = args
|
||||
# hack to access cli_args across modules
|
||||
builtins.cli_args = parser.parse_args()
|
||||
|
||||
loglevel = getattr(logging, args.loglevel.upper(), None)
|
||||
loglevel = getattr(logging, cli_args.loglevel.upper(), None)
|
||||
if not isinstance(loglevel, int):
|
||||
raise ValueError(f'Invalid log level: {args.loglevel}')
|
||||
raise ValueError(f'Invalid log level: {cli_args.loglevel}')
|
||||
logging.basicConfig(level=loglevel,format='%(asctime)s: %(message)s')
|
||||
|
||||
global prefix
|
||||
prefix = args.metrics_prefix
|
||||
pvecommon.global_qm_timeout = args.qm_terminal_timeout
|
||||
pvecommon.qm_max_ttl = args.qm_max_ttl
|
||||
pvecommon.qm_rand = args.qm_rand
|
||||
pvecommon.qm_monitor_defer_close = args.qm_monitor_defer_close
|
||||
prefix = cli_args.metrics_prefix
|
||||
pvecommon.global_qm_timeout = cli_args.qm_terminal_timeout
|
||||
pvecommon.qm_max_ttl = cli_args.qm_max_ttl
|
||||
pvecommon.qm_rand = cli_args.qm_rand
|
||||
pvecommon.qm_monitor_defer_close = cli_args.qm_monitor_defer_close
|
||||
|
||||
if args.profile.lower() == 'true':
|
||||
if cli_args.profile.lower() == 'true':
|
||||
profiler = cProfile.Profile()
|
||||
profiler.enable()
|
||||
collect_kvm_metrics()
|
||||
@@ -261,7 +390,7 @@ def main():
|
||||
return
|
||||
else:
|
||||
REGISTRY.register(PVECollector())
|
||||
start_http_server(args.port)
|
||||
start_http_server(cli_args.port, addr=cli_args.host)
|
||||
|
||||
while True:
|
||||
time.sleep(100)
|
||||
|
||||
183
src/pvestorage/__init__.py
Normal file
183
src/pvestorage/__init__.py
Normal file
@@ -0,0 +1,183 @@
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
import pprint
|
||||
|
||||
from prometheus_client.core import InfoMetricFamily, GaugeMetricFamily, CounterMetricFamily, REGISTRY
|
||||
|
||||
gauge_settings = [
|
||||
('node_storage_size', 'Size of the storage pool. This number is inaccurate for ZFS.', ['name', 'type']),
|
||||
('node_storage_free', 'Free space on the storage pool', ['name', 'type'])
|
||||
]
|
||||
|
||||
info_settings = [
|
||||
('node_storage', 'information for each PVE storage'),
|
||||
]
|
||||
|
||||
# Sanitize the key to match Prometheus label requirements
|
||||
# Replace any character that is not a letter, digit, or underscore with an underscore
|
||||
sanitize_key = lambda key: re.sub(r"[^a-zA-Z0-9_]", "_", key)
|
||||
|
||||
_cached_storage_data = None
|
||||
_cached_mtime = None
|
||||
|
||||
def parse_storage_cfg(file_path='/etc/pve/storage.cfg'):
|
||||
logging.debug(f"parse_storage_cfg({file_path=}) called")
|
||||
global _cached_storage_data, _cached_mtime
|
||||
|
||||
# Check if file exists
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"The file {file_path} does not exist.")
|
||||
|
||||
# Get the file's modification time
|
||||
current_mtime = os.path.getmtime(file_path)
|
||||
|
||||
# If the data is already cached and the file hasn't changed, return the cached data
|
||||
if _cached_storage_data is not None and _cached_mtime == current_mtime:
|
||||
logging.debug("parse_storage_cfg: returning cached data")
|
||||
return _cached_storage_data
|
||||
|
||||
logging.debug("parse_storage_cfg: file modified, dropping cache")
|
||||
|
||||
# Initialize list to store storages
|
||||
storage_list = []
|
||||
current_storage = None
|
||||
|
||||
with open(file_path, 'r') as file:
|
||||
for line in file:
|
||||
line = line.strip()
|
||||
|
||||
if not line or line.startswith("#"):
|
||||
# Ignore empty lines or comments
|
||||
continue
|
||||
|
||||
if ":" in line:
|
||||
# If we were processing a previous section, append it to the list
|
||||
if current_storage:
|
||||
storage_list.append(current_storage)
|
||||
|
||||
# Start a new storage definition
|
||||
section_type, section_name = line.split(":", 1)
|
||||
current_storage = {
|
||||
'type': sanitize_key(section_type.strip()), # Sanitize the section type
|
||||
'name': sanitize_key(section_name.strip()), # Sanitize the section name
|
||||
}
|
||||
else:
|
||||
# Parse key-value pairs within the current storage
|
||||
if current_storage:
|
||||
parts = line.split(None, 1)
|
||||
key = parts[0].strip()
|
||||
sanitized_key = sanitize_key(key)
|
||||
if len(parts) > 1:
|
||||
# Regular key-value pair
|
||||
current_storage[sanitized_key] = parts[1].strip()
|
||||
else:
|
||||
# Key with no value, set it to True
|
||||
current_storage[sanitized_key] = True
|
||||
|
||||
# Append the last storage section to the list if any
|
||||
if current_storage:
|
||||
storage_list.append(current_storage)
|
||||
|
||||
# Update the cache
|
||||
_cached_storage_data = storage_list
|
||||
_cached_mtime = current_mtime
|
||||
|
||||
return storage_list
|
||||
|
||||
def get_storage_size(storage):
|
||||
try:
|
||||
if storage["type"] == "zfspool":
|
||||
if "pool" not in storage:
|
||||
logging.debug(f"ZFS pool {storage['name']} has no pool name configured")
|
||||
return None
|
||||
|
||||
# Extract the pool name (could be in format like rpool/data)
|
||||
pool_name = storage["pool"].split("/")[0]
|
||||
|
||||
# Use zpool command to get accurate size information
|
||||
import subprocess
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["zpool", "list", pool_name, "-p"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
|
||||
# Parse the output
|
||||
lines = result.stdout.strip().split("\n")
|
||||
if len(lines) < 2:
|
||||
logging.warn(f"Unexpected zpool list output format for {pool_name}")
|
||||
return None
|
||||
|
||||
# Extract values from the second line (the data line)
|
||||
values = lines[1].split()
|
||||
if len(values) < 4:
|
||||
logging.warn(f"Insufficient data in zpool list output for {pool_name}")
|
||||
return None
|
||||
|
||||
# Values are: NAME SIZE ALLOC FREE ...
|
||||
# We need the SIZE and FREE values (index 1 and 3)
|
||||
total_size = int(values[1])
|
||||
free_space = int(values[3])
|
||||
|
||||
return {
|
||||
"total": total_size,
|
||||
"free": free_space
|
||||
}
|
||||
except (subprocess.SubprocessError, ValueError, IndexError) as e:
|
||||
logging.warn(f"Error running zpool list for {pool_name}: {e}")
|
||||
return None
|
||||
|
||||
elif storage["type"] in ["dir", "nfs", "cephfs"]:
|
||||
# For non-ZFS storage, use statvfs
|
||||
path = storage["path"]
|
||||
stats = os.statvfs(path)
|
||||
total_size = stats.f_frsize * stats.f_blocks
|
||||
free_space = stats.f_frsize * stats.f_bavail
|
||||
return {
|
||||
"total": total_size,
|
||||
"free": free_space
|
||||
}
|
||||
|
||||
# TODO: handle lvmthin
|
||||
# could parse /etc/lvm/backup/<vg-name> to collect this data
|
||||
# TODO: handle rbd
|
||||
|
||||
except Exception as e:
|
||||
logging.warn(f"get_storage_size: unknown error, {storage=}, error: {e}")
|
||||
|
||||
# Return None if the case is not handled
|
||||
return None
|
||||
|
||||
def collect_storage_metrics():
|
||||
logging.debug("collect_storage_metrics() called")
|
||||
gauge_dict = {}
|
||||
info_dict = {}
|
||||
prefix = cli_args.metrics_prefix
|
||||
for name, description, labels in gauge_settings:
|
||||
gauge_dict[name] = GaugeMetricFamily(f"{prefix}_{name}", description, labels=labels)
|
||||
|
||||
for name, description in info_settings:
|
||||
info_dict[name] = InfoMetricFamily(f"{prefix}_{name}", description)
|
||||
|
||||
storage_pools = parse_storage_cfg()
|
||||
for storage in storage_pools:
|
||||
# Convert any non-string values to strings for InfoMetricFamily
|
||||
storage_info = {}
|
||||
for key, value in storage.items():
|
||||
storage_info[key] = str(value) if not isinstance(value, str) else value
|
||||
|
||||
info_dict["node_storage"].add_metric([], storage_info)
|
||||
size = get_storage_size(storage)
|
||||
if size != None:
|
||||
gauge_dict["node_storage_size"].add_metric([storage["name"], storage["type"]], size["total"])
|
||||
gauge_dict["node_storage_free"].add_metric([storage["name"], storage["type"]], size["free"])
|
||||
|
||||
for v in info_dict.values():
|
||||
yield v
|
||||
for v in gauge_dict.values():
|
||||
yield v
|
||||
|
||||
logging.debug("collect_storage_metrics() return")
|
||||
@@ -2,9 +2,12 @@ import pexpect
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
import stat
|
||||
|
||||
import pvecommon
|
||||
|
||||
extract_disk_info_max_retries = 1
|
||||
|
||||
def get_device(disk_path):
|
||||
try:
|
||||
return os.readlink(disk_path).split('/')[-1]
|
||||
@@ -26,7 +29,7 @@ def handle_json_path(path):
|
||||
raise ValueError('No host_device driver found or filename is missing')
|
||||
return filename
|
||||
|
||||
def extract_disk_info_from_monitor(vm_id):
|
||||
def extract_disk_info_from_monitor(vm_id, retries = 0):
|
||||
raw_output = pvecommon.qm_term_cmd(vm_id, 'info block')
|
||||
disks_map = {}
|
||||
disks = [x.strip() for x in raw_output.split("drive-")[1:]]
|
||||
@@ -64,6 +67,8 @@ def extract_disk_info_from_monitor(vm_id):
|
||||
disks_map[disk_name]["disk_type"] = "rbd"
|
||||
rbd_parts = disk_path.split('/')
|
||||
disks_map[disk_name]["cluster_id"] = rbd_parts[-3]
|
||||
disks_map[disk_name]["pool"] = rbd_parts[-2]
|
||||
# Keeping for backwards compatibility
|
||||
disks_map[disk_name]["pool_name"] = rbd_parts[-2]
|
||||
disks_map[disk_name]["vol_name"] = rbd_parts[-1]
|
||||
disks_map[disk_name]["device"] = get_device(disk_path)
|
||||
@@ -73,6 +78,11 @@ def extract_disk_info_from_monitor(vm_id):
|
||||
disks_map[disk_name]["vg_name"] = vg_name
|
||||
disks_map[disk_name]["vol_name"] = vol_name
|
||||
disks_map[disk_name]["device"] = get_device(disk_path)
|
||||
# At this point, if disks_map[disk_name]["device"] exists and is None, the cache might be stale
|
||||
# Flush the cache for this VMID and try again
|
||||
if "device" in disks_map[disk_name] and disks_map[disk_name]["device"] == None and retries < extract_disk_info_max_retries:
|
||||
pvecommon.qm_term_cmd.invalidate_cache(vm_id, 'info block')
|
||||
return extract_disk_info_from_monitor(vm_id, retries+1)
|
||||
for line in data[1:-1]:
|
||||
if "Attached to" in line:
|
||||
attached_to = line.split(":")[-1].strip()
|
||||
@@ -87,6 +97,24 @@ def extract_disk_info_from_monitor(vm_id):
|
||||
disks_map[disk_name]["detect_zeroes"] = "on"
|
||||
return disks_map
|
||||
|
||||
def get_disk_size(disk_path, disk_type):
|
||||
if stat.S_ISBLK(os.stat(disk_path).st_mode):
|
||||
disk_name = os.path.basename(os.path.realpath(disk_path))
|
||||
size_file_path = f"/sys/block/{disk_name}/size"
|
||||
sector_size_file_path = f"/sys/block/{disk_name}/queue/hw_sector_size"
|
||||
|
||||
with open(size_file_path, 'r') as f:
|
||||
sectors = int(f.read().strip())
|
||||
|
||||
with open(sector_size_file_path, 'r') as sector_size_file:
|
||||
sector_size = int(sector_size_file.read().strip())
|
||||
|
||||
size_in_bytes = sectors * sector_size
|
||||
else:
|
||||
size_in_bytes = os.path.getsize(disk_path)
|
||||
|
||||
return size_in_bytes
|
||||
|
||||
if __name__ == "__main__":
|
||||
import json
|
||||
import sys
|
||||
|
||||
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name='pvemon',
|
||||
version = "1.1.1",
|
||||
version = "1.3.3",
|
||||
packages=find_packages(),
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
|
||||
Reference in New Issue
Block a user