From 23aa370cbf3a49a90455d4d46e5e81454a7c14e8 Mon Sep 17 00:00:00 2001 From: Daniel Langbein Date: Sun, 8 Oct 2023 18:41:02 +0200 Subject: [PATCH] feat: log drive temperature with psutil --- src/de/p1st/monitor/cfg/loggers.py | 9 +++ src/de/p1st/monitor/data/yodaNas.ini | 28 ++----- src/de/p1st/monitor/data/yodaTux.ini | 14 +--- src/de/p1st/monitor/loggers/drive.py | 11 ++- src/de/p1st/monitor/loggers/drive_temp.py | 98 +++++++++++++++++++++++ src/de/p1st/monitor/loggers/temp.py | 2 +- 6 files changed, 129 insertions(+), 33 deletions(-) create mode 100644 src/de/p1st/monitor/loggers/drive_temp.py diff --git a/src/de/p1st/monitor/cfg/loggers.py b/src/de/p1st/monitor/cfg/loggers.py index f903314..126738e 100644 --- a/src/de/p1st/monitor/cfg/loggers.py +++ b/src/de/p1st/monitor/cfg/loggers.py @@ -8,6 +8,7 @@ from de.p1st.monitor.cfg.singleton import get_cfg from de.p1st.monitor.logger_ex import LoggerArgEx from de.p1st.monitor.loggers.cpu import CPULogger1, CPULogger5, CPULogger15 from de.p1st.monitor.loggers.drive import DriveLogger +from de.p1st.monitor.loggers.drive_temp import DriveTempLogger from de.p1st.monitor.loggers.filesystem import FilesystemLogger from de.p1st.monitor.loggers.memory import MemoryLogger from de.p1st.monitor.loggers.network import NetworkLogger @@ -86,6 +87,13 @@ def get_loggers() -> tuple[list[Logger], list[LoggerArgEx]]: warn_data_range = int(cfg_.get('warn_data_range', '1')) return DriveLogger(uuid, id_, device, warn_if_above, warn_threshold, warn_data_range) + def drive_temp(cfg_: configparser.SectionProxy) -> Logger: + type_ = cfg_.get('type', None) + warn_if_above = int(cfg_['warn_if_above']) if 'warn_if_above' in cfg_ else None + warn_threshold = int(cfg_.get('warn_threshold', '1')) + warn_data_range = int(cfg_.get('warn_data_range', '1')) + return DriveTempLogger(type_, warn_if_above, warn_threshold, warn_data_range) + def memory(cfg_: configparser.SectionProxy) -> Logger: warn_if_above = float(cfg_.get('warn_if_above', '1.0')) warn_threshold = int(cfg_.get('warn_threshold', '1')) @@ -107,6 +115,7 @@ def get_loggers() -> tuple[list[Logger], list[LoggerArgEx]]: 'network': net, 'filesystem': filesystem, 'drive': drive, + 'drive_temp': drive_temp, 'memory': memory, 'swap': swap, } diff --git a/src/de/p1st/monitor/data/yodaNas.ini b/src/de/p1st/monitor/data/yodaNas.ini index 3c590b5..9a28417 100644 --- a/src/de/p1st/monitor/data/yodaNas.ini +++ b/src/de/p1st/monitor/data/yodaNas.ini @@ -53,27 +53,15 @@ warn_if_above = 60 [network.1] network_interface = enp0s31f6 -[drive.1] -; NVMe 256GB2 -; TODO NVMe 49 warn, 55 limit -id = nvme-WDC_PC_SN520_SDAPNUW-256G-1002_183873801941 +[drive_temp.1] +; NVMe drives +; TODO: 49 warn, 55 limit +type = nvme warn_if_above = 50 -[drive.2] -; HDD 12TB1 -; TODO HDD 39 warn, 45 limit -id = ata-TOSHIBA_MG07ACA12TE_X1E0A0WKF95G -warn_if_above = 40 -[drive.3] -; HDD 3TB1 -id = ata-WDC_WD30EFRX-68EUZN0_WD-WCC4N1173157 -warn_if_above = 40 -[drive.4] -; HDD 3TB2 -id = ata-WDC_WD30EFRX-68EUZN0_WD-WMC4N0564095 -warn_if_above = 40 -[drive.5] -; HDD 4TB1 -id = ata-WDC_WD40EFRX-68N32N0_WD-WCC7K0CPF0N1 +[drive_temp.2] +; HDD drives +; TODO: 39 warn, 45 limit +type = drivetemp warn_if_above = 40 ; [sensor_script.1] diff --git a/src/de/p1st/monitor/data/yodaTux.ini b/src/de/p1st/monitor/data/yodaTux.ini index a6a525b..d7cc841 100644 --- a/src/de/p1st/monitor/data/yodaTux.ini +++ b/src/de/p1st/monitor/data/yodaTux.ini @@ -69,20 +69,14 @@ unmounted_ok = true warn_if_above = 0.1 -[drive.1] -; Either `uuid`, `id` or `device` must be given. -; -; `uuid` as in /dev/disk/by-uuid/* -;uuid = ea7099e3-320d-4eb3-a4c3-9910a9af817b -; `id` as in /dev/disk/by-id/* -; id = nvme-XPG_GAMMIX_S50_Lite_2K462L2JN9KG -; device as in /dev/* -device = /dev/nvme0n1 - +[drive_temp.1] +; Either `nvme` (for NVMe drives) or `drivetemp` (for HDDs) +type = nvme ; Warn if temperature is above this value. ; Unit: °C warn_if_above = 25 + [sensor_script.1] ; The command will be executed. ; It has to return a float (or int) and exit code 0 on success. diff --git a/src/de/p1st/monitor/loggers/drive.py b/src/de/p1st/monitor/loggers/drive.py index 32d2da0..6f6b4f2 100644 --- a/src/de/p1st/monitor/loggers/drive.py +++ b/src/de/p1st/monitor/loggers/drive.py @@ -135,17 +135,24 @@ class DriveLogger(Logger): @classmethod def get_temp_from_device(cls, device: Path) -> int: """ + Use `smartctl` to get HDD/SSD temperature. + + As reading SMART data wakes up standby HDD drives, we skip them. + :param device: Partition path, e.g. `/dev/sda` - :return: Temperature in celsius + :return: Temperature in Celsius """ + # -n standby: Don't spin-up an HDD if it is in standby mode. # -j: JSON output. # -a: Print all SMART device information. # For NVMe, this is equivalent to: '-H -i -c -A -l error -l selftest'. # -H: Print health status. # -A: Prints only the vendor specific SMART Attributes. - returncode, stdout, stderr = execute_capture(['smartctl', '-j', '-A', f'{device}']) + returncode, stdout, stderr = execute_capture(['smartctl', '-n', 'standby', '-j', '-A', f'{device}']) + if returncode == 2 and 'Device is in STANDBY mode' in stdout: + raise LoggerReadEx(f'Could not read drive temperature as it is in standby mode: {device}') if returncode != 0: raise LoggerReadEx(f'smartctl failed with returncode {returncode}\nstdout: {stdout}\nstderr: {stderr}') j = json.loads(stdout) diff --git a/src/de/p1st/monitor/loggers/drive_temp.py b/src/de/p1st/monitor/loggers/drive_temp.py new file mode 100644 index 0000000..597f114 --- /dev/null +++ b/src/de/p1st/monitor/loggers/drive_temp.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import math +from typing import Literal +from pathlib import Path + +import psutil + +from de.p1st.monitor import datetime_util +from de.p1st.monitor.logger import Logger +from de.p1st.monitor.logger_ex import LoggerReadEx +from de.p1st.monitor.warn_data import WarnData + + +class DriveTempLogger(Logger): + def __init__(self, + type_: Literal['drivetemp', 'nvme'], + warn_if_above: int = None, + warn_threshold: int = 1, + warn_data_range: int = 1, + ): + """ + :param type_: HDD -> drivetemp, NVMe -> nvme + """ + + critical_if_above = warn_if_above + 10 + super().__init__(warn_threshold, + warn_data_range, + warn_if_above, + critical_if_above + ) + + self.type = type_ + + def get_warn_data(self, data: list[any]) -> WarnData: + min_temp = data[1] + max_temp = data[2] + message = f'Temperature of drive type {self.type} is in range {min_temp}:{max_temp}' + return WarnData(data[0], max_temp, message) + + def read_data(self) -> list[any]: + min_temp, max_temp = self.get_drive_temp() + return [ + datetime_util.now(), + min_temp, + max_temp + ] + + def data_schema(self) -> list[str]: + return ['datetime#Date', 'float#MinTemperature', 'float#MaxTemperature'] + + def get_log_file(self) -> Path: + return self.get_log_dir() / f'drive-temp_{self.type}.csv' + + # + # HELPERS + # + + def get_drive_temp(self) -> (float, float): + """ + Use `psutil` Python library to get HDD/SSD temperature. + https://psutil.readthedocs.io/en/latest/index.html#psutil.sensors_temperatures + + Not sure if this changed the results: + sudo modprobe drivetemp + + Example output: + { + 'nvme': [ + shwtemp(label='Composite', current=37.85, high=81.85, critical=85.85) + ], + 'pch_skylake': [...], + 'coretemp': [...], + 'drivetemp': [ + shwtemp(label='', current=23.0, high=65.0, critical=85.0), + shwtemp(label='', current=25.0, high=55.0, critical=70.0), + shwtemp(label='', current=24.0, high=60.0, critical=85.0), + shwtemp(label='', current=22.0, high=60.0, critical=85.0)] + } + + Problem: If one has multiple drives attached, they can't be distinguished. + https://github.com/giampaolo/psutil/issues/1902 + + Therefore, we currently accumulate the maximum and minimum values of all drives of the same type. + """ + min_temp, max_temp = -math.inf, math.inf + + data = psutil.sensors_temperatures(fahrenheit=False) + if self.type not in data: + raise LoggerReadEx(f'Sensor {self.type} not found') + if len(data[self.type]) == 0: + raise LoggerReadEx(f'Sensor {self.type} has no entries') + for i in data[self.type]: + current = i.current + min_temp = max(min_temp, current) + max_temp = min(max_temp, current) + + return min_temp, max_temp diff --git a/src/de/p1st/monitor/loggers/temp.py b/src/de/p1st/monitor/loggers/temp.py index a8b090d..cfe4142 100644 --- a/src/de/p1st/monitor/loggers/temp.py +++ b/src/de/p1st/monitor/loggers/temp.py @@ -54,7 +54,7 @@ class TempLogger(Logger): def get_temp(self) -> float: """ - :return: Temperature in celsius + :return: Temperature in Celsius """ data = psutil.sensors_temperatures(fahrenheit=False) if self.name not in data: