feat: log drive temperature with psutil

This commit is contained in:
Daniel Langbein 2023-10-08 18:41:02 +02:00
parent be10017165
commit 23aa370cbf
Signed by: langfingaz
GPG Key ID: 6C47C753F0823002
6 changed files with 129 additions and 33 deletions

View File

@ -8,6 +8,7 @@ from de.p1st.monitor.cfg.singleton import get_cfg
from de.p1st.monitor.logger_ex import LoggerArgEx from de.p1st.monitor.logger_ex import LoggerArgEx
from de.p1st.monitor.loggers.cpu import CPULogger1, CPULogger5, CPULogger15 from de.p1st.monitor.loggers.cpu import CPULogger1, CPULogger5, CPULogger15
from de.p1st.monitor.loggers.drive import DriveLogger from de.p1st.monitor.loggers.drive import DriveLogger
from de.p1st.monitor.loggers.drive_temp import DriveTempLogger
from de.p1st.monitor.loggers.filesystem import FilesystemLogger from de.p1st.monitor.loggers.filesystem import FilesystemLogger
from de.p1st.monitor.loggers.memory import MemoryLogger from de.p1st.monitor.loggers.memory import MemoryLogger
from de.p1st.monitor.loggers.network import NetworkLogger from de.p1st.monitor.loggers.network import NetworkLogger
@ -86,6 +87,13 @@ def get_loggers() -> tuple[list[Logger], list[LoggerArgEx]]:
warn_data_range = int(cfg_.get('warn_data_range', '1')) warn_data_range = int(cfg_.get('warn_data_range', '1'))
return DriveLogger(uuid, id_, device, warn_if_above, warn_threshold, warn_data_range) return DriveLogger(uuid, id_, device, warn_if_above, warn_threshold, warn_data_range)
def drive_temp(cfg_: configparser.SectionProxy) -> Logger:
type_ = cfg_.get('type', None)
warn_if_above = int(cfg_['warn_if_above']) if 'warn_if_above' in cfg_ else None
warn_threshold = int(cfg_.get('warn_threshold', '1'))
warn_data_range = int(cfg_.get('warn_data_range', '1'))
return DriveTempLogger(type_, warn_if_above, warn_threshold, warn_data_range)
def memory(cfg_: configparser.SectionProxy) -> Logger: def memory(cfg_: configparser.SectionProxy) -> Logger:
warn_if_above = float(cfg_.get('warn_if_above', '1.0')) warn_if_above = float(cfg_.get('warn_if_above', '1.0'))
warn_threshold = int(cfg_.get('warn_threshold', '1')) warn_threshold = int(cfg_.get('warn_threshold', '1'))
@ -107,6 +115,7 @@ def get_loggers() -> tuple[list[Logger], list[LoggerArgEx]]:
'network': net, 'network': net,
'filesystem': filesystem, 'filesystem': filesystem,
'drive': drive, 'drive': drive,
'drive_temp': drive_temp,
'memory': memory, 'memory': memory,
'swap': swap, 'swap': swap,
} }

View File

@ -53,27 +53,15 @@ warn_if_above = 60
[network.1] [network.1]
network_interface = enp0s31f6 network_interface = enp0s31f6
[drive.1] [drive_temp.1]
; NVMe 256GB2 ; NVMe drives
; TODO NVMe 49 warn, 55 limit ; TODO: 49 warn, 55 limit
id = nvme-WDC_PC_SN520_SDAPNUW-256G-1002_183873801941 type = nvme
warn_if_above = 50 warn_if_above = 50
[drive.2] [drive_temp.2]
; HDD 12TB1 ; HDD drives
; TODO HDD 39 warn, 45 limit ; TODO: 39 warn, 45 limit
id = ata-TOSHIBA_MG07ACA12TE_X1E0A0WKF95G type = drivetemp
warn_if_above = 40
[drive.3]
; HDD 3TB1
id = ata-WDC_WD30EFRX-68EUZN0_WD-WCC4N1173157
warn_if_above = 40
[drive.4]
; HDD 3TB2
id = ata-WDC_WD30EFRX-68EUZN0_WD-WMC4N0564095
warn_if_above = 40
[drive.5]
; HDD 4TB1
id = ata-WDC_WD40EFRX-68N32N0_WD-WCC7K0CPF0N1
warn_if_above = 40 warn_if_above = 40
; [sensor_script.1] ; [sensor_script.1]

View File

@ -69,20 +69,14 @@ unmounted_ok = true
warn_if_above = 0.1 warn_if_above = 0.1
[drive.1] [drive_temp.1]
; Either `uuid`, `id` or `device` must be given. ; Either `nvme` (for NVMe drives) or `drivetemp` (for HDDs)
; type = nvme
; `uuid` as in /dev/disk/by-uuid/*
;uuid = ea7099e3-320d-4eb3-a4c3-9910a9af817b
; `id` as in /dev/disk/by-id/*
; id = nvme-XPG_GAMMIX_S50_Lite_2K462L2JN9KG
; device as in /dev/*
device = /dev/nvme0n1
; Warn if temperature is above this value. ; Warn if temperature is above this value.
; Unit: °C ; Unit: °C
warn_if_above = 25 warn_if_above = 25
[sensor_script.1] [sensor_script.1]
; The command will be executed. ; The command will be executed.
; It has to return a float (or int) and exit code 0 on success. ; It has to return a float (or int) and exit code 0 on success.

View File

@ -135,17 +135,24 @@ class DriveLogger(Logger):
@classmethod @classmethod
def get_temp_from_device(cls, device: Path) -> int: def get_temp_from_device(cls, device: Path) -> int:
""" """
Use `smartctl` to get HDD/SSD temperature.
As reading SMART data wakes up standby HDD drives, we skip them.
:param device: Partition path, e.g. `/dev/sda` :param device: Partition path, e.g. `/dev/sda`
:return: Temperature in celsius :return: Temperature in Celsius
""" """
# -n standby: Don't spin-up an HDD if it is in standby mode.
# -j: JSON output. # -j: JSON output.
# -a: Print all SMART device information. # -a: Print all SMART device information.
# For NVMe, this is equivalent to: '-H -i -c -A -l error -l selftest'. # For NVMe, this is equivalent to: '-H -i -c -A -l error -l selftest'.
# -H: Print health status. # -H: Print health status.
# -A: Prints only the vendor specific SMART Attributes. # -A: Prints only the vendor specific SMART Attributes.
returncode, stdout, stderr = execute_capture(['smartctl', '-j', '-A', f'{device}']) returncode, stdout, stderr = execute_capture(['smartctl', '-n', 'standby', '-j', '-A', f'{device}'])
if returncode == 2 and 'Device is in STANDBY mode' in stdout:
raise LoggerReadEx(f'Could not read drive temperature as it is in standby mode: {device}')
if returncode != 0: if returncode != 0:
raise LoggerReadEx(f'smartctl failed with returncode {returncode}\nstdout: {stdout}\nstderr: {stderr}') raise LoggerReadEx(f'smartctl failed with returncode {returncode}\nstdout: {stdout}\nstderr: {stderr}')
j = json.loads(stdout) j = json.loads(stdout)

View File

@ -0,0 +1,98 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import math
from typing import Literal
from pathlib import Path
import psutil
from de.p1st.monitor import datetime_util
from de.p1st.monitor.logger import Logger
from de.p1st.monitor.logger_ex import LoggerReadEx
from de.p1st.monitor.warn_data import WarnData
class DriveTempLogger(Logger):
def __init__(self,
type_: Literal['drivetemp', 'nvme'],
warn_if_above: int = None,
warn_threshold: int = 1,
warn_data_range: int = 1,
):
"""
:param type_: HDD -> drivetemp, NVMe -> nvme
"""
critical_if_above = warn_if_above + 10
super().__init__(warn_threshold,
warn_data_range,
warn_if_above,
critical_if_above
)
self.type = type_
def get_warn_data(self, data: list[any]) -> WarnData:
min_temp = data[1]
max_temp = data[2]
message = f'Temperature of drive type {self.type} is in range {min_temp}:{max_temp}'
return WarnData(data[0], max_temp, message)
def read_data(self) -> list[any]:
min_temp, max_temp = self.get_drive_temp()
return [
datetime_util.now(),
min_temp,
max_temp
]
def data_schema(self) -> list[str]:
return ['datetime#Date', 'float#MinTemperature', 'float#MaxTemperature']
def get_log_file(self) -> Path:
return self.get_log_dir() / f'drive-temp_{self.type}.csv'
#
# HELPERS
#
def get_drive_temp(self) -> (float, float):
"""
Use `psutil` Python library to get HDD/SSD temperature.
https://psutil.readthedocs.io/en/latest/index.html#psutil.sensors_temperatures
Not sure if this changed the results:
sudo modprobe drivetemp
Example output:
{
'nvme': [
shwtemp(label='Composite', current=37.85, high=81.85, critical=85.85)
],
'pch_skylake': [...],
'coretemp': [...],
'drivetemp': [
shwtemp(label='', current=23.0, high=65.0, critical=85.0),
shwtemp(label='', current=25.0, high=55.0, critical=70.0),
shwtemp(label='', current=24.0, high=60.0, critical=85.0),
shwtemp(label='', current=22.0, high=60.0, critical=85.0)]
}
Problem: If one has multiple drives attached, they can't be distinguished.
https://github.com/giampaolo/psutil/issues/1902
Therefore, we currently accumulate the maximum and minimum values of all drives of the same type.
"""
min_temp, max_temp = -math.inf, math.inf
data = psutil.sensors_temperatures(fahrenheit=False)
if self.type not in data:
raise LoggerReadEx(f'Sensor {self.type} not found')
if len(data[self.type]) == 0:
raise LoggerReadEx(f'Sensor {self.type} has no entries')
for i in data[self.type]:
current = i.current
min_temp = max(min_temp, current)
max_temp = min(max_temp, current)
return min_temp, max_temp

View File

@ -54,7 +54,7 @@ class TempLogger(Logger):
def get_temp(self) -> float: def get_temp(self) -> float:
""" """
:return: Temperature in celsius :return: Temperature in Celsius
""" """
data = psutil.sensors_temperatures(fahrenheit=False) data = psutil.sensors_temperatures(fahrenheit=False)
if self.name not in data: if self.name not in data: