commit 003841d66ce84ad6709e1ea7fc3755d8e38c981b Author: Daniel Langbein Date: Thu Jun 15 16:26:11 2023 +0200 import diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8cace66 --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +/.idea/ +__pycache__/ + +# pip build +/src/de.p1st.monitor.egg-info/ +/dist/ +/build/ +/venv/ + +# makepkg +/packaging/python-de-p1st-monitor-git-*-any.pkg.tar.zst +/packaging/de-p1st-monitor/ diff --git a/.run/main (export).run.xml b/.run/main (export).run.xml new file mode 100644 index 0000000..d8dbb3a --- /dev/null +++ b/.run/main (export).run.xml @@ -0,0 +1,24 @@ + + + + + \ No newline at end of file diff --git a/.run/main (help).run.xml b/.run/main (help).run.xml new file mode 100644 index 0000000..f8a5842 --- /dev/null +++ b/.run/main (help).run.xml @@ -0,0 +1,24 @@ + + + + + \ No newline at end of file diff --git a/.run/main (log).run.xml b/.run/main (log).run.xml new file mode 100644 index 0000000..073e9c6 --- /dev/null +++ b/.run/main (log).run.xml @@ -0,0 +1,24 @@ + + + + + \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..2fe83c7 --- /dev/null +++ b/LICENSE @@ -0,0 +1,32 @@ +The Clear BSD License + +Copyright (c) 2023 Daniel Langbein +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted (subject to the limitations in the disclaimer +below) provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..1f4c59a --- /dev/null +++ b/Makefile @@ -0,0 +1,47 @@ +PKGNAME := de-p1st-monitor + +.PHONY: all +all: install-pkgbuild + +.PHONY: install-pkgbuild +install-pkgbuild: cron ## Install with pacman (on Arch Linux) + sudo pacman -S --needed base-devel + cd packaging && makepkg -fCcsri && rm -rf $(PKGNAME) + ${MAKE} install-files + +.PHONY: install-pip +install-pip: notify cron ## Install with pip + sudo python3 -m pip install --upgrade --force-reinstall . + ${MAKE} install-files + +.PHONY: install-files +install-files: + sudo install -m0644 cron.d/$(PKGNAME) /etc/cron.d/$(PKGNAME) + + sudo install --directory -m755 /etc/$(PKGNAME)/ + sudo install -m0644 cfg/* /etc/$(PKGNAME)/ + +.PHONY: notify ## Check if exec-notify is installed. +notify: + # `type` does not work e.g. on Ubuntu 18.04 + which exec-notify + +.PHONY: cron ## Check if cron (e.g. cronie) is running. +cron: + # Check if cron.d exists + stat /etc/cron.d/ + # Check if cron is running + pgrep cron + + +.PHONY: clean-pkgbuild +clean-pkgbuild: clean-files + sudo pacman -Rns python-$(PKGNAME)-git + +.PHONY: clean-pip +clean-pip: clean-files + sudo python3 -m pip uninstall -y $(PKGNAME) + +.PHONY: clean-files +clean-files: + sudo rm -rf /etc/cron.d/$(PKGNAME) /etc/$(PKGNAME) /var/log/$(PKGNAME).cron diff --git a/README.md b/README.md new file mode 100644 index 0000000..9efafba --- /dev/null +++ b/README.md @@ -0,0 +1,248 @@ +# de-p1st-monitor + +## Research + +See [./research](./research). + +- HDD temp: + - Modern hard drives will throttle their read and write speeds + when the drive reaches a critical pre-set temperature + (usually around 60°C) + - 20-50°C (short-term) + - 20-40°C (long-term usage) +- SSD temp: + - Most SSDs implement thermal throttling as a safety feature + if a drive gets too hot. As the driver approaches the 70ºC limit + that most manufacturers set, the more likely it is that the + drive will start to slow itself down to prevent failure. + - 30-50°C + +## Keep it simple! + +Lines of code including docstrings and comments: + +```shell +find ./src -name '*.py' | xargs wc -l +#=> 1394 total +``` + +## Configuration + +See [cfg/yodaTux.ini](cfg/yodaTux.ini) for a configuration file covering all config options. + +## Installation + +Install dependencies: + +- on Arch Linux + +```shell +# TODO +# Optional: 1-wire temperature sensor. +sudo pacman -S digitemp # TODO: configure your sensor +``` + +- on Ubuntu + +```shell +sudo apt-get install python3-pip + +# Ubuntu 18.04 and below +sudo apt-get install python3-setuptools +sudo apt-get install python3-wheel + +sudo apt-get install python3-psutil + +# Ubuntu 18.04 and below: psutil < 5.6.2 +sudo apt-get install python3-dev +sudo apt-get install build-essential +# Ubuntu 20.04 and below: psutil < 5.6.2 +sudo python3 -m pip install psutil --upgrade +``` + +Install: + +- on Arch Linux + +```shell +make +``` + +- on Ubuntu + +```shell +make install-pip +``` + +## Usage + +### Command line interface + +``` +usage: de-p1st-monitor [-h] [--config CONFIG] [--export] + +Iterates over all config sections. For each section the current sensor data is +read and logged to a .csv file. + +options: + -h, --help show this help message and exit + --config CONFIG, -c CONFIG + Path to .ini configuration file. + --export, -e If `True`, export .csv files and print their paths to + stdout. No sensor data is logged during this. +``` + +### Periodic logging + +Add a cron entry executing this e.g. every 3 Minutes: + +```shell +de-p1st-monitor +``` + +## Example log files + +```shell +ssh nas 'tail -n 1 /var/log/de-p1st-monitor/*' +``` +``` +==> /var/log/de-p1st-monitor/cpu_15min.csv <== +20230315T103001,0.10400390625 + +==> /var/log/de-p1st-monitor/cpu_1min.csv <== +20230315T103001,0.03076171875 + +==> /var/log/de-p1st-monitor/cpu_5min.csv <== +20230315T103001,0.0301513671875 + +==> /var/log/de-p1st-monitor/drive_20d86155-30d4-404c-95e8-c701cfb16ca5.csv <== +20230315T103001,24 + +==> /var/log/de-p1st-monitor/drive_4651c3f1-e4b8-45aa-a823-df762530a307.csv <== +20230315T103001,21 + +==> /var/log/de-p1st-monitor/drive_68c349e8-5118-4773-9fd5-5dbad9acee4e.csv <== +20230315T103001,29 + +==> /var/log/de-p1st-monitor/drive_b8ef1da9-d76d-44b4-86d4-71c82c888b6f.csv <== +20230315T103001,28 + +==> /var/log/de-p1st-monitor/filesystem_3CBA-B4EA.csv <== +20230315T103001,0.228 + +==> /var/log/de-p1st-monitor/filesystem_a454430b-dee3-4b6b-8325-f7bdb9435ed1.csv <== +20230314T231501,0.762 + +==> /var/log/de-p1st-monitor/filesystem_b8ef1da9-d76d-44b4-86d4-71c82c888b6f.csv <== +20230315T103001,0.034 + +==> /var/log/de-p1st-monitor/filesystem_c385a436-0288-486f-a2b9-c64c2db667e7.csv <== +20230315T103001,0.374 + +==> /var/log/de-p1st-monitor/memory.csv <== +20230315T103001,4127,15329 + +==> /var/log/de-p1st-monitor/net_enp0s31f6.csv <== +20230315T103001,69366974632,58725303985,20230304T173014 + +==> /var/log/de-p1st-monitor/swap.csv <== +20230315T103001,25,4095 + +==> /var/log/de-p1st-monitor/temp_coretemp_Core 0.csv <== +20230315T103001,26.0 + +==> /var/log/de-p1st-monitor/temp_coretemp_Core 1.csv <== +20230315T103001,34.0 + +==> /var/log/de-p1st-monitor/temp_coretemp_Package id 0.csv <== +20230315T103001,35.0 +``` + +## Plots + +### Creating plots with graph-cli + +1) Export and fetch data + +```shell +ssh_target=rootnas +dst=~/de-p1st-monitor-"${ssh_target}" +files="${dst}".files + +# Export .csv files on SSH target and save list of exported files to $files. +ssh "${ssh_target}" 'de-p1st-monitor --export' > "${files}" + +rm -rf "${dst}" +mkdir -p "${dst}" + +rsync --checksum --archive --progress --human-readable --delete \ + --files-from="${files}" "${ssh_target}":/ "${dst}" +mv "${dst}"/var/log/de-p1st-monitor/* "${dst}" +rm -r "${dst}"/var "${files}" + +cd "${dst}" +``` + +2) Install (python) `graph-cli` + +```shell +python -m venv ~/de-p1st-monitor.venv +source ~/de-p1st-monitor.venv/bin/activate +pip install graph-cli +``` + +3) Create plots + +Create one plot for each .csv file: + +```shell +sample_duration=4H + +for file in *.csv; do + graph "${file}" -x 1 --resample "${sample_duration}" --figsize 1600x1000 -o "${file}".resample-"${sample_duration}"-mean.png || { + echo "Error while processing ${file}" + } +done + +for file in {swap,memory}.csv {temp_,drive_,net_,cpu_,filesystem_}*.csv; do + graph "${file}" -x 1 --resample "${sample_duration}" --resample-action max --figsize 1600x1000 -o "${file}".resample-"${sample_duration}"-max.png || { + echo "Error while processing ${file}" + } +done +``` + +4) Optionally, create more plots + +Some self-explaining examples: + +```shell +# x and y axis by column name +graph cpu_1min.csv -x 'datetime#Date' -y 'float#LoadAverage1min' --resample 1H -o cpu_1min_resample-1H.png +# x and y axis by column number +graph cpu_1min.csv -x 1 -y 2 --resample 1H -o cpu_1min_resample-1H.png +# specify x axis; use all other axes for y +graph cpu_1min.csv -x 1 --resample 1H -o cpu_1min_resample-1H.png +# increased plot size +graph cpu_1min.csv -x 1 --resample 1H --figsize 1600x1000 -o cpu_1min_resample-1H.png +``` + +```shel +# resample using sum +graph net_enp0s31f6.csv.exported.csv -x 1 --resample 1H --resample-action sum --figsize 1600x1000 -o net_enp0s31f6.csv.exported_resample-1H-sum.png +``` + +```shel +# resample using max +graph cpu_1min.csv -x 1 --resample 1H --resample-action max --figsize 1600x1000 -o cpu_1min_resample-1H-max.png +``` + + +### Example plots + +![img](images/cpu_1min.csv.resample-1H.png) +![img](images/drive_68c349e8-5118-4773-9fd5-5dbad9acee4e.csv.resample-1H.png) +![img](images/filesystem_c385a436-0288-486f-a2b9-c64c2db667e7.csv.resample-1H.png) +![img](images/memory.csv.resample-1H.png) +![img](images/net_enp0s31f6.csv.exported.csv.resample-1H.png) +![img](images/swap.csv.resample-1H.png) +![img](images/temp_coretemp_Package%20id%200.csv.resample-1H.png) diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..43093fa --- /dev/null +++ b/TODO.md @@ -0,0 +1,264 @@ +# TODOs + +## Public IP address + +Logg the public IP address. Reuse `netcup-dns` python functions. + +## Rewrite + +~~* easier configuration +~~* easier read/write from/to csv~~ +~~* use classes & objects~~~~ + +~~* create plots?~~ + +* Don't send emit warning again, if during previous log a lower warning was emitted + * Example: + * log1: 30°C OK + * log2: 40°C Warning sent + * log3: 35°C Still above limit, but don't send warning again as value decreased + * log4: 37°C Send another warning: The value increased since last logging + +## Use Grafana to visualize metrics + +One can use Prometheus + Grafana to collect and visualize server metrics. + +> https://geekflare.com/best-open-source-monitoring-software/ +> This list won’t be complete without including two fantastic open-source solutions – Prometheus and Grafana. Its DIY solution where you use Prometheus to scrape the metrics from server, OS, applications and use Grafana to visualize them. + +As we do already collect logs, we should do some research on how to +import data into Grafana. + +### Time series + +* https://grafana.com/docs/grafana/latest/fundamentals/timeseries/#introduction-to-time-series + +E.g. CPU and memory usage, sensor data. + +* https://grafana.com/docs/grafana/latest/fundamentals/timeseries/#time-series-databases + +A time series database (TSDB) is a database explicitly designed for time series data. + +Some supported TSDBs are: + +* Graphite +* InfluxDB +* Prometheus + +### Installation + +* https://grafana.com/docs/grafana/latest/setup-grafana/installation/docker/#alpine-image-recommended +* https://grafana.com/docs/grafana/latest/setup-grafana/installation/docker/#install-official-and-community-grafana-plugins + +* https://grafana.com/grafana/plugins/marcusolsson-csv-datasource/?tab=installation + * https://grafana.github.io/grafana-csv-datasource/ +* https://grafana.com/grafana/plugins/marcusolsson-json-datasource/?tab=installation + * https://grafana.github.io/grafana-json-datasource/ + +```shell +sudo docker run --rm \ + -p 3000:3000 \ + --name=grafana \ + -e "GF_INSTALL_PLUGINS=marcusolsson-json-datasource,marcusolsson-csv-datasource" \ + grafana/grafana-oss +``` + +TODO: test csv or json data import tools + +## Netdata - Can be exported to Grafana + +* https://github.com/netdata/netdata/blob/master/docs/getting-started/introduction.md + +## Monit - An existing monitoring service + +### General notes and links + +* Monit is a widely used service for system monitoring. + * OPNsense uses Monit: https://docs.opnsense.org/manual/monit.html + +* Short slideshow presentation: https://mmonit.com/monit/#slideshow +* https://wiki.ubuntuusers.de/Monit/ + +* Excellent configuration and usage summary in the Arch Linux Wiki: https://wiki.archlinux.org/title/Monit + +* Examples + * https://mmonit.com/wiki/Monit/ConfigurationExamples + * One can use the returncode or stdout of an executed shell script + * https://mmonit.com/wiki/Monit/ConfigurationExamples#HDDHealth + ``` + check program HDD_Health with path "/usr/local/etc/monit/scripts/sdahealth.sh" + every 120 cycles + if content != "PASSED" then alert + # if status > 0 then alert + group health + ``` +* Documentation + * Event queue - Store events (notifications) if mail server is not reachable + * https://mmonit.com/monit/documentation/monit.html#Event-queue + ``` + set eventqueue basedir /var/monit + ``` + * https://mmonit.com/monit/documentation/monit.html#SPACE-USAGE-TEST + ``` + check filesystem rootfs with path / + if space usage > 90% then alert + ``` + * https://mmonit.com/monit/documentation/monit.html#PROGRAM-STATUS-TEST + ``` + check program myscript with path /usr/local/bin/myscript.sh + if status != 0 then alert + ``` + * https://mmonit.com/monit/documentation/monit.html#PROGRAM-OUTPUT-CONTENT-TEST + * https://mmonit.com/monit/documentation/monit.html#Link-upload-and-download-bytes + ``` + check network eth0 with interface eth0 + if upload > 500 kB/s then alert + if total downloaded > 1 GB in last 2 hours then alert + if total downloaded > 10 GB in last day then alert + ``` + +* https://mmonit.com/monit/documentation/monit.html#MANAGE-YOUR-MONIT-INSTANCES + +### Monitoring all your monit instances + +* Monit itself does only monitor the current system +* Multi-server monitoring is a paid extra service called M/Monit :/ +* But there are other open source services for this + * https://github.com/monmon-io/monmon#why-did-you-create-monmon + +### Setup + +Install and start: + +```shell +sudo pacman -S --needed monit lm_sensors smartmontools +sudo systemctl start monit +sudo systemctl status monit | grep 'Active: active (running)' +``` + +Print default configuration: + +```shell +sudo cat /etc/monitrc | grep -v '^#' +#=> set daemon 30 +#=> - A cycle is 30 seconds long. +#=> set log syslog +#=> - We will overwrite this config value later on. +#=> set httpd port 2812 +#=> - Only listen on localhost with username admin and pwd monit. +``` + +Include `monit.d`: + +```shell +sudo mkdir -p /etc/monit.d/ +! sudo cat /etc/monitrc | grep -q '^include' && echo 'include /etc/monit.d/*' | sudo tee -a /etc/monitrc +``` + +Log to file: + +```shell +sudo install -m700 /dev/stdin /etc/monit.d/log <<< 'set log /var/log/monit.log' +sudo systemctl restart monit +# tail -f /var/log/monit.log +``` + +System: + +```shell +sudo install -m700 /dev/stdin /etc/monit.d/system <<< 'check system $HOST + if filedescriptors >= 80% then alert + if loadavg (5min) > 2 for 4 cycles then alert + if memory usage > 75% for 4 cycles then alert + if swap usage > 50% for 4 cycles then alert' +sudo systemctl restart monit +``` + +Filesystem: + +```shell +sudo install -m700 /dev/stdin /etc/monit.d/fs <<< 'check filesystem rootfs with path / + if space usage > 80% then alert' +sudo systemctl restart monit +``` + +SSL options: + +* https://mmonit.com/monit/documentation/monit.html#SSL-OPTIONS + +```shell +sudo install -m700 /dev/stdin /etc/monit.d/ssl <<< '# Enable certificate verification for all SSL connections +set ssl options { + verify: enable +}' +sudo systemctl restart monit +``` + +Mailserver, alerts and eventqueue: + +* https://mmonit.com/monit/documentation/monit.html#Setting-a-mail-server-for-alert-delivery +* https://mmonit.com/monit/documentation/monit.html#Setting-an-error-reminder +* https://mmonit.com/monit/documentation/monit.html#Event-queue + * If no mail server is available, Monit can queue events in the local file-system for retry until the mail server recovers. + * By default, the queue is disabled and if the alert handler fails, Monit will simply drop the alert message. + +```shell +sudo install -m700 /dev/stdin /etc/monit.d/mail <<< 'set mailserver smtp.mail.de + port 465 + username "langbein@mail.de" + password "qiXF6cUgfvSVqd0pAoFTqZEHIcUKzc3n" + using SSL + with timeout 20 seconds + +set mail-format { + from: langbein@mail.de + subject: $SERVICE - $EVENT at $DATE + message: Monit $ACTION $SERVICE at $DATE on $HOST: $DESCRIPTION. +} + +set alert daniel@systemli.org with reminder on 10 cycles + +set eventqueue basedir /var/monit' +sudo systemctl restart monit +sudo monit -v | grep 'Mail' +``` + +Test alert: + +* https://wiki.ubuntuusers.de/Monit/#E-Mail-Benachrichtigungen-testen +* It is enough to restart monit. It will send an email that it's state has changed (stopped/started). +* But if desired, one can also create a test for a non-existing file: + +```shell +sudo install -m700 /dev/stdin /etc/monit.d/alerttest <<< 'check file alerttest with path /.nonexistent.file' +sudo systemctl restart monit +``` + +Example script - run a speedtest: + +```shell +sudo pacman -S --needed speedtest-cli +sudo install -m700 /dev/stdin /etc/monit.d/speedtest <<< 'check program speedtest with path /usr/bin/speedtest-cli + every 120 cycles + if status != 0 then alert' +sudo systemctl restart monit +``` + +Check config syntax: + +```shell +sudo monit -t +``` + +################## TODOS ########################## + +* See Firefox bookmark folder 20230219_monit. +* Disk health +* BTRFS balance +* Save disk usage and temperatures to CSV log file + * e.g. by using `check program check-and-log-temp.sh` monit configuration + * Or: do checks by monit and every couple minutes run `check program log-system-info.sh` + +### Monit behind Nginx + +TODO: Nginx reverse proxy with basic authentication. diff --git a/cfg/yodaNas.ini b/cfg/yodaNas.ini new file mode 100644 index 0000000..1d4ff29 --- /dev/null +++ b/cfg/yodaNas.ini @@ -0,0 +1,79 @@ +[logging] +dir = /var/log/de-p1st-monitor/ + +[filesystem.1] +; NVME +mountpoint = / +warn_if_above = 0.75 +[filesystem.2] +; NVME +mountpoint = /boot +warn_if_above = 0.75 +[filesystem.3] +; 12TB1 +uuid = c385a436-0288-486f-a2b9-c64c2db667e7 +warn_if_above = 0.66 +[filesystem.4] +; 3TB1 and 3TB2 +uuid = a454430b-dee3-4b6b-8325-f7bdb9435ed1 +warn_if_above = 0.85 +unmounted_ok = true + +[memory] +warn_if_above = 0.85 +[swap] +warn_if_above = 0.85 + +[cpu1] +warn_if_above = 3.0 +warn_threshold = 2 +warn_data_range = 2 +[cpu5] +warn_if_above = 2.0 +warn_threshold = 2 +warn_data_range = 2 +[cpu15] +warn_if_above = 1.0 +warn_threshold = 2 +warn_data_range = 2 + +[temp.1] +sensor = coretemp +label = Package id 0 +warn_if_above = 60 +[temp.2] +sensor = coretemp +label = Core 0 +warn_if_above = 60 +[temp.3] +sensor = coretemp +label = Core 1 +warn_if_above = 60 + +[network.1] +network_interface = enp0s31f6 + +[drive.1] +; NVME /dev/nvme0n1p3 +; TODO NVME 49 warn, 55 limit +uuid = b8ef1da9-d76d-44b4-86d4-71c82c888b6f +warn_if_above = 50 +[drive.2] +; HDD 12TB1 +; TODO HDD 39 warn, 45 limit +uuid = 68c349e8-5118-4773-9fd5-5dbad9acee4e +warn_if_above = 40 +[drive.3] +; HDD 3TB1 +uuid = 20d86155-30d4-404c-95e8-c701cfb16ca5 +warn_if_above = 40 +[drive.4] +; HDD 3TB2 +uuid = 4651c3f1-e4b8-45aa-a823-df762530a307 +warn_if_above = 40 + +; TODO digitemp sensor +;[digitemp_DS9097.1] +;cfg = /root/.digitemprc +;sensor_num = 0 +;name = room-temp diff --git a/cfg/yodaTux.ini b/cfg/yodaTux.ini new file mode 100644 index 0000000..d9c1f63 --- /dev/null +++ b/cfg/yodaTux.ini @@ -0,0 +1,79 @@ +[logging] +; The CSV logfiles are saved in this directory. +dir = /var/log/de-p1st-monitor/ + + +[temp.1] +; `sensor` and `label` are used to identify one temperature value. +sensor = k10temp +label = Tctl + +; Warn if temperature is above this value. +; Unit: °C +warn_if_above = 80 + +; Send warning if critical values were reached 2 times during the last 4 logs. +warn_threshold = 2 +warn_data_range = 4 + +[temp.2] +sensor = amdgpu +label = edge +warn_if_above = 50 +warn_threshold = 2 +warn_data_range = 4 + + +[network.1] +network_interface = wlan0 + + +[memory] +; Warn if memory usage is above this value. +; Range: (0.0, 1.0) +warn_if_above = 0.1 + +[swap] +; Warn if swap usage is above this value. +; Range: (0.0, 1.0) +warn_if_above = 0.5 + + +[cpu1] +; Warn if CPU load of the last 1 minute is above this value. +; Range: (0.0, infinite) +; `1.0` corresponds to 100% CPU utilisation. +; However, there can be more processes in the queue than can be processed. +; As a result, the value can go above `1.0`. +warn_if_above = 0.95 +[cpu5] +; Warn if CPU load of the last 5 minutes is above this value. +warn_if_above = 0.85 +[cpu15] +; Warn if CPU load of the last 15 minutes is above this value. +warn_if_above = 0.75 + + +[filesystem.1] +; Either `uuid` or `mountpoint` must be given. +; +; If both are given but the UUID of the disk mounted at `mountpoint` differs from `uuid`, then an exception is raised. +uuid = 7fb12542-bd59-4727-9beb-7cf1f79f8293 +mountpoint = / + +; If `true` don't log or warn if the filesystem is not found. +unmounted_ok = true + +; Warn if disk usage is above this value. +; Range: (0.0, 1.0) +warn_if_above = 0.1 + + +[drive.1] +; Either `uuid` or `device` must be given. +;uuid = +device = /dev/nvme0n1p3 + +; Warn if temperature is above this value. +; Unit: °C +warn_if_above = 25 diff --git a/cron.d/de-p1st-monitor b/cron.d/de-p1st-monitor new file mode 100644 index 0000000..1a7ad20 --- /dev/null +++ b/cron.d/de-p1st-monitor @@ -0,0 +1,11 @@ +# Run command every 3min +# - https://crontab.guru/every-3-minutes +# `/etc/cron.d/` requires user field +# - https://unix.stackexchange.com/questions/458713/how-are-files-under-etc-cron-d-used#comment1019389_458715 +# Some users report that files in `/etc/cron.d/` containing `-` are not executed +# - https://unix.stackexchange.com/questions/296347/crontab-never-running-while-in-etc-cron-d#comment640748_296351 +# PATH is restricted to `/bin:/usr/bin` but `exec-notify` resides in `/usr/local/bin/` +# - https://serverfault.com/a/449652 + +PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/sbin:/usr/local/bin/ +*/3 * * * * root exec-notify de-p1st-monitor > /var/log/de-p1st-monitor.cron 2>&1 diff --git a/images/cpu_1min.csv.resample-1H.png b/images/cpu_1min.csv.resample-1H.png new file mode 100644 index 0000000..cb08bc5 Binary files /dev/null and b/images/cpu_1min.csv.resample-1H.png differ diff --git a/images/drive_68c349e8-5118-4773-9fd5-5dbad9acee4e.csv.resample-1H.png b/images/drive_68c349e8-5118-4773-9fd5-5dbad9acee4e.csv.resample-1H.png new file mode 100644 index 0000000..6d10881 Binary files /dev/null and b/images/drive_68c349e8-5118-4773-9fd5-5dbad9acee4e.csv.resample-1H.png differ diff --git a/images/filesystem_c385a436-0288-486f-a2b9-c64c2db667e7.csv.resample-1H.png b/images/filesystem_c385a436-0288-486f-a2b9-c64c2db667e7.csv.resample-1H.png new file mode 100644 index 0000000..bdf4c5c Binary files /dev/null and b/images/filesystem_c385a436-0288-486f-a2b9-c64c2db667e7.csv.resample-1H.png differ diff --git a/images/memory.csv.resample-1H.png b/images/memory.csv.resample-1H.png new file mode 100644 index 0000000..bc4b24f Binary files /dev/null and b/images/memory.csv.resample-1H.png differ diff --git a/images/net_enp0s31f6.csv.exported.csv.resample-1H.png b/images/net_enp0s31f6.csv.exported.csv.resample-1H.png new file mode 100644 index 0000000..5bc8634 Binary files /dev/null and b/images/net_enp0s31f6.csv.exported.csv.resample-1H.png differ diff --git a/images/swap.csv.resample-1H.png b/images/swap.csv.resample-1H.png new file mode 100644 index 0000000..17ee04e Binary files /dev/null and b/images/swap.csv.resample-1H.png differ diff --git a/images/temp_coretemp_Package id 0.csv.resample-1H.png b/images/temp_coretemp_Package id 0.csv.resample-1H.png new file mode 100644 index 0000000..88796b7 Binary files /dev/null and b/images/temp_coretemp_Package id 0.csv.resample-1H.png differ diff --git a/packaging/PKGBUILD b/packaging/PKGBUILD new file mode 100644 index 0000000..fdf567e --- /dev/null +++ b/packaging/PKGBUILD @@ -0,0 +1,56 @@ +# Maintainer: Daniel Langbein < daniel [ at ] systemli [ dot ] org > + +# This PKGBUILD is based on the instructions from the Arch Linux wiki: +# https://wiki.archlinux.org/title/Python_package_guidelines + +_name=de-p1st-monitor +pkgname="python-$_name-git" +pkgver=r202.f3f2f46 +pkgrel=1 +pkgdesc='periodically monitor and warn' +arch=(any) +url="https://git.privacy1st.de/langfingaz/$_name" +license=('custom:BSD-3-Clause-Clear-License') + +provides=(de-p1st-monitor) +depends=(python exec-notify) +makedepends=(git python-build python-installer python-wheel) +optdepends=('python-psutil: CPU, memory, network monitoring' + 'digitemp: USB temperature sensor' + 'smartmontools: disk temperature monitoring') + +source=("git+https://git.privacy1st.de/langfingaz/$_name.git") +b2sums=(SKIP) + +# If there are no tags then use number of revisions since beginning of the history: +# https://wiki.archlinux.org/title/VCS_package_guidelines +pkgver() { + cd "$_name" + printf "r%s.%s" "$(git rev-list --count HEAD)" "$(git rev-parse --short=7 HEAD)" +} + +prepare() { + git -C "$srcdir/$_name" clean -dfx +} + +build() { + # cd "$_name-$pkgver" + cd "$_name" + python -m build --wheel --no-isolation +} + +package() { + # cd "$_name-$pkgver" + cd "$_name" + python -m installer --destdir="$pkgdir" dist/*.whl +} + +check(){ + cd "$srcdir/$_name" + + # For nosetests + # nosetests + + # For pytest + # pytest +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c035a4f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,8 @@ +# https://packaging.python.org/tutorials/packaging-projects/#creating-pyproject-toml + +[build-system] +requires = [ + "setuptools>=42", + "wheel" +] +build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6d18ea2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +psutil>=5.9 diff --git a/research/AKCP Monitoring - How Temperature affects IT Storage.pdf b/research/AKCP Monitoring - How Temperature affects IT Storage.pdf new file mode 100644 index 0000000..c4f7ba7 Binary files /dev/null and b/research/AKCP Monitoring - How Temperature affects IT Storage.pdf differ diff --git a/research/Failure Trends in a Large Disk Drive Population/bibtext.txt b/research/Failure Trends in a Large Disk Drive Population/bibtext.txt new file mode 100644 index 0000000..9526351 --- /dev/null +++ b/research/Failure Trends in a Large Disk Drive Population/bibtext.txt @@ -0,0 +1,9 @@ +@inproceedings{32774, +title = {Failure Trends in a Large Disk Drive Population}, +author = {Eduardo Pinheiro and Wolf-Dietrich Weber and Luiz André Barroso}, +year = {2007}, +booktitle = {5th USENIX Conference on File and Storage Technologies (FAST 2007)}, +pages = {17-29} +} + + diff --git a/research/Failure Trends in a Large Disk Drive Population/paper.pdf b/research/Failure Trends in a Large Disk Drive Population/paper.pdf new file mode 100644 index 0000000..c89ae95 Binary files /dev/null and b/research/Failure Trends in a Large Disk Drive Population/paper.pdf differ diff --git a/research/Failure Trends in a Large Disk Drive Population/paper.xopp b/research/Failure Trends in a Large Disk Drive Population/paper.xopp new file mode 100644 index 0000000..4252f06 Binary files /dev/null and b/research/Failure Trends in a Large Disk Drive Population/paper.xopp differ diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..b27e4a0 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,35 @@ +; setup.cfg is the configuration file for setuptools. +; https://packaging.python.org/tutorials/packaging-projects/#configuring-metadata + +[metadata] +name = de.p1st.monitor +version = 0.8.0 +author = Daniel Langbein +author_email = daniel@systemli.org +description = periodically monitor and warn +long_description = file: README.md +long_description_content_type = text/markdown +url = https://git.privacy1st.de/langfingaz/de-p1st-monitor +project_urls = + Bug Tracker = https://git.privacy1st.de/langfingaz/de-p1st-monitor/issues + +; https://pypi.org/classifiers/ +classifiers = + Development Status :: 4 - Beta + Programming Language :: Python :: 3 + ; License :: BSD 3-Clause Clear License + Operating System :: Unix + +[options] +package_dir = + = src +packages = find: +python_requires = >=3.6.9 + +[options.packages.find] +where = src + +[options.entry_points] +; https://setuptools.readthedocs.io/en/latest/userguide/entry_point.html +console_scripts= + de-p1st-monitor = de.p1st.monitor.main:main diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..0abbd0a --- /dev/null +++ b/setup.py @@ -0,0 +1,4 @@ +# This file is required for `pip install` on Ubuntu 18.04. +# It loads `setup.cfg`. +from setuptools import setup +setup() diff --git a/src/de/__init__.py b/src/de/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/de/p1st/__init__.py b/src/de/p1st/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/de/p1st/monitor/__init__.py b/src/de/p1st/monitor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/de/p1st/monitor/cfg/__init__.py b/src/de/p1st/monitor/cfg/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/de/p1st/monitor/cfg/loggers.py b/src/de/p1st/monitor/cfg/loggers.py new file mode 100644 index 0000000..b8df095 --- /dev/null +++ b/src/de/p1st/monitor/cfg/loggers.py @@ -0,0 +1,106 @@ +import configparser +from pathlib import Path + +from de.p1st.monitor.cfg.singleton import get_cfg +from de.p1st.monitor.logger_ex import LoggerArgEx +from de.p1st.monitor.loggers.cpu import CPULogger1, CPULogger5, CPULogger15 +from de.p1st.monitor.loggers.drive import DriveLogger +from de.p1st.monitor.loggers.filesystem import FilesystemLogger +from de.p1st.monitor.loggers.memory import MemoryLogger +from de.p1st.monitor.loggers.network import NetworkLogger +from de.p1st.monitor.loggers.swap import SwapLogger +from de.p1st.monitor.loggers.temp import TempLogger +from de.p1st.monitor.logger import Logger + +def get_or_raise(cfg: configparser.SectionProxy, key: str) -> str: + if key in cfg: + return cfg[key] + else: + raise LoggerArgEx(f'Missing key {key} in section {cfg.name}') + +def get_loggers() -> tuple[list[Logger], list[LoggerArgEx]]: + def temp(cfg: configparser.SectionProxy) -> Logger: + sensor = get_or_raise(cfg, 'sensor') + label = get_or_raise(cfg, 'label') + warn_if_above = float(cfg['warn_if_above']) if 'warn_if_above' in cfg else None + warn_threshold = int(cfg.get('warn_threshold', '1')) + warn_data_range = int(cfg.get('warn_data_range', '1')) + return TempLogger(sensor, label, warn_if_above, warn_threshold, warn_data_range) + def cpu1(cfg: configparser.SectionProxy) -> Logger: + warn_if_above = float(cfg['warn_if_above']) if 'warn_if_above' in cfg else None + warn_threshold = int(cfg.get('warn_threshold', '1')) + warn_data_range = int(cfg.get('warn_data_range', '1')) + return CPULogger1(warn_if_above, warn_threshold, warn_data_range) + + def cpu5(cfg: configparser.SectionProxy) -> Logger: + warn_if_above = float(cfg['warn_if_above']) if 'warn_if_above' in cfg else None + warn_threshold = int(cfg.get('warn_threshold', '1')) + warn_data_range = int(cfg.get('warn_data_range', '1')) + return CPULogger5(warn_if_above, warn_threshold, warn_data_range) + + def cpu15(cfg: configparser.SectionProxy) -> Logger: + warn_if_above = float(cfg['warn_if_above']) if 'warn_if_above' in cfg else None + warn_threshold = int(cfg.get('warn_threshold', '1')) + warn_data_range = int(cfg.get('warn_data_range', '1')) + return CPULogger15(warn_if_above, warn_threshold, warn_data_range) + + def net(cfg: configparser.SectionProxy) -> Logger: + network_interface = get_or_raise(cfg, 'network_interface') + return NetworkLogger(network_interface) + + def filesystem(cfg: configparser.SectionProxy) -> Logger: + uuid = cfg.get('uuid', None) + mountpoint = Path(cfg.get('mountpoint')) if 'mountpoint' in cfg else None + unmounted_ok = bool(cfg.get('unmounted_ok', 'false')) + warn_if_above = float(cfg.get('warn_if_above', '1.0')) + warn_threshold = int(cfg.get('warn_threshold', '1')) + warn_data_range = int(cfg.get('warn_data_range', '1')) + return FilesystemLogger(uuid, mountpoint, unmounted_ok, warn_if_above, warn_threshold, warn_data_range) + + def drive(cfg: configparser.SectionProxy) -> Logger: + uuid = cfg.get('uuid', None) + device = Path(cfg.get('device')) if 'device' in cfg else None + warn_if_above = int(cfg['warn_if_above']) if 'warn_if_above' in cfg else None + warn_threshold = int(cfg.get('warn_threshold', '1')) + warn_data_range = int(cfg.get('warn_data_range', '1')) + return DriveLogger(uuid, device, warn_if_above, warn_threshold, warn_data_range) + + def memory(cfg: configparser.SectionProxy) -> Logger: + warn_if_above = float(cfg.get('warn_if_above', '1.0')) + warn_threshold = int(cfg.get('warn_threshold', '1')) + warn_data_range = int(cfg.get('warn_data_range', '1')) + return MemoryLogger(warn_if_above, warn_threshold, warn_data_range) + def swap(cfg: configparser.SectionProxy) -> Logger: + warn_if_above = float(cfg.get('warn_if_above', '1.0')) + warn_threshold = int(cfg.get('warn_threshold', '1')) + warn_data_range = int(cfg.get('warn_data_range', '1')) + return SwapLogger(warn_if_above, warn_threshold, warn_data_range) + + + mapping = { + 'temp': temp, + 'cpu1': cpu1, + 'cpu5': cpu5, + 'cpu15': cpu15, + 'network': net, + 'filesystem': filesystem, + 'drive': drive, + 'memory': memory, + 'swap': swap, + } + + loggers = [] + exceptions = [] + cfg: configparser.ConfigParser = get_cfg() + for section_name in cfg.sections(): + if section_name == 'logging': + continue + prefix = section_name.split('.', maxsplit=1)[0] + try: + loggers.append( + mapping[prefix](cfg[section_name]) + ) + except LoggerArgEx as e: + exceptions.append(e) + + return loggers, exceptions diff --git a/src/de/p1st/monitor/cfg/logging_dir.py b/src/de/p1st/monitor/cfg/logging_dir.py new file mode 100644 index 0000000..025796c --- /dev/null +++ b/src/de/p1st/monitor/cfg/logging_dir.py @@ -0,0 +1,11 @@ +from pathlib import Path + +from de.p1st.monitor.cfg.singleton import get_cfg + + +def logging_dir() -> Path: + cfg = get_cfg() + default = '/var/log/de-p1st-monitor' + if 'logging' not in cfg: + return Path(default) + return Path(cfg['logging'].get('dir', default)) diff --git a/src/de/p1st/monitor/cfg/singleton.py b/src/de/p1st/monitor/cfg/singleton.py new file mode 100644 index 0000000..53721ad --- /dev/null +++ b/src/de/p1st/monitor/cfg/singleton.py @@ -0,0 +1,30 @@ +import configparser +from pathlib import Path + +_cfg: configparser.ConfigParser | None = None + + +def init_cfg(config_file: Path = None): + global _cfg + + if _cfg is not None: + raise ValueError('already initialized') + + if config_file is None: + import socket + hostname = socket.gethostname() + config_file = Path(f'/etc/de-p1st-monitor/{hostname}.ini') + + if not config_file.exists(): + raise Exception(f'Configuration file does not exist! {config_file}') + + _cfg = configparser.ConfigParser() + _cfg.read(config_file) + + +def get_cfg() -> configparser.ConfigParser: + global _cfg + + if _cfg is None: + raise ValueError('uninitialized') + return _cfg diff --git a/src/de/p1st/monitor/csv_util.py b/src/de/p1st/monitor/csv_util.py new file mode 100644 index 0000000..3a8d76d --- /dev/null +++ b/src/de/p1st/monitor/csv_util.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +import csv +from collections import deque +from pathlib import Path + + +def read(file: Path) -> list[list[str]]: + """ + Returns all rows from the CSV file `file`. + """ + with open(file, newline='') as csvfile: + reader = csv.reader(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + return [row for row in reader] + + +def read_last(file: Path, num_rows: int, skip: int = 0) -> list[list[str]]: + """ + Returns the last `num_rows` from the CSV file `file`. + + :param file: + :param num_rows: + :param skip: If given, the first `skip` rows are skipped. + """ + with open(file, newline='') as csvfile: + reader = csv.reader(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + + # Skip the first `skip` rows. + for i in range(skip): + try: + next(reader) + except StopIteration: + break # EOF + + # Read all other rows but only keep the last `num_rows` rows. + q = deque(reader, num_rows) + # Return the last `num_rows` as list. + return [row for row in q] + + +def write(file: Path, + rows: list[list[str]], + header: list[str] = None, + create_parent_dirs: bool = True, + recreate_file: bool = False) -> None: + """ + Create new .csv file if missing or append to existing .csv file. + + :param file: + :param rows: The rows to write as csv table to file. + :param header: If given will be inserted as first row into the csv table. + :param create_parent_dirs: If `file.parent` does not exist, create it. + :param recreate_file: Never append, always recreate the .csv file. + """ + if create_parent_dirs and not file.parent.exists(): + file.parent.mkdir(parents=True, exist_ok=False) + if recreate_file and file.exists(): + file.unlink(missing_ok=False) + if file.exists(): + append(file, rows) + else: + if header is not None: + rows = [header] + rows + create(file, rows) + + text = file.read_text() + if text.count('\n') != len(rows) or not text.endswith('\n'): + raise Exception(f'Created a new csv file with {len(rows)} rows but it does not have {len(rows)} lines. ' + f'Make sure that there are no concurrent writes to this file!') + + +def create(file: Path, rows: list[list[str]]) -> None: + with open(file, 'x', newline='') as csvfile: + writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + writer.writerows(rows) + + +def append(file: Path, rows: list[list[str]]) -> None: + with open(file, 'a', newline='') as csvfile: + writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + writer.writerows(rows) + + +def test(): + file = Path('/var/log/de-p1st-monitor/cpu_avg.csv') + data = read_last(file, 4, 10) + print(data) + + +if __name__ == '__main__': + test() diff --git a/src/de/p1st/monitor/datetime_util.py b/src/de/p1st/monitor/datetime_util.py new file mode 100755 index 0000000..d8402df --- /dev/null +++ b/src/de/p1st/monitor/datetime_util.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +from datetime import datetime, timezone + + +def test(): + dt = datetime.now() + + print('non UTC:') + print(dt) + + print('\nUTC:') + print(now()) + print(to_str(now())) + print(now_str()) + print(from_str(to_str(now()))) + + print('\nlocalized:') + print(dt.tzinfo) + dt = dt.replace(tzinfo=timezone.utc) + print(dt) + + +def now() -> datetime: + return datetime.now(timezone.utc) + + +def now_str() -> str: + return to_str(now()) + + +def to_str(dt: datetime) -> str: + return dt.strftime(fmt()) + + +def from_str(dt_str: str) -> datetime: + dt = datetime.strptime(dt_str, fmt()) + return dt.replace(tzinfo=timezone.utc) + + +def fmt() -> str: + return '%Y%m%dT%H%M%S' + + +def fmt_len() -> int: + return 13 + + +if __name__ == '__main__': + test() diff --git a/src/de/p1st/monitor/exec_capture.py b/src/de/p1st/monitor/exec_capture.py new file mode 100644 index 0000000..b5e7cd3 --- /dev/null +++ b/src/de/p1st/monitor/exec_capture.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import subprocess + + +def execute_capture(command: list[str]) -> tuple[int, str, str]: + completed: subprocess.CompletedProcess = subprocess.run( + command, + capture_output=True, + text=True, + ) + return completed.returncode, completed.stdout, completed.stderr diff --git a/src/de/p1st/monitor/logger.py b/src/de/p1st/monitor/logger.py new file mode 100644 index 0000000..e892d23 --- /dev/null +++ b/src/de/p1st/monitor/logger.py @@ -0,0 +1,230 @@ +from pathlib import Path +from abc import ABC, abstractmethod + +from de.p1st.monitor import csv_util +from de.p1st.monitor.cfg.logging_dir import logging_dir +from de.p1st.monitor.string_conversion import to_string, from_string +from de.p1st.monitor.warn import WarnMessage, WarnLevel + + +# https://www.geeksforgeeks.org/abstract-classes-in-python/ +class Logger(ABC): + def __init__(self, + warn_threshold: int = 1, + warn_data_range: int = 1, + warn_if_above: int | float = None, + critical_if_above: int | float = None, + ): + self.data: list[any] | None = None + # True if the data held by this object is already appended to the logfile. + self.logged = False + + self.warn_threshold = warn_threshold + self.warn_data_range = warn_data_range + + # Either both variables are given, or both are None + if warn_if_above is not None and critical_if_above is not None: + assert critical_if_above > warn_if_above + else: + assert warn_if_above is None and critical_if_above is None + self.warn_if_above = warn_if_above + self.critical_if_above = critical_if_above + + def export_data(self) -> Path: + """ + This method is intended to be overriden in a subclass! + + With most loggers the `get_log_file()` is ready-to-use. + In this case this method simply returns `get_log_file()`. + + But some loggers require postprocessing of that data before it can be used. + In this case this method creates a new .csv file and returns it. + + @return: Path to .csv file with ready-to-use data. + """ + return self.get_log_file() + + def check(self) -> WarnMessage: + """ + Checks the latest `self.warn_data_range` datasets for problems using `self.check_data()`. + + If at least `self.warn_threshold` problems are found, + then a WarnMessage with the highest reported WarnLevel is returned. + + If at least one WarnLevel is above NORMAL, + then a WarnMessage is returned independent of the number of problems. + """ + datasets = self.get_datasets(self.warn_data_range) + warnings = [self.check_data(data) for data in datasets] + warnings = [warning for warning in warnings + if not warning.level == WarnLevel.NONE] + + if len(warnings) == 0: + return WarnMessage(WarnLevel.NONE) + # max() must not be called with an empty list. + highest_warn_level = max([warning.level for warning in warnings]) + + messages: list[str] = [warning.message for warning in warnings] + message = f'{len(warnings)} of the last {self.warn_data_range} datasets are above limits:\n\t' \ + + '\n\t'.join(messages) + + if highest_warn_level > WarnLevel.NORMAL: + return WarnMessage(highest_warn_level, message) + if len(warnings) >= self.warn_threshold: + return WarnMessage(highest_warn_level, message) + return WarnMessage(WarnLevel.NONE) + + @abstractmethod + def check_data(self, data: list[any]) -> WarnMessage: + """ + Check the given data for problems. + Return a WarnLevel indicating how serious the problems are. + + If there are no problems, return `WarnLevel.NONE`. + """ + raise ValueError('Subclasses must implement this') + + def get_all_datasets(self) -> list[list[any]]: + # See also: self.get_datasets() + + if self.get_log_file().exists(): + # We skip the first row as it is the data schema. + raw = csv_util.read(self.get_log_file())[1:] + data = [self.get_data_from_row(row) for row in raw] + else: + data = [] + + if not self.logged and self.has_data(): + data.append(self.get_data()) + + return data + + def get_datasets(self, num: int) -> list[list[any]]: + """ + Returns the last `num` datasets (including the current dataset). + """ + if not self.logged and self.has_data(): + # We will append the current data manually. + # Thus, we need to read one less line from the CSV file. + read_last = num - 1 + else: + read_last = num + + if self.get_log_file().exists(): + # Read rows from CSV file. + # We skip the first row as it is the data schema. + # We keep only the last `read_last` rows. + raw = csv_util.read_last(self.get_log_file(), read_last, 1) + # Convert from string to data types defined in the data schema. + data = [self.get_data_from_row(row) for row in raw] + else: + data = [] + + if not self.logged and self.has_data(): + # We append the current data. + # It has not yet been logged and is therefore not included in the CSV file we just read. + data.append(self.get_data()) + + return data + + def log(self, skip_if_no_data: bool = False) -> None: + """ + Appends the current data (e.g. temperature of a sensor) + to a logfile. + + :param skip_if_no_data: Can be used to do nothing if no data is available. If one is sure to have called update() previously, this can be set to True. + :raise Exception: If method is called but no data is available. Please do call update() first to avoid this! + """ + if self.logged: + return + if skip_if_no_data and not self.has_data(): + return + + csv_util.write(file=self.get_log_file(), rows=[self.get_data_as_row()], header=self.data_schema()) + self.logged = True + + def update(self): + self.set_data(self.read_data()) + self.logged = False + + @abstractmethod + def read_data(self) -> list[any] | None: + """ + Collects current data (e.g. temperature of a sensor). + + Might return None if sensor is detached / not available. + + :raise LoggerReadEx: + """ + raise ValueError('Subclasses must implement this') + + @abstractmethod + def data_schema(self) -> list[str]: + """ + Describes the type and meaning of the elements in self.values(). + + Returns a list with elements f'{data-type}#{column-description}'. + + Example: + ['datetime#Date', 'float#Disk usage'] + """ + raise ValueError('Subclasses must implement this') + + def get_data_from_row(self, data: list[str]) -> list[any]: + return [ + from_string(v, type_str) + for v, type_str + in zip(data, self.data_type_strs()) + ] + + def get_data_as_row(self) -> list[str]: + """ + Returns `self.get_data()` as string list that can easily be added as row to a CSV file. + """ + return self.as_row(self.get_data()) + + def as_row(self, data: list, data_schema: list[str] = None) -> list[str]: + """ + Returns the given `data` as string list that can easily be added as row to a CSV file. + """ + if data_schema is None: + data_schema = self.data_schema() + return [ + to_string(v, type_str) + for v, type_str + in zip(data, self.data_type_strs(data_schema)) + ] + + def has_data(self) -> bool: + return self.data is not None + + def get_data(self) -> list[any]: + """ + Returns the last data collected by `self.update()`. + """ + if self.has_data(): + return self.data + else: + raise ValueError(f'Data has not yet been read. {self.__str__()}') + + def set_data(self, data: list[any] | None): + self.data = data + + def data_type_strs(self, data_schema: list[str] = None) -> list[str]: + if data_schema is None: + data_schema = self.data_schema() + return [x.split('#', maxsplit=1)[0] for x in data_schema] + + @abstractmethod + def get_log_file(self) -> Path: + raise ValueError('Subclasses must implement this') + + @classmethod + def get_log_dir(cls) -> Path: + return logging_dir() + + def __str__(self) -> str: + key_value_strings = [f'classname: {type(self).__name__}'] + for key, value in vars(self).items(): + key_value_strings.append(f'{key}: {value}') + return ', '.join(key_value_strings) diff --git a/src/de/p1st/monitor/logger_ex.py b/src/de/p1st/monitor/logger_ex.py new file mode 100644 index 0000000..7785326 --- /dev/null +++ b/src/de/p1st/monitor/logger_ex.py @@ -0,0 +1,14 @@ +class LoggerReadEx(Exception): + """ + Used by Logger subclasses if + - sensor data could not be read + """ + pass + + +class LoggerArgEx(Exception): + """ + Used by Logger subclasses if + - Logger object created with illegal arguments + """ + pass diff --git a/src/de/p1st/monitor/loggers/__init__.py b/src/de/p1st/monitor/loggers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/de/p1st/monitor/loggers/cpu.py b/src/de/p1st/monitor/loggers/cpu.py new file mode 100644 index 0000000..fc5ec71 --- /dev/null +++ b/src/de/p1st/monitor/loggers/cpu.py @@ -0,0 +1,90 @@ +from pathlib import Path +from abc import abstractmethod +from typing import Literal + +import psutil + +from de.p1st.monitor import datetime_util + +from de.p1st.monitor.logger import Logger +from de.p1st.monitor.warn import WarnMessage, WarnLevel + + +class CPULogger(Logger): + def __init__(self, + warn_if_above: float = None, + warn_threshold: int = 1, + warn_data_range: int = 1, + ): + + critical_if_above = warn_if_above * 1.5 + super().__init__(warn_threshold, + warn_data_range, + warn_if_above, + critical_if_above) + self.warn_if_above = warn_if_above + + def check_data(self, data: list[any]) -> WarnMessage: + load_avg = data[1] + message = f'CPU load avg of last {self.get_load_timespan()} minutes is at {load_avg}' + + if load_avg > self.critical_if_above: + return WarnMessage(WarnLevel.HIGH, message) + if load_avg > self.warn_if_above: + return WarnMessage(WarnLevel.NORMAL, message) + return WarnMessage(WarnLevel.NONE) + + def read_data(self) -> list[any] | None: + return [ + datetime_util.now(), + self.get_load(self.get_load_timespan()) + ] + + def data_schema(self) -> list[str]: + return [ + 'datetime#Date', + f'float#LoadAverage{self.get_load_timespan()}min' + ] + + def get_log_file(self) -> Path: + return self.get_log_dir() / f'cpu_{self.get_load_timespan()}min.csv' + + @abstractmethod + def get_load_timespan(self) -> Literal[1, 5, 15]: + raise ValueError('Subclasses must implement this') + + # + # HELPERS + # + + + @staticmethod + def get_load(minutes: Literal[1, 5, 15]) -> float: + """ + :param minutes: avg of last 1/5/15 minutes + :return: Average CPU load of last 1/5/15 minutes + """ + idx_dict = { + 1: 0, + 5: 1, + 15: 2 + } + idx = idx_dict[minutes] + + # Number of processes in the system run queue averaged over + # the last 1, 5, and 15 minutes: + # one, five, fifteen = psutil.getloadavg() + + # Load percentage during last 5 minutes. + # This value has been tested to be correct on my AMD Ryzen 4800H CPU. + return psutil.getloadavg()[idx] / psutil.cpu_count() + +class CPULogger1(CPULogger): + def get_load_timespan(self) -> Literal[1, 5, 15]: + return 1 +class CPULogger5(CPULogger): + def get_load_timespan(self) -> Literal[1, 5, 15]: + return 5 +class CPULogger15(CPULogger): + def get_load_timespan(self) -> Literal[1, 5, 15]: + return 15 diff --git a/src/de/p1st/monitor/loggers/drive.py b/src/de/p1st/monitor/loggers/drive.py new file mode 100644 index 0000000..4627869 --- /dev/null +++ b/src/de/p1st/monitor/loggers/drive.py @@ -0,0 +1,108 @@ +import json +from pathlib import Path + +from de.p1st.monitor import datetime_util + +from de.p1st.monitor.exec_capture import execute_capture +from de.p1st.monitor.logger import Logger +from de.p1st.monitor.logger_ex import LoggerArgEx, LoggerReadEx +from de.p1st.monitor.warn import WarnLevel, WarnMessage + +class BlkidException(Exception): + pass + +class DriveLogger(Logger): + def __init__(self, uuid: str = None, + device: Path = None, + warn_if_above: int = None, + warn_threshold: int = 1, + warn_data_range: int = 1, + ): + + critical_if_above = warn_if_above + 10 + super().__init__(warn_threshold, + warn_data_range, + warn_if_above, + critical_if_above + ) + + if uuid is None and device is None: + raise LoggerArgEx('uuid or device required') + + if uuid is None: + try: + self.uuid = self.get_partition_uuid(device) + except BlkidException as e: + raise LoggerArgEx(getattr(e, 'message', e)) + else: + self.uuid = uuid + + if device is None: + self.device = self.get_partition_path(uuid) + else: + self.device = device + + self.warn_if_above = warn_if_above + + def check_data(self, data: list[any]) -> WarnMessage: + temp = data[1] + message = f'Temperature of {self.uuid} ist at {temp}' + + if temp > self.critical_if_above: + return WarnMessage(WarnLevel.HIGH, message) + if temp > self.warn_if_above: + return WarnMessage(WarnLevel.NORMAL, message) + return WarnMessage(WarnLevel.NONE) + + def read_data(self) -> list[any]: + return [ + datetime_util.now(), + self.get_temp_from_device(self.device), + ] + + def data_schema(self) -> list[str]: + return ['datetime#Date', 'int#Temperature'] + + def get_log_file(self) -> Path: + # self.device might change overtime. + # Thus, we use self.uuid to identify a partition. + return self.get_log_dir() / f'drive_{self.uuid}.csv' + + # + # HELPERS + # + + @classmethod + def get_partition_path(cls, uuid: str) -> Path: + """ + :return: Partition path, e.g. /dev/sda1 + """ + return Path(f'/dev/disk/by-uuid/{uuid}').resolve() + + @classmethod + def get_partition_uuid(cls, device: Path) -> str: + """ + :param device: E.g. /dev/sda1 + :return: UUID of e.g. partition /dev/sda1 + :raise BlkidException: If UUID could not be determined. + """ + returncode, stdout, stderr = execute_capture(['blkid', '-s', 'UUID', '-o', 'value', f'{device}']) + + if returncode != 0: + raise BlkidException(f'blkid failed with returncode {returncode}\nstdout: {stdout}\nstderr: {stderr}') + + return stdout.strip() + + @classmethod + def get_temp_from_device(cls, device: Path) -> int: + """ + :param device: For example `/dev/sda` or `/dev/disk/by-uuid/` + :return: Temperature in celsius + """ + returncode, stdout, stderr = execute_capture(['smartctl', '-j', '-a', f'{device}']) + + if returncode != 0: + raise LoggerReadEx(f'smartctl failed with returncode {returncode}\nstdout: {stdout}\nstderr: {stderr}') + j = json.loads(stdout) + + return j['temperature']['current'] diff --git a/src/de/p1st/monitor/loggers/filesystem.py b/src/de/p1st/monitor/loggers/filesystem.py new file mode 100644 index 0000000..28eed57 --- /dev/null +++ b/src/de/p1st/monitor/loggers/filesystem.py @@ -0,0 +1,164 @@ +from pathlib import Path + +import psutil +from de.p1st.monitor import datetime_util +from de.p1st.monitor.exec_capture import execute_capture + +from de.p1st.monitor.logger import Logger +from de.p1st.monitor.logger_ex import LoggerArgEx, LoggerReadEx +from de.p1st.monitor.warn import WarnLevel, WarnMessage + + +class NotMounted(Exception): + pass + + +class FilesystemLogger(Logger): + def __init__(self, uuid: str = None, + mountpoint: Path = None, + unmounted_ok: bool = False, + warn_if_above: float = 1.0, + warn_threshold: int = 1, + warn_data_range: int = 1, + ): + + # The space between disk is at `self.warn_if_above` and disk is full at `1.0`. + buffer = 1 - warn_if_above + critical_if_above = warn_if_above + 0.5 * buffer + super().__init__(warn_threshold, + warn_data_range, + warn_if_above, + critical_if_above, + ) + + if uuid is None and mountpoint is None: + raise LoggerArgEx('uuid or mountpoint required') + + self.uuid = uuid + self.mountpoint = mountpoint + self.unmounted_ok = unmounted_ok + self.warn_if_above = warn_if_above + + self.mounted = True + + # + # + # + + # If uuid and mountpoint are both specified, + # raise warning if unexpected uuid is mounted at mountpoint. + if self.mountpoint is not None and self.uuid is not None: + try: + actual_uuid = self.get_uuid(self.mountpoint) + self.mounted = True + if self.uuid != actual_uuid: + raise LoggerReadEx(f'Expected {self.uuid} at {self.mountpoint} but got {actual_uuid}') + except NotMounted as e: + if self.unmounted_ok: + self.mounted = False + else: + raise LoggerArgEx(getattr(e, 'message', e)) + + # Try to get UUID (if only mountpoint given) + if self.uuid is None: + try: + self.uuid = self.get_uuid(self.mountpoint) + self.mounted = True + except NotMounted as e: + if self.unmounted_ok: + self.mounted = False + else: + raise LoggerArgEx(getattr(e, 'message', e)) + + # Try to get mountpoint (if only uuid given) + if self.mountpoint is None: + try: + self.mountpoint = self.get_mountpoint(self.uuid) + self.mounted = True + except NotMounted as e: + if self.unmounted_ok: + self.mounted = False + else: + raise LoggerReadEx(getattr(e, 'message', e)) + + def check_data(self, data: list[any]) -> WarnMessage: + if not self.mounted: + return WarnMessage(WarnLevel.NONE) + + disk_usage = data[1] + message = f'Disk usage of {self.uuid} ist at {disk_usage}' + + if disk_usage > self.critical_if_above: + return WarnMessage(WarnLevel.HIGH, message) + if disk_usage > self.warn_if_above: + return WarnMessage(WarnLevel.NORMAL, message) + return WarnMessage(WarnLevel.NONE) + + def read_data(self) -> list[any] | None: + if not self.mounted: + return None + + disk_usage: float = self.get_disk_usage(self.mountpoint) + return [ + datetime_util.now(), + disk_usage, + ] + + def data_schema(self) -> list[str]: + return ['datetime#Date', 'float#Disk usage'] + + def get_log_file(self) -> Path: + # The mountpoint of a filesystem might change overtime. + # Thus, we use self.uuid to identify a filesystem. + return self.get_log_dir() / f'filesystem_{self.uuid}.csv' + + # + # HELPERS + # + + @classmethod + def get_disk_usage(cls, mountpoint: Path) -> float: + """ + :returns: used space / total space + """ + return psutil.disk_usage(str(mountpoint)).percent / 100.0 + + @classmethod + def get_mountpoint(cls, uuid: str) -> Path: + """ + Throws an error if the corresponding partition is not mounted. + """ + + partition_list: list[psutil._common.sdiskpart] = psutil.disk_partitions(all=False) + partitions: dict[Path, psutil._common.sdiskpart] = {Path(partition.device).resolve(): partition for partition in + partition_list} + + partition_path = cls.get_partition_path(uuid) + if partition_path not in partitions: + raise NotMounted( + f'Partition {partition_path} is probably not mounted ' + f'as it is not in psutil partition list: {partitions}') + + partition = partitions[partition_path] + return Path(partition.mountpoint) + + @classmethod + def get_uuid(cls, mountpoint: Path) -> str: + # Returns the UUID of the device mounted at `/`. + # Fails if there is no disk mounted at `/`. + # + # findmnt / -o UUID -n + + returncode, stdout, stderr = execute_capture(['findmnt', str(mountpoint), '-o', 'UUID', '-n']) + if returncode != 0: + raise NotMounted( + f'No partition mounted at {mountpoint}. Stderr of findmnt: {stderr}') + + return stdout.strip() + + @classmethod + def get_partition_path(cls, uuid: str) -> Path: + """ + :return: Partition path, e.g. /dev/sda1 + """ + return Path(f'/dev/disk/by-uuid/{uuid}').resolve() diff --git a/src/de/p1st/monitor/loggers/memory.py b/src/de/p1st/monitor/loggers/memory.py new file mode 100644 index 0000000..9135438 --- /dev/null +++ b/src/de/p1st/monitor/loggers/memory.py @@ -0,0 +1,74 @@ +from pathlib import Path + +import psutil +from de.p1st.monitor import datetime_util + +from de.p1st.monitor.logger import Logger +from de.p1st.monitor.warn import WarnMessage, WarnLevel + + +class MemoryLogger(Logger): + def __init__(self, + warn_if_above: float = 1.0, + warn_threshold: int = 1, + warn_data_range: int = 1, + ): + + # The space between memory is at `self.warn_if_above` and memory is full at `1.0`. + buffer = 1 - warn_if_above + critical_if_above = warn_if_above + 0.5 * buffer + super().__init__(warn_threshold, + warn_data_range, + warn_if_above, + critical_if_above) + self.warn_if_above = warn_if_above + + def check_data(self, data: list[any]) -> WarnMessage: + used_mb = data[1] + total_available_mb = data[3] + message = f'Memory usage ist at {used_mb} MB of {total_available_mb} MB' + + used = used_mb / total_available_mb + + if used > self.critical_if_above: + return WarnMessage(WarnLevel.HIGH, message) + if used > self.warn_if_above: + return WarnMessage(WarnLevel.NORMAL, message) + return WarnMessage(WarnLevel.NONE) + + def read_data(self) -> list[any]: + used_mb, free_mb, available_mb, total_mb = self.get_memory() + used_and_cached_mb = total_mb - free_mb + total_available_mb = used_mb + available_mb + return [ + datetime_util.now(), + used_mb, + used_and_cached_mb, + total_available_mb, + ] + + def data_schema(self) -> list[str]: + return ['datetime#Date', 'int#Used memory in MB', 'int#Used and cached in MB', 'int#Total available memory in MB'] + + def get_log_file(self) -> Path: + return self.get_log_dir() / f'memory.csv' + + # + # HELPERS + # + + @classmethod + def get_memory(cls) -> tuple[int, int, int, int]: + """ + :return: Tuple[used memory in MB, free memory in MB, total memory in MB]. This does not include swap. + """ + mb = 1024 * 1024 + mem = psutil.virtual_memory() + + # mem.available: + # The memory that can be given instantly to processes, + # excluding swap. + # mem.total: + # Total physical memory (exclusive swap). + # mem.used + mem.available != mem.total + return int(mem.used / mb), int(mem.free / mb), int(mem.available / mb), int(mem.total / mb) diff --git a/src/de/p1st/monitor/loggers/network.py b/src/de/p1st/monitor/loggers/network.py new file mode 100644 index 0000000..2d6c860 --- /dev/null +++ b/src/de/p1st/monitor/loggers/network.py @@ -0,0 +1,113 @@ +import sys +from datetime import datetime, timezone, timedelta +from pathlib import Path + +import psutil + +from de.p1st.monitor import datetime_util, csv_util +from de.p1st.monitor.logger import Logger +from de.p1st.monitor.logger_ex import LoggerReadEx +from de.p1st.monitor.warn import WarnLevel, WarnMessage + + +class NetworkLogger(Logger): + def __init__(self, network_interface: str): + super().__init__() + self.network_interface = network_interface + + def export_data(self) -> Path: + data = self.get_all_datasets() + + export_schema = [ + 'datetime#Date', + 'float#Bytes sent per second', + 'float#Bytes received per second', + ] + export_data = [] + # Append all other rows. + for prev_row, curr_row in zip(data[:-1], data[1:]): + # if boot time differs -> reboot between data points -> invalid sent/received deltas + if prev_row[3] != curr_row[3]: + continue + + elapsed_time: timedelta = curr_row[0] - prev_row[0] + delta_sent = curr_row[1] - prev_row[1] + delta_received = curr_row[2] - prev_row[2] + + if delta_sent < 0 or delta_received < 0: + print(f'bytes received/sent counter did overflow after {prev_row[0]}', + file=sys.stderr) + continue + + elapsed_seconds = elapsed_time.total_seconds() + export_data.append([ + # datetime#Date + prev_row[0] + 0.5 * elapsed_time, + # float#Bytes sent per second + delta_sent / elapsed_seconds, + # float#Bytes received per second + delta_received / elapsed_seconds, + ]) + + export_file = self.get_log_file().parent.joinpath(self.get_log_file().name + '.exported.csv') + rows = [self.as_row(export_row, export_schema) for export_row in export_data] + csv_util.write(file=export_file, rows=rows, header=export_schema, recreate_file=True) + return export_file + + def check_data(self, data: list[any]) -> WarnMessage: + return WarnMessage(WarnLevel.NONE) + + def data_schema(self) -> list[str]: + return [ + 'datetime#Date', + 'int#Bytes sent since boot', + 'int#Bytes received since boot', + 'datetime#Boot date', + ] + + def read_data(self) -> list[any]: + sent, received = self.get_net_usage() + return [ + datetime_util.now(), + sent, + received, + self.get_boot_time(), + ] + + def get_log_file(self) -> Path: + return self.get_log_dir() / f'net_{self.network_interface}.csv' + + # + # HELPERS + # + + def get_net_usage(self) -> tuple[int, int]: + """ + Warning: The returned values may overflow if the system is running for a long time. + + :return: bytes sent, bytes received + """ + # noinspection PyTypeChecker + nics_data: dict[str, psutil._common.snetio] = psutil.net_io_counters(pernic=True, nowrap=True) + + if self.network_interface not in nics_data: + raise LoggerReadEx(f'Network interface {self.network_interface} not found') + + nic_data = nics_data[self.network_interface] + return nic_data.bytes_sent, nic_data.bytes_recv + + @classmethod + def get_boot_time(cls) -> datetime: + epoch_seconds = psutil.boot_time() + return datetime.fromtimestamp(epoch_seconds, tz=timezone.utc) + +def test(): + from de.p1st.monitor.cfg.singleton import init_cfg + init_cfg() + + logger = NetworkLogger('wlp1s0') + logger.update() + logger.log() + +if __name__ == '__main__': + test() diff --git a/src/de/p1st/monitor/loggers/swap.py b/src/de/p1st/monitor/loggers/swap.py new file mode 100644 index 0000000..133d89d --- /dev/null +++ b/src/de/p1st/monitor/loggers/swap.py @@ -0,0 +1,69 @@ +from pathlib import Path + +import psutil +from de.p1st.monitor import datetime_util + +from de.p1st.monitor.logger import Logger +from de.p1st.monitor.warn import WarnMessage, WarnLevel + + +class SwapLogger(Logger): + def __init__(self, + warn_if_above: float = 1.0, + warn_threshold: int = 1, + warn_data_range: int = 1, + ): + + # The space between swap is at `self.warn_if_above` and swap is full at `1.0`. + buffer = 1 - warn_if_above + critical_if_above = warn_if_above + 0.5 * buffer + super().__init__(warn_threshold, + warn_data_range, + warn_if_above, + critical_if_above) + self.warn_if_above = warn_if_above + + def check_data(self, data: list[any]) -> WarnMessage: + used_mb = data[1] + total_mb = data[2] + message = f'Swap usage ist at {used_mb} MB of {total_mb} MB' + + if used_mb == 0 and total_mb == 0: + # Swap not enabled. + return WarnMessage(WarnLevel.NONE) + + usage = used_mb / total_mb + + if usage > self.critical_if_above: + return WarnMessage(WarnLevel.HIGH, message) + if usage > self.warn_if_above: + return WarnMessage(WarnLevel.NORMAL, message) + return WarnMessage(WarnLevel.NONE) + + def read_data(self) -> list[any]: + used_mb, total_mb = self.get_swap() + return [ + datetime_util.now(), + used_mb, + total_mb, + ] + + def data_schema(self) -> list[str]: + return ['datetime#Date', 'int#Used swap in MB', 'int#Total swap in MB'] + + def get_log_file(self) -> Path: + return self.get_log_dir() / f'swap.csv' + + # + # HELPERS + # + + @classmethod + def get_swap(cls) -> (int, int): + """ + :return: Tuple[used swap in MB, total swap in MB]. + """ + mb = 1024 * 1024 + mem = psutil.swap_memory() + + return int(mem.used / mb), int(mem.total / mb) diff --git a/src/de/p1st/monitor/loggers/temp.py b/src/de/p1st/monitor/loggers/temp.py new file mode 100644 index 0000000..de4ff59 --- /dev/null +++ b/src/de/p1st/monitor/loggers/temp.py @@ -0,0 +1,82 @@ +from pathlib import Path + +import psutil + +from de.p1st.monitor import datetime_util +from de.p1st.monitor.logger import Logger +from de.p1st.monitor.logger_ex import LoggerReadEx +from de.p1st.monitor.warn import WarnMessage, WarnLevel + + +class TempLogger(Logger): + def __init__(self, sensor_name: str, + sensor_label: str, + warn_if_above: float = None, + warn_threshold: int = 1, + warn_data_range: int = 1, + ): + + critical_if_above = warn_if_above + 10 + super().__init__(warn_threshold, + warn_data_range, + warn_if_above, + critical_if_above) + self.name = sensor_name + self.label = sensor_label + + self.warn_if_above = warn_if_above + + def check_data(self, data: list[any]) -> WarnMessage: + temp = data[1] + message = f'Temperature of {self.name} {self.label} ist at {temp}' + + if temp > self.critical_if_above: + return WarnMessage(WarnLevel.HIGH, message) + if temp > self.warn_if_above: + return WarnMessage(WarnLevel.NORMAL, message) + return WarnMessage(WarnLevel.NONE) + + def read_data(self) -> list[any]: + return [ + datetime_util.now(), + self.get_temp() + ] + + def data_schema(self) -> list[str]: + return [ + 'datetime#Date', + 'float#Temperature' + ] + + def get_log_file(self) -> Path: + return self.get_log_dir() / f'temp_{self.name}_{self.label}.csv' + + # + # HELPERS + # + + def get_temp(self) -> float: + """ + :return: Temperature in celsius + """ + data = psutil.sensors_temperatures(fahrenheit=False) + if not self.name in data: + raise LoggerReadEx(f'Sensor {self.name} not found') + for i in data[self.name]: + if i.label == self.label: + return i.current + raise LoggerReadEx(f'Label {self.label} of sensor {self.name} not found') + + +def test(): + from de.p1st.monitor.cfg import singleton + singleton.init_cfg() + + logger = TempLogger('amdgpu', 'edge', 47, 2, 4) + logger.update() + logger.log() + logger.check().print() + + +if __name__ == '__main__': + test() diff --git a/src/de/p1st/monitor/main.py b/src/de/p1st/monitor/main.py new file mode 100755 index 0000000..df05697 --- /dev/null +++ b/src/de/p1st/monitor/main.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import argparse +import sys +from pathlib import Path + +from de.p1st.monitor.cfg.singleton import init_cfg +from de.p1st.monitor.cfg.loggers import get_loggers +from de.p1st.monitor.logger_ex import LoggerReadEx + + +def main(): + parser = argparse.ArgumentParser(prog='de-p1st-monitor', + description='Iterates over all config sections. ' + 'For each section the current sensor data is read ' + 'and logged to a .csv file.') + parser.add_argument('--config', '-c', default=None, type=Path, + help='Path to .ini configuration file.') + parser.add_argument('--export', '-e', default=False, action='store_true', + help='If `True`, export .csv files and print their paths to stdout. ' + 'No sensor data is logged during this.') + # parser.add_argument('--export', '-e', default=False, type=bool, + # help='If `True`, export .csv files and print their paths to stdout. + # No sensor data is logged during this.') + args = parser.parse_args() + init_cfg(args.config) + + if args.export: + export() + else: + log() + + +def export(): + loggers, logger_arg_exs = get_loggers() + if len(logger_arg_exs) > 0: + print('\nCONFIGURATION ERROR: Could not instantiate some of the loggers!', file=sys.stderr) + print_exs(logger_arg_exs, [f'{n}.' for n in range(1, 1 + len(logger_arg_exs))]) + exit(1) + + for logger in loggers: + export_path: Path = logger.export_data() + print(export_path) + + +def log(): + loggers, logger_arg_exs = get_loggers() + logger_read_exs = [] + logger_warnings = 0 + for logger_ct, logger in enumerate(loggers, start=1): + print(f'Running logger {logger_ct}/{len(loggers)} ...') + try: + logger.update() + except LoggerReadEx as e: + logger_read_exs.append(e) + continue + # After logger.update() there might still be no data + # Example: FilesystemLogger if partition is not mounted (and unmounted_ok is True) + logger.log(skip_if_no_data=True) + if logger.check().print().is_warning(): + logger_warnings += 1 + + if len(logger_arg_exs) > 0: + print('\nCONFIGURATION ERROR: Could not instantiate some of the loggers!', file=sys.stderr) + print_exs(logger_arg_exs, [f'{n}.' for n in range(1, 1 + len(logger_arg_exs))]) + if len(logger_read_exs) > 0: + print('\nRUNTIME ERROR: Some loggers could not fetch sensor data!', file=sys.stderr) + print_exs(logger_read_exs, [f'{n}.' for n in range(1, 1 + len(logger_read_exs))]) + + if len(logger_arg_exs) + len(logger_read_exs) > 0 or logger_warnings > 0: + exit(1) + + +def print_exs(exs: list[Exception], headers: list): + for e, header in zip(exs, headers): + # Indent str(e) with \t + body = '\t' + '\n\t'.join(str(e).splitlines()) + + print(f'{header}\n{body}', file=sys.stderr) + + +if __name__ == '__main__': + main() diff --git a/src/de/p1st/monitor/string_conversion.py b/src/de/p1st/monitor/string_conversion.py new file mode 100644 index 0000000..88edb6a --- /dev/null +++ b/src/de/p1st/monitor/string_conversion.py @@ -0,0 +1,24 @@ +from typing import Callable + +from de.p1st.monitor import datetime_util + + +def data_types() -> dict[str, dict[str, Callable[[any], any]]]: + """ + Returns a dictionary. Its key-value pairs contain the following: + + Key: Name of type. + Value: Dict containing to_string and from_string conversion methods, called 'to' and 'from'. + """ + return { + 'str': {'to': lambda x: x, 'from': lambda x: x}, + 'int': {'to': lambda x: str(x), 'from': lambda x: int(x)}, + 'float': {'to': lambda x: str(x), 'from': lambda x: float(x)}, + 'datetime': {'to': datetime_util.to_str, 'from': datetime_util.from_str}, + } + +def to_string(v: any, type_str: str) -> str: + return data_types()[type_str]['to'](v) + +def from_string(v: str, type_str: str) -> any: + return data_types()[type_str]['from'](v) diff --git a/src/de/p1st/monitor/warn.py b/src/de/p1st/monitor/warn.py new file mode 100644 index 0000000..0a8648e --- /dev/null +++ b/src/de/p1st/monitor/warn.py @@ -0,0 +1,49 @@ +from __future__ import annotations +import sys +from enum import Enum +from functools import total_ordering + + +# https://docs.python.org/3/library/functools.html#functools.total_ordering +@total_ordering +class WarnLevel(Enum): + NONE = 0 # Not a warning. Everything is ok. + LOW = 1 + NORMAL = 2 + HIGH = 3 + + def __eq__(self, other): + if isinstance(other, WarnLevel): + return self.value == other.value + return NotImplemented + + def __lt__(self, other): + if isinstance(other, WarnLevel): + return self.value < other.value + return NotImplemented + + +class WarnMessage: + def __init__(self, level: WarnLevel, message: str = None): + self.level = level + self.message = message + + def is_warning(self) -> bool: + return self.level > WarnLevel.NONE + + def print(self, default_message: str = 'Warning!') -> WarnMessage: + """ + return: self + """ + message = default_message if self.message is None else self.message + + if self.level == WarnLevel.NONE: + pass + elif self.level == WarnLevel.LOW: + print(message) + elif self.level == WarnLevel.NORMAL: + print(message, file=sys.stderr) + elif self.level == WarnLevel.HIGH: + print(f'[CRITICAL] {message}', file=sys.stderr) + + return self