import
12
.gitignore
vendored
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
/.idea/
|
||||||
|
__pycache__/
|
||||||
|
|
||||||
|
# pip build
|
||||||
|
/src/de.p1st.monitor.egg-info/
|
||||||
|
/dist/
|
||||||
|
/build/
|
||||||
|
/venv/
|
||||||
|
|
||||||
|
# makepkg
|
||||||
|
/packaging/python-de-p1st-monitor-git-*-any.pkg.tar.zst
|
||||||
|
/packaging/de-p1st-monitor/
|
24
.run/main (export).run.xml
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
<component name="ProjectRunConfigurationManager">
|
||||||
|
<configuration default="false" name="main (export)" type="PythonConfigurationType" factoryName="Python">
|
||||||
|
<module name="de-p1st-monitor" />
|
||||||
|
<option name="INTERPRETER_OPTIONS" value="" />
|
||||||
|
<option name="PARENT_ENVS" value="true" />
|
||||||
|
<envs>
|
||||||
|
<env name="PYTHONUNBUFFERED" value="1" />
|
||||||
|
</envs>
|
||||||
|
<option name="SDK_HOME" value="" />
|
||||||
|
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/src/de/p1st/monitor" />
|
||||||
|
<option name="IS_MODULE_SDK" value="true" />
|
||||||
|
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||||
|
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||||
|
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||||
|
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/src/de/p1st/monitor/main.py" />
|
||||||
|
<option name="PARAMETERS" value="-c $PROJECT_DIR$/cfg/yodaTux.ini --export" />
|
||||||
|
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||||
|
<option name="EMULATE_TERMINAL" value="false" />
|
||||||
|
<option name="MODULE_MODE" value="false" />
|
||||||
|
<option name="REDIRECT_INPUT" value="false" />
|
||||||
|
<option name="INPUT_FILE" value="" />
|
||||||
|
<method v="2" />
|
||||||
|
</configuration>
|
||||||
|
</component>
|
24
.run/main (help).run.xml
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
<component name="ProjectRunConfigurationManager">
|
||||||
|
<configuration default="false" name="main (help)" type="PythonConfigurationType" factoryName="Python">
|
||||||
|
<module name="de-p1st-monitor" />
|
||||||
|
<option name="INTERPRETER_OPTIONS" value="" />
|
||||||
|
<option name="PARENT_ENVS" value="true" />
|
||||||
|
<envs>
|
||||||
|
<env name="PYTHONUNBUFFERED" value="1" />
|
||||||
|
</envs>
|
||||||
|
<option name="SDK_HOME" value="" />
|
||||||
|
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/src/de/p1st/monitor" />
|
||||||
|
<option name="IS_MODULE_SDK" value="true" />
|
||||||
|
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||||
|
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||||
|
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||||
|
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/src/de/p1st/monitor/main.py" />
|
||||||
|
<option name="PARAMETERS" value="--help" />
|
||||||
|
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||||
|
<option name="EMULATE_TERMINAL" value="false" />
|
||||||
|
<option name="MODULE_MODE" value="false" />
|
||||||
|
<option name="REDIRECT_INPUT" value="false" />
|
||||||
|
<option name="INPUT_FILE" value="" />
|
||||||
|
<method v="2" />
|
||||||
|
</configuration>
|
||||||
|
</component>
|
24
.run/main (log).run.xml
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
<component name="ProjectRunConfigurationManager">
|
||||||
|
<configuration default="false" name="main (log)" type="PythonConfigurationType" factoryName="Python">
|
||||||
|
<module name="de-p1st-monitor" />
|
||||||
|
<option name="INTERPRETER_OPTIONS" value="" />
|
||||||
|
<option name="PARENT_ENVS" value="true" />
|
||||||
|
<envs>
|
||||||
|
<env name="PYTHONUNBUFFERED" value="1" />
|
||||||
|
</envs>
|
||||||
|
<option name="SDK_HOME" value="" />
|
||||||
|
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/src/de/p1st/monitor" />
|
||||||
|
<option name="IS_MODULE_SDK" value="true" />
|
||||||
|
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||||
|
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||||
|
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||||
|
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/src/de/p1st/monitor/main.py" />
|
||||||
|
<option name="PARAMETERS" value="-c $PROJECT_DIR$/cfg/yodaTux.ini" />
|
||||||
|
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||||
|
<option name="EMULATE_TERMINAL" value="false" />
|
||||||
|
<option name="MODULE_MODE" value="false" />
|
||||||
|
<option name="REDIRECT_INPUT" value="false" />
|
||||||
|
<option name="INPUT_FILE" value="" />
|
||||||
|
<method v="2" />
|
||||||
|
</configuration>
|
||||||
|
</component>
|
32
LICENSE
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
The Clear BSD License
|
||||||
|
|
||||||
|
Copyright (c) 2023 Daniel Langbein
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted (subject to the limitations in the disclaimer
|
||||||
|
below) provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of the copyright holder nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from this
|
||||||
|
software without specific prior written permission.
|
||||||
|
|
||||||
|
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
|
||||||
|
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
|
||||||
|
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
|
||||||
|
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||||
|
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
|
||||||
|
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
POSSIBILITY OF SUCH DAMAGE.
|
47
Makefile
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
PKGNAME := de-p1st-monitor
|
||||||
|
|
||||||
|
.PHONY: all
|
||||||
|
all: install-pkgbuild
|
||||||
|
|
||||||
|
.PHONY: install-pkgbuild
|
||||||
|
install-pkgbuild: cron ## Install with pacman (on Arch Linux)
|
||||||
|
sudo pacman -S --needed base-devel
|
||||||
|
cd packaging && makepkg -fCcsri && rm -rf $(PKGNAME)
|
||||||
|
${MAKE} install-files
|
||||||
|
|
||||||
|
.PHONY: install-pip
|
||||||
|
install-pip: notify cron ## Install with pip
|
||||||
|
sudo python3 -m pip install --upgrade --force-reinstall .
|
||||||
|
${MAKE} install-files
|
||||||
|
|
||||||
|
.PHONY: install-files
|
||||||
|
install-files:
|
||||||
|
sudo install -m0644 cron.d/$(PKGNAME) /etc/cron.d/$(PKGNAME)
|
||||||
|
|
||||||
|
sudo install --directory -m755 /etc/$(PKGNAME)/
|
||||||
|
sudo install -m0644 cfg/* /etc/$(PKGNAME)/
|
||||||
|
|
||||||
|
.PHONY: notify ## Check if exec-notify is installed.
|
||||||
|
notify:
|
||||||
|
# `type` does not work e.g. on Ubuntu 18.04
|
||||||
|
which exec-notify
|
||||||
|
|
||||||
|
.PHONY: cron ## Check if cron (e.g. cronie) is running.
|
||||||
|
cron:
|
||||||
|
# Check if cron.d exists
|
||||||
|
stat /etc/cron.d/
|
||||||
|
# Check if cron is running
|
||||||
|
pgrep cron
|
||||||
|
|
||||||
|
|
||||||
|
.PHONY: clean-pkgbuild
|
||||||
|
clean-pkgbuild: clean-files
|
||||||
|
sudo pacman -Rns python-$(PKGNAME)-git
|
||||||
|
|
||||||
|
.PHONY: clean-pip
|
||||||
|
clean-pip: clean-files
|
||||||
|
sudo python3 -m pip uninstall -y $(PKGNAME)
|
||||||
|
|
||||||
|
.PHONY: clean-files
|
||||||
|
clean-files:
|
||||||
|
sudo rm -rf /etc/cron.d/$(PKGNAME) /etc/$(PKGNAME) /var/log/$(PKGNAME).cron
|
248
README.md
Normal file
@ -0,0 +1,248 @@
|
|||||||
|
# de-p1st-monitor
|
||||||
|
|
||||||
|
## Research
|
||||||
|
|
||||||
|
See [./research](./research).
|
||||||
|
|
||||||
|
- HDD temp:
|
||||||
|
- Modern hard drives will throttle their read and write speeds
|
||||||
|
when the drive reaches a critical pre-set temperature
|
||||||
|
(usually around 60°C)
|
||||||
|
- 20-50°C (short-term)
|
||||||
|
- 20-40°C (long-term usage)
|
||||||
|
- SSD temp:
|
||||||
|
- Most SSDs implement thermal throttling as a safety feature
|
||||||
|
if a drive gets too hot. As the driver approaches the 70ºC limit
|
||||||
|
that most manufacturers set, the more likely it is that the
|
||||||
|
drive will start to slow itself down to prevent failure.
|
||||||
|
- 30-50°C
|
||||||
|
|
||||||
|
## Keep it simple!
|
||||||
|
|
||||||
|
Lines of code including docstrings and comments:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
find ./src -name '*.py' | xargs wc -l
|
||||||
|
#=> 1394 total
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
See [cfg/yodaTux.ini](cfg/yodaTux.ini) for a configuration file covering all config options.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
Install dependencies:
|
||||||
|
|
||||||
|
- on Arch Linux
|
||||||
|
|
||||||
|
```shell
|
||||||
|
# TODO
|
||||||
|
# Optional: 1-wire temperature sensor.
|
||||||
|
sudo pacman -S digitemp # TODO: configure your sensor
|
||||||
|
```
|
||||||
|
|
||||||
|
- on Ubuntu
|
||||||
|
|
||||||
|
```shell
|
||||||
|
sudo apt-get install python3-pip
|
||||||
|
|
||||||
|
# Ubuntu 18.04 and below
|
||||||
|
sudo apt-get install python3-setuptools
|
||||||
|
sudo apt-get install python3-wheel
|
||||||
|
|
||||||
|
sudo apt-get install python3-psutil
|
||||||
|
|
||||||
|
# Ubuntu 18.04 and below: psutil < 5.6.2
|
||||||
|
sudo apt-get install python3-dev
|
||||||
|
sudo apt-get install build-essential
|
||||||
|
# Ubuntu 20.04 and below: psutil < 5.6.2
|
||||||
|
sudo python3 -m pip install psutil --upgrade
|
||||||
|
```
|
||||||
|
|
||||||
|
Install:
|
||||||
|
|
||||||
|
- on Arch Linux
|
||||||
|
|
||||||
|
```shell
|
||||||
|
make
|
||||||
|
```
|
||||||
|
|
||||||
|
- on Ubuntu
|
||||||
|
|
||||||
|
```shell
|
||||||
|
make install-pip
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Command line interface
|
||||||
|
|
||||||
|
```
|
||||||
|
usage: de-p1st-monitor [-h] [--config CONFIG] [--export]
|
||||||
|
|
||||||
|
Iterates over all config sections. For each section the current sensor data is
|
||||||
|
read and logged to a .csv file.
|
||||||
|
|
||||||
|
options:
|
||||||
|
-h, --help show this help message and exit
|
||||||
|
--config CONFIG, -c CONFIG
|
||||||
|
Path to .ini configuration file.
|
||||||
|
--export, -e If `True`, export .csv files and print their paths to
|
||||||
|
stdout. No sensor data is logged during this.
|
||||||
|
```
|
||||||
|
|
||||||
|
### Periodic logging
|
||||||
|
|
||||||
|
Add a cron entry executing this e.g. every 3 Minutes:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
de-p1st-monitor
|
||||||
|
```
|
||||||
|
|
||||||
|
## Example log files
|
||||||
|
|
||||||
|
```shell
|
||||||
|
ssh nas 'tail -n 1 /var/log/de-p1st-monitor/*'
|
||||||
|
```
|
||||||
|
```
|
||||||
|
==> /var/log/de-p1st-monitor/cpu_15min.csv <==
|
||||||
|
20230315T103001,0.10400390625
|
||||||
|
|
||||||
|
==> /var/log/de-p1st-monitor/cpu_1min.csv <==
|
||||||
|
20230315T103001,0.03076171875
|
||||||
|
|
||||||
|
==> /var/log/de-p1st-monitor/cpu_5min.csv <==
|
||||||
|
20230315T103001,0.0301513671875
|
||||||
|
|
||||||
|
==> /var/log/de-p1st-monitor/drive_20d86155-30d4-404c-95e8-c701cfb16ca5.csv <==
|
||||||
|
20230315T103001,24
|
||||||
|
|
||||||
|
==> /var/log/de-p1st-monitor/drive_4651c3f1-e4b8-45aa-a823-df762530a307.csv <==
|
||||||
|
20230315T103001,21
|
||||||
|
|
||||||
|
==> /var/log/de-p1st-monitor/drive_68c349e8-5118-4773-9fd5-5dbad9acee4e.csv <==
|
||||||
|
20230315T103001,29
|
||||||
|
|
||||||
|
==> /var/log/de-p1st-monitor/drive_b8ef1da9-d76d-44b4-86d4-71c82c888b6f.csv <==
|
||||||
|
20230315T103001,28
|
||||||
|
|
||||||
|
==> /var/log/de-p1st-monitor/filesystem_3CBA-B4EA.csv <==
|
||||||
|
20230315T103001,0.228
|
||||||
|
|
||||||
|
==> /var/log/de-p1st-monitor/filesystem_a454430b-dee3-4b6b-8325-f7bdb9435ed1.csv <==
|
||||||
|
20230314T231501,0.762
|
||||||
|
|
||||||
|
==> /var/log/de-p1st-monitor/filesystem_b8ef1da9-d76d-44b4-86d4-71c82c888b6f.csv <==
|
||||||
|
20230315T103001,0.034
|
||||||
|
|
||||||
|
==> /var/log/de-p1st-monitor/filesystem_c385a436-0288-486f-a2b9-c64c2db667e7.csv <==
|
||||||
|
20230315T103001,0.374
|
||||||
|
|
||||||
|
==> /var/log/de-p1st-monitor/memory.csv <==
|
||||||
|
20230315T103001,4127,15329
|
||||||
|
|
||||||
|
==> /var/log/de-p1st-monitor/net_enp0s31f6.csv <==
|
||||||
|
20230315T103001,69366974632,58725303985,20230304T173014
|
||||||
|
|
||||||
|
==> /var/log/de-p1st-monitor/swap.csv <==
|
||||||
|
20230315T103001,25,4095
|
||||||
|
|
||||||
|
==> /var/log/de-p1st-monitor/temp_coretemp_Core 0.csv <==
|
||||||
|
20230315T103001,26.0
|
||||||
|
|
||||||
|
==> /var/log/de-p1st-monitor/temp_coretemp_Core 1.csv <==
|
||||||
|
20230315T103001,34.0
|
||||||
|
|
||||||
|
==> /var/log/de-p1st-monitor/temp_coretemp_Package id 0.csv <==
|
||||||
|
20230315T103001,35.0
|
||||||
|
```
|
||||||
|
|
||||||
|
## Plots
|
||||||
|
|
||||||
|
### Creating plots with graph-cli
|
||||||
|
|
||||||
|
1) Export and fetch data
|
||||||
|
|
||||||
|
```shell
|
||||||
|
ssh_target=rootnas
|
||||||
|
dst=~/de-p1st-monitor-"${ssh_target}"
|
||||||
|
files="${dst}".files
|
||||||
|
|
||||||
|
# Export .csv files on SSH target and save list of exported files to $files.
|
||||||
|
ssh "${ssh_target}" 'de-p1st-monitor --export' > "${files}"
|
||||||
|
|
||||||
|
rm -rf "${dst}"
|
||||||
|
mkdir -p "${dst}"
|
||||||
|
|
||||||
|
rsync --checksum --archive --progress --human-readable --delete \
|
||||||
|
--files-from="${files}" "${ssh_target}":/ "${dst}"
|
||||||
|
mv "${dst}"/var/log/de-p1st-monitor/* "${dst}"
|
||||||
|
rm -r "${dst}"/var "${files}"
|
||||||
|
|
||||||
|
cd "${dst}"
|
||||||
|
```
|
||||||
|
|
||||||
|
2) Install (python) `graph-cli`
|
||||||
|
|
||||||
|
```shell
|
||||||
|
python -m venv ~/de-p1st-monitor.venv
|
||||||
|
source ~/de-p1st-monitor.venv/bin/activate
|
||||||
|
pip install graph-cli
|
||||||
|
```
|
||||||
|
|
||||||
|
3) Create plots
|
||||||
|
|
||||||
|
Create one plot for each .csv file:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
sample_duration=4H
|
||||||
|
|
||||||
|
for file in *.csv; do
|
||||||
|
graph "${file}" -x 1 --resample "${sample_duration}" --figsize 1600x1000 -o "${file}".resample-"${sample_duration}"-mean.png || {
|
||||||
|
echo "Error while processing ${file}"
|
||||||
|
}
|
||||||
|
done
|
||||||
|
|
||||||
|
for file in {swap,memory}.csv {temp_,drive_,net_,cpu_,filesystem_}*.csv; do
|
||||||
|
graph "${file}" -x 1 --resample "${sample_duration}" --resample-action max --figsize 1600x1000 -o "${file}".resample-"${sample_duration}"-max.png || {
|
||||||
|
echo "Error while processing ${file}"
|
||||||
|
}
|
||||||
|
done
|
||||||
|
```
|
||||||
|
|
||||||
|
4) Optionally, create more plots
|
||||||
|
|
||||||
|
Some self-explaining examples:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
# x and y axis by column name
|
||||||
|
graph cpu_1min.csv -x 'datetime#Date' -y 'float#LoadAverage1min' --resample 1H -o cpu_1min_resample-1H.png
|
||||||
|
# x and y axis by column number
|
||||||
|
graph cpu_1min.csv -x 1 -y 2 --resample 1H -o cpu_1min_resample-1H.png
|
||||||
|
# specify x axis; use all other axes for y
|
||||||
|
graph cpu_1min.csv -x 1 --resample 1H -o cpu_1min_resample-1H.png
|
||||||
|
# increased plot size
|
||||||
|
graph cpu_1min.csv -x 1 --resample 1H --figsize 1600x1000 -o cpu_1min_resample-1H.png
|
||||||
|
```
|
||||||
|
|
||||||
|
```shel
|
||||||
|
# resample using sum
|
||||||
|
graph net_enp0s31f6.csv.exported.csv -x 1 --resample 1H --resample-action sum --figsize 1600x1000 -o net_enp0s31f6.csv.exported_resample-1H-sum.png
|
||||||
|
```
|
||||||
|
|
||||||
|
```shel
|
||||||
|
# resample using max
|
||||||
|
graph cpu_1min.csv -x 1 --resample 1H --resample-action max --figsize 1600x1000 -o cpu_1min_resample-1H-max.png
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Example plots
|
||||||
|
|
||||||
|
![img](images/cpu_1min.csv.resample-1H.png)
|
||||||
|
![img](images/drive_68c349e8-5118-4773-9fd5-5dbad9acee4e.csv.resample-1H.png)
|
||||||
|
![img](images/filesystem_c385a436-0288-486f-a2b9-c64c2db667e7.csv.resample-1H.png)
|
||||||
|
![img](images/memory.csv.resample-1H.png)
|
||||||
|
![img](images/net_enp0s31f6.csv.exported.csv.resample-1H.png)
|
||||||
|
![img](images/swap.csv.resample-1H.png)
|
||||||
|
![img](images/temp_coretemp_Package%20id%200.csv.resample-1H.png)
|
264
TODO.md
Normal file
@ -0,0 +1,264 @@
|
|||||||
|
# TODOs
|
||||||
|
|
||||||
|
## Public IP address
|
||||||
|
|
||||||
|
Logg the public IP address. Reuse `netcup-dns` python functions.
|
||||||
|
|
||||||
|
## Rewrite
|
||||||
|
|
||||||
|
~~* easier configuration
|
||||||
|
~~* easier read/write from/to csv~~
|
||||||
|
~~* use classes & objects~~~~
|
||||||
|
|
||||||
|
~~* create plots?~~
|
||||||
|
|
||||||
|
* Don't send emit warning again, if during previous log a lower warning was emitted
|
||||||
|
* Example:
|
||||||
|
* log1: 30°C OK
|
||||||
|
* log2: 40°C Warning sent
|
||||||
|
* log3: 35°C Still above limit, but don't send warning again as value decreased
|
||||||
|
* log4: 37°C Send another warning: The value increased since last logging
|
||||||
|
|
||||||
|
## Use Grafana to visualize metrics
|
||||||
|
|
||||||
|
One can use Prometheus + Grafana to collect and visualize server metrics.
|
||||||
|
|
||||||
|
> https://geekflare.com/best-open-source-monitoring-software/
|
||||||
|
> This list won’t be complete without including two fantastic open-source solutions – Prometheus and Grafana. Its DIY solution where you use Prometheus to scrape the metrics from server, OS, applications and use Grafana to visualize them.
|
||||||
|
|
||||||
|
As we do already collect logs, we should do some research on how to
|
||||||
|
import data into Grafana.
|
||||||
|
|
||||||
|
### Time series
|
||||||
|
|
||||||
|
* https://grafana.com/docs/grafana/latest/fundamentals/timeseries/#introduction-to-time-series
|
||||||
|
|
||||||
|
E.g. CPU and memory usage, sensor data.
|
||||||
|
|
||||||
|
* https://grafana.com/docs/grafana/latest/fundamentals/timeseries/#time-series-databases
|
||||||
|
|
||||||
|
A time series database (TSDB) is a database explicitly designed for time series data.
|
||||||
|
|
||||||
|
Some supported TSDBs are:
|
||||||
|
|
||||||
|
* Graphite
|
||||||
|
* InfluxDB
|
||||||
|
* Prometheus
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
* https://grafana.com/docs/grafana/latest/setup-grafana/installation/docker/#alpine-image-recommended
|
||||||
|
* https://grafana.com/docs/grafana/latest/setup-grafana/installation/docker/#install-official-and-community-grafana-plugins
|
||||||
|
|
||||||
|
* https://grafana.com/grafana/plugins/marcusolsson-csv-datasource/?tab=installation
|
||||||
|
* https://grafana.github.io/grafana-csv-datasource/
|
||||||
|
* https://grafana.com/grafana/plugins/marcusolsson-json-datasource/?tab=installation
|
||||||
|
* https://grafana.github.io/grafana-json-datasource/
|
||||||
|
|
||||||
|
```shell
|
||||||
|
sudo docker run --rm \
|
||||||
|
-p 3000:3000 \
|
||||||
|
--name=grafana \
|
||||||
|
-e "GF_INSTALL_PLUGINS=marcusolsson-json-datasource,marcusolsson-csv-datasource" \
|
||||||
|
grafana/grafana-oss
|
||||||
|
```
|
||||||
|
|
||||||
|
TODO: test csv or json data import tools
|
||||||
|
|
||||||
|
## Netdata - Can be exported to Grafana
|
||||||
|
|
||||||
|
* https://github.com/netdata/netdata/blob/master/docs/getting-started/introduction.md
|
||||||
|
|
||||||
|
## Monit - An existing monitoring service
|
||||||
|
|
||||||
|
### General notes and links
|
||||||
|
|
||||||
|
* Monit is a widely used service for system monitoring.
|
||||||
|
* OPNsense uses Monit: https://docs.opnsense.org/manual/monit.html
|
||||||
|
|
||||||
|
* Short slideshow presentation: https://mmonit.com/monit/#slideshow
|
||||||
|
* https://wiki.ubuntuusers.de/Monit/
|
||||||
|
|
||||||
|
* Excellent configuration and usage summary in the Arch Linux Wiki: https://wiki.archlinux.org/title/Monit
|
||||||
|
|
||||||
|
* Examples
|
||||||
|
* https://mmonit.com/wiki/Monit/ConfigurationExamples
|
||||||
|
* One can use the returncode or stdout of an executed shell script
|
||||||
|
* https://mmonit.com/wiki/Monit/ConfigurationExamples#HDDHealth
|
||||||
|
```
|
||||||
|
check program HDD_Health with path "/usr/local/etc/monit/scripts/sdahealth.sh"
|
||||||
|
every 120 cycles
|
||||||
|
if content != "PASSED" then alert
|
||||||
|
# if status > 0 then alert
|
||||||
|
group health
|
||||||
|
```
|
||||||
|
* Documentation
|
||||||
|
* Event queue - Store events (notifications) if mail server is not reachable
|
||||||
|
* https://mmonit.com/monit/documentation/monit.html#Event-queue
|
||||||
|
```
|
||||||
|
set eventqueue basedir /var/monit
|
||||||
|
```
|
||||||
|
* https://mmonit.com/monit/documentation/monit.html#SPACE-USAGE-TEST
|
||||||
|
```
|
||||||
|
check filesystem rootfs with path /
|
||||||
|
if space usage > 90% then alert
|
||||||
|
```
|
||||||
|
* https://mmonit.com/monit/documentation/monit.html#PROGRAM-STATUS-TEST
|
||||||
|
```
|
||||||
|
check program myscript with path /usr/local/bin/myscript.sh
|
||||||
|
if status != 0 then alert
|
||||||
|
```
|
||||||
|
* https://mmonit.com/monit/documentation/monit.html#PROGRAM-OUTPUT-CONTENT-TEST
|
||||||
|
* https://mmonit.com/monit/documentation/monit.html#Link-upload-and-download-bytes
|
||||||
|
```
|
||||||
|
check network eth0 with interface eth0
|
||||||
|
if upload > 500 kB/s then alert
|
||||||
|
if total downloaded > 1 GB in last 2 hours then alert
|
||||||
|
if total downloaded > 10 GB in last day then alert
|
||||||
|
```
|
||||||
|
|
||||||
|
* https://mmonit.com/monit/documentation/monit.html#MANAGE-YOUR-MONIT-INSTANCES
|
||||||
|
|
||||||
|
### Monitoring all your monit instances
|
||||||
|
|
||||||
|
* Monit itself does only monitor the current system
|
||||||
|
* Multi-server monitoring is a paid extra service called M/Monit :/
|
||||||
|
* But there are other open source services for this
|
||||||
|
* https://github.com/monmon-io/monmon#why-did-you-create-monmon
|
||||||
|
|
||||||
|
### Setup
|
||||||
|
|
||||||
|
Install and start:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
sudo pacman -S --needed monit lm_sensors smartmontools
|
||||||
|
sudo systemctl start monit
|
||||||
|
sudo systemctl status monit | grep 'Active: active (running)'
|
||||||
|
```
|
||||||
|
|
||||||
|
Print default configuration:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
sudo cat /etc/monitrc | grep -v '^#'
|
||||||
|
#=> set daemon 30
|
||||||
|
#=> - A cycle is 30 seconds long.
|
||||||
|
#=> set log syslog
|
||||||
|
#=> - We will overwrite this config value later on.
|
||||||
|
#=> set httpd port 2812
|
||||||
|
#=> - Only listen on localhost with username admin and pwd monit.
|
||||||
|
```
|
||||||
|
|
||||||
|
Include `monit.d`:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
sudo mkdir -p /etc/monit.d/
|
||||||
|
! sudo cat /etc/monitrc | grep -q '^include' && echo 'include /etc/monit.d/*' | sudo tee -a /etc/monitrc
|
||||||
|
```
|
||||||
|
|
||||||
|
Log to file:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
sudo install -m700 /dev/stdin /etc/monit.d/log <<< 'set log /var/log/monit.log'
|
||||||
|
sudo systemctl restart monit
|
||||||
|
# tail -f /var/log/monit.log
|
||||||
|
```
|
||||||
|
|
||||||
|
System:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
sudo install -m700 /dev/stdin /etc/monit.d/system <<< 'check system $HOST
|
||||||
|
if filedescriptors >= 80% then alert
|
||||||
|
if loadavg (5min) > 2 for 4 cycles then alert
|
||||||
|
if memory usage > 75% for 4 cycles then alert
|
||||||
|
if swap usage > 50% for 4 cycles then alert'
|
||||||
|
sudo systemctl restart monit
|
||||||
|
```
|
||||||
|
|
||||||
|
Filesystem:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
sudo install -m700 /dev/stdin /etc/monit.d/fs <<< 'check filesystem rootfs with path /
|
||||||
|
if space usage > 80% then alert'
|
||||||
|
sudo systemctl restart monit
|
||||||
|
```
|
||||||
|
|
||||||
|
SSL options:
|
||||||
|
|
||||||
|
* https://mmonit.com/monit/documentation/monit.html#SSL-OPTIONS
|
||||||
|
|
||||||
|
```shell
|
||||||
|
sudo install -m700 /dev/stdin /etc/monit.d/ssl <<< '# Enable certificate verification for all SSL connections
|
||||||
|
set ssl options {
|
||||||
|
verify: enable
|
||||||
|
}'
|
||||||
|
sudo systemctl restart monit
|
||||||
|
```
|
||||||
|
|
||||||
|
Mailserver, alerts and eventqueue:
|
||||||
|
|
||||||
|
* https://mmonit.com/monit/documentation/monit.html#Setting-a-mail-server-for-alert-delivery
|
||||||
|
* https://mmonit.com/monit/documentation/monit.html#Setting-an-error-reminder
|
||||||
|
* https://mmonit.com/monit/documentation/monit.html#Event-queue
|
||||||
|
* If no mail server is available, Monit can queue events in the local file-system for retry until the mail server recovers.
|
||||||
|
* By default, the queue is disabled and if the alert handler fails, Monit will simply drop the alert message.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
sudo install -m700 /dev/stdin /etc/monit.d/mail <<< 'set mailserver smtp.mail.de
|
||||||
|
port 465
|
||||||
|
username "langbein@mail.de"
|
||||||
|
password "qiXF6cUgfvSVqd0pAoFTqZEHIcUKzc3n"
|
||||||
|
using SSL
|
||||||
|
with timeout 20 seconds
|
||||||
|
|
||||||
|
set mail-format {
|
||||||
|
from: langbein@mail.de
|
||||||
|
subject: $SERVICE - $EVENT at $DATE
|
||||||
|
message: Monit $ACTION $SERVICE at $DATE on $HOST: $DESCRIPTION.
|
||||||
|
}
|
||||||
|
|
||||||
|
set alert daniel@systemli.org with reminder on 10 cycles
|
||||||
|
|
||||||
|
set eventqueue basedir /var/monit'
|
||||||
|
sudo systemctl restart monit
|
||||||
|
sudo monit -v | grep 'Mail'
|
||||||
|
```
|
||||||
|
|
||||||
|
Test alert:
|
||||||
|
|
||||||
|
* https://wiki.ubuntuusers.de/Monit/#E-Mail-Benachrichtigungen-testen
|
||||||
|
* It is enough to restart monit. It will send an email that it's state has changed (stopped/started).
|
||||||
|
* But if desired, one can also create a test for a non-existing file:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
sudo install -m700 /dev/stdin /etc/monit.d/alerttest <<< 'check file alerttest with path /.nonexistent.file'
|
||||||
|
sudo systemctl restart monit
|
||||||
|
```
|
||||||
|
|
||||||
|
Example script - run a speedtest:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
sudo pacman -S --needed speedtest-cli
|
||||||
|
sudo install -m700 /dev/stdin /etc/monit.d/speedtest <<< 'check program speedtest with path /usr/bin/speedtest-cli
|
||||||
|
every 120 cycles
|
||||||
|
if status != 0 then alert'
|
||||||
|
sudo systemctl restart monit
|
||||||
|
```
|
||||||
|
|
||||||
|
Check config syntax:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
sudo monit -t
|
||||||
|
```
|
||||||
|
|
||||||
|
################## TODOS ##########################
|
||||||
|
|
||||||
|
* See Firefox bookmark folder 20230219_monit.
|
||||||
|
* Disk health
|
||||||
|
* BTRFS balance
|
||||||
|
* Save disk usage and temperatures to CSV log file
|
||||||
|
* e.g. by using `check program check-and-log-temp.sh` monit configuration
|
||||||
|
* Or: do checks by monit and every couple minutes run `check program log-system-info.sh`
|
||||||
|
|
||||||
|
### Monit behind Nginx
|
||||||
|
|
||||||
|
TODO: Nginx reverse proxy with basic authentication.
|
79
cfg/yodaNas.ini
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
[logging]
|
||||||
|
dir = /var/log/de-p1st-monitor/
|
||||||
|
|
||||||
|
[filesystem.1]
|
||||||
|
; NVME
|
||||||
|
mountpoint = /
|
||||||
|
warn_if_above = 0.75
|
||||||
|
[filesystem.2]
|
||||||
|
; NVME
|
||||||
|
mountpoint = /boot
|
||||||
|
warn_if_above = 0.75
|
||||||
|
[filesystem.3]
|
||||||
|
; 12TB1
|
||||||
|
uuid = c385a436-0288-486f-a2b9-c64c2db667e7
|
||||||
|
warn_if_above = 0.66
|
||||||
|
[filesystem.4]
|
||||||
|
; 3TB1 and 3TB2
|
||||||
|
uuid = a454430b-dee3-4b6b-8325-f7bdb9435ed1
|
||||||
|
warn_if_above = 0.85
|
||||||
|
unmounted_ok = true
|
||||||
|
|
||||||
|
[memory]
|
||||||
|
warn_if_above = 0.85
|
||||||
|
[swap]
|
||||||
|
warn_if_above = 0.85
|
||||||
|
|
||||||
|
[cpu1]
|
||||||
|
warn_if_above = 3.0
|
||||||
|
warn_threshold = 2
|
||||||
|
warn_data_range = 2
|
||||||
|
[cpu5]
|
||||||
|
warn_if_above = 2.0
|
||||||
|
warn_threshold = 2
|
||||||
|
warn_data_range = 2
|
||||||
|
[cpu15]
|
||||||
|
warn_if_above = 1.0
|
||||||
|
warn_threshold = 2
|
||||||
|
warn_data_range = 2
|
||||||
|
|
||||||
|
[temp.1]
|
||||||
|
sensor = coretemp
|
||||||
|
label = Package id 0
|
||||||
|
warn_if_above = 60
|
||||||
|
[temp.2]
|
||||||
|
sensor = coretemp
|
||||||
|
label = Core 0
|
||||||
|
warn_if_above = 60
|
||||||
|
[temp.3]
|
||||||
|
sensor = coretemp
|
||||||
|
label = Core 1
|
||||||
|
warn_if_above = 60
|
||||||
|
|
||||||
|
[network.1]
|
||||||
|
network_interface = enp0s31f6
|
||||||
|
|
||||||
|
[drive.1]
|
||||||
|
; NVME /dev/nvme0n1p3
|
||||||
|
; TODO NVME 49 warn, 55 limit
|
||||||
|
uuid = b8ef1da9-d76d-44b4-86d4-71c82c888b6f
|
||||||
|
warn_if_above = 50
|
||||||
|
[drive.2]
|
||||||
|
; HDD 12TB1
|
||||||
|
; TODO HDD 39 warn, 45 limit
|
||||||
|
uuid = 68c349e8-5118-4773-9fd5-5dbad9acee4e
|
||||||
|
warn_if_above = 40
|
||||||
|
[drive.3]
|
||||||
|
; HDD 3TB1
|
||||||
|
uuid = 20d86155-30d4-404c-95e8-c701cfb16ca5
|
||||||
|
warn_if_above = 40
|
||||||
|
[drive.4]
|
||||||
|
; HDD 3TB2
|
||||||
|
uuid = 4651c3f1-e4b8-45aa-a823-df762530a307
|
||||||
|
warn_if_above = 40
|
||||||
|
|
||||||
|
; TODO digitemp sensor
|
||||||
|
;[digitemp_DS9097.1]
|
||||||
|
;cfg = /root/.digitemprc
|
||||||
|
;sensor_num = 0
|
||||||
|
;name = room-temp
|
79
cfg/yodaTux.ini
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
[logging]
|
||||||
|
; The CSV logfiles are saved in this directory.
|
||||||
|
dir = /var/log/de-p1st-monitor/
|
||||||
|
|
||||||
|
|
||||||
|
[temp.1]
|
||||||
|
; `sensor` and `label` are used to identify one temperature value.
|
||||||
|
sensor = k10temp
|
||||||
|
label = Tctl
|
||||||
|
|
||||||
|
; Warn if temperature is above this value.
|
||||||
|
; Unit: °C
|
||||||
|
warn_if_above = 80
|
||||||
|
|
||||||
|
; Send warning if critical values were reached 2 times during the last 4 logs.
|
||||||
|
warn_threshold = 2
|
||||||
|
warn_data_range = 4
|
||||||
|
|
||||||
|
[temp.2]
|
||||||
|
sensor = amdgpu
|
||||||
|
label = edge
|
||||||
|
warn_if_above = 50
|
||||||
|
warn_threshold = 2
|
||||||
|
warn_data_range = 4
|
||||||
|
|
||||||
|
|
||||||
|
[network.1]
|
||||||
|
network_interface = wlan0
|
||||||
|
|
||||||
|
|
||||||
|
[memory]
|
||||||
|
; Warn if memory usage is above this value.
|
||||||
|
; Range: (0.0, 1.0)
|
||||||
|
warn_if_above = 0.1
|
||||||
|
|
||||||
|
[swap]
|
||||||
|
; Warn if swap usage is above this value.
|
||||||
|
; Range: (0.0, 1.0)
|
||||||
|
warn_if_above = 0.5
|
||||||
|
|
||||||
|
|
||||||
|
[cpu1]
|
||||||
|
; Warn if CPU load of the last 1 minute is above this value.
|
||||||
|
; Range: (0.0, infinite)
|
||||||
|
; `1.0` corresponds to 100% CPU utilisation.
|
||||||
|
; However, there can be more processes in the queue than can be processed.
|
||||||
|
; As a result, the value can go above `1.0`.
|
||||||
|
warn_if_above = 0.95
|
||||||
|
[cpu5]
|
||||||
|
; Warn if CPU load of the last 5 minutes is above this value.
|
||||||
|
warn_if_above = 0.85
|
||||||
|
[cpu15]
|
||||||
|
; Warn if CPU load of the last 15 minutes is above this value.
|
||||||
|
warn_if_above = 0.75
|
||||||
|
|
||||||
|
|
||||||
|
[filesystem.1]
|
||||||
|
; Either `uuid` or `mountpoint` must be given.
|
||||||
|
;
|
||||||
|
; If both are given but the UUID of the disk mounted at `mountpoint` differs from `uuid`, then an exception is raised.
|
||||||
|
uuid = 7fb12542-bd59-4727-9beb-7cf1f79f8293
|
||||||
|
mountpoint = /
|
||||||
|
|
||||||
|
; If `true` don't log or warn if the filesystem is not found.
|
||||||
|
unmounted_ok = true
|
||||||
|
|
||||||
|
; Warn if disk usage is above this value.
|
||||||
|
; Range: (0.0, 1.0)
|
||||||
|
warn_if_above = 0.1
|
||||||
|
|
||||||
|
|
||||||
|
[drive.1]
|
||||||
|
; Either `uuid` or `device` must be given.
|
||||||
|
;uuid =
|
||||||
|
device = /dev/nvme0n1p3
|
||||||
|
|
||||||
|
; Warn if temperature is above this value.
|
||||||
|
; Unit: °C
|
||||||
|
warn_if_above = 25
|
11
cron.d/de-p1st-monitor
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# Run command every 3min
|
||||||
|
# - https://crontab.guru/every-3-minutes
|
||||||
|
# `/etc/cron.d/` requires user field
|
||||||
|
# - https://unix.stackexchange.com/questions/458713/how-are-files-under-etc-cron-d-used#comment1019389_458715
|
||||||
|
# Some users report that files in `/etc/cron.d/` containing `-` are not executed
|
||||||
|
# - https://unix.stackexchange.com/questions/296347/crontab-never-running-while-in-etc-cron-d#comment640748_296351
|
||||||
|
# PATH is restricted to `/bin:/usr/bin` but `exec-notify` resides in `/usr/local/bin/`
|
||||||
|
# - https://serverfault.com/a/449652
|
||||||
|
|
||||||
|
PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/sbin:/usr/local/bin/
|
||||||
|
*/3 * * * * root exec-notify de-p1st-monitor > /var/log/de-p1st-monitor.cron 2>&1
|
BIN
images/cpu_1min.csv.resample-1H.png
Normal file
After Width: | Height: | Size: 114 KiB |
After Width: | Height: | Size: 118 KiB |
After Width: | Height: | Size: 89 KiB |
BIN
images/memory.csv.resample-1H.png
Normal file
After Width: | Height: | Size: 107 KiB |
BIN
images/net_enp0s31f6.csv.exported.csv.resample-1H.png
Normal file
After Width: | Height: | Size: 138 KiB |
BIN
images/swap.csv.resample-1H.png
Normal file
After Width: | Height: | Size: 107 KiB |
BIN
images/temp_coretemp_Package id 0.csv.resample-1H.png
Normal file
After Width: | Height: | Size: 132 KiB |
56
packaging/PKGBUILD
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
# Maintainer: Daniel Langbein < daniel [ at ] systemli [ dot ] org >
|
||||||
|
|
||||||
|
# This PKGBUILD is based on the instructions from the Arch Linux wiki:
|
||||||
|
# https://wiki.archlinux.org/title/Python_package_guidelines
|
||||||
|
|
||||||
|
_name=de-p1st-monitor
|
||||||
|
pkgname="python-$_name-git"
|
||||||
|
pkgver=r202.f3f2f46
|
||||||
|
pkgrel=1
|
||||||
|
pkgdesc='periodically monitor and warn'
|
||||||
|
arch=(any)
|
||||||
|
url="https://git.privacy1st.de/langfingaz/$_name"
|
||||||
|
license=('custom:BSD-3-Clause-Clear-License')
|
||||||
|
|
||||||
|
provides=(de-p1st-monitor)
|
||||||
|
depends=(python exec-notify)
|
||||||
|
makedepends=(git python-build python-installer python-wheel)
|
||||||
|
optdepends=('python-psutil: CPU, memory, network monitoring'
|
||||||
|
'digitemp: USB temperature sensor'
|
||||||
|
'smartmontools: disk temperature monitoring')
|
||||||
|
|
||||||
|
source=("git+https://git.privacy1st.de/langfingaz/$_name.git")
|
||||||
|
b2sums=(SKIP)
|
||||||
|
|
||||||
|
# If there are no tags then use number of revisions since beginning of the history:
|
||||||
|
# https://wiki.archlinux.org/title/VCS_package_guidelines
|
||||||
|
pkgver() {
|
||||||
|
cd "$_name"
|
||||||
|
printf "r%s.%s" "$(git rev-list --count HEAD)" "$(git rev-parse --short=7 HEAD)"
|
||||||
|
}
|
||||||
|
|
||||||
|
prepare() {
|
||||||
|
git -C "$srcdir/$_name" clean -dfx
|
||||||
|
}
|
||||||
|
|
||||||
|
build() {
|
||||||
|
# cd "$_name-$pkgver"
|
||||||
|
cd "$_name"
|
||||||
|
python -m build --wheel --no-isolation
|
||||||
|
}
|
||||||
|
|
||||||
|
package() {
|
||||||
|
# cd "$_name-$pkgver"
|
||||||
|
cd "$_name"
|
||||||
|
python -m installer --destdir="$pkgdir" dist/*.whl
|
||||||
|
}
|
||||||
|
|
||||||
|
check(){
|
||||||
|
cd "$srcdir/$_name"
|
||||||
|
|
||||||
|
# For nosetests
|
||||||
|
# nosetests
|
||||||
|
|
||||||
|
# For pytest
|
||||||
|
# pytest
|
||||||
|
}
|
8
pyproject.toml
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
# https://packaging.python.org/tutorials/packaging-projects/#creating-pyproject-toml
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = [
|
||||||
|
"setuptools>=42",
|
||||||
|
"wheel"
|
||||||
|
]
|
||||||
|
build-backend = "setuptools.build_meta"
|
1
requirements.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
psutil>=5.9
|
@ -0,0 +1,9 @@
|
|||||||
|
@inproceedings{32774,
|
||||||
|
title = {Failure Trends in a Large Disk Drive Population},
|
||||||
|
author = {Eduardo Pinheiro and Wolf-Dietrich Weber and Luiz André Barroso},
|
||||||
|
year = {2007},
|
||||||
|
booktitle = {5th USENIX Conference on File and Storage Technologies (FAST 2007)},
|
||||||
|
pages = {17-29}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
35
setup.cfg
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
; setup.cfg is the configuration file for setuptools.
|
||||||
|
; https://packaging.python.org/tutorials/packaging-projects/#configuring-metadata
|
||||||
|
|
||||||
|
[metadata]
|
||||||
|
name = de.p1st.monitor
|
||||||
|
version = 0.8.0
|
||||||
|
author = Daniel Langbein
|
||||||
|
author_email = daniel@systemli.org
|
||||||
|
description = periodically monitor and warn
|
||||||
|
long_description = file: README.md
|
||||||
|
long_description_content_type = text/markdown
|
||||||
|
url = https://git.privacy1st.de/langfingaz/de-p1st-monitor
|
||||||
|
project_urls =
|
||||||
|
Bug Tracker = https://git.privacy1st.de/langfingaz/de-p1st-monitor/issues
|
||||||
|
|
||||||
|
; https://pypi.org/classifiers/
|
||||||
|
classifiers =
|
||||||
|
Development Status :: 4 - Beta
|
||||||
|
Programming Language :: Python :: 3
|
||||||
|
; License :: BSD 3-Clause Clear License
|
||||||
|
Operating System :: Unix
|
||||||
|
|
||||||
|
[options]
|
||||||
|
package_dir =
|
||||||
|
= src
|
||||||
|
packages = find:
|
||||||
|
python_requires = >=3.6.9
|
||||||
|
|
||||||
|
[options.packages.find]
|
||||||
|
where = src
|
||||||
|
|
||||||
|
[options.entry_points]
|
||||||
|
; https://setuptools.readthedocs.io/en/latest/userguide/entry_point.html
|
||||||
|
console_scripts=
|
||||||
|
de-p1st-monitor = de.p1st.monitor.main:main
|
4
setup.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
# This file is required for `pip install` on Ubuntu 18.04.
|
||||||
|
# It loads `setup.cfg`.
|
||||||
|
from setuptools import setup
|
||||||
|
setup()
|
0
src/de/__init__.py
Normal file
0
src/de/p1st/__init__.py
Normal file
0
src/de/p1st/monitor/__init__.py
Normal file
0
src/de/p1st/monitor/cfg/__init__.py
Normal file
106
src/de/p1st/monitor/cfg/loggers.py
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
import configparser
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from de.p1st.monitor.cfg.singleton import get_cfg
|
||||||
|
from de.p1st.monitor.logger_ex import LoggerArgEx
|
||||||
|
from de.p1st.monitor.loggers.cpu import CPULogger1, CPULogger5, CPULogger15
|
||||||
|
from de.p1st.monitor.loggers.drive import DriveLogger
|
||||||
|
from de.p1st.monitor.loggers.filesystem import FilesystemLogger
|
||||||
|
from de.p1st.monitor.loggers.memory import MemoryLogger
|
||||||
|
from de.p1st.monitor.loggers.network import NetworkLogger
|
||||||
|
from de.p1st.monitor.loggers.swap import SwapLogger
|
||||||
|
from de.p1st.monitor.loggers.temp import TempLogger
|
||||||
|
from de.p1st.monitor.logger import Logger
|
||||||
|
|
||||||
|
def get_or_raise(cfg: configparser.SectionProxy, key: str) -> str:
|
||||||
|
if key in cfg:
|
||||||
|
return cfg[key]
|
||||||
|
else:
|
||||||
|
raise LoggerArgEx(f'Missing key {key} in section {cfg.name}')
|
||||||
|
|
||||||
|
def get_loggers() -> tuple[list[Logger], list[LoggerArgEx]]:
|
||||||
|
def temp(cfg: configparser.SectionProxy) -> Logger:
|
||||||
|
sensor = get_or_raise(cfg, 'sensor')
|
||||||
|
label = get_or_raise(cfg, 'label')
|
||||||
|
warn_if_above = float(cfg['warn_if_above']) if 'warn_if_above' in cfg else None
|
||||||
|
warn_threshold = int(cfg.get('warn_threshold', '1'))
|
||||||
|
warn_data_range = int(cfg.get('warn_data_range', '1'))
|
||||||
|
return TempLogger(sensor, label, warn_if_above, warn_threshold, warn_data_range)
|
||||||
|
def cpu1(cfg: configparser.SectionProxy) -> Logger:
|
||||||
|
warn_if_above = float(cfg['warn_if_above']) if 'warn_if_above' in cfg else None
|
||||||
|
warn_threshold = int(cfg.get('warn_threshold', '1'))
|
||||||
|
warn_data_range = int(cfg.get('warn_data_range', '1'))
|
||||||
|
return CPULogger1(warn_if_above, warn_threshold, warn_data_range)
|
||||||
|
|
||||||
|
def cpu5(cfg: configparser.SectionProxy) -> Logger:
|
||||||
|
warn_if_above = float(cfg['warn_if_above']) if 'warn_if_above' in cfg else None
|
||||||
|
warn_threshold = int(cfg.get('warn_threshold', '1'))
|
||||||
|
warn_data_range = int(cfg.get('warn_data_range', '1'))
|
||||||
|
return CPULogger5(warn_if_above, warn_threshold, warn_data_range)
|
||||||
|
|
||||||
|
def cpu15(cfg: configparser.SectionProxy) -> Logger:
|
||||||
|
warn_if_above = float(cfg['warn_if_above']) if 'warn_if_above' in cfg else None
|
||||||
|
warn_threshold = int(cfg.get('warn_threshold', '1'))
|
||||||
|
warn_data_range = int(cfg.get('warn_data_range', '1'))
|
||||||
|
return CPULogger15(warn_if_above, warn_threshold, warn_data_range)
|
||||||
|
|
||||||
|
def net(cfg: configparser.SectionProxy) -> Logger:
|
||||||
|
network_interface = get_or_raise(cfg, 'network_interface')
|
||||||
|
return NetworkLogger(network_interface)
|
||||||
|
|
||||||
|
def filesystem(cfg: configparser.SectionProxy) -> Logger:
|
||||||
|
uuid = cfg.get('uuid', None)
|
||||||
|
mountpoint = Path(cfg.get('mountpoint')) if 'mountpoint' in cfg else None
|
||||||
|
unmounted_ok = bool(cfg.get('unmounted_ok', 'false'))
|
||||||
|
warn_if_above = float(cfg.get('warn_if_above', '1.0'))
|
||||||
|
warn_threshold = int(cfg.get('warn_threshold', '1'))
|
||||||
|
warn_data_range = int(cfg.get('warn_data_range', '1'))
|
||||||
|
return FilesystemLogger(uuid, mountpoint, unmounted_ok, warn_if_above, warn_threshold, warn_data_range)
|
||||||
|
|
||||||
|
def drive(cfg: configparser.SectionProxy) -> Logger:
|
||||||
|
uuid = cfg.get('uuid', None)
|
||||||
|
device = Path(cfg.get('device')) if 'device' in cfg else None
|
||||||
|
warn_if_above = int(cfg['warn_if_above']) if 'warn_if_above' in cfg else None
|
||||||
|
warn_threshold = int(cfg.get('warn_threshold', '1'))
|
||||||
|
warn_data_range = int(cfg.get('warn_data_range', '1'))
|
||||||
|
return DriveLogger(uuid, device, warn_if_above, warn_threshold, warn_data_range)
|
||||||
|
|
||||||
|
def memory(cfg: configparser.SectionProxy) -> Logger:
|
||||||
|
warn_if_above = float(cfg.get('warn_if_above', '1.0'))
|
||||||
|
warn_threshold = int(cfg.get('warn_threshold', '1'))
|
||||||
|
warn_data_range = int(cfg.get('warn_data_range', '1'))
|
||||||
|
return MemoryLogger(warn_if_above, warn_threshold, warn_data_range)
|
||||||
|
def swap(cfg: configparser.SectionProxy) -> Logger:
|
||||||
|
warn_if_above = float(cfg.get('warn_if_above', '1.0'))
|
||||||
|
warn_threshold = int(cfg.get('warn_threshold', '1'))
|
||||||
|
warn_data_range = int(cfg.get('warn_data_range', '1'))
|
||||||
|
return SwapLogger(warn_if_above, warn_threshold, warn_data_range)
|
||||||
|
|
||||||
|
|
||||||
|
mapping = {
|
||||||
|
'temp': temp,
|
||||||
|
'cpu1': cpu1,
|
||||||
|
'cpu5': cpu5,
|
||||||
|
'cpu15': cpu15,
|
||||||
|
'network': net,
|
||||||
|
'filesystem': filesystem,
|
||||||
|
'drive': drive,
|
||||||
|
'memory': memory,
|
||||||
|
'swap': swap,
|
||||||
|
}
|
||||||
|
|
||||||
|
loggers = []
|
||||||
|
exceptions = []
|
||||||
|
cfg: configparser.ConfigParser = get_cfg()
|
||||||
|
for section_name in cfg.sections():
|
||||||
|
if section_name == 'logging':
|
||||||
|
continue
|
||||||
|
prefix = section_name.split('.', maxsplit=1)[0]
|
||||||
|
try:
|
||||||
|
loggers.append(
|
||||||
|
mapping[prefix](cfg[section_name])
|
||||||
|
)
|
||||||
|
except LoggerArgEx as e:
|
||||||
|
exceptions.append(e)
|
||||||
|
|
||||||
|
return loggers, exceptions
|
11
src/de/p1st/monitor/cfg/logging_dir.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from de.p1st.monitor.cfg.singleton import get_cfg
|
||||||
|
|
||||||
|
|
||||||
|
def logging_dir() -> Path:
|
||||||
|
cfg = get_cfg()
|
||||||
|
default = '/var/log/de-p1st-monitor'
|
||||||
|
if 'logging' not in cfg:
|
||||||
|
return Path(default)
|
||||||
|
return Path(cfg['logging'].get('dir', default))
|
30
src/de/p1st/monitor/cfg/singleton.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
import configparser
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
_cfg: configparser.ConfigParser | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def init_cfg(config_file: Path = None):
|
||||||
|
global _cfg
|
||||||
|
|
||||||
|
if _cfg is not None:
|
||||||
|
raise ValueError('already initialized')
|
||||||
|
|
||||||
|
if config_file is None:
|
||||||
|
import socket
|
||||||
|
hostname = socket.gethostname()
|
||||||
|
config_file = Path(f'/etc/de-p1st-monitor/{hostname}.ini')
|
||||||
|
|
||||||
|
if not config_file.exists():
|
||||||
|
raise Exception(f'Configuration file does not exist! {config_file}')
|
||||||
|
|
||||||
|
_cfg = configparser.ConfigParser()
|
||||||
|
_cfg.read(config_file)
|
||||||
|
|
||||||
|
|
||||||
|
def get_cfg() -> configparser.ConfigParser:
|
||||||
|
global _cfg
|
||||||
|
|
||||||
|
if _cfg is None:
|
||||||
|
raise ValueError('uninitialized')
|
||||||
|
return _cfg
|
90
src/de/p1st/monitor/csv_util.py
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import csv
|
||||||
|
from collections import deque
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def read(file: Path) -> list[list[str]]:
|
||||||
|
"""
|
||||||
|
Returns all rows from the CSV file `file`.
|
||||||
|
"""
|
||||||
|
with open(file, newline='') as csvfile:
|
||||||
|
reader = csv.reader(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
|
||||||
|
return [row for row in reader]
|
||||||
|
|
||||||
|
|
||||||
|
def read_last(file: Path, num_rows: int, skip: int = 0) -> list[list[str]]:
|
||||||
|
"""
|
||||||
|
Returns the last `num_rows` from the CSV file `file`.
|
||||||
|
|
||||||
|
:param file:
|
||||||
|
:param num_rows:
|
||||||
|
:param skip: If given, the first `skip` rows are skipped.
|
||||||
|
"""
|
||||||
|
with open(file, newline='') as csvfile:
|
||||||
|
reader = csv.reader(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
|
||||||
|
|
||||||
|
# Skip the first `skip` rows.
|
||||||
|
for i in range(skip):
|
||||||
|
try:
|
||||||
|
next(reader)
|
||||||
|
except StopIteration:
|
||||||
|
break # EOF
|
||||||
|
|
||||||
|
# Read all other rows but only keep the last `num_rows` rows.
|
||||||
|
q = deque(reader, num_rows)
|
||||||
|
# Return the last `num_rows` as list.
|
||||||
|
return [row for row in q]
|
||||||
|
|
||||||
|
|
||||||
|
def write(file: Path,
|
||||||
|
rows: list[list[str]],
|
||||||
|
header: list[str] = None,
|
||||||
|
create_parent_dirs: bool = True,
|
||||||
|
recreate_file: bool = False) -> None:
|
||||||
|
"""
|
||||||
|
Create new .csv file if missing or append to existing .csv file.
|
||||||
|
|
||||||
|
:param file:
|
||||||
|
:param rows: The rows to write as csv table to file.
|
||||||
|
:param header: If given will be inserted as first row into the csv table.
|
||||||
|
:param create_parent_dirs: If `file.parent` does not exist, create it.
|
||||||
|
:param recreate_file: Never append, always recreate the .csv file.
|
||||||
|
"""
|
||||||
|
if create_parent_dirs and not file.parent.exists():
|
||||||
|
file.parent.mkdir(parents=True, exist_ok=False)
|
||||||
|
if recreate_file and file.exists():
|
||||||
|
file.unlink(missing_ok=False)
|
||||||
|
if file.exists():
|
||||||
|
append(file, rows)
|
||||||
|
else:
|
||||||
|
if header is not None:
|
||||||
|
rows = [header] + rows
|
||||||
|
create(file, rows)
|
||||||
|
|
||||||
|
text = file.read_text()
|
||||||
|
if text.count('\n') != len(rows) or not text.endswith('\n'):
|
||||||
|
raise Exception(f'Created a new csv file with {len(rows)} rows but it does not have {len(rows)} lines. '
|
||||||
|
f'Make sure that there are no concurrent writes to this file!')
|
||||||
|
|
||||||
|
|
||||||
|
def create(file: Path, rows: list[list[str]]) -> None:
|
||||||
|
with open(file, 'x', newline='') as csvfile:
|
||||||
|
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
|
||||||
|
writer.writerows(rows)
|
||||||
|
|
||||||
|
|
||||||
|
def append(file: Path, rows: list[list[str]]) -> None:
|
||||||
|
with open(file, 'a', newline='') as csvfile:
|
||||||
|
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
|
||||||
|
writer.writerows(rows)
|
||||||
|
|
||||||
|
|
||||||
|
def test():
|
||||||
|
file = Path('/var/log/de-p1st-monitor/cpu_avg.csv')
|
||||||
|
data = read_last(file, 4, 10)
|
||||||
|
print(data)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test()
|
49
src/de/p1st/monitor/datetime_util.py
Executable file
@ -0,0 +1,49 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
|
||||||
|
def test():
|
||||||
|
dt = datetime.now()
|
||||||
|
|
||||||
|
print('non UTC:')
|
||||||
|
print(dt)
|
||||||
|
|
||||||
|
print('\nUTC:')
|
||||||
|
print(now())
|
||||||
|
print(to_str(now()))
|
||||||
|
print(now_str())
|
||||||
|
print(from_str(to_str(now())))
|
||||||
|
|
||||||
|
print('\nlocalized:')
|
||||||
|
print(dt.tzinfo)
|
||||||
|
dt = dt.replace(tzinfo=timezone.utc)
|
||||||
|
print(dt)
|
||||||
|
|
||||||
|
|
||||||
|
def now() -> datetime:
|
||||||
|
return datetime.now(timezone.utc)
|
||||||
|
|
||||||
|
|
||||||
|
def now_str() -> str:
|
||||||
|
return to_str(now())
|
||||||
|
|
||||||
|
|
||||||
|
def to_str(dt: datetime) -> str:
|
||||||
|
return dt.strftime(fmt())
|
||||||
|
|
||||||
|
|
||||||
|
def from_str(dt_str: str) -> datetime:
|
||||||
|
dt = datetime.strptime(dt_str, fmt())
|
||||||
|
return dt.replace(tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
|
||||||
|
def fmt() -> str:
|
||||||
|
return '%Y%m%dT%H%M%S'
|
||||||
|
|
||||||
|
|
||||||
|
def fmt_len() -> int:
|
||||||
|
return 13
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test()
|
12
src/de/p1st/monitor/exec_capture.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
|
||||||
|
def execute_capture(command: list[str]) -> tuple[int, str, str]:
|
||||||
|
completed: subprocess.CompletedProcess = subprocess.run(
|
||||||
|
command,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
return completed.returncode, completed.stdout, completed.stderr
|
230
src/de/p1st/monitor/logger.py
Normal file
@ -0,0 +1,230 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
from de.p1st.monitor import csv_util
|
||||||
|
from de.p1st.monitor.cfg.logging_dir import logging_dir
|
||||||
|
from de.p1st.monitor.string_conversion import to_string, from_string
|
||||||
|
from de.p1st.monitor.warn import WarnMessage, WarnLevel
|
||||||
|
|
||||||
|
|
||||||
|
# https://www.geeksforgeeks.org/abstract-classes-in-python/
|
||||||
|
class Logger(ABC):
|
||||||
|
def __init__(self,
|
||||||
|
warn_threshold: int = 1,
|
||||||
|
warn_data_range: int = 1,
|
||||||
|
warn_if_above: int | float = None,
|
||||||
|
critical_if_above: int | float = None,
|
||||||
|
):
|
||||||
|
self.data: list[any] | None = None
|
||||||
|
# True if the data held by this object is already appended to the logfile.
|
||||||
|
self.logged = False
|
||||||
|
|
||||||
|
self.warn_threshold = warn_threshold
|
||||||
|
self.warn_data_range = warn_data_range
|
||||||
|
|
||||||
|
# Either both variables are given, or both are None
|
||||||
|
if warn_if_above is not None and critical_if_above is not None:
|
||||||
|
assert critical_if_above > warn_if_above
|
||||||
|
else:
|
||||||
|
assert warn_if_above is None and critical_if_above is None
|
||||||
|
self.warn_if_above = warn_if_above
|
||||||
|
self.critical_if_above = critical_if_above
|
||||||
|
|
||||||
|
def export_data(self) -> Path:
|
||||||
|
"""
|
||||||
|
This method is intended to be overriden in a subclass!
|
||||||
|
|
||||||
|
With most loggers the `get_log_file()` is ready-to-use.
|
||||||
|
In this case this method simply returns `get_log_file()`.
|
||||||
|
|
||||||
|
But some loggers require postprocessing of that data before it can be used.
|
||||||
|
In this case this method creates a new .csv file and returns it.
|
||||||
|
|
||||||
|
@return: Path to .csv file with ready-to-use data.
|
||||||
|
"""
|
||||||
|
return self.get_log_file()
|
||||||
|
|
||||||
|
def check(self) -> WarnMessage:
|
||||||
|
"""
|
||||||
|
Checks the latest `self.warn_data_range` datasets for problems using `self.check_data()`.
|
||||||
|
|
||||||
|
If at least `self.warn_threshold` problems are found,
|
||||||
|
then a WarnMessage with the highest reported WarnLevel is returned.
|
||||||
|
|
||||||
|
If at least one WarnLevel is above NORMAL,
|
||||||
|
then a WarnMessage is returned independent of the number of problems.
|
||||||
|
"""
|
||||||
|
datasets = self.get_datasets(self.warn_data_range)
|
||||||
|
warnings = [self.check_data(data) for data in datasets]
|
||||||
|
warnings = [warning for warning in warnings
|
||||||
|
if not warning.level == WarnLevel.NONE]
|
||||||
|
|
||||||
|
if len(warnings) == 0:
|
||||||
|
return WarnMessage(WarnLevel.NONE)
|
||||||
|
# max() must not be called with an empty list.
|
||||||
|
highest_warn_level = max([warning.level for warning in warnings])
|
||||||
|
|
||||||
|
messages: list[str] = [warning.message for warning in warnings]
|
||||||
|
message = f'{len(warnings)} of the last {self.warn_data_range} datasets are above limits:\n\t' \
|
||||||
|
+ '\n\t'.join(messages)
|
||||||
|
|
||||||
|
if highest_warn_level > WarnLevel.NORMAL:
|
||||||
|
return WarnMessage(highest_warn_level, message)
|
||||||
|
if len(warnings) >= self.warn_threshold:
|
||||||
|
return WarnMessage(highest_warn_level, message)
|
||||||
|
return WarnMessage(WarnLevel.NONE)
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def check_data(self, data: list[any]) -> WarnMessage:
|
||||||
|
"""
|
||||||
|
Check the given data for problems.
|
||||||
|
Return a WarnLevel indicating how serious the problems are.
|
||||||
|
|
||||||
|
If there are no problems, return `WarnLevel.NONE`.
|
||||||
|
"""
|
||||||
|
raise ValueError('Subclasses must implement this')
|
||||||
|
|
||||||
|
def get_all_datasets(self) -> list[list[any]]:
|
||||||
|
# See also: self.get_datasets()
|
||||||
|
|
||||||
|
if self.get_log_file().exists():
|
||||||
|
# We skip the first row as it is the data schema.
|
||||||
|
raw = csv_util.read(self.get_log_file())[1:]
|
||||||
|
data = [self.get_data_from_row(row) for row in raw]
|
||||||
|
else:
|
||||||
|
data = []
|
||||||
|
|
||||||
|
if not self.logged and self.has_data():
|
||||||
|
data.append(self.get_data())
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def get_datasets(self, num: int) -> list[list[any]]:
|
||||||
|
"""
|
||||||
|
Returns the last `num` datasets (including the current dataset).
|
||||||
|
"""
|
||||||
|
if not self.logged and self.has_data():
|
||||||
|
# We will append the current data manually.
|
||||||
|
# Thus, we need to read one less line from the CSV file.
|
||||||
|
read_last = num - 1
|
||||||
|
else:
|
||||||
|
read_last = num
|
||||||
|
|
||||||
|
if self.get_log_file().exists():
|
||||||
|
# Read rows from CSV file.
|
||||||
|
# We skip the first row as it is the data schema.
|
||||||
|
# We keep only the last `read_last` rows.
|
||||||
|
raw = csv_util.read_last(self.get_log_file(), read_last, 1)
|
||||||
|
# Convert from string to data types defined in the data schema.
|
||||||
|
data = [self.get_data_from_row(row) for row in raw]
|
||||||
|
else:
|
||||||
|
data = []
|
||||||
|
|
||||||
|
if not self.logged and self.has_data():
|
||||||
|
# We append the current data.
|
||||||
|
# It has not yet been logged and is therefore not included in the CSV file we just read.
|
||||||
|
data.append(self.get_data())
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def log(self, skip_if_no_data: bool = False) -> None:
|
||||||
|
"""
|
||||||
|
Appends the current data (e.g. temperature of a sensor)
|
||||||
|
to a logfile.
|
||||||
|
|
||||||
|
:param skip_if_no_data: Can be used to do nothing if no data is available. If one is sure to have called update() previously, this can be set to True.
|
||||||
|
:raise Exception: If method is called but no data is available. Please do call update() first to avoid this!
|
||||||
|
"""
|
||||||
|
if self.logged:
|
||||||
|
return
|
||||||
|
if skip_if_no_data and not self.has_data():
|
||||||
|
return
|
||||||
|
|
||||||
|
csv_util.write(file=self.get_log_file(), rows=[self.get_data_as_row()], header=self.data_schema())
|
||||||
|
self.logged = True
|
||||||
|
|
||||||
|
def update(self):
|
||||||
|
self.set_data(self.read_data())
|
||||||
|
self.logged = False
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def read_data(self) -> list[any] | None:
|
||||||
|
"""
|
||||||
|
Collects current data (e.g. temperature of a sensor).
|
||||||
|
|
||||||
|
Might return None if sensor is detached / not available.
|
||||||
|
|
||||||
|
:raise LoggerReadEx:
|
||||||
|
"""
|
||||||
|
raise ValueError('Subclasses must implement this')
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def data_schema(self) -> list[str]:
|
||||||
|
"""
|
||||||
|
Describes the type and meaning of the elements in self.values().
|
||||||
|
|
||||||
|
Returns a list with elements f'{data-type}#{column-description}'.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
['datetime#Date', 'float#Disk usage']
|
||||||
|
"""
|
||||||
|
raise ValueError('Subclasses must implement this')
|
||||||
|
|
||||||
|
def get_data_from_row(self, data: list[str]) -> list[any]:
|
||||||
|
return [
|
||||||
|
from_string(v, type_str)
|
||||||
|
for v, type_str
|
||||||
|
in zip(data, self.data_type_strs())
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_data_as_row(self) -> list[str]:
|
||||||
|
"""
|
||||||
|
Returns `self.get_data()` as string list that can easily be added as row to a CSV file.
|
||||||
|
"""
|
||||||
|
return self.as_row(self.get_data())
|
||||||
|
|
||||||
|
def as_row(self, data: list, data_schema: list[str] = None) -> list[str]:
|
||||||
|
"""
|
||||||
|
Returns the given `data` as string list that can easily be added as row to a CSV file.
|
||||||
|
"""
|
||||||
|
if data_schema is None:
|
||||||
|
data_schema = self.data_schema()
|
||||||
|
return [
|
||||||
|
to_string(v, type_str)
|
||||||
|
for v, type_str
|
||||||
|
in zip(data, self.data_type_strs(data_schema))
|
||||||
|
]
|
||||||
|
|
||||||
|
def has_data(self) -> bool:
|
||||||
|
return self.data is not None
|
||||||
|
|
||||||
|
def get_data(self) -> list[any]:
|
||||||
|
"""
|
||||||
|
Returns the last data collected by `self.update()`.
|
||||||
|
"""
|
||||||
|
if self.has_data():
|
||||||
|
return self.data
|
||||||
|
else:
|
||||||
|
raise ValueError(f'Data has not yet been read. {self.__str__()}')
|
||||||
|
|
||||||
|
def set_data(self, data: list[any] | None):
|
||||||
|
self.data = data
|
||||||
|
|
||||||
|
def data_type_strs(self, data_schema: list[str] = None) -> list[str]:
|
||||||
|
if data_schema is None:
|
||||||
|
data_schema = self.data_schema()
|
||||||
|
return [x.split('#', maxsplit=1)[0] for x in data_schema]
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_log_file(self) -> Path:
|
||||||
|
raise ValueError('Subclasses must implement this')
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_log_dir(cls) -> Path:
|
||||||
|
return logging_dir()
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
key_value_strings = [f'classname: {type(self).__name__}']
|
||||||
|
for key, value in vars(self).items():
|
||||||
|
key_value_strings.append(f'{key}: {value}')
|
||||||
|
return ', '.join(key_value_strings)
|
14
src/de/p1st/monitor/logger_ex.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
class LoggerReadEx(Exception):
|
||||||
|
"""
|
||||||
|
Used by Logger subclasses if
|
||||||
|
- sensor data could not be read
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class LoggerArgEx(Exception):
|
||||||
|
"""
|
||||||
|
Used by Logger subclasses if
|
||||||
|
- Logger object created with illegal arguments
|
||||||
|
"""
|
||||||
|
pass
|
0
src/de/p1st/monitor/loggers/__init__.py
Normal file
90
src/de/p1st/monitor/loggers/cpu.py
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from abc import abstractmethod
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
|
||||||
|
from de.p1st.monitor import datetime_util
|
||||||
|
|
||||||
|
from de.p1st.monitor.logger import Logger
|
||||||
|
from de.p1st.monitor.warn import WarnMessage, WarnLevel
|
||||||
|
|
||||||
|
|
||||||
|
class CPULogger(Logger):
|
||||||
|
def __init__(self,
|
||||||
|
warn_if_above: float = None,
|
||||||
|
warn_threshold: int = 1,
|
||||||
|
warn_data_range: int = 1,
|
||||||
|
):
|
||||||
|
|
||||||
|
critical_if_above = warn_if_above * 1.5
|
||||||
|
super().__init__(warn_threshold,
|
||||||
|
warn_data_range,
|
||||||
|
warn_if_above,
|
||||||
|
critical_if_above)
|
||||||
|
self.warn_if_above = warn_if_above
|
||||||
|
|
||||||
|
def check_data(self, data: list[any]) -> WarnMessage:
|
||||||
|
load_avg = data[1]
|
||||||
|
message = f'CPU load avg of last {self.get_load_timespan()} minutes is at {load_avg}'
|
||||||
|
|
||||||
|
if load_avg > self.critical_if_above:
|
||||||
|
return WarnMessage(WarnLevel.HIGH, message)
|
||||||
|
if load_avg > self.warn_if_above:
|
||||||
|
return WarnMessage(WarnLevel.NORMAL, message)
|
||||||
|
return WarnMessage(WarnLevel.NONE)
|
||||||
|
|
||||||
|
def read_data(self) -> list[any] | None:
|
||||||
|
return [
|
||||||
|
datetime_util.now(),
|
||||||
|
self.get_load(self.get_load_timespan())
|
||||||
|
]
|
||||||
|
|
||||||
|
def data_schema(self) -> list[str]:
|
||||||
|
return [
|
||||||
|
'datetime#Date',
|
||||||
|
f'float#LoadAverage{self.get_load_timespan()}min'
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_log_file(self) -> Path:
|
||||||
|
return self.get_log_dir() / f'cpu_{self.get_load_timespan()}min.csv'
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_load_timespan(self) -> Literal[1, 5, 15]:
|
||||||
|
raise ValueError('Subclasses must implement this')
|
||||||
|
|
||||||
|
#
|
||||||
|
# HELPERS
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_load(minutes: Literal[1, 5, 15]) -> float:
|
||||||
|
"""
|
||||||
|
:param minutes: avg of last 1/5/15 minutes
|
||||||
|
:return: Average CPU load of last 1/5/15 minutes
|
||||||
|
"""
|
||||||
|
idx_dict = {
|
||||||
|
1: 0,
|
||||||
|
5: 1,
|
||||||
|
15: 2
|
||||||
|
}
|
||||||
|
idx = idx_dict[minutes]
|
||||||
|
|
||||||
|
# Number of processes in the system run queue averaged over
|
||||||
|
# the last 1, 5, and 15 minutes:
|
||||||
|
# one, five, fifteen = psutil.getloadavg()
|
||||||
|
|
||||||
|
# Load percentage during last 5 minutes.
|
||||||
|
# This value has been tested to be correct on my AMD Ryzen 4800H CPU.
|
||||||
|
return psutil.getloadavg()[idx] / psutil.cpu_count()
|
||||||
|
|
||||||
|
class CPULogger1(CPULogger):
|
||||||
|
def get_load_timespan(self) -> Literal[1, 5, 15]:
|
||||||
|
return 1
|
||||||
|
class CPULogger5(CPULogger):
|
||||||
|
def get_load_timespan(self) -> Literal[1, 5, 15]:
|
||||||
|
return 5
|
||||||
|
class CPULogger15(CPULogger):
|
||||||
|
def get_load_timespan(self) -> Literal[1, 5, 15]:
|
||||||
|
return 15
|
108
src/de/p1st/monitor/loggers/drive.py
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from de.p1st.monitor import datetime_util
|
||||||
|
|
||||||
|
from de.p1st.monitor.exec_capture import execute_capture
|
||||||
|
from de.p1st.monitor.logger import Logger
|
||||||
|
from de.p1st.monitor.logger_ex import LoggerArgEx, LoggerReadEx
|
||||||
|
from de.p1st.monitor.warn import WarnLevel, WarnMessage
|
||||||
|
|
||||||
|
class BlkidException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class DriveLogger(Logger):
|
||||||
|
def __init__(self, uuid: str = None,
|
||||||
|
device: Path = None,
|
||||||
|
warn_if_above: int = None,
|
||||||
|
warn_threshold: int = 1,
|
||||||
|
warn_data_range: int = 1,
|
||||||
|
):
|
||||||
|
|
||||||
|
critical_if_above = warn_if_above + 10
|
||||||
|
super().__init__(warn_threshold,
|
||||||
|
warn_data_range,
|
||||||
|
warn_if_above,
|
||||||
|
critical_if_above
|
||||||
|
)
|
||||||
|
|
||||||
|
if uuid is None and device is None:
|
||||||
|
raise LoggerArgEx('uuid or device required')
|
||||||
|
|
||||||
|
if uuid is None:
|
||||||
|
try:
|
||||||
|
self.uuid = self.get_partition_uuid(device)
|
||||||
|
except BlkidException as e:
|
||||||
|
raise LoggerArgEx(getattr(e, 'message', e))
|
||||||
|
else:
|
||||||
|
self.uuid = uuid
|
||||||
|
|
||||||
|
if device is None:
|
||||||
|
self.device = self.get_partition_path(uuid)
|
||||||
|
else:
|
||||||
|
self.device = device
|
||||||
|
|
||||||
|
self.warn_if_above = warn_if_above
|
||||||
|
|
||||||
|
def check_data(self, data: list[any]) -> WarnMessage:
|
||||||
|
temp = data[1]
|
||||||
|
message = f'Temperature of {self.uuid} ist at {temp}'
|
||||||
|
|
||||||
|
if temp > self.critical_if_above:
|
||||||
|
return WarnMessage(WarnLevel.HIGH, message)
|
||||||
|
if temp > self.warn_if_above:
|
||||||
|
return WarnMessage(WarnLevel.NORMAL, message)
|
||||||
|
return WarnMessage(WarnLevel.NONE)
|
||||||
|
|
||||||
|
def read_data(self) -> list[any]:
|
||||||
|
return [
|
||||||
|
datetime_util.now(),
|
||||||
|
self.get_temp_from_device(self.device),
|
||||||
|
]
|
||||||
|
|
||||||
|
def data_schema(self) -> list[str]:
|
||||||
|
return ['datetime#Date', 'int#Temperature']
|
||||||
|
|
||||||
|
def get_log_file(self) -> Path:
|
||||||
|
# self.device might change overtime.
|
||||||
|
# Thus, we use self.uuid to identify a partition.
|
||||||
|
return self.get_log_dir() / f'drive_{self.uuid}.csv'
|
||||||
|
|
||||||
|
#
|
||||||
|
# HELPERS
|
||||||
|
#
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_partition_path(cls, uuid: str) -> Path:
|
||||||
|
"""
|
||||||
|
:return: Partition path, e.g. /dev/sda1
|
||||||
|
"""
|
||||||
|
return Path(f'/dev/disk/by-uuid/{uuid}').resolve()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_partition_uuid(cls, device: Path) -> str:
|
||||||
|
"""
|
||||||
|
:param device: E.g. /dev/sda1
|
||||||
|
:return: UUID of e.g. partition /dev/sda1
|
||||||
|
:raise BlkidException: If UUID could not be determined.
|
||||||
|
"""
|
||||||
|
returncode, stdout, stderr = execute_capture(['blkid', '-s', 'UUID', '-o', 'value', f'{device}'])
|
||||||
|
|
||||||
|
if returncode != 0:
|
||||||
|
raise BlkidException(f'blkid failed with returncode {returncode}\nstdout: {stdout}\nstderr: {stderr}')
|
||||||
|
|
||||||
|
return stdout.strip()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_temp_from_device(cls, device: Path) -> int:
|
||||||
|
"""
|
||||||
|
:param device: For example `/dev/sda` or `/dev/disk/by-uuid/<uuid>`
|
||||||
|
:return: Temperature in celsius
|
||||||
|
"""
|
||||||
|
returncode, stdout, stderr = execute_capture(['smartctl', '-j', '-a', f'{device}'])
|
||||||
|
|
||||||
|
if returncode != 0:
|
||||||
|
raise LoggerReadEx(f'smartctl failed with returncode {returncode}\nstdout: {stdout}\nstderr: {stderr}')
|
||||||
|
j = json.loads(stdout)
|
||||||
|
|
||||||
|
return j['temperature']['current']
|
164
src/de/p1st/monitor/loggers/filesystem.py
Normal file
@ -0,0 +1,164 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
from de.p1st.monitor import datetime_util
|
||||||
|
from de.p1st.monitor.exec_capture import execute_capture
|
||||||
|
|
||||||
|
from de.p1st.monitor.logger import Logger
|
||||||
|
from de.p1st.monitor.logger_ex import LoggerArgEx, LoggerReadEx
|
||||||
|
from de.p1st.monitor.warn import WarnLevel, WarnMessage
|
||||||
|
|
||||||
|
|
||||||
|
class NotMounted(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class FilesystemLogger(Logger):
|
||||||
|
def __init__(self, uuid: str = None,
|
||||||
|
mountpoint: Path = None,
|
||||||
|
unmounted_ok: bool = False,
|
||||||
|
warn_if_above: float = 1.0,
|
||||||
|
warn_threshold: int = 1,
|
||||||
|
warn_data_range: int = 1,
|
||||||
|
):
|
||||||
|
|
||||||
|
# The space between disk is at `self.warn_if_above` and disk is full at `1.0`.
|
||||||
|
buffer = 1 - warn_if_above
|
||||||
|
critical_if_above = warn_if_above + 0.5 * buffer
|
||||||
|
super().__init__(warn_threshold,
|
||||||
|
warn_data_range,
|
||||||
|
warn_if_above,
|
||||||
|
critical_if_above,
|
||||||
|
)
|
||||||
|
|
||||||
|
if uuid is None and mountpoint is None:
|
||||||
|
raise LoggerArgEx('uuid or mountpoint required')
|
||||||
|
|
||||||
|
self.uuid = uuid
|
||||||
|
self.mountpoint = mountpoint
|
||||||
|
self.unmounted_ok = unmounted_ok
|
||||||
|
self.warn_if_above = warn_if_above
|
||||||
|
|
||||||
|
self.mounted = True
|
||||||
|
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#
|
||||||
|
|
||||||
|
# If uuid and mountpoint are both specified,
|
||||||
|
# raise warning if unexpected uuid is mounted at mountpoint.
|
||||||
|
if self.mountpoint is not None and self.uuid is not None:
|
||||||
|
try:
|
||||||
|
actual_uuid = self.get_uuid(self.mountpoint)
|
||||||
|
self.mounted = True
|
||||||
|
if self.uuid != actual_uuid:
|
||||||
|
raise LoggerReadEx(f'Expected {self.uuid} at {self.mountpoint} but got {actual_uuid}')
|
||||||
|
except NotMounted as e:
|
||||||
|
if self.unmounted_ok:
|
||||||
|
self.mounted = False
|
||||||
|
else:
|
||||||
|
raise LoggerArgEx(getattr(e, 'message', e))
|
||||||
|
|
||||||
|
# Try to get UUID (if only mountpoint given)
|
||||||
|
if self.uuid is None:
|
||||||
|
try:
|
||||||
|
self.uuid = self.get_uuid(self.mountpoint)
|
||||||
|
self.mounted = True
|
||||||
|
except NotMounted as e:
|
||||||
|
if self.unmounted_ok:
|
||||||
|
self.mounted = False
|
||||||
|
else:
|
||||||
|
raise LoggerArgEx(getattr(e, 'message', e))
|
||||||
|
|
||||||
|
# Try to get mountpoint (if only uuid given)
|
||||||
|
if self.mountpoint is None:
|
||||||
|
try:
|
||||||
|
self.mountpoint = self.get_mountpoint(self.uuid)
|
||||||
|
self.mounted = True
|
||||||
|
except NotMounted as e:
|
||||||
|
if self.unmounted_ok:
|
||||||
|
self.mounted = False
|
||||||
|
else:
|
||||||
|
raise LoggerReadEx(getattr(e, 'message', e))
|
||||||
|
|
||||||
|
def check_data(self, data: list[any]) -> WarnMessage:
|
||||||
|
if not self.mounted:
|
||||||
|
return WarnMessage(WarnLevel.NONE)
|
||||||
|
|
||||||
|
disk_usage = data[1]
|
||||||
|
message = f'Disk usage of {self.uuid} ist at {disk_usage}'
|
||||||
|
|
||||||
|
if disk_usage > self.critical_if_above:
|
||||||
|
return WarnMessage(WarnLevel.HIGH, message)
|
||||||
|
if disk_usage > self.warn_if_above:
|
||||||
|
return WarnMessage(WarnLevel.NORMAL, message)
|
||||||
|
return WarnMessage(WarnLevel.NONE)
|
||||||
|
|
||||||
|
def read_data(self) -> list[any] | None:
|
||||||
|
if not self.mounted:
|
||||||
|
return None
|
||||||
|
|
||||||
|
disk_usage: float = self.get_disk_usage(self.mountpoint)
|
||||||
|
return [
|
||||||
|
datetime_util.now(),
|
||||||
|
disk_usage,
|
||||||
|
]
|
||||||
|
|
||||||
|
def data_schema(self) -> list[str]:
|
||||||
|
return ['datetime#Date', 'float#Disk usage']
|
||||||
|
|
||||||
|
def get_log_file(self) -> Path:
|
||||||
|
# The mountpoint of a filesystem might change overtime.
|
||||||
|
# Thus, we use self.uuid to identify a filesystem.
|
||||||
|
return self.get_log_dir() / f'filesystem_{self.uuid}.csv'
|
||||||
|
|
||||||
|
#
|
||||||
|
# HELPERS
|
||||||
|
#
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_disk_usage(cls, mountpoint: Path) -> float:
|
||||||
|
"""
|
||||||
|
:returns: used space / total space
|
||||||
|
"""
|
||||||
|
return psutil.disk_usage(str(mountpoint)).percent / 100.0
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_mountpoint(cls, uuid: str) -> Path:
|
||||||
|
"""
|
||||||
|
Throws an error if the corresponding partition is not mounted.
|
||||||
|
"""
|
||||||
|
|
||||||
|
partition_list: list[psutil._common.sdiskpart] = psutil.disk_partitions(all=False)
|
||||||
|
partitions: dict[Path, psutil._common.sdiskpart] = {Path(partition.device).resolve(): partition for partition in
|
||||||
|
partition_list}
|
||||||
|
|
||||||
|
partition_path = cls.get_partition_path(uuid)
|
||||||
|
if partition_path not in partitions:
|
||||||
|
raise NotMounted(
|
||||||
|
f'Partition {partition_path} is probably not mounted '
|
||||||
|
f'as it is not in psutil partition list: {partitions}')
|
||||||
|
|
||||||
|
partition = partitions[partition_path]
|
||||||
|
return Path(partition.mountpoint)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_uuid(cls, mountpoint: Path) -> str:
|
||||||
|
# Returns the UUID of the device mounted at `/`.
|
||||||
|
# Fails if there is no disk mounted at `/`.
|
||||||
|
#
|
||||||
|
# findmnt / -o UUID -n
|
||||||
|
|
||||||
|
returncode, stdout, stderr = execute_capture(['findmnt', str(mountpoint), '-o', 'UUID', '-n'])
|
||||||
|
if returncode != 0:
|
||||||
|
raise NotMounted(
|
||||||
|
f'No partition mounted at {mountpoint}. Stderr of findmnt: {stderr}')
|
||||||
|
|
||||||
|
return stdout.strip()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_partition_path(cls, uuid: str) -> Path:
|
||||||
|
"""
|
||||||
|
:return: Partition path, e.g. /dev/sda1
|
||||||
|
"""
|
||||||
|
return Path(f'/dev/disk/by-uuid/{uuid}').resolve()
|
74
src/de/p1st/monitor/loggers/memory.py
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
from de.p1st.monitor import datetime_util
|
||||||
|
|
||||||
|
from de.p1st.monitor.logger import Logger
|
||||||
|
from de.p1st.monitor.warn import WarnMessage, WarnLevel
|
||||||
|
|
||||||
|
|
||||||
|
class MemoryLogger(Logger):
|
||||||
|
def __init__(self,
|
||||||
|
warn_if_above: float = 1.0,
|
||||||
|
warn_threshold: int = 1,
|
||||||
|
warn_data_range: int = 1,
|
||||||
|
):
|
||||||
|
|
||||||
|
# The space between memory is at `self.warn_if_above` and memory is full at `1.0`.
|
||||||
|
buffer = 1 - warn_if_above
|
||||||
|
critical_if_above = warn_if_above + 0.5 * buffer
|
||||||
|
super().__init__(warn_threshold,
|
||||||
|
warn_data_range,
|
||||||
|
warn_if_above,
|
||||||
|
critical_if_above)
|
||||||
|
self.warn_if_above = warn_if_above
|
||||||
|
|
||||||
|
def check_data(self, data: list[any]) -> WarnMessage:
|
||||||
|
used_mb = data[1]
|
||||||
|
total_available_mb = data[3]
|
||||||
|
message = f'Memory usage ist at {used_mb} MB of {total_available_mb} MB'
|
||||||
|
|
||||||
|
used = used_mb / total_available_mb
|
||||||
|
|
||||||
|
if used > self.critical_if_above:
|
||||||
|
return WarnMessage(WarnLevel.HIGH, message)
|
||||||
|
if used > self.warn_if_above:
|
||||||
|
return WarnMessage(WarnLevel.NORMAL, message)
|
||||||
|
return WarnMessage(WarnLevel.NONE)
|
||||||
|
|
||||||
|
def read_data(self) -> list[any]:
|
||||||
|
used_mb, free_mb, available_mb, total_mb = self.get_memory()
|
||||||
|
used_and_cached_mb = total_mb - free_mb
|
||||||
|
total_available_mb = used_mb + available_mb
|
||||||
|
return [
|
||||||
|
datetime_util.now(),
|
||||||
|
used_mb,
|
||||||
|
used_and_cached_mb,
|
||||||
|
total_available_mb,
|
||||||
|
]
|
||||||
|
|
||||||
|
def data_schema(self) -> list[str]:
|
||||||
|
return ['datetime#Date', 'int#Used memory in MB', 'int#Used and cached in MB', 'int#Total available memory in MB']
|
||||||
|
|
||||||
|
def get_log_file(self) -> Path:
|
||||||
|
return self.get_log_dir() / f'memory.csv'
|
||||||
|
|
||||||
|
#
|
||||||
|
# HELPERS
|
||||||
|
#
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_memory(cls) -> tuple[int, int, int, int]:
|
||||||
|
"""
|
||||||
|
:return: Tuple[used memory in MB, free memory in MB, total memory in MB]. This does not include swap.
|
||||||
|
"""
|
||||||
|
mb = 1024 * 1024
|
||||||
|
mem = psutil.virtual_memory()
|
||||||
|
|
||||||
|
# mem.available:
|
||||||
|
# The memory that can be given instantly to processes,
|
||||||
|
# excluding swap.
|
||||||
|
# mem.total:
|
||||||
|
# Total physical memory (exclusive swap).
|
||||||
|
# mem.used + mem.available != mem.total
|
||||||
|
return int(mem.used / mb), int(mem.free / mb), int(mem.available / mb), int(mem.total / mb)
|
113
src/de/p1st/monitor/loggers/network.py
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
import sys
|
||||||
|
from datetime import datetime, timezone, timedelta
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
|
||||||
|
from de.p1st.monitor import datetime_util, csv_util
|
||||||
|
from de.p1st.monitor.logger import Logger
|
||||||
|
from de.p1st.monitor.logger_ex import LoggerReadEx
|
||||||
|
from de.p1st.monitor.warn import WarnLevel, WarnMessage
|
||||||
|
|
||||||
|
|
||||||
|
class NetworkLogger(Logger):
|
||||||
|
def __init__(self, network_interface: str):
|
||||||
|
super().__init__()
|
||||||
|
self.network_interface = network_interface
|
||||||
|
|
||||||
|
def export_data(self) -> Path:
|
||||||
|
data = self.get_all_datasets()
|
||||||
|
|
||||||
|
export_schema = [
|
||||||
|
'datetime#Date',
|
||||||
|
'float#Bytes sent per second',
|
||||||
|
'float#Bytes received per second',
|
||||||
|
]
|
||||||
|
export_data = []
|
||||||
|
# Append all other rows.
|
||||||
|
for prev_row, curr_row in zip(data[:-1], data[1:]):
|
||||||
|
# if boot time differs -> reboot between data points -> invalid sent/received deltas
|
||||||
|
if prev_row[3] != curr_row[3]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
elapsed_time: timedelta = curr_row[0] - prev_row[0]
|
||||||
|
delta_sent = curr_row[1] - prev_row[1]
|
||||||
|
delta_received = curr_row[2] - prev_row[2]
|
||||||
|
|
||||||
|
if delta_sent < 0 or delta_received < 0:
|
||||||
|
print(f'bytes received/sent counter did overflow after {prev_row[0]}',
|
||||||
|
file=sys.stderr)
|
||||||
|
continue
|
||||||
|
|
||||||
|
elapsed_seconds = elapsed_time.total_seconds()
|
||||||
|
export_data.append([
|
||||||
|
# datetime#Date
|
||||||
|
prev_row[0] + 0.5 * elapsed_time,
|
||||||
|
# float#Bytes sent per second
|
||||||
|
delta_sent / elapsed_seconds,
|
||||||
|
# float#Bytes received per second
|
||||||
|
delta_received / elapsed_seconds,
|
||||||
|
])
|
||||||
|
|
||||||
|
export_file = self.get_log_file().parent.joinpath(self.get_log_file().name + '.exported.csv')
|
||||||
|
rows = [self.as_row(export_row, export_schema) for export_row in export_data]
|
||||||
|
csv_util.write(file=export_file, rows=rows, header=export_schema, recreate_file=True)
|
||||||
|
return export_file
|
||||||
|
|
||||||
|
def check_data(self, data: list[any]) -> WarnMessage:
|
||||||
|
return WarnMessage(WarnLevel.NONE)
|
||||||
|
|
||||||
|
def data_schema(self) -> list[str]:
|
||||||
|
return [
|
||||||
|
'datetime#Date',
|
||||||
|
'int#Bytes sent since boot',
|
||||||
|
'int#Bytes received since boot',
|
||||||
|
'datetime#Boot date',
|
||||||
|
]
|
||||||
|
|
||||||
|
def read_data(self) -> list[any]:
|
||||||
|
sent, received = self.get_net_usage()
|
||||||
|
return [
|
||||||
|
datetime_util.now(),
|
||||||
|
sent,
|
||||||
|
received,
|
||||||
|
self.get_boot_time(),
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_log_file(self) -> Path:
|
||||||
|
return self.get_log_dir() / f'net_{self.network_interface}.csv'
|
||||||
|
|
||||||
|
#
|
||||||
|
# HELPERS
|
||||||
|
#
|
||||||
|
|
||||||
|
def get_net_usage(self) -> tuple[int, int]:
|
||||||
|
"""
|
||||||
|
Warning: The returned values may overflow if the system is running for a long time.
|
||||||
|
|
||||||
|
:return: bytes sent, bytes received
|
||||||
|
"""
|
||||||
|
# noinspection PyTypeChecker
|
||||||
|
nics_data: dict[str, psutil._common.snetio] = psutil.net_io_counters(pernic=True, nowrap=True)
|
||||||
|
|
||||||
|
if self.network_interface not in nics_data:
|
||||||
|
raise LoggerReadEx(f'Network interface {self.network_interface} not found')
|
||||||
|
|
||||||
|
nic_data = nics_data[self.network_interface]
|
||||||
|
return nic_data.bytes_sent, nic_data.bytes_recv
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_boot_time(cls) -> datetime:
|
||||||
|
epoch_seconds = psutil.boot_time()
|
||||||
|
return datetime.fromtimestamp(epoch_seconds, tz=timezone.utc)
|
||||||
|
|
||||||
|
def test():
|
||||||
|
from de.p1st.monitor.cfg.singleton import init_cfg
|
||||||
|
init_cfg()
|
||||||
|
|
||||||
|
logger = NetworkLogger('wlp1s0')
|
||||||
|
logger.update()
|
||||||
|
logger.log()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test()
|
69
src/de/p1st/monitor/loggers/swap.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
from de.p1st.monitor import datetime_util
|
||||||
|
|
||||||
|
from de.p1st.monitor.logger import Logger
|
||||||
|
from de.p1st.monitor.warn import WarnMessage, WarnLevel
|
||||||
|
|
||||||
|
|
||||||
|
class SwapLogger(Logger):
|
||||||
|
def __init__(self,
|
||||||
|
warn_if_above: float = 1.0,
|
||||||
|
warn_threshold: int = 1,
|
||||||
|
warn_data_range: int = 1,
|
||||||
|
):
|
||||||
|
|
||||||
|
# The space between swap is at `self.warn_if_above` and swap is full at `1.0`.
|
||||||
|
buffer = 1 - warn_if_above
|
||||||
|
critical_if_above = warn_if_above + 0.5 * buffer
|
||||||
|
super().__init__(warn_threshold,
|
||||||
|
warn_data_range,
|
||||||
|
warn_if_above,
|
||||||
|
critical_if_above)
|
||||||
|
self.warn_if_above = warn_if_above
|
||||||
|
|
||||||
|
def check_data(self, data: list[any]) -> WarnMessage:
|
||||||
|
used_mb = data[1]
|
||||||
|
total_mb = data[2]
|
||||||
|
message = f'Swap usage ist at {used_mb} MB of {total_mb} MB'
|
||||||
|
|
||||||
|
if used_mb == 0 and total_mb == 0:
|
||||||
|
# Swap not enabled.
|
||||||
|
return WarnMessage(WarnLevel.NONE)
|
||||||
|
|
||||||
|
usage = used_mb / total_mb
|
||||||
|
|
||||||
|
if usage > self.critical_if_above:
|
||||||
|
return WarnMessage(WarnLevel.HIGH, message)
|
||||||
|
if usage > self.warn_if_above:
|
||||||
|
return WarnMessage(WarnLevel.NORMAL, message)
|
||||||
|
return WarnMessage(WarnLevel.NONE)
|
||||||
|
|
||||||
|
def read_data(self) -> list[any]:
|
||||||
|
used_mb, total_mb = self.get_swap()
|
||||||
|
return [
|
||||||
|
datetime_util.now(),
|
||||||
|
used_mb,
|
||||||
|
total_mb,
|
||||||
|
]
|
||||||
|
|
||||||
|
def data_schema(self) -> list[str]:
|
||||||
|
return ['datetime#Date', 'int#Used swap in MB', 'int#Total swap in MB']
|
||||||
|
|
||||||
|
def get_log_file(self) -> Path:
|
||||||
|
return self.get_log_dir() / f'swap.csv'
|
||||||
|
|
||||||
|
#
|
||||||
|
# HELPERS
|
||||||
|
#
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_swap(cls) -> (int, int):
|
||||||
|
"""
|
||||||
|
:return: Tuple[used swap in MB, total swap in MB].
|
||||||
|
"""
|
||||||
|
mb = 1024 * 1024
|
||||||
|
mem = psutil.swap_memory()
|
||||||
|
|
||||||
|
return int(mem.used / mb), int(mem.total / mb)
|
82
src/de/p1st/monitor/loggers/temp.py
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
|
||||||
|
from de.p1st.monitor import datetime_util
|
||||||
|
from de.p1st.monitor.logger import Logger
|
||||||
|
from de.p1st.monitor.logger_ex import LoggerReadEx
|
||||||
|
from de.p1st.monitor.warn import WarnMessage, WarnLevel
|
||||||
|
|
||||||
|
|
||||||
|
class TempLogger(Logger):
|
||||||
|
def __init__(self, sensor_name: str,
|
||||||
|
sensor_label: str,
|
||||||
|
warn_if_above: float = None,
|
||||||
|
warn_threshold: int = 1,
|
||||||
|
warn_data_range: int = 1,
|
||||||
|
):
|
||||||
|
|
||||||
|
critical_if_above = warn_if_above + 10
|
||||||
|
super().__init__(warn_threshold,
|
||||||
|
warn_data_range,
|
||||||
|
warn_if_above,
|
||||||
|
critical_if_above)
|
||||||
|
self.name = sensor_name
|
||||||
|
self.label = sensor_label
|
||||||
|
|
||||||
|
self.warn_if_above = warn_if_above
|
||||||
|
|
||||||
|
def check_data(self, data: list[any]) -> WarnMessage:
|
||||||
|
temp = data[1]
|
||||||
|
message = f'Temperature of {self.name} {self.label} ist at {temp}'
|
||||||
|
|
||||||
|
if temp > self.critical_if_above:
|
||||||
|
return WarnMessage(WarnLevel.HIGH, message)
|
||||||
|
if temp > self.warn_if_above:
|
||||||
|
return WarnMessage(WarnLevel.NORMAL, message)
|
||||||
|
return WarnMessage(WarnLevel.NONE)
|
||||||
|
|
||||||
|
def read_data(self) -> list[any]:
|
||||||
|
return [
|
||||||
|
datetime_util.now(),
|
||||||
|
self.get_temp()
|
||||||
|
]
|
||||||
|
|
||||||
|
def data_schema(self) -> list[str]:
|
||||||
|
return [
|
||||||
|
'datetime#Date',
|
||||||
|
'float#Temperature'
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_log_file(self) -> Path:
|
||||||
|
return self.get_log_dir() / f'temp_{self.name}_{self.label}.csv'
|
||||||
|
|
||||||
|
#
|
||||||
|
# HELPERS
|
||||||
|
#
|
||||||
|
|
||||||
|
def get_temp(self) -> float:
|
||||||
|
"""
|
||||||
|
:return: Temperature in celsius
|
||||||
|
"""
|
||||||
|
data = psutil.sensors_temperatures(fahrenheit=False)
|
||||||
|
if not self.name in data:
|
||||||
|
raise LoggerReadEx(f'Sensor {self.name} not found')
|
||||||
|
for i in data[self.name]:
|
||||||
|
if i.label == self.label:
|
||||||
|
return i.current
|
||||||
|
raise LoggerReadEx(f'Label {self.label} of sensor {self.name} not found')
|
||||||
|
|
||||||
|
|
||||||
|
def test():
|
||||||
|
from de.p1st.monitor.cfg import singleton
|
||||||
|
singleton.init_cfg()
|
||||||
|
|
||||||
|
logger = TempLogger('amdgpu', 'edge', 47, 2, 4)
|
||||||
|
logger.update()
|
||||||
|
logger.log()
|
||||||
|
logger.check().print()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test()
|
83
src/de/p1st/monitor/main.py
Executable file
@ -0,0 +1,83 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from de.p1st.monitor.cfg.singleton import init_cfg
|
||||||
|
from de.p1st.monitor.cfg.loggers import get_loggers
|
||||||
|
from de.p1st.monitor.logger_ex import LoggerReadEx
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(prog='de-p1st-monitor',
|
||||||
|
description='Iterates over all config sections. '
|
||||||
|
'For each section the current sensor data is read '
|
||||||
|
'and logged to a .csv file.')
|
||||||
|
parser.add_argument('--config', '-c', default=None, type=Path,
|
||||||
|
help='Path to .ini configuration file.')
|
||||||
|
parser.add_argument('--export', '-e', default=False, action='store_true',
|
||||||
|
help='If `True`, export .csv files and print their paths to stdout. '
|
||||||
|
'No sensor data is logged during this.')
|
||||||
|
# parser.add_argument('--export', '-e', default=False, type=bool,
|
||||||
|
# help='If `True`, export .csv files and print their paths to stdout.
|
||||||
|
# No sensor data is logged during this.')
|
||||||
|
args = parser.parse_args()
|
||||||
|
init_cfg(args.config)
|
||||||
|
|
||||||
|
if args.export:
|
||||||
|
export()
|
||||||
|
else:
|
||||||
|
log()
|
||||||
|
|
||||||
|
|
||||||
|
def export():
|
||||||
|
loggers, logger_arg_exs = get_loggers()
|
||||||
|
if len(logger_arg_exs) > 0:
|
||||||
|
print('\nCONFIGURATION ERROR: Could not instantiate some of the loggers!', file=sys.stderr)
|
||||||
|
print_exs(logger_arg_exs, [f'{n}.' for n in range(1, 1 + len(logger_arg_exs))])
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
for logger in loggers:
|
||||||
|
export_path: Path = logger.export_data()
|
||||||
|
print(export_path)
|
||||||
|
|
||||||
|
|
||||||
|
def log():
|
||||||
|
loggers, logger_arg_exs = get_loggers()
|
||||||
|
logger_read_exs = []
|
||||||
|
logger_warnings = 0
|
||||||
|
for logger_ct, logger in enumerate(loggers, start=1):
|
||||||
|
print(f'Running logger {logger_ct}/{len(loggers)} ...')
|
||||||
|
try:
|
||||||
|
logger.update()
|
||||||
|
except LoggerReadEx as e:
|
||||||
|
logger_read_exs.append(e)
|
||||||
|
continue
|
||||||
|
# After logger.update() there might still be no data
|
||||||
|
# Example: FilesystemLogger if partition is not mounted (and unmounted_ok is True)
|
||||||
|
logger.log(skip_if_no_data=True)
|
||||||
|
if logger.check().print().is_warning():
|
||||||
|
logger_warnings += 1
|
||||||
|
|
||||||
|
if len(logger_arg_exs) > 0:
|
||||||
|
print('\nCONFIGURATION ERROR: Could not instantiate some of the loggers!', file=sys.stderr)
|
||||||
|
print_exs(logger_arg_exs, [f'{n}.' for n in range(1, 1 + len(logger_arg_exs))])
|
||||||
|
if len(logger_read_exs) > 0:
|
||||||
|
print('\nRUNTIME ERROR: Some loggers could not fetch sensor data!', file=sys.stderr)
|
||||||
|
print_exs(logger_read_exs, [f'{n}.' for n in range(1, 1 + len(logger_read_exs))])
|
||||||
|
|
||||||
|
if len(logger_arg_exs) + len(logger_read_exs) > 0 or logger_warnings > 0:
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def print_exs(exs: list[Exception], headers: list):
|
||||||
|
for e, header in zip(exs, headers):
|
||||||
|
# Indent str(e) with \t
|
||||||
|
body = '\t' + '\n\t'.join(str(e).splitlines())
|
||||||
|
|
||||||
|
print(f'{header}\n{body}', file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
24
src/de/p1st/monitor/string_conversion.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
from typing import Callable
|
||||||
|
|
||||||
|
from de.p1st.monitor import datetime_util
|
||||||
|
|
||||||
|
|
||||||
|
def data_types() -> dict[str, dict[str, Callable[[any], any]]]:
|
||||||
|
"""
|
||||||
|
Returns a dictionary. Its key-value pairs contain the following:
|
||||||
|
|
||||||
|
Key: Name of type.
|
||||||
|
Value: Dict containing to_string and from_string conversion methods, called 'to' and 'from'.
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
'str': {'to': lambda x: x, 'from': lambda x: x},
|
||||||
|
'int': {'to': lambda x: str(x), 'from': lambda x: int(x)},
|
||||||
|
'float': {'to': lambda x: str(x), 'from': lambda x: float(x)},
|
||||||
|
'datetime': {'to': datetime_util.to_str, 'from': datetime_util.from_str},
|
||||||
|
}
|
||||||
|
|
||||||
|
def to_string(v: any, type_str: str) -> str:
|
||||||
|
return data_types()[type_str]['to'](v)
|
||||||
|
|
||||||
|
def from_string(v: str, type_str: str) -> any:
|
||||||
|
return data_types()[type_str]['from'](v)
|
49
src/de/p1st/monitor/warn.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
import sys
|
||||||
|
from enum import Enum
|
||||||
|
from functools import total_ordering
|
||||||
|
|
||||||
|
|
||||||
|
# https://docs.python.org/3/library/functools.html#functools.total_ordering
|
||||||
|
@total_ordering
|
||||||
|
class WarnLevel(Enum):
|
||||||
|
NONE = 0 # Not a warning. Everything is ok.
|
||||||
|
LOW = 1
|
||||||
|
NORMAL = 2
|
||||||
|
HIGH = 3
|
||||||
|
|
||||||
|
def __eq__(self, other):
|
||||||
|
if isinstance(other, WarnLevel):
|
||||||
|
return self.value == other.value
|
||||||
|
return NotImplemented
|
||||||
|
|
||||||
|
def __lt__(self, other):
|
||||||
|
if isinstance(other, WarnLevel):
|
||||||
|
return self.value < other.value
|
||||||
|
return NotImplemented
|
||||||
|
|
||||||
|
|
||||||
|
class WarnMessage:
|
||||||
|
def __init__(self, level: WarnLevel, message: str = None):
|
||||||
|
self.level = level
|
||||||
|
self.message = message
|
||||||
|
|
||||||
|
def is_warning(self) -> bool:
|
||||||
|
return self.level > WarnLevel.NONE
|
||||||
|
|
||||||
|
def print(self, default_message: str = 'Warning!') -> WarnMessage:
|
||||||
|
"""
|
||||||
|
return: self
|
||||||
|
"""
|
||||||
|
message = default_message if self.message is None else self.message
|
||||||
|
|
||||||
|
if self.level == WarnLevel.NONE:
|
||||||
|
pass
|
||||||
|
elif self.level == WarnLevel.LOW:
|
||||||
|
print(message)
|
||||||
|
elif self.level == WarnLevel.NORMAL:
|
||||||
|
print(message, file=sys.stderr)
|
||||||
|
elif self.level == WarnLevel.HIGH:
|
||||||
|
print(f'[CRITICAL] {message}', file=sys.stderr)
|
||||||
|
|
||||||
|
return self
|