From a52c830c3e9b8ad2e290727a11757854381e5f56 Mon Sep 17 00:00:00 2001 From: Daniel Langbein Date: Wed, 28 Aug 2024 21:03:14 +0200 Subject: [PATCH] improved systemd service add wants network-online.target, better error handling --- hosts/yodaHedgehog/host-specific.nix | 42 ++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/hosts/yodaHedgehog/host-specific.nix b/hosts/yodaHedgehog/host-specific.nix index 24da615..fccefd6 100644 --- a/hosts/yodaHedgehog/host-specific.nix +++ b/hosts/yodaHedgehog/host-specific.nix @@ -15,6 +15,11 @@ let # The "stay-awake" file is located at `${backup-source}:${stay-awake-file}`. # Example: ssh rootNas 'touch yodaHedgehog.stay-awake' stay-awake-file = "${config.networking.hostName}.stay-awake"; + + # How often to try to establish an SSH connection with ${backup-source}. + retries = "10"; + # How many seconds to wait between failed SSH connection attempts to ${backup-source}. + wait-seconds = "15"; in { assertions = [{ @@ -37,6 +42,7 @@ in }; systemd.services."daily-backup-and-suspend" = { after = [ "network-online.target" ]; + wants = [ "network-online.target" ]; # Packages required for this script. # For `ssh` and `journalwatch`, there are assertions above. path = with pkgs; [ @@ -57,23 +63,35 @@ in # Script to execute as main process. script = '' set -eu -o pipefail - #printf '%s\n' 'Starting backup script.' - # Wait until ${backup-source} is reachable. - # - # This test is necessary because of the following: - # If the system wakes up at 12:05, it is not directly connected to the Internet. - # The config option `after = [ "network-online.target" ];` does not help in this regard. - # Thus, `btrbk` might fail with the following error while connecting to ${backup-source}: - # ssh: Could not resolve hostname p1st.de: Name or service not known - # - while :; do + for i in $(seq 1 ${retries}); do + # Check if ${backup-source} is reachable via SSH. + # + # This check is useful if ${backup-source} is disconnected for a short period. + # Additionally, this is necessary because of the following issue: + # If the system resumes at 12:05, it is not directly connected to the Internet, even if "after" and "wants" are set to "network-online.target". + # TODO: How can we fix this? + # TODO: Once fixed, send notification already after first failed connection attempt (instead of fourth). + # result="$(ssh ${backup-source} 'echo ${backup-source}')" && e=0 || e=$? - if [ "''${e}" = 0 ] && [ "''${result}" = '${backup-source}' ]; then + if [ "''${e}" = 0 ] && [ "''${result}" = ${backup-source} ]; then + # Continue if successful. break fi + # Otherwise do some error handling and try again. + printf '%s\n' 'Delaying backup due to SSH connectivity problems.' - sleep 10s + # After the fourth failed connection attempt, send a notification by email. + if [ "''${i}" = "4" ]; then + printf '%s\n\n%s' 'Subject: ${config.networking.hostName}' 'Error connecting to ${backup-source}. Will retry in some seconds.' | sendmail -f langbein@mail.de daniel@systemli.org + fi + # After ${retries} failed connection attempts, send a second notification by email and give up. + if [ "''${i}" = "${retries}" ]; then + printf '%s\n\n%s' 'Subject: ${config.networking.hostName}' 'Error connecting to ${backup-source} for ${retries} times. Giving up!' | sendmail -f langbein@mail.de daniel@systemli.org + exit 1 + fi + # Wait some seconds before repeating. + sleep "${wait-seconds}"s done # Pull BTRFS snapshots from ${backup-source}.