From e1a4dc7aa6a1034fb9e3fe9e06489e0b46048671 Mon Sep 17 00:00:00 2001 From: clore Date: Tue, 10 Dec 2024 23:48:01 +0000 Subject: [PATCH] HiveOS on nvidia driver update - if fails to unload nvidia -> restart clore hosting --- hosting.py | 4 ++++ lib/nvidia_driver_update.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/hosting.py b/hosting.py index e662f69..1eb91b2 100644 --- a/hosting.py +++ b/hosting.py @@ -34,6 +34,10 @@ elif config.service: xfs_state = xfs.init() + if os.path.isfile("/opt/clore-hosting/.run_hive_driver_update"): + utils.run_command("PATH=/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./ nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force") + utils.run_command("systemctl restart docker") + os.remove("/opt/clore-hosting/.run_hive_driver_update") if os.path.isfile(config.restart_docker_flag_file): utils.run_command("systemctl restart docker") os.remove(config.restart_docker_flag_file) diff --git a/lib/nvidia_driver_update.py b/lib/nvidia_driver_update.py index ecd5511..429253e 100644 --- a/lib/nvidia_driver_update.py +++ b/lib/nvidia_driver_update.py @@ -44,6 +44,10 @@ async def run_update(is_hive = False): if is_hive: background_job.temporarly_disable(14400) driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force", 14400, non_interactive_env_hive) + if driver_update_code == 1 and "Unload modules failed (nvidia)" in driver_update_stdout: + async with aiofiles.open("/opt/clore-hosting/.run_hive_driver_update", mode='w') as file: + await file.write("") + os._exit(0) background_job.enable() if driver_update_code == 0: async with aiofiles.open(config.restart_docker_flag_file, mode='w') as file: