hosting/lib/nvidia_driver_update.py

70 lines
3.5 KiB
Python
Raw Normal View History

from lib import config as config_module
from lib import docker_interface
from lib import background_job
from lib import utils
import asyncio
import aiofiles
import os
config = config_module.config
non_interactive_env = {
'DEBIAN_FRONTEND': 'noninteractive',
'PATH': '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin',
}
non_interactive_env_hive = {
'DEBIAN_FRONTEND': 'noninteractive',
'PATH': '/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./',
}
INTENDED_DRIVER_VERSION = 550
async def get_running_containers():
try:
containers = await asyncio.to_thread(docker_interface.get_containers, False)
return containers
except Exception as e:
return False
async def run_update(is_hive = False):
try:
code, stdout, stderr = await utils.async_run_command("nvidia-smi --query-gpu=driver_version --format=csv,noheader", 20, env=non_interactive_env)
if code == 0 and stderr == '' and stdout:
driver_version = stdout.split('\n')[0].split('.')
if len(driver_version) >= 2 and driver_version[0].isdigit() and int(driver_version[0]) > 300 and int(driver_version[0]) < INTENDED_DRIVER_VERSION:
running_containers = await get_running_containers()
if type(running_containers) == list:
order_running = False
for container in running_containers:
if container.name[:11] == "clore-order" or container.name[:2] == "C.":
order_running = True
break
if not order_running:
if is_hive:
background_job.temporarly_disable(14400)
driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force", 14400, non_interactive_env_hive)
2024-12-11 01:18:42 +00:00
if driver_update_code == 1 and "Unload modules failed" in driver_update_stdout:
async with aiofiles.open("/opt/clore-hosting/.run_hive_driver_update", mode='w') as file:
await file.write("")
os._exit(0)
background_job.enable()
if driver_update_code == 0:
async with aiofiles.open(config.restart_docker_flag_file, mode='w') as file:
await file.write("")
await aiofiles.os.remove(config.update_driver_550_flag)
os._exit(0)
else:
driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("apt update -y && apt install nvidia-driver-550 -y", 14400, non_interactive_env)
if driver_update_code == 0:
await aiofiles.os.remove(config.update_driver_550_flag)
await utils.async_run_command("reboot") # On ubuntu it's just safest to reboot to get new driver version working properly
except Exception as e:
pass
async def update_loop(is_hive):
while True:
flag_exists = await aiofiles.os.path.exists(config.update_driver_550_flag)
if flag_exists:
await run_update(is_hive)
await asyncio.sleep(300)