2024-12-10 22:28:51 +00:00
|
|
|
from lib import config as config_module
|
|
|
|
from lib import docker_interface
|
2024-12-10 23:00:09 +00:00
|
|
|
from lib import background_job
|
2024-12-10 22:28:51 +00:00
|
|
|
from lib import utils
|
|
|
|
import asyncio
|
|
|
|
import aiofiles
|
|
|
|
import os
|
|
|
|
|
|
|
|
config = config_module.config
|
|
|
|
|
|
|
|
non_interactive_env = {
|
|
|
|
'DEBIAN_FRONTEND': 'noninteractive',
|
|
|
|
'PATH': '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin',
|
|
|
|
}
|
|
|
|
|
|
|
|
non_interactive_env_hive = {
|
|
|
|
'DEBIAN_FRONTEND': 'noninteractive',
|
|
|
|
'PATH': '/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./',
|
|
|
|
}
|
|
|
|
|
|
|
|
INTENDED_DRIVER_VERSION = 550
|
|
|
|
|
|
|
|
async def get_running_containers():
|
|
|
|
try:
|
|
|
|
containers = await asyncio.to_thread(docker_interface.get_containers, False)
|
|
|
|
return containers
|
|
|
|
except Exception as e:
|
|
|
|
return False
|
|
|
|
|
|
|
|
async def run_update(is_hive = False):
|
|
|
|
try:
|
|
|
|
code, stdout, stderr = await utils.async_run_command("nvidia-smi --query-gpu=driver_version --format=csv,noheader", 20, env=non_interactive_env)
|
|
|
|
if code == 0 and stderr == '' and stdout:
|
|
|
|
driver_version = stdout.split('\n')[0].split('.')
|
|
|
|
if len(driver_version) >= 2 and driver_version[0].isdigit() and int(driver_version[0]) > 300 and int(driver_version[0]) < INTENDED_DRIVER_VERSION:
|
|
|
|
running_containers = await get_running_containers()
|
|
|
|
if type(running_containers) == list:
|
|
|
|
order_running = False
|
|
|
|
for container in running_containers:
|
|
|
|
if container.name[:11] == "clore-order" or container.name[:2] == "C.":
|
|
|
|
order_running = True
|
|
|
|
break
|
|
|
|
if not order_running:
|
|
|
|
if is_hive:
|
2024-12-10 23:00:09 +00:00
|
|
|
background_job.temporarly_disable(14400)
|
2024-12-10 22:28:51 +00:00
|
|
|
driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force", 14400, non_interactive_env_hive)
|
2024-12-11 01:18:42 +00:00
|
|
|
if driver_update_code == 1 and "Unload modules failed" in driver_update_stdout:
|
2024-12-10 23:48:01 +00:00
|
|
|
async with aiofiles.open("/opt/clore-hosting/.run_hive_driver_update", mode='w') as file:
|
|
|
|
await file.write("")
|
|
|
|
os._exit(0)
|
2024-12-10 23:00:09 +00:00
|
|
|
background_job.enable()
|
2024-12-10 22:28:51 +00:00
|
|
|
if driver_update_code == 0:
|
|
|
|
async with aiofiles.open(config.restart_docker_flag_file, mode='w') as file:
|
|
|
|
await file.write("")
|
|
|
|
await aiofiles.os.remove(config.update_driver_550_flag)
|
|
|
|
os._exit(0)
|
|
|
|
else:
|
|
|
|
driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("apt update -y && apt install nvidia-driver-550 -y", 14400, non_interactive_env)
|
|
|
|
if driver_update_code == 0:
|
|
|
|
await aiofiles.os.remove(config.update_driver_550_flag)
|
|
|
|
await utils.async_run_command("reboot") # On ubuntu it's just safest to reboot to get new driver version working properly
|
|
|
|
except Exception as e:
|
|
|
|
pass
|
|
|
|
|
|
|
|
async def update_loop(is_hive):
|
|
|
|
while True:
|
|
|
|
flag_exists = await aiofiles.os.path.exists(config.update_driver_550_flag)
|
|
|
|
if flag_exists:
|
|
|
|
await run_update(is_hive)
|
|
|
|
await asyncio.sleep(300)
|