63 lines
3.0 KiB
Python
63 lines
3.0 KiB
Python
|
from lib import config as config_module
|
||
|
from lib import docker_interface
|
||
|
from lib import utils
|
||
|
import asyncio
|
||
|
import aiofiles
|
||
|
import os
|
||
|
|
||
|
config = config_module.config
|
||
|
|
||
|
non_interactive_env = {
|
||
|
'DEBIAN_FRONTEND': 'noninteractive',
|
||
|
'PATH': '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin',
|
||
|
}
|
||
|
|
||
|
non_interactive_env_hive = {
|
||
|
'DEBIAN_FRONTEND': 'noninteractive',
|
||
|
'PATH': '/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./',
|
||
|
}
|
||
|
|
||
|
INTENDED_DRIVER_VERSION = 550
|
||
|
|
||
|
async def get_running_containers():
|
||
|
try:
|
||
|
containers = await asyncio.to_thread(docker_interface.get_containers, False)
|
||
|
return containers
|
||
|
except Exception as e:
|
||
|
return False
|
||
|
|
||
|
async def run_update(is_hive = False):
|
||
|
try:
|
||
|
code, stdout, stderr = await utils.async_run_command("nvidia-smi --query-gpu=driver_version --format=csv,noheader", 20, env=non_interactive_env)
|
||
|
if code == 0 and stderr == '' and stdout:
|
||
|
driver_version = stdout.split('\n')[0].split('.')
|
||
|
if len(driver_version) >= 2 and driver_version[0].isdigit() and int(driver_version[0]) > 300 and int(driver_version[0]) < INTENDED_DRIVER_VERSION:
|
||
|
running_containers = await get_running_containers()
|
||
|
if type(running_containers) == list:
|
||
|
order_running = False
|
||
|
for container in running_containers:
|
||
|
if container.name[:11] == "clore-order" or container.name[:2] == "C.":
|
||
|
order_running = True
|
||
|
break
|
||
|
if not order_running:
|
||
|
if is_hive:
|
||
|
driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force", 14400, non_interactive_env_hive)
|
||
|
if driver_update_code == 0:
|
||
|
async with aiofiles.open(config.restart_docker_flag_file, mode='w') as file:
|
||
|
await file.write("")
|
||
|
await aiofiles.os.remove(config.update_driver_550_flag)
|
||
|
os._exit(0)
|
||
|
else:
|
||
|
driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("apt update -y && apt install nvidia-driver-550 -y", 14400, non_interactive_env)
|
||
|
if driver_update_code == 0:
|
||
|
await aiofiles.os.remove(config.update_driver_550_flag)
|
||
|
await utils.async_run_command("reboot") # On ubuntu it's just safest to reboot to get new driver version working properly
|
||
|
except Exception as e:
|
||
|
pass
|
||
|
|
||
|
async def update_loop(is_hive):
|
||
|
while True:
|
||
|
flag_exists = await aiofiles.os.path.exists(config.update_driver_550_flag)
|
||
|
if flag_exists:
|
||
|
await run_update(is_hive)
|
||
|
await asyncio.sleep(300)
|