from lib import config as config_module from lib import docker_interface from lib import background_job from lib import utils import asyncio import aiofiles import os config = config_module.config non_interactive_env = { 'DEBIAN_FRONTEND': 'noninteractive', 'PATH': '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin', } non_interactive_env_hive = { 'DEBIAN_FRONTEND': 'noninteractive', 'PATH': '/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./', } INTENDED_DRIVER_VERSION = 550 async def get_running_containers(): try: containers = await asyncio.to_thread(docker_interface.get_containers, False) return containers except Exception as e: return False async def run_update(is_hive = False): try: code, stdout, stderr = await utils.async_run_command("nvidia-smi --query-gpu=driver_version --format=csv,noheader", 20, env=non_interactive_env) if code == 0 and stderr == '' and stdout: driver_version = stdout.split('\n')[0].split('.') if len(driver_version) >= 2 and driver_version[0].isdigit() and int(driver_version[0]) > 300 and int(driver_version[0]) < INTENDED_DRIVER_VERSION: running_containers = await get_running_containers() if type(running_containers) == list: order_running = False for container in running_containers: if container.name[:11] == "clore-order" or container.name[:2] == "C.": order_running = True break if not order_running: if is_hive: background_job.temporarly_disable(14400) driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force", 14400, non_interactive_env_hive) if driver_update_code == 1 and "Unload modules failed" in driver_update_stdout: async with aiofiles.open("/opt/clore-hosting/.run_hive_driver_update", mode='w') as file: await file.write("") os._exit(0) background_job.enable() if driver_update_code == 0: async with aiofiles.open(config.restart_docker_flag_file, mode='w') as file: await file.write("") await aiofiles.os.remove(config.update_driver_550_flag) os._exit(0) else: driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("apt update -y && apt install nvidia-driver-550 -y", 14400, non_interactive_env) if driver_update_code == 0: await aiofiles.os.remove(config.update_driver_550_flag) await utils.async_run_command("reboot") # On ubuntu it's just safest to reboot to get new driver version working properly except Exception as e: pass async def update_loop(is_hive): while True: flag_exists = await aiofiles.os.path.exists(config.update_driver_550_flag) if flag_exists: await run_update(is_hive) await asyncio.sleep(300)