From b54bd78a45203d689617f6f0c7e70781bab1bdb9 Mon Sep 17 00:00:00 2001 From: clore Date: Tue, 10 Dec 2024 22:28:51 +0000 Subject: [PATCH 1/5] driver update to 550 flag, pull selftest for partner enabled machines --- clore_hosting/main.py | 18 ++++++++--- lib/clore_partner.py | 28 ++++++++++++++++- lib/config.py | 3 +- lib/nvidia_driver_update.py | 63 +++++++++++++++++++++++++++++++++++++ 4 files changed, 106 insertions(+), 6 deletions(-) create mode 100644 lib/nvidia_driver_update.py diff --git a/clore_hosting/main.py b/clore_hosting/main.py index f9031c5..ceb3a57 100644 --- a/clore_hosting/main.py +++ b/clore_hosting/main.py @@ -1,5 +1,6 @@ from lib import config as config_module from lib import logging as logging_lib +from lib import nvidia_driver_update from lib import log_streaming_task from lib import run_startup_script from lib import hive_miner_interface @@ -108,7 +109,7 @@ class CloreClient: "partner_service": utils.unix_timestamp() } self.max_service_inactivity = 600 # seconds - self.no_restart_services = ["partner_service"] # Services that are allowed to run indefinetly without triggering the app to restart + self.no_restart_services = ["partner_service", "specs_service"] # Services that are allowed to run indefinetly without triggering the app to restart if config.debug_ws_peer: self.ws_peers[str(config.debug_ws_peer)]={ @@ -148,6 +149,8 @@ class CloreClient: self.partner_forwarding_ips = [] self.start_time = utils.unix_timestamp() + self.runned_pull_selftest = False + async def service(self): global container_log_broken @@ -164,9 +167,10 @@ class CloreClient: task8 = asyncio.create_task(self.background_pow_data_collection(monitoring)) task9 = asyncio.create_task(self.partner_service(monitoring)) monitoring_task = asyncio.create_task(self.monitoring_service(monitoring)) + driver_update_task = asyncio.create_task(nvidia_driver_update.update_loop(self.is_hive)) # Wait for both tasks to complete (they won't in this case) - await asyncio.gather(task1, task2, task3, task4, task5, task6, task7, task8, task9, monitoring_task) + await asyncio.gather(task1, task2, task3, task4, task5, task6, task7, task8, task9, monitoring_task, driver_update_task) async def monitoring_service(self, monitoring): while True: @@ -475,7 +479,7 @@ class CloreClient: if self.xfs_state == "active": self.allowed_images.append({ "repository": "vastai/test", - "allowed_tags": ["bandwidth-test-nvidia"] + "allowed_tags": ["bandwidth-test-nvidia", "selftest"] }) if not config.debug_ws_peer: for pure_ws_peer in result.ws_peers: @@ -505,7 +509,7 @@ class CloreClient: async def submit_specs(self, current_specs): try: if type(current_specs) == dict: - current_specs["backend_version"]=19 + current_specs["backend_version"]=20 current_specs["update_hw"]=True smallest_pcie_width = 999 for gpu in current_specs["gpus"]["nvidia"]: @@ -544,6 +548,12 @@ class CloreClient: self.last_hw_specs_submit=utils.unix_timestamp() await self.submit_specs(current_specs) await self.update_realtime_data(current_specs) + try: + if self.xfs_state == "active" and len(current_specs["gpus"]["nvidia"]) > 0 and not self.runned_pull_selftest: + await clore_partner.check_to_pull_selftest(current_specs) + self.runned_pull_selftest = True + except Exception as partner_exception: + pass except Exception as e: log.debug(f"FAIL | specs_service() | {e}") await asyncio.sleep(7) diff --git a/lib/clore_partner.py b/lib/clore_partner.py index 1e971ed..5f95c6e 100644 --- a/lib/clore_partner.py +++ b/lib/clore_partner.py @@ -22,6 +22,11 @@ MANDATORY_PACKEGES = ['dmidecode', 'openvpn', 'iproute2'] DUMMY_WORKLOAD_CONTAINER = "cloreai/partner-dummy-workload" +non_interactive_env = { + 'DEBIAN_FRONTEND': 'noninteractive', + 'PATH': '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin', +} + host_facts_location = os.path.join(config.clore_partner_base_dir, "host_facts") partner_cache_location = os.path.join(config.clore_partner_base_dir, "partner_cache") @@ -235,4 +240,25 @@ def filter_partner_dummy_workload_container(containers): remaining_containers.append(container) return remaining_containers except Exception as e: - return containers \ No newline at end of file + return containers + +auto_pull_selftest_gpus = ["NVIDIA GeForce RTX 3090", "NVIDIA GeForce RTX 4090"] + +async def check_to_pull_selftest(current_specs): + try: + min_width = 16 + gpu_total_vram = 0 + gpu_name = '' + mixed_cards = False + for idx, nvidia_gpu in enumerate(current_specs["gpus"]["nvidia"]): + if idx > 0 and nvidia_gpu["name"] != gpu_name: + mixed_cards = True + gpu_name = nvidia_gpu["name"] + if nvidia_gpu["pcie_width"] < min_width: + min_width = nvidia_gpu["pcie_width"] + if " MiB" in nvidia_gpu["mem_total"]: + gpu_total_vram += int(nvidia_gpu["mem_total"]) + if gpu_name in auto_pull_selftest_gpus and current_specs["ram"] > 7 and int(current_specs["cpus"].split('/')[0]) >= 4 and not mixed_cards and min_width > 1 and gpu_total_vram < current_specs["ram"] * 1024 and float(current_specs["disk"].split(' ')[-1].replace("GB", '')) > 25: + await utils.async_run_command("docker pull vastai/test:selftest", 14400, non_interactive_env) + except Exception as e: + pass \ No newline at end of file diff --git a/lib/config.py b/lib/config.py index 2df2c5c..92a0c37 100644 --- a/lib/config.py +++ b/lib/config.py @@ -42,7 +42,8 @@ hard_config = { "openvpn_forwarding_tun_device": "tun1313", "forwarding_ip_route_table_id": 100, "clore_partner_container_name": "clore-partner-service", - "restart_docker_flag_file": "/opt/clore-hosting/.restart_docker" + "restart_docker_flag_file": "/opt/clore-hosting/.restart_docker", + "update_driver_550_flag": "/opt/clore-hosting/.update_550" } parser = argparse.ArgumentParser(description='Example argparse usage') diff --git a/lib/nvidia_driver_update.py b/lib/nvidia_driver_update.py new file mode 100644 index 0000000..4e9308e --- /dev/null +++ b/lib/nvidia_driver_update.py @@ -0,0 +1,63 @@ +from lib import config as config_module +from lib import docker_interface +from lib import utils +import asyncio +import aiofiles +import os + +config = config_module.config + +non_interactive_env = { + 'DEBIAN_FRONTEND': 'noninteractive', + 'PATH': '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin', +} + +non_interactive_env_hive = { + 'DEBIAN_FRONTEND': 'noninteractive', + 'PATH': '/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./', +} + +INTENDED_DRIVER_VERSION = 550 + +async def get_running_containers(): + try: + containers = await asyncio.to_thread(docker_interface.get_containers, False) + return containers + except Exception as e: + return False + +async def run_update(is_hive = False): + try: + code, stdout, stderr = await utils.async_run_command("nvidia-smi --query-gpu=driver_version --format=csv,noheader", 20, env=non_interactive_env) + if code == 0 and stderr == '' and stdout: + driver_version = stdout.split('\n')[0].split('.') + if len(driver_version) >= 2 and driver_version[0].isdigit() and int(driver_version[0]) > 300 and int(driver_version[0]) < INTENDED_DRIVER_VERSION: + running_containers = await get_running_containers() + if type(running_containers) == list: + order_running = False + for container in running_containers: + if container.name[:11] == "clore-order" or container.name[:2] == "C.": + order_running = True + break + if not order_running: + if is_hive: + driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force", 14400, non_interactive_env_hive) + if driver_update_code == 0: + async with aiofiles.open(config.restart_docker_flag_file, mode='w') as file: + await file.write("") + await aiofiles.os.remove(config.update_driver_550_flag) + os._exit(0) + else: + driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("apt update -y && apt install nvidia-driver-550 -y", 14400, non_interactive_env) + if driver_update_code == 0: + await aiofiles.os.remove(config.update_driver_550_flag) + await utils.async_run_command("reboot") # On ubuntu it's just safest to reboot to get new driver version working properly + except Exception as e: + pass + +async def update_loop(is_hive): + while True: + flag_exists = await aiofiles.os.path.exists(config.update_driver_550_flag) + if flag_exists: + await run_update(is_hive) + await asyncio.sleep(300) \ No newline at end of file From 4f1807e3ab0b632181c31a7f4ba3774b3a4fa73e Mon Sep 17 00:00:00 2001 From: clore Date: Tue, 10 Dec 2024 23:00:09 +0000 Subject: [PATCH 2/5] disable background job while updating nvidia driver on HiveOS --- lib/nvidia_driver_update.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/nvidia_driver_update.py b/lib/nvidia_driver_update.py index 4e9308e..ecd5511 100644 --- a/lib/nvidia_driver_update.py +++ b/lib/nvidia_driver_update.py @@ -1,5 +1,6 @@ from lib import config as config_module from lib import docker_interface +from lib import background_job from lib import utils import asyncio import aiofiles @@ -41,7 +42,9 @@ async def run_update(is_hive = False): break if not order_running: if is_hive: + background_job.temporarly_disable(14400) driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force", 14400, non_interactive_env_hive) + background_job.enable() if driver_update_code == 0: async with aiofiles.open(config.restart_docker_flag_file, mode='w') as file: await file.write("") From 8b9f89ff64e2739e480ddae498ce21a326b4d1a5 Mon Sep 17 00:00:00 2001 From: clore Date: Tue, 10 Dec 2024 23:11:25 +0000 Subject: [PATCH 3/5] fix exception --- lib/clore_partner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/clore_partner.py b/lib/clore_partner.py index 5f95c6e..20d397a 100644 --- a/lib/clore_partner.py +++ b/lib/clore_partner.py @@ -257,7 +257,7 @@ async def check_to_pull_selftest(current_specs): if nvidia_gpu["pcie_width"] < min_width: min_width = nvidia_gpu["pcie_width"] if " MiB" in nvidia_gpu["mem_total"]: - gpu_total_vram += int(nvidia_gpu["mem_total"]) + gpu_total_vram += int(nvidia_gpu["mem_total"].replace(" MiB", '')) if gpu_name in auto_pull_selftest_gpus and current_specs["ram"] > 7 and int(current_specs["cpus"].split('/')[0]) >= 4 and not mixed_cards and min_width > 1 and gpu_total_vram < current_specs["ram"] * 1024 and float(current_specs["disk"].split(' ')[-1].replace("GB", '')) > 25: await utils.async_run_command("docker pull vastai/test:selftest", 14400, non_interactive_env) except Exception as e: From e1a4dc7aa6a1034fb9e3fe9e06489e0b46048671 Mon Sep 17 00:00:00 2001 From: clore Date: Tue, 10 Dec 2024 23:48:01 +0000 Subject: [PATCH 4/5] HiveOS on nvidia driver update - if fails to unload nvidia -> restart clore hosting --- hosting.py | 4 ++++ lib/nvidia_driver_update.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/hosting.py b/hosting.py index e662f69..1eb91b2 100644 --- a/hosting.py +++ b/hosting.py @@ -34,6 +34,10 @@ elif config.service: xfs_state = xfs.init() + if os.path.isfile("/opt/clore-hosting/.run_hive_driver_update"): + utils.run_command("PATH=/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./ nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force") + utils.run_command("systemctl restart docker") + os.remove("/opt/clore-hosting/.run_hive_driver_update") if os.path.isfile(config.restart_docker_flag_file): utils.run_command("systemctl restart docker") os.remove(config.restart_docker_flag_file) diff --git a/lib/nvidia_driver_update.py b/lib/nvidia_driver_update.py index ecd5511..429253e 100644 --- a/lib/nvidia_driver_update.py +++ b/lib/nvidia_driver_update.py @@ -44,6 +44,10 @@ async def run_update(is_hive = False): if is_hive: background_job.temporarly_disable(14400) driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force", 14400, non_interactive_env_hive) + if driver_update_code == 1 and "Unload modules failed (nvidia)" in driver_update_stdout: + async with aiofiles.open("/opt/clore-hosting/.run_hive_driver_update", mode='w') as file: + await file.write("") + os._exit(0) background_job.enable() if driver_update_code == 0: async with aiofiles.open(config.restart_docker_flag_file, mode='w') as file: From 7c369ae36bf56ed1b829210ee01dff52e1a3aca7 Mon Sep 17 00:00:00 2001 From: clore Date: Tue, 10 Dec 2024 23:54:59 +0000 Subject: [PATCH 5/5] in case we are already in progress of driver update on hive, we can allow to stop docker --- hosting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hosting.py b/hosting.py index 1eb91b2..b9d53bb 100644 --- a/hosting.py +++ b/hosting.py @@ -35,7 +35,7 @@ elif config.service: xfs_state = xfs.init() if os.path.isfile("/opt/clore-hosting/.run_hive_driver_update"): - utils.run_command("PATH=/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./ nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force") + utils.run_command("systemctl stop docker && PATH=/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./ nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force") utils.run_command("systemctl restart docker") os.remove("/opt/clore-hosting/.run_hive_driver_update") if os.path.isfile(config.restart_docker_flag_file):