Nvidia driver 550 install flag | auto pull selftest #2

Merged
clore merged 5 commits from xfs into main 2024-12-11 00:24:01 +00:00
4 changed files with 106 additions and 6 deletions
Showing only changes of commit b54bd78a45 - Show all commits

View File

@ -1,5 +1,6 @@
from lib import config as config_module
from lib import logging as logging_lib
from lib import nvidia_driver_update
from lib import log_streaming_task
from lib import run_startup_script
from lib import hive_miner_interface
@ -108,7 +109,7 @@ class CloreClient:
"partner_service": utils.unix_timestamp()
}
self.max_service_inactivity = 600 # seconds
self.no_restart_services = ["partner_service"] # Services that are allowed to run indefinetly without triggering the app to restart
self.no_restart_services = ["partner_service", "specs_service"] # Services that are allowed to run indefinetly without triggering the app to restart
if config.debug_ws_peer:
self.ws_peers[str(config.debug_ws_peer)]={
@ -148,6 +149,8 @@ class CloreClient:
self.partner_forwarding_ips = []
self.start_time = utils.unix_timestamp()
self.runned_pull_selftest = False
async def service(self):
global container_log_broken
@ -164,9 +167,10 @@ class CloreClient:
task8 = asyncio.create_task(self.background_pow_data_collection(monitoring))
task9 = asyncio.create_task(self.partner_service(monitoring))
monitoring_task = asyncio.create_task(self.monitoring_service(monitoring))
driver_update_task = asyncio.create_task(nvidia_driver_update.update_loop(self.is_hive))
# Wait for both tasks to complete (they won't in this case)
await asyncio.gather(task1, task2, task3, task4, task5, task6, task7, task8, task9, monitoring_task)
await asyncio.gather(task1, task2, task3, task4, task5, task6, task7, task8, task9, monitoring_task, driver_update_task)
async def monitoring_service(self, monitoring):
while True:
@ -475,7 +479,7 @@ class CloreClient:
if self.xfs_state == "active":
self.allowed_images.append({
"repository": "vastai/test",
"allowed_tags": ["bandwidth-test-nvidia"]
"allowed_tags": ["bandwidth-test-nvidia", "selftest"]
})
if not config.debug_ws_peer:
for pure_ws_peer in result.ws_peers:
@ -505,7 +509,7 @@ class CloreClient:
async def submit_specs(self, current_specs):
try:
if type(current_specs) == dict:
current_specs["backend_version"]=19
current_specs["backend_version"]=20
current_specs["update_hw"]=True
smallest_pcie_width = 999
for gpu in current_specs["gpus"]["nvidia"]:
@ -544,6 +548,12 @@ class CloreClient:
self.last_hw_specs_submit=utils.unix_timestamp()
await self.submit_specs(current_specs)
await self.update_realtime_data(current_specs)
try:
if self.xfs_state == "active" and len(current_specs["gpus"]["nvidia"]) > 0 and not self.runned_pull_selftest:
await clore_partner.check_to_pull_selftest(current_specs)
self.runned_pull_selftest = True
except Exception as partner_exception:
pass
except Exception as e:
log.debug(f"FAIL | specs_service() | {e}")
await asyncio.sleep(7)

View File

@ -22,6 +22,11 @@ MANDATORY_PACKEGES = ['dmidecode', 'openvpn', 'iproute2']
DUMMY_WORKLOAD_CONTAINER = "cloreai/partner-dummy-workload"
non_interactive_env = {
'DEBIAN_FRONTEND': 'noninteractive',
'PATH': '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin',
}
host_facts_location = os.path.join(config.clore_partner_base_dir, "host_facts")
partner_cache_location = os.path.join(config.clore_partner_base_dir, "partner_cache")
@ -235,4 +240,25 @@ def filter_partner_dummy_workload_container(containers):
remaining_containers.append(container)
return remaining_containers
except Exception as e:
return containers
return containers
auto_pull_selftest_gpus = ["NVIDIA GeForce RTX 3090", "NVIDIA GeForce RTX 4090"]
async def check_to_pull_selftest(current_specs):
try:
min_width = 16
gpu_total_vram = 0
gpu_name = ''
mixed_cards = False
for idx, nvidia_gpu in enumerate(current_specs["gpus"]["nvidia"]):
if idx > 0 and nvidia_gpu["name"] != gpu_name:
mixed_cards = True
gpu_name = nvidia_gpu["name"]
if nvidia_gpu["pcie_width"] < min_width:
min_width = nvidia_gpu["pcie_width"]
if " MiB" in nvidia_gpu["mem_total"]:
gpu_total_vram += int(nvidia_gpu["mem_total"])
if gpu_name in auto_pull_selftest_gpus and current_specs["ram"] > 7 and int(current_specs["cpus"].split('/')[0]) >= 4 and not mixed_cards and min_width > 1 and gpu_total_vram < current_specs["ram"] * 1024 and float(current_specs["disk"].split(' ')[-1].replace("GB", '')) > 25:
await utils.async_run_command("docker pull vastai/test:selftest", 14400, non_interactive_env)
except Exception as e:
pass

View File

@ -42,7 +42,8 @@ hard_config = {
"openvpn_forwarding_tun_device": "tun1313",
"forwarding_ip_route_table_id": 100,
"clore_partner_container_name": "clore-partner-service",
"restart_docker_flag_file": "/opt/clore-hosting/.restart_docker"
"restart_docker_flag_file": "/opt/clore-hosting/.restart_docker",
"update_driver_550_flag": "/opt/clore-hosting/.update_550"
}
parser = argparse.ArgumentParser(description='Example argparse usage')

View File

@ -0,0 +1,63 @@
from lib import config as config_module
from lib import docker_interface
from lib import utils
import asyncio
import aiofiles
import os
config = config_module.config
non_interactive_env = {
'DEBIAN_FRONTEND': 'noninteractive',
'PATH': '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin',
}
non_interactive_env_hive = {
'DEBIAN_FRONTEND': 'noninteractive',
'PATH': '/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./',
}
INTENDED_DRIVER_VERSION = 550
async def get_running_containers():
try:
containers = await asyncio.to_thread(docker_interface.get_containers, False)
return containers
except Exception as e:
return False
async def run_update(is_hive = False):
try:
code, stdout, stderr = await utils.async_run_command("nvidia-smi --query-gpu=driver_version --format=csv,noheader", 20, env=non_interactive_env)
if code == 0 and stderr == '' and stdout:
driver_version = stdout.split('\n')[0].split('.')
if len(driver_version) >= 2 and driver_version[0].isdigit() and int(driver_version[0]) > 300 and int(driver_version[0]) < INTENDED_DRIVER_VERSION:
running_containers = await get_running_containers()
if type(running_containers) == list:
order_running = False
for container in running_containers:
if container.name[:11] == "clore-order" or container.name[:2] == "C.":
order_running = True
break
if not order_running:
if is_hive:
driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force", 14400, non_interactive_env_hive)
if driver_update_code == 0:
async with aiofiles.open(config.restart_docker_flag_file, mode='w') as file:
await file.write("")
await aiofiles.os.remove(config.update_driver_550_flag)
os._exit(0)
else:
driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("apt update -y && apt install nvidia-driver-550 -y", 14400, non_interactive_env)
if driver_update_code == 0:
await aiofiles.os.remove(config.update_driver_550_flag)
await utils.async_run_command("reboot") # On ubuntu it's just safest to reboot to get new driver version working properly
except Exception as e:
pass
async def update_loop(is_hive):
while True:
flag_exists = await aiofiles.os.path.exists(config.update_driver_550_flag)
if flag_exists:
await run_update(is_hive)
await asyncio.sleep(300)