Nvidia driver 550 install flag | auto pull selftest #2
|
@ -1,5 +1,6 @@
|
|||
from lib import config as config_module
|
||||
from lib import logging as logging_lib
|
||||
from lib import nvidia_driver_update
|
||||
from lib import log_streaming_task
|
||||
from lib import run_startup_script
|
||||
from lib import hive_miner_interface
|
||||
|
@ -108,7 +109,7 @@ class CloreClient:
|
|||
"partner_service": utils.unix_timestamp()
|
||||
}
|
||||
self.max_service_inactivity = 600 # seconds
|
||||
self.no_restart_services = ["partner_service"] # Services that are allowed to run indefinetly without triggering the app to restart
|
||||
self.no_restart_services = ["partner_service", "specs_service"] # Services that are allowed to run indefinetly without triggering the app to restart
|
||||
|
||||
if config.debug_ws_peer:
|
||||
self.ws_peers[str(config.debug_ws_peer)]={
|
||||
|
@ -148,6 +149,8 @@ class CloreClient:
|
|||
self.partner_forwarding_ips = []
|
||||
self.start_time = utils.unix_timestamp()
|
||||
|
||||
self.runned_pull_selftest = False
|
||||
|
||||
async def service(self):
|
||||
global container_log_broken
|
||||
|
||||
|
@ -164,9 +167,10 @@ class CloreClient:
|
|||
task8 = asyncio.create_task(self.background_pow_data_collection(monitoring))
|
||||
task9 = asyncio.create_task(self.partner_service(monitoring))
|
||||
monitoring_task = asyncio.create_task(self.monitoring_service(monitoring))
|
||||
driver_update_task = asyncio.create_task(nvidia_driver_update.update_loop(self.is_hive))
|
||||
|
||||
# Wait for both tasks to complete (they won't in this case)
|
||||
await asyncio.gather(task1, task2, task3, task4, task5, task6, task7, task8, task9, monitoring_task)
|
||||
await asyncio.gather(task1, task2, task3, task4, task5, task6, task7, task8, task9, monitoring_task, driver_update_task)
|
||||
|
||||
async def monitoring_service(self, monitoring):
|
||||
while True:
|
||||
|
@ -475,7 +479,7 @@ class CloreClient:
|
|||
if self.xfs_state == "active":
|
||||
self.allowed_images.append({
|
||||
"repository": "vastai/test",
|
||||
"allowed_tags": ["bandwidth-test-nvidia"]
|
||||
"allowed_tags": ["bandwidth-test-nvidia", "selftest"]
|
||||
})
|
||||
if not config.debug_ws_peer:
|
||||
for pure_ws_peer in result.ws_peers:
|
||||
|
@ -505,7 +509,7 @@ class CloreClient:
|
|||
async def submit_specs(self, current_specs):
|
||||
try:
|
||||
if type(current_specs) == dict:
|
||||
current_specs["backend_version"]=19
|
||||
current_specs["backend_version"]=20
|
||||
current_specs["update_hw"]=True
|
||||
smallest_pcie_width = 999
|
||||
for gpu in current_specs["gpus"]["nvidia"]:
|
||||
|
@ -544,6 +548,12 @@ class CloreClient:
|
|||
self.last_hw_specs_submit=utils.unix_timestamp()
|
||||
await self.submit_specs(current_specs)
|
||||
await self.update_realtime_data(current_specs)
|
||||
try:
|
||||
if self.xfs_state == "active" and len(current_specs["gpus"]["nvidia"]) > 0 and not self.runned_pull_selftest:
|
||||
await clore_partner.check_to_pull_selftest(current_specs)
|
||||
self.runned_pull_selftest = True
|
||||
except Exception as partner_exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
log.debug(f"FAIL | specs_service() | {e}")
|
||||
await asyncio.sleep(7)
|
||||
|
|
|
@ -34,6 +34,10 @@ elif config.service:
|
|||
|
||||
xfs_state = xfs.init()
|
||||
|
||||
if os.path.isfile("/opt/clore-hosting/.run_hive_driver_update"):
|
||||
utils.run_command("systemctl stop docker && PATH=/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./ nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force")
|
||||
utils.run_command("systemctl restart docker")
|
||||
os.remove("/opt/clore-hosting/.run_hive_driver_update")
|
||||
if os.path.isfile(config.restart_docker_flag_file):
|
||||
utils.run_command("systemctl restart docker")
|
||||
os.remove(config.restart_docker_flag_file)
|
||||
|
|
|
@ -22,6 +22,11 @@ MANDATORY_PACKEGES = ['dmidecode', 'openvpn', 'iproute2']
|
|||
|
||||
DUMMY_WORKLOAD_CONTAINER = "cloreai/partner-dummy-workload"
|
||||
|
||||
non_interactive_env = {
|
||||
'DEBIAN_FRONTEND': 'noninteractive',
|
||||
'PATH': '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin',
|
||||
}
|
||||
|
||||
host_facts_location = os.path.join(config.clore_partner_base_dir, "host_facts")
|
||||
partner_cache_location = os.path.join(config.clore_partner_base_dir, "partner_cache")
|
||||
|
||||
|
@ -235,4 +240,25 @@ def filter_partner_dummy_workload_container(containers):
|
|||
remaining_containers.append(container)
|
||||
return remaining_containers
|
||||
except Exception as e:
|
||||
return containers
|
||||
return containers
|
||||
|
||||
auto_pull_selftest_gpus = ["NVIDIA GeForce RTX 3090", "NVIDIA GeForce RTX 4090"]
|
||||
|
||||
async def check_to_pull_selftest(current_specs):
|
||||
try:
|
||||
min_width = 16
|
||||
gpu_total_vram = 0
|
||||
gpu_name = ''
|
||||
mixed_cards = False
|
||||
for idx, nvidia_gpu in enumerate(current_specs["gpus"]["nvidia"]):
|
||||
if idx > 0 and nvidia_gpu["name"] != gpu_name:
|
||||
mixed_cards = True
|
||||
gpu_name = nvidia_gpu["name"]
|
||||
if nvidia_gpu["pcie_width"] < min_width:
|
||||
min_width = nvidia_gpu["pcie_width"]
|
||||
if " MiB" in nvidia_gpu["mem_total"]:
|
||||
gpu_total_vram += int(nvidia_gpu["mem_total"].replace(" MiB", ''))
|
||||
if gpu_name in auto_pull_selftest_gpus and current_specs["ram"] > 7 and int(current_specs["cpus"].split('/')[0]) >= 4 and not mixed_cards and min_width > 1 and gpu_total_vram < current_specs["ram"] * 1024 and float(current_specs["disk"].split(' ')[-1].replace("GB", '')) > 25:
|
||||
await utils.async_run_command("docker pull vastai/test:selftest", 14400, non_interactive_env)
|
||||
except Exception as e:
|
||||
pass
|
|
@ -42,7 +42,8 @@ hard_config = {
|
|||
"openvpn_forwarding_tun_device": "tun1313",
|
||||
"forwarding_ip_route_table_id": 100,
|
||||
"clore_partner_container_name": "clore-partner-service",
|
||||
"restart_docker_flag_file": "/opt/clore-hosting/.restart_docker"
|
||||
"restart_docker_flag_file": "/opt/clore-hosting/.restart_docker",
|
||||
"update_driver_550_flag": "/opt/clore-hosting/.update_550"
|
||||
}
|
||||
|
||||
parser = argparse.ArgumentParser(description='Example argparse usage')
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
from lib import config as config_module
|
||||
from lib import docker_interface
|
||||
from lib import background_job
|
||||
from lib import utils
|
||||
import asyncio
|
||||
import aiofiles
|
||||
import os
|
||||
|
||||
config = config_module.config
|
||||
|
||||
non_interactive_env = {
|
||||
'DEBIAN_FRONTEND': 'noninteractive',
|
||||
'PATH': '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin',
|
||||
}
|
||||
|
||||
non_interactive_env_hive = {
|
||||
'DEBIAN_FRONTEND': 'noninteractive',
|
||||
'PATH': '/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./',
|
||||
}
|
||||
|
||||
INTENDED_DRIVER_VERSION = 550
|
||||
|
||||
async def get_running_containers():
|
||||
try:
|
||||
containers = await asyncio.to_thread(docker_interface.get_containers, False)
|
||||
return containers
|
||||
except Exception as e:
|
||||
return False
|
||||
|
||||
async def run_update(is_hive = False):
|
||||
try:
|
||||
code, stdout, stderr = await utils.async_run_command("nvidia-smi --query-gpu=driver_version --format=csv,noheader", 20, env=non_interactive_env)
|
||||
if code == 0 and stderr == '' and stdout:
|
||||
driver_version = stdout.split('\n')[0].split('.')
|
||||
if len(driver_version) >= 2 and driver_version[0].isdigit() and int(driver_version[0]) > 300 and int(driver_version[0]) < INTENDED_DRIVER_VERSION:
|
||||
running_containers = await get_running_containers()
|
||||
if type(running_containers) == list:
|
||||
order_running = False
|
||||
for container in running_containers:
|
||||
if container.name[:11] == "clore-order" or container.name[:2] == "C.":
|
||||
order_running = True
|
||||
break
|
||||
if not order_running:
|
||||
if is_hive:
|
||||
background_job.temporarly_disable(14400)
|
||||
driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force", 14400, non_interactive_env_hive)
|
||||
if driver_update_code == 1 and "Unload modules failed (nvidia)" in driver_update_stdout:
|
||||
async with aiofiles.open("/opt/clore-hosting/.run_hive_driver_update", mode='w') as file:
|
||||
await file.write("")
|
||||
os._exit(0)
|
||||
background_job.enable()
|
||||
if driver_update_code == 0:
|
||||
async with aiofiles.open(config.restart_docker_flag_file, mode='w') as file:
|
||||
await file.write("")
|
||||
await aiofiles.os.remove(config.update_driver_550_flag)
|
||||
os._exit(0)
|
||||
else:
|
||||
driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("apt update -y && apt install nvidia-driver-550 -y", 14400, non_interactive_env)
|
||||
if driver_update_code == 0:
|
||||
await aiofiles.os.remove(config.update_driver_550_flag)
|
||||
await utils.async_run_command("reboot") # On ubuntu it's just safest to reboot to get new driver version working properly
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
async def update_loop(is_hive):
|
||||
while True:
|
||||
flag_exists = await aiofiles.os.path.exists(config.update_driver_550_flag)
|
||||
if flag_exists:
|
||||
await run_update(is_hive)
|
||||
await asyncio.sleep(300)
|
Loading…
Reference in New Issue