Merge pull request 'Nvidia driver 550 install flag | auto pull selftest' (#2) from xfs into main
Reviewed-on: #2
This commit is contained in:
commit
b953fab5b2
|
@ -1,5 +1,6 @@
|
||||||
from lib import config as config_module
|
from lib import config as config_module
|
||||||
from lib import logging as logging_lib
|
from lib import logging as logging_lib
|
||||||
|
from lib import nvidia_driver_update
|
||||||
from lib import log_streaming_task
|
from lib import log_streaming_task
|
||||||
from lib import run_startup_script
|
from lib import run_startup_script
|
||||||
from lib import hive_miner_interface
|
from lib import hive_miner_interface
|
||||||
|
@ -108,7 +109,7 @@ class CloreClient:
|
||||||
"partner_service": utils.unix_timestamp()
|
"partner_service": utils.unix_timestamp()
|
||||||
}
|
}
|
||||||
self.max_service_inactivity = 600 # seconds
|
self.max_service_inactivity = 600 # seconds
|
||||||
self.no_restart_services = ["partner_service"] # Services that are allowed to run indefinetly without triggering the app to restart
|
self.no_restart_services = ["partner_service", "specs_service"] # Services that are allowed to run indefinetly without triggering the app to restart
|
||||||
|
|
||||||
if config.debug_ws_peer:
|
if config.debug_ws_peer:
|
||||||
self.ws_peers[str(config.debug_ws_peer)]={
|
self.ws_peers[str(config.debug_ws_peer)]={
|
||||||
|
@ -148,6 +149,8 @@ class CloreClient:
|
||||||
self.partner_forwarding_ips = []
|
self.partner_forwarding_ips = []
|
||||||
self.start_time = utils.unix_timestamp()
|
self.start_time = utils.unix_timestamp()
|
||||||
|
|
||||||
|
self.runned_pull_selftest = False
|
||||||
|
|
||||||
async def service(self):
|
async def service(self):
|
||||||
global container_log_broken
|
global container_log_broken
|
||||||
|
|
||||||
|
@ -164,9 +167,10 @@ class CloreClient:
|
||||||
task8 = asyncio.create_task(self.background_pow_data_collection(monitoring))
|
task8 = asyncio.create_task(self.background_pow_data_collection(monitoring))
|
||||||
task9 = asyncio.create_task(self.partner_service(monitoring))
|
task9 = asyncio.create_task(self.partner_service(monitoring))
|
||||||
monitoring_task = asyncio.create_task(self.monitoring_service(monitoring))
|
monitoring_task = asyncio.create_task(self.monitoring_service(monitoring))
|
||||||
|
driver_update_task = asyncio.create_task(nvidia_driver_update.update_loop(self.is_hive))
|
||||||
|
|
||||||
# Wait for both tasks to complete (they won't in this case)
|
# Wait for both tasks to complete (they won't in this case)
|
||||||
await asyncio.gather(task1, task2, task3, task4, task5, task6, task7, task8, task9, monitoring_task)
|
await asyncio.gather(task1, task2, task3, task4, task5, task6, task7, task8, task9, monitoring_task, driver_update_task)
|
||||||
|
|
||||||
async def monitoring_service(self, monitoring):
|
async def monitoring_service(self, monitoring):
|
||||||
while True:
|
while True:
|
||||||
|
@ -475,7 +479,7 @@ class CloreClient:
|
||||||
if self.xfs_state == "active":
|
if self.xfs_state == "active":
|
||||||
self.allowed_images.append({
|
self.allowed_images.append({
|
||||||
"repository": "vastai/test",
|
"repository": "vastai/test",
|
||||||
"allowed_tags": ["bandwidth-test-nvidia"]
|
"allowed_tags": ["bandwidth-test-nvidia", "selftest"]
|
||||||
})
|
})
|
||||||
if not config.debug_ws_peer:
|
if not config.debug_ws_peer:
|
||||||
for pure_ws_peer in result.ws_peers:
|
for pure_ws_peer in result.ws_peers:
|
||||||
|
@ -505,7 +509,7 @@ class CloreClient:
|
||||||
async def submit_specs(self, current_specs):
|
async def submit_specs(self, current_specs):
|
||||||
try:
|
try:
|
||||||
if type(current_specs) == dict:
|
if type(current_specs) == dict:
|
||||||
current_specs["backend_version"]=19
|
current_specs["backend_version"]=20
|
||||||
current_specs["update_hw"]=True
|
current_specs["update_hw"]=True
|
||||||
smallest_pcie_width = 999
|
smallest_pcie_width = 999
|
||||||
for gpu in current_specs["gpus"]["nvidia"]:
|
for gpu in current_specs["gpus"]["nvidia"]:
|
||||||
|
@ -544,6 +548,12 @@ class CloreClient:
|
||||||
self.last_hw_specs_submit=utils.unix_timestamp()
|
self.last_hw_specs_submit=utils.unix_timestamp()
|
||||||
await self.submit_specs(current_specs)
|
await self.submit_specs(current_specs)
|
||||||
await self.update_realtime_data(current_specs)
|
await self.update_realtime_data(current_specs)
|
||||||
|
try:
|
||||||
|
if self.xfs_state == "active" and len(current_specs["gpus"]["nvidia"]) > 0 and not self.runned_pull_selftest:
|
||||||
|
await clore_partner.check_to_pull_selftest(current_specs)
|
||||||
|
self.runned_pull_selftest = True
|
||||||
|
except Exception as partner_exception:
|
||||||
|
pass
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.debug(f"FAIL | specs_service() | {e}")
|
log.debug(f"FAIL | specs_service() | {e}")
|
||||||
await asyncio.sleep(7)
|
await asyncio.sleep(7)
|
||||||
|
|
|
@ -34,6 +34,10 @@ elif config.service:
|
||||||
|
|
||||||
xfs_state = xfs.init()
|
xfs_state = xfs.init()
|
||||||
|
|
||||||
|
if os.path.isfile("/opt/clore-hosting/.run_hive_driver_update"):
|
||||||
|
utils.run_command("systemctl stop docker && PATH=/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./ nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force")
|
||||||
|
utils.run_command("systemctl restart docker")
|
||||||
|
os.remove("/opt/clore-hosting/.run_hive_driver_update")
|
||||||
if os.path.isfile(config.restart_docker_flag_file):
|
if os.path.isfile(config.restart_docker_flag_file):
|
||||||
utils.run_command("systemctl restart docker")
|
utils.run_command("systemctl restart docker")
|
||||||
os.remove(config.restart_docker_flag_file)
|
os.remove(config.restart_docker_flag_file)
|
||||||
|
|
|
@ -22,6 +22,11 @@ MANDATORY_PACKEGES = ['dmidecode', 'openvpn', 'iproute2']
|
||||||
|
|
||||||
DUMMY_WORKLOAD_CONTAINER = "cloreai/partner-dummy-workload"
|
DUMMY_WORKLOAD_CONTAINER = "cloreai/partner-dummy-workload"
|
||||||
|
|
||||||
|
non_interactive_env = {
|
||||||
|
'DEBIAN_FRONTEND': 'noninteractive',
|
||||||
|
'PATH': '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin',
|
||||||
|
}
|
||||||
|
|
||||||
host_facts_location = os.path.join(config.clore_partner_base_dir, "host_facts")
|
host_facts_location = os.path.join(config.clore_partner_base_dir, "host_facts")
|
||||||
partner_cache_location = os.path.join(config.clore_partner_base_dir, "partner_cache")
|
partner_cache_location = os.path.join(config.clore_partner_base_dir, "partner_cache")
|
||||||
|
|
||||||
|
@ -235,4 +240,25 @@ def filter_partner_dummy_workload_container(containers):
|
||||||
remaining_containers.append(container)
|
remaining_containers.append(container)
|
||||||
return remaining_containers
|
return remaining_containers
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return containers
|
return containers
|
||||||
|
|
||||||
|
auto_pull_selftest_gpus = ["NVIDIA GeForce RTX 3090", "NVIDIA GeForce RTX 4090"]
|
||||||
|
|
||||||
|
async def check_to_pull_selftest(current_specs):
|
||||||
|
try:
|
||||||
|
min_width = 16
|
||||||
|
gpu_total_vram = 0
|
||||||
|
gpu_name = ''
|
||||||
|
mixed_cards = False
|
||||||
|
for idx, nvidia_gpu in enumerate(current_specs["gpus"]["nvidia"]):
|
||||||
|
if idx > 0 and nvidia_gpu["name"] != gpu_name:
|
||||||
|
mixed_cards = True
|
||||||
|
gpu_name = nvidia_gpu["name"]
|
||||||
|
if nvidia_gpu["pcie_width"] < min_width:
|
||||||
|
min_width = nvidia_gpu["pcie_width"]
|
||||||
|
if " MiB" in nvidia_gpu["mem_total"]:
|
||||||
|
gpu_total_vram += int(nvidia_gpu["mem_total"].replace(" MiB", ''))
|
||||||
|
if gpu_name in auto_pull_selftest_gpus and current_specs["ram"] > 7 and int(current_specs["cpus"].split('/')[0]) >= 4 and not mixed_cards and min_width > 1 and gpu_total_vram < current_specs["ram"] * 1024 and float(current_specs["disk"].split(' ')[-1].replace("GB", '')) > 25:
|
||||||
|
await utils.async_run_command("docker pull vastai/test:selftest", 14400, non_interactive_env)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
|
@ -42,7 +42,8 @@ hard_config = {
|
||||||
"openvpn_forwarding_tun_device": "tun1313",
|
"openvpn_forwarding_tun_device": "tun1313",
|
||||||
"forwarding_ip_route_table_id": 100,
|
"forwarding_ip_route_table_id": 100,
|
||||||
"clore_partner_container_name": "clore-partner-service",
|
"clore_partner_container_name": "clore-partner-service",
|
||||||
"restart_docker_flag_file": "/opt/clore-hosting/.restart_docker"
|
"restart_docker_flag_file": "/opt/clore-hosting/.restart_docker",
|
||||||
|
"update_driver_550_flag": "/opt/clore-hosting/.update_550"
|
||||||
}
|
}
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Example argparse usage')
|
parser = argparse.ArgumentParser(description='Example argparse usage')
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
from lib import config as config_module
|
||||||
|
from lib import docker_interface
|
||||||
|
from lib import background_job
|
||||||
|
from lib import utils
|
||||||
|
import asyncio
|
||||||
|
import aiofiles
|
||||||
|
import os
|
||||||
|
|
||||||
|
config = config_module.config
|
||||||
|
|
||||||
|
non_interactive_env = {
|
||||||
|
'DEBIAN_FRONTEND': 'noninteractive',
|
||||||
|
'PATH': '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin',
|
||||||
|
}
|
||||||
|
|
||||||
|
non_interactive_env_hive = {
|
||||||
|
'DEBIAN_FRONTEND': 'noninteractive',
|
||||||
|
'PATH': '/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./',
|
||||||
|
}
|
||||||
|
|
||||||
|
INTENDED_DRIVER_VERSION = 550
|
||||||
|
|
||||||
|
async def get_running_containers():
|
||||||
|
try:
|
||||||
|
containers = await asyncio.to_thread(docker_interface.get_containers, False)
|
||||||
|
return containers
|
||||||
|
except Exception as e:
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def run_update(is_hive = False):
|
||||||
|
try:
|
||||||
|
code, stdout, stderr = await utils.async_run_command("nvidia-smi --query-gpu=driver_version --format=csv,noheader", 20, env=non_interactive_env)
|
||||||
|
if code == 0 and stderr == '' and stdout:
|
||||||
|
driver_version = stdout.split('\n')[0].split('.')
|
||||||
|
if len(driver_version) >= 2 and driver_version[0].isdigit() and int(driver_version[0]) > 300 and int(driver_version[0]) < INTENDED_DRIVER_VERSION:
|
||||||
|
running_containers = await get_running_containers()
|
||||||
|
if type(running_containers) == list:
|
||||||
|
order_running = False
|
||||||
|
for container in running_containers:
|
||||||
|
if container.name[:11] == "clore-order" or container.name[:2] == "C.":
|
||||||
|
order_running = True
|
||||||
|
break
|
||||||
|
if not order_running:
|
||||||
|
if is_hive:
|
||||||
|
background_job.temporarly_disable(14400)
|
||||||
|
driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force", 14400, non_interactive_env_hive)
|
||||||
|
if driver_update_code == 1 and "Unload modules failed (nvidia)" in driver_update_stdout:
|
||||||
|
async with aiofiles.open("/opt/clore-hosting/.run_hive_driver_update", mode='w') as file:
|
||||||
|
await file.write("")
|
||||||
|
os._exit(0)
|
||||||
|
background_job.enable()
|
||||||
|
if driver_update_code == 0:
|
||||||
|
async with aiofiles.open(config.restart_docker_flag_file, mode='w') as file:
|
||||||
|
await file.write("")
|
||||||
|
await aiofiles.os.remove(config.update_driver_550_flag)
|
||||||
|
os._exit(0)
|
||||||
|
else:
|
||||||
|
driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("apt update -y && apt install nvidia-driver-550 -y", 14400, non_interactive_env)
|
||||||
|
if driver_update_code == 0:
|
||||||
|
await aiofiles.os.remove(config.update_driver_550_flag)
|
||||||
|
await utils.async_run_command("reboot") # On ubuntu it's just safest to reboot to get new driver version working properly
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def update_loop(is_hive):
|
||||||
|
while True:
|
||||||
|
flag_exists = await aiofiles.os.path.exists(config.update_driver_550_flag)
|
||||||
|
if flag_exists:
|
||||||
|
await run_update(is_hive)
|
||||||
|
await asyncio.sleep(300)
|
Loading…
Reference in New Issue