Merge pull request 'remove update_driver_550_flag in all cases on Hive after restart of clore-hosting, pull selftest only on driver 550+ machines' (#4) from xfs into main

Reviewed-on: #4
This commit is contained in:
clore 2024-12-11 09:00:10 +00:00
commit d2c4bb6044
2 changed files with 7 additions and 1 deletions

View File

@ -38,6 +38,10 @@ elif config.service:
utils.run_command("systemctl stop docker && PATH=/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./ nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force")
utils.run_command("systemctl restart docker")
os.remove("/opt/clore-hosting/.run_hive_driver_update")
try:
os.remove(config.update_driver_550_flag)
except Exception as e:
pass
if os.path.isfile(config.restart_docker_flag_file):
utils.run_command("systemctl restart docker")
os.remove(config.restart_docker_flag_file)

View File

@ -250,15 +250,17 @@ async def check_to_pull_selftest(current_specs):
gpu_total_vram = 0
gpu_name = ''
mixed_cards = False
driver_version = 0
for idx, nvidia_gpu in enumerate(current_specs["gpus"]["nvidia"]):
if idx > 0 and nvidia_gpu["name"] != gpu_name:
mixed_cards = True
gpu_name = nvidia_gpu["name"]
driver_version = int(nvidia_gpu["driver"].split('.')[0])
if nvidia_gpu["pcie_width"] < min_width:
min_width = nvidia_gpu["pcie_width"]
if " MiB" in nvidia_gpu["mem_total"]:
gpu_total_vram += int(nvidia_gpu["mem_total"].replace(" MiB", ''))
if gpu_name in auto_pull_selftest_gpus and current_specs["ram"] > 7 and int(current_specs["cpus"].split('/')[0]) >= 4 and not mixed_cards and min_width > 1 and gpu_total_vram < current_specs["ram"] * 1024 and float(current_specs["disk"].split(' ')[-1].replace("GB", '')) > 25:
if driver_version >= 550 and gpu_name in auto_pull_selftest_gpus and current_specs["ram"] > 7 and int(current_specs["cpus"].split('/')[0]) >= 4 and not mixed_cards and min_width > 1 and gpu_total_vram < current_specs["ram"] * 1024 and float(current_specs["disk"].split(' ')[-1].replace("GB", '')) > 25:
await utils.async_run_command("docker pull vastai/test:selftest", 14400, non_interactive_env)
except Exception as e:
pass