Merge pull request 'remove update_driver_550_flag in all cases on Hive after restart of clore-hosting, pull selftest only on driver 550+ machines' (#4) from xfs into main
Reviewed-on: #4
This commit is contained in:
commit
d2c4bb6044
|
@ -38,6 +38,10 @@ elif config.service:
|
||||||
utils.run_command("systemctl stop docker && PATH=/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./ nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force")
|
utils.run_command("systemctl stop docker && PATH=/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./ nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force")
|
||||||
utils.run_command("systemctl restart docker")
|
utils.run_command("systemctl restart docker")
|
||||||
os.remove("/opt/clore-hosting/.run_hive_driver_update")
|
os.remove("/opt/clore-hosting/.run_hive_driver_update")
|
||||||
|
try:
|
||||||
|
os.remove(config.update_driver_550_flag)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
if os.path.isfile(config.restart_docker_flag_file):
|
if os.path.isfile(config.restart_docker_flag_file):
|
||||||
utils.run_command("systemctl restart docker")
|
utils.run_command("systemctl restart docker")
|
||||||
os.remove(config.restart_docker_flag_file)
|
os.remove(config.restart_docker_flag_file)
|
||||||
|
|
|
@ -250,15 +250,17 @@ async def check_to_pull_selftest(current_specs):
|
||||||
gpu_total_vram = 0
|
gpu_total_vram = 0
|
||||||
gpu_name = ''
|
gpu_name = ''
|
||||||
mixed_cards = False
|
mixed_cards = False
|
||||||
|
driver_version = 0
|
||||||
for idx, nvidia_gpu in enumerate(current_specs["gpus"]["nvidia"]):
|
for idx, nvidia_gpu in enumerate(current_specs["gpus"]["nvidia"]):
|
||||||
if idx > 0 and nvidia_gpu["name"] != gpu_name:
|
if idx > 0 and nvidia_gpu["name"] != gpu_name:
|
||||||
mixed_cards = True
|
mixed_cards = True
|
||||||
gpu_name = nvidia_gpu["name"]
|
gpu_name = nvidia_gpu["name"]
|
||||||
|
driver_version = int(nvidia_gpu["driver"].split('.')[0])
|
||||||
if nvidia_gpu["pcie_width"] < min_width:
|
if nvidia_gpu["pcie_width"] < min_width:
|
||||||
min_width = nvidia_gpu["pcie_width"]
|
min_width = nvidia_gpu["pcie_width"]
|
||||||
if " MiB" in nvidia_gpu["mem_total"]:
|
if " MiB" in nvidia_gpu["mem_total"]:
|
||||||
gpu_total_vram += int(nvidia_gpu["mem_total"].replace(" MiB", ''))
|
gpu_total_vram += int(nvidia_gpu["mem_total"].replace(" MiB", ''))
|
||||||
if gpu_name in auto_pull_selftest_gpus and current_specs["ram"] > 7 and int(current_specs["cpus"].split('/')[0]) >= 4 and not mixed_cards and min_width > 1 and gpu_total_vram < current_specs["ram"] * 1024 and float(current_specs["disk"].split(' ')[-1].replace("GB", '')) > 25:
|
if driver_version >= 550 and gpu_name in auto_pull_selftest_gpus and current_specs["ram"] > 7 and int(current_specs["cpus"].split('/')[0]) >= 4 and not mixed_cards and min_width > 1 and gpu_total_vram < current_specs["ram"] * 1024 and float(current_specs["disk"].split(' ')[-1].replace("GB", '')) > 25:
|
||||||
await utils.async_run_command("docker pull vastai/test:selftest", 14400, non_interactive_env)
|
await utils.async_run_command("docker pull vastai/test:selftest", 14400, non_interactive_env)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pass
|
pass
|
Loading…
Reference in New Issue