Nvidia driver 550 install flag | auto pull selftest #2
			
				
			
		
		
		
	| 
						 | 
				
			
			@ -1,5 +1,6 @@
 | 
			
		|||
from lib import config as config_module
 | 
			
		||||
from lib import logging as logging_lib
 | 
			
		||||
from lib import nvidia_driver_update
 | 
			
		||||
from lib import log_streaming_task
 | 
			
		||||
from lib import run_startup_script
 | 
			
		||||
from lib import hive_miner_interface
 | 
			
		||||
| 
						 | 
				
			
			@ -108,7 +109,7 @@ class CloreClient:
 | 
			
		|||
            "partner_service": utils.unix_timestamp()
 | 
			
		||||
        }
 | 
			
		||||
        self.max_service_inactivity = 600 # seconds
 | 
			
		||||
        self.no_restart_services = ["partner_service"] # Services that are allowed to run indefinetly without triggering the app to restart
 | 
			
		||||
        self.no_restart_services = ["partner_service", "specs_service"] # Services that are allowed to run indefinetly without triggering the app to restart
 | 
			
		||||
 | 
			
		||||
        if config.debug_ws_peer:
 | 
			
		||||
            self.ws_peers[str(config.debug_ws_peer)]={
 | 
			
		||||
| 
						 | 
				
			
			@ -148,6 +149,8 @@ class CloreClient:
 | 
			
		|||
        self.partner_forwarding_ips = []
 | 
			
		||||
        self.start_time = utils.unix_timestamp()
 | 
			
		||||
 | 
			
		||||
        self.runned_pull_selftest = False
 | 
			
		||||
 | 
			
		||||
    async def service(self):
 | 
			
		||||
        global container_log_broken
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -164,9 +167,10 @@ class CloreClient:
 | 
			
		|||
        task8 = asyncio.create_task(self.background_pow_data_collection(monitoring))
 | 
			
		||||
        task9 = asyncio.create_task(self.partner_service(monitoring))
 | 
			
		||||
        monitoring_task = asyncio.create_task(self.monitoring_service(monitoring))
 | 
			
		||||
        driver_update_task = asyncio.create_task(nvidia_driver_update.update_loop(self.is_hive))
 | 
			
		||||
 | 
			
		||||
        # Wait for both tasks to complete (they won't in this case)
 | 
			
		||||
        await asyncio.gather(task1, task2, task3, task4, task5, task6, task7, task8, task9, monitoring_task)
 | 
			
		||||
        await asyncio.gather(task1, task2, task3, task4, task5, task6, task7, task8, task9, monitoring_task, driver_update_task)
 | 
			
		||||
 | 
			
		||||
    async def monitoring_service(self, monitoring):
 | 
			
		||||
        while True:
 | 
			
		||||
| 
						 | 
				
			
			@ -475,7 +479,7 @@ class CloreClient:
 | 
			
		|||
                                if self.xfs_state == "active":
 | 
			
		||||
                                    self.allowed_images.append({
 | 
			
		||||
                                        "repository": "vastai/test",
 | 
			
		||||
                                        "allowed_tags": ["bandwidth-test-nvidia"]
 | 
			
		||||
                                        "allowed_tags": ["bandwidth-test-nvidia", "selftest"]
 | 
			
		||||
                                    })
 | 
			
		||||
                                if not config.debug_ws_peer:
 | 
			
		||||
                                    for pure_ws_peer in result.ws_peers:
 | 
			
		||||
| 
						 | 
				
			
			@ -505,7 +509,7 @@ class CloreClient:
 | 
			
		|||
    async def submit_specs(self, current_specs):
 | 
			
		||||
        try:
 | 
			
		||||
            if type(current_specs) == dict:
 | 
			
		||||
                current_specs["backend_version"]=19
 | 
			
		||||
                current_specs["backend_version"]=20
 | 
			
		||||
                current_specs["update_hw"]=True
 | 
			
		||||
                smallest_pcie_width = 999
 | 
			
		||||
                for gpu in current_specs["gpus"]["nvidia"]:
 | 
			
		||||
| 
						 | 
				
			
			@ -544,6 +548,12 @@ class CloreClient:
 | 
			
		|||
                    self.last_hw_specs_submit=utils.unix_timestamp()
 | 
			
		||||
                    await self.submit_specs(current_specs)
 | 
			
		||||
                await self.update_realtime_data(current_specs)
 | 
			
		||||
                try:
 | 
			
		||||
                    if self.xfs_state == "active" and len(current_specs["gpus"]["nvidia"]) > 0 and not self.runned_pull_selftest:
 | 
			
		||||
                        await clore_partner.check_to_pull_selftest(current_specs)
 | 
			
		||||
                        self.runned_pull_selftest = True
 | 
			
		||||
                except Exception as partner_exception:
 | 
			
		||||
                    pass
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                log.debug(f"FAIL | specs_service() | {e}")
 | 
			
		||||
            await asyncio.sleep(7)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -22,6 +22,11 @@ MANDATORY_PACKEGES = ['dmidecode', 'openvpn', 'iproute2']
 | 
			
		|||
 | 
			
		||||
DUMMY_WORKLOAD_CONTAINER = "cloreai/partner-dummy-workload"
 | 
			
		||||
 | 
			
		||||
non_interactive_env = {
 | 
			
		||||
    'DEBIAN_FRONTEND': 'noninteractive',
 | 
			
		||||
    'PATH': '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin',
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
host_facts_location = os.path.join(config.clore_partner_base_dir, "host_facts")
 | 
			
		||||
partner_cache_location = os.path.join(config.clore_partner_base_dir, "partner_cache")
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -235,4 +240,25 @@ def filter_partner_dummy_workload_container(containers):
 | 
			
		|||
                remaining_containers.append(container)
 | 
			
		||||
        return remaining_containers
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        return containers
 | 
			
		||||
        return containers
 | 
			
		||||
 | 
			
		||||
auto_pull_selftest_gpus = ["NVIDIA GeForce RTX 3090", "NVIDIA GeForce RTX 4090"]
 | 
			
		||||
 | 
			
		||||
async def check_to_pull_selftest(current_specs):
 | 
			
		||||
    try:
 | 
			
		||||
        min_width = 16
 | 
			
		||||
        gpu_total_vram = 0
 | 
			
		||||
        gpu_name = ''
 | 
			
		||||
        mixed_cards = False
 | 
			
		||||
        for idx, nvidia_gpu in enumerate(current_specs["gpus"]["nvidia"]):
 | 
			
		||||
            if idx > 0 and nvidia_gpu["name"] != gpu_name:
 | 
			
		||||
                mixed_cards = True
 | 
			
		||||
            gpu_name = nvidia_gpu["name"]
 | 
			
		||||
            if nvidia_gpu["pcie_width"] < min_width:
 | 
			
		||||
                min_width =  nvidia_gpu["pcie_width"]
 | 
			
		||||
                if " MiB" in nvidia_gpu["mem_total"]:
 | 
			
		||||
                    gpu_total_vram += int(nvidia_gpu["mem_total"])
 | 
			
		||||
        if gpu_name in auto_pull_selftest_gpus and current_specs["ram"] > 7 and int(current_specs["cpus"].split('/')[0]) >= 4 and not mixed_cards and min_width > 1 and gpu_total_vram < current_specs["ram"] * 1024 and float(current_specs["disk"].split(' ')[-1].replace("GB", '')) > 25:
 | 
			
		||||
            await utils.async_run_command("docker pull vastai/test:selftest", 14400, non_interactive_env)
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        pass
 | 
			
		||||
| 
						 | 
				
			
			@ -42,7 +42,8 @@ hard_config = {
 | 
			
		|||
    "openvpn_forwarding_tun_device": "tun1313",
 | 
			
		||||
    "forwarding_ip_route_table_id": 100,
 | 
			
		||||
    "clore_partner_container_name": "clore-partner-service",
 | 
			
		||||
    "restart_docker_flag_file": "/opt/clore-hosting/.restart_docker"
 | 
			
		||||
    "restart_docker_flag_file": "/opt/clore-hosting/.restart_docker",
 | 
			
		||||
    "update_driver_550_flag": "/opt/clore-hosting/.update_550"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
parser = argparse.ArgumentParser(description='Example argparse usage')
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1,63 @@
 | 
			
		|||
from lib import config as config_module
 | 
			
		||||
from lib import docker_interface
 | 
			
		||||
from lib import utils
 | 
			
		||||
import asyncio
 | 
			
		||||
import aiofiles
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
config = config_module.config
 | 
			
		||||
 | 
			
		||||
non_interactive_env = {
 | 
			
		||||
    'DEBIAN_FRONTEND': 'noninteractive',
 | 
			
		||||
    'PATH': '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin',
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
non_interactive_env_hive = {
 | 
			
		||||
    'DEBIAN_FRONTEND': 'noninteractive',
 | 
			
		||||
    'PATH': '/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./',
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
INTENDED_DRIVER_VERSION = 550
 | 
			
		||||
 | 
			
		||||
async def get_running_containers():
 | 
			
		||||
    try:
 | 
			
		||||
        containers = await asyncio.to_thread(docker_interface.get_containers, False)
 | 
			
		||||
        return containers
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
async def run_update(is_hive = False):
 | 
			
		||||
    try:
 | 
			
		||||
        code, stdout, stderr = await utils.async_run_command("nvidia-smi --query-gpu=driver_version --format=csv,noheader", 20, env=non_interactive_env)
 | 
			
		||||
        if code == 0 and stderr == '' and stdout:
 | 
			
		||||
            driver_version = stdout.split('\n')[0].split('.')
 | 
			
		||||
            if len(driver_version) >= 2 and driver_version[0].isdigit() and int(driver_version[0]) > 300 and int(driver_version[0]) < INTENDED_DRIVER_VERSION:
 | 
			
		||||
                running_containers = await get_running_containers()
 | 
			
		||||
                if type(running_containers) == list:
 | 
			
		||||
                    order_running = False
 | 
			
		||||
                    for container in running_containers:
 | 
			
		||||
                        if container.name[:11] == "clore-order" or container.name[:2] == "C.":
 | 
			
		||||
                            order_running = True
 | 
			
		||||
                            break
 | 
			
		||||
                    if not order_running:
 | 
			
		||||
                        if is_hive:
 | 
			
		||||
                            driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("nvidia-driver-update http://45.12.132.34/NVIDIA-Linux-x86_64-550.135.run --force", 14400, non_interactive_env_hive)
 | 
			
		||||
                            if driver_update_code == 0:
 | 
			
		||||
                                async with aiofiles.open(config.restart_docker_flag_file, mode='w') as file:
 | 
			
		||||
                                    await file.write("")
 | 
			
		||||
                                await aiofiles.os.remove(config.update_driver_550_flag)
 | 
			
		||||
                                os._exit(0)
 | 
			
		||||
                        else:
 | 
			
		||||
                            driver_update_code, driver_update_stdout, driver_update_stderr = await utils.async_run_command("apt update -y && apt install nvidia-driver-550 -y", 14400, non_interactive_env)
 | 
			
		||||
                            if driver_update_code == 0:
 | 
			
		||||
                                await aiofiles.os.remove(config.update_driver_550_flag)
 | 
			
		||||
                                await utils.async_run_command("reboot") # On ubuntu it's just safest to reboot to get new driver version working properly
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
async def update_loop(is_hive):
 | 
			
		||||
    while True:
 | 
			
		||||
        flag_exists = await aiofiles.os.path.exists(config.update_driver_550_flag)
 | 
			
		||||
        if flag_exists:
 | 
			
		||||
            await run_update(is_hive)
 | 
			
		||||
        await asyncio.sleep(300)
 | 
			
		||||
		Loading…
	
		Reference in New Issue