diff --git a/clore_hosting/main.py b/clore_hosting/main.py index ceb3a57..28ce979 100644 --- a/clore_hosting/main.py +++ b/clore_hosting/main.py @@ -151,6 +151,9 @@ class CloreClient: self.runned_pull_selftest = False + WebSocketClient.set_gpu_list(nvml.get_gpu_name_list()) + WebSocketClient.set_is_hive(self.is_hive) + async def service(self): global container_log_broken diff --git a/clore_hosting/ws_interface.py b/clore_hosting/ws_interface.py index d3c5dba..429b3fa 100644 --- a/clore_hosting/ws_interface.py +++ b/clore_hosting/ws_interface.py @@ -55,7 +55,16 @@ class WebSocketClient: self.clore_partner_config = None self.forwarding_latency_measurment = None + + self.gpu_list = [] + self.is_hive = False + def set_gpu_list(self, gpu_list): + self.gpu_list = gpu_list + + def set_is_hive(self, is_hive): + self.is_hive = is_hive + def get_last_heartbeat(self): return self.last_heartbeat @@ -113,7 +122,9 @@ class WebSocketClient: "login":str(self.auth), "xfs_state": self.xfs_state, "type":"python", - "clore_partner_support": True + "clore_partner_support": True, + "gpu_list": self.gpu_list, + "is_hive": self.is_hive })) except Exception as e: log.debug(f"CLOREWS | Connection to {random_ws_peer} failed: {e} ❌") diff --git a/lib/docker_cli_wrapper.py b/lib/docker_cli_wrapper.py index 6c48989..0d905cc 100644 --- a/lib/docker_cli_wrapper.py +++ b/lib/docker_cli_wrapper.py @@ -9,11 +9,11 @@ import docker config = config_module.config log = logging_lib.log -def create_container(container_options, ip=None, docker_gpus=False, shm_size=64, timeout=30): +def create_container(container_options, ip=None, docker_gpus=False, shm_size=64, timeout=30, paused=False): # Sanitize and validate input container_options = sanitize_input(container_options) - command = ["docker", "run", "--detach", "--tty"] + command = ["docker", ("create" if paused else "run"), "--detach", "--tty"] if "name" in container_options: command.extend(["--name", container_options["name"]]) diff --git a/lib/docker_deploy.py b/lib/docker_deploy.py index 8116085..5304293 100644 --- a/lib/docker_deploy.py +++ b/lib/docker_deploy.py @@ -151,7 +151,7 @@ def deploy(validated_containers, allowed_running_containers=[], can_run_partner_ if not validated_container["name"] in created_container_names and image_ready and not (not background_job.is_enabled() and background_job.is_background_job_container_name(validated_container["name"])): if config.creation_engine == "wrapper": - docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), shm_size=SHM_SIZE, docker_gpus=docker_gpus) + docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), shm_size=SHM_SIZE, docker_gpus=docker_gpus, paused="paused" in validated_container) else: container = client.containers.create(**container_options) if "ip" in validated_container: diff --git a/lib/nvml.py b/lib/nvml.py index 8049af6..59dade3 100644 --- a/lib/nvml.py +++ b/lib/nvml.py @@ -68,10 +68,11 @@ GPU_CORE_ALLOWED_OC_RANGES = { # Known to be problematic GPUs is_hive = False all_gpus_data_list=[] +gpu_name_list=[] get_data_fail=False def init(gpu_specs_file=None, allow_hive_binaries=True): - global is_hive, all_gpus_data_list, get_data_fail + global is_hive, all_gpus_data_list, get_data_fail, gpu_name_list log.info("Loading GPU OC specs [ working ]") try: pynvml.nvmlInit() @@ -96,6 +97,7 @@ def init(gpu_specs_file=None, allow_hive_binaries=True): break gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i) gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle) + gpu_name_list.append(pynvml.nvmlDeviceGetName(gpu_handle)) if not f"{i}-{gpu_uuid}" in parsed_specs_keys: parsed_specs={} regenerate_specs=True @@ -117,6 +119,7 @@ def init(gpu_specs_file=None, allow_hive_binaries=True): gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0) gpu_spec["power_limits"] = [min_power_limit, max_power_limit] gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle) + gpu_name_list.append(gpu_spec["name"]) gpu_spec["locks"] = mem_to_core_allowed_locks pci_info = pynvml.nvmlDeviceGetPciInfo(gpu_handle) @@ -201,6 +204,10 @@ def init(gpu_specs_file=None, allow_hive_binaries=True): print(all_gpus_data_list) # Load GPU specs +def get_gpu_name_list(): + global gpu_name_list + return gpu_name_list + def get_gpu_oc_specs(): global get_data_fail if get_data_fail: