From 4e1e72da25467d8c5ee53dd46197e97dc25e957d Mon Sep 17 00:00:00 2001
From: clore <support@clore.ai>
Date: Fri, 27 Dec 2024 01:31:59 +0000
Subject: [PATCH] gigaspot

---
 clore_hosting/main.py         |  3 +++
 clore_hosting/ws_interface.py | 13 ++++++++++++-
 lib/docker_cli_wrapper.py     |  4 ++--
 lib/docker_deploy.py          |  2 +-
 lib/nvml.py                   |  9 ++++++++-
 5 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/clore_hosting/main.py b/clore_hosting/main.py
index ceb3a57..28ce979 100644
--- a/clore_hosting/main.py
+++ b/clore_hosting/main.py
@@ -151,6 +151,9 @@ class CloreClient:
 
         self.runned_pull_selftest = False
 
+        WebSocketClient.set_gpu_list(nvml.get_gpu_name_list())
+        WebSocketClient.set_is_hive(self.is_hive)
+
     async def service(self):
         global container_log_broken
 
diff --git a/clore_hosting/ws_interface.py b/clore_hosting/ws_interface.py
index d3c5dba..429b3fa 100644
--- a/clore_hosting/ws_interface.py
+++ b/clore_hosting/ws_interface.py
@@ -55,7 +55,16 @@ class WebSocketClient:
 
         self.clore_partner_config = None
         self.forwarding_latency_measurment = None
+
+        self.gpu_list = []
+        self.is_hive = False
     
+    def set_gpu_list(self, gpu_list):
+        self.gpu_list = gpu_list
+    
+    def set_is_hive(self, is_hive):
+        self.is_hive = is_hive
+
     def get_last_heartbeat(self):
         return self.last_heartbeat
 
@@ -113,7 +122,9 @@ class WebSocketClient:
                     "login":str(self.auth),
                     "xfs_state": self.xfs_state,
                     "type":"python",
-                    "clore_partner_support": True
+                    "clore_partner_support": True,
+                    "gpu_list": self.gpu_list,
+                    "is_hive": self.is_hive
                 }))
             except Exception as e:
                 log.debug(f"CLOREWS | Connection to {random_ws_peer} failed: {e} ❌")
diff --git a/lib/docker_cli_wrapper.py b/lib/docker_cli_wrapper.py
index 6c48989..0d905cc 100644
--- a/lib/docker_cli_wrapper.py
+++ b/lib/docker_cli_wrapper.py
@@ -9,11 +9,11 @@ import docker
 config = config_module.config
 log = logging_lib.log
 
-def create_container(container_options, ip=None, docker_gpus=False, shm_size=64, timeout=30):
+def create_container(container_options, ip=None, docker_gpus=False, shm_size=64, timeout=30, paused=False):
     # Sanitize and validate input
     container_options = sanitize_input(container_options)
 
-    command = ["docker", "run", "--detach", "--tty"]
+    command = ["docker", ("create" if paused else "run"), "--detach", "--tty"]
 
     if "name" in container_options:
         command.extend(["--name", container_options["name"]])
diff --git a/lib/docker_deploy.py b/lib/docker_deploy.py
index 8116085..5304293 100644
--- a/lib/docker_deploy.py
+++ b/lib/docker_deploy.py
@@ -151,7 +151,7 @@ def deploy(validated_containers, allowed_running_containers=[], can_run_partner_
 
             if not validated_container["name"] in created_container_names and image_ready and not (not background_job.is_enabled() and background_job.is_background_job_container_name(validated_container["name"])):
                 if config.creation_engine == "wrapper":
-                    docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), shm_size=SHM_SIZE, docker_gpus=docker_gpus)
+                    docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), shm_size=SHM_SIZE, docker_gpus=docker_gpus, paused="paused" in validated_container)
                 else:
                     container = client.containers.create(**container_options)
                     if "ip" in validated_container:
diff --git a/lib/nvml.py b/lib/nvml.py
index 8049af6..59dade3 100644
--- a/lib/nvml.py
+++ b/lib/nvml.py
@@ -68,10 +68,11 @@ GPU_CORE_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
 
 is_hive = False
 all_gpus_data_list=[]
+gpu_name_list=[]
 get_data_fail=False
 
 def init(gpu_specs_file=None, allow_hive_binaries=True):
-    global is_hive, all_gpus_data_list, get_data_fail
+    global is_hive, all_gpus_data_list, get_data_fail, gpu_name_list
     log.info("Loading GPU OC specs [ working ]")
     try:
         pynvml.nvmlInit()
@@ -96,6 +97,7 @@ def init(gpu_specs_file=None, allow_hive_binaries=True):
                 break
             gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
             gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
+            gpu_name_list.append(pynvml.nvmlDeviceGetName(gpu_handle))
             if not f"{i}-{gpu_uuid}" in parsed_specs_keys:
                 parsed_specs={}
                 regenerate_specs=True
@@ -117,6 +119,7 @@ def init(gpu_specs_file=None, allow_hive_binaries=True):
                 gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0)
                 gpu_spec["power_limits"] = [min_power_limit, max_power_limit]
                 gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle)
+                gpu_name_list.append(gpu_spec["name"])
                 gpu_spec["locks"] = mem_to_core_allowed_locks
 
                 pci_info = pynvml.nvmlDeviceGetPciInfo(gpu_handle)
@@ -201,6 +204,10 @@ def init(gpu_specs_file=None, allow_hive_binaries=True):
     print(all_gpus_data_list)
     # Load GPU specs
 
+def get_gpu_name_list():
+    global gpu_name_list
+    return gpu_name_list
+
 def get_gpu_oc_specs():
     global get_data_fail
     if get_data_fail: