enforce image_cache_allowed_prefixes

do not touch container param
fix - volume.name
2025-03-18 20:02:41 +00:00 · 2025-03-18 19:21:40 +00:00 · 2025-03-04 12:37:30 +03:00 · 2025-03-01 02:35:45 +00:00
6 changed files with 61 additions and 48 deletions
--- a/clore_hosting/main.py
+++ b/clore_hosting/main.py
@ -151,6 +151,7 @@ class CloreClient:
        self.start_time = utils.unix_timestamp()
        self.runned_pull_selftest = False
        self.image_cache_allowed_prefixes = None
        WebSocketClient.set_gpu_list(nvml.get_gpu_name_list())
        WebSocketClient.set_is_hive(self.is_hive)
@ -279,7 +280,7 @@ class CloreClient:
            if len(got_data)>0:
                self.p_needed_containers=got_data[len(got_data)-1]
-            if len(self.p_needed_containers)>0:
+            if len(self.p_needed_containers)>0 and self.image_cache_allowed_prefixes != None and len(self.containers) > 0:
                local_images = await get_local_images(no_latest_tag=True)
                partner_images = await clore_partner.get_partner_allowed_images()
                for local_image in local_images:
@ -324,6 +325,11 @@ class CloreClient:
                                                    image_needed = True
                                                    del self.last_pull_progress[local_image]
                                                    break
                                for image_needed_prefix in self.image_cache_allowed_prefixes:
                                    if local_image[:len(image_needed_prefix)] == image_needed_prefix:
                                        image_needed = True
                                        del self.last_pull_progress[local_image]
                                        break
                            if not image_needed and removed_cnt < config.max_remove_images_per_run and config.delete_unused_containers and partner_images != None:
                                log.success(f"GOING TO REMOVE {local_image}")
                                with concurrent.futures.ThreadPoolExecutor() as pool:
@ -409,10 +415,13 @@ class CloreClient:
                    tmp_images = []
                    is_order_spot = False
                    self.image_cache_allowed_prefixes=[]
                    for idx, container in enumerate(self.containers):
                        if "spot" in container:
                            is_order_spot = True
                        if "allow_image_cache_prefix" in container:
                            self.image_cache_allowed_prefixes.append(container["allow_image_cache_prefix"])
                        if "image" in container and "image" in container and container["image"]!="cloreai/hive-use-flightsheet":
                            log_pull = False
                            if "name" in container:
@ -519,7 +528,7 @@ class CloreClient:
    async def submit_specs(self, current_specs):
        try:
            if type(current_specs) == dict:
-                current_specs["backend_version"]=21
+                current_specs["backend_version"]=23
                current_specs["update_hw"]=True
                smallest_pcie_width = 999
                for gpu in current_specs["gpus"]["nvidia"]:
@ -554,7 +563,6 @@ class CloreClient:
            try:
                await monitoring.put("specs_service")
                current_specs = await specs.get()
                if self.last_hw_specs_submit < (utils.unix_timestamp()-1800):
                    self.last_hw_specs_submit=utils.unix_timestamp()
                    await self.submit_specs(current_specs)
--- a/lib/constants.py
+++ b/lib/constants.py
@ -1,4 +0,0 @@
 GPU_ID_TO_NAME = {
    "0x20c210de": "NVIDIA CMP 170HX",
    "0x208210de": "NVIDIA CMP 170HX"
 }
--- a/lib/docker_deploy.py
+++ b/lib/docker_deploy.py
@ -19,11 +19,6 @@ log = logging_lib.log
 def deploy(validated_containers, allowed_running_containers=[], can_run_partner_workloads=False):
    local_images = docker_interface.get_local_images()
    all_containers = docker_interface.get_containers(all=True)
    is_hive = "hive" in get_specs.get_kernel()
    # Deploy wireguard first
    wireguard_containers = []
    rest_containers = []
    for container in validated_containers:
@ -40,6 +35,13 @@ def deploy(validated_containers, allowed_running_containers=[], can_run_partner_
    needed_running_names = []
    paused_names = []
    all_use_volumes = []
    allowed_container_prefixes = []
    local_volume_list = docker_interface.list_volumes()
    clore_volume_list = []
    for volume in local_volume_list:
        if volume.name[:6]=="clore_":
            clore_volume_list.append(volume.name)
    created_container_names = []
    for container in all_containers:
@ -62,6 +64,15 @@ def deploy(validated_containers, allowed_running_containers=[], can_run_partner_
            else:
                needed_running_names.append(validated_container["name"])
            if "mandatory_volumes" in validated_container:
                for volume_name in validated_container["mandatory_volumes"]:
                    if volume_name[:6] == "clore_" and not volume_name in clore_volume_list:
                        docker_interface.create_volume(volume_name)
                all_use_volumes += validated_container["mandatory_volumes"]
            if "allowed_container_prefixes" in validated_container:
                allowed_container_prefixes += validated_container["allowed_container_prefixes"]
            container_options = {
                'image': validated_container["image"],
                'name': validated_container["name"],
@ -163,16 +174,35 @@ def deploy(validated_containers, allowed_running_containers=[], can_run_partner_
            log.debug(f"Container creation issue | {e}")
            pass
    all_use_volumes=list(dict.fromkeys(all_use_volumes))
    for volume in local_volume_list:
        if volume.name[:6]=="clore_" and not volume.name in all_use_volumes:
            try:
                volume.remove()
            except Exception as e:
                pass
    all_running_container_names = []
    all_stopped_container_names = []
    for container in all_containers:
        if type(container.name)==str:
-            if container.status == "running":
+            do_not_touch_container = False
            for container_prefix in allowed_container_prefixes:
                try:
                    if container.name[:len(container_prefix)] == container_prefix:
                        do_not_touch_container = True
                except Exception as e:
                    pass
            if container.status == "running" and not do_not_touch_container:
                all_running_container_names.append(container.name)
-            else:
+            elif not do_not_touch_container:
                all_stopped_container_names.append(container.name)
-            if background_job.is_background_job_container_name(container.name) and not background_job.is_enabled():
+
            if do_not_touch_container:
                pass
            elif background_job.is_background_job_container_name(container.name) and not background_job.is_enabled():
                if container.status == "running":
                    container.stop()
            elif container.name in needed_running_names and container.status != 'running':
--- a/lib/docker_interface.py
+++ b/lib/docker_interface.py
@ -444,3 +444,8 @@ def configure_exec_opts(key="native.cgroupdriver", value="cgroupfs"):
 def is_docker_default_name_lenient(container_name): # Not a perfect solution, but it will do the job, 
    pattern = r'^[a-z]+_[a-z]+$'
    return re.match(pattern, container_name) is not None
 def list_volumes():
    return client.volumes.list()
 def create_volume(volume_name):
    client.volumes.create(name=volume_name)
--- a/lib/get_specs.py
+++ b/lib/get_specs.py
@ -4,7 +4,6 @@ import xml.etree.ElementTree as ET
 from lib import docker_interface
 from typing import Dict, List, Optional
 from lib import utils
 from lib import constants
 import subprocess
 import speedtest
 import platform
@ -257,8 +256,8 @@ def get_gpu_info():
    except Exception as e:
        pass
-    nvidia_smi_return_code, nvidia_smi_stdout, nvidia_smi_stderr = utils.run_command(f"nvidia-smi --query-gpu=index,name,uuid,serial,memory.total,pci.device_id --format=csv")
+    nvidia_smi_return_code, nvidia_smi_stdout, nvidia_smi_stderr = utils.run_command(f"nvidia-smi --query-gpu=index,name,uuid,serial,memory.total --format=csv")
-    nvidia_smi_xl_return_code, nvidia_smi_xl_stdout, nvidia_smi_xl_stderr = utils.run_command("nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used,pci.device_id --format=csv")
+    nvidia_smi_xl_return_code, nvidia_smi_xl_stdout, nvidia_smi_xl_stderr = utils.run_command("nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv")
    if "Failed to initialize NVML" in nvidia_smi_stdout or "Failed to initialize NVML" in nvidia_smi_stderr or "Failed to initialize NVML" in nvidia_smi_xl_stdout or "Failed to initialize NVML" in nvidia_smi_xl_stderr:
        nvml_err=True
@ -268,15 +267,10 @@ def get_gpu_info():
            for index, line in enumerate(lines_xl):
                parts = [s.strip() for s in line.split(',')]
                if len(parts)>12 and index>0:
                    gpu_name_xl = parts[1]
                    gpu_id_xl = parts[13].lower()
                    if gpu_name_xl == "NVIDIA Graphics Device" and gpu_id_xl in constants.GPU_ID_TO_NAME:
                        gpu_name_xl = constants.GPU_ID_TO_NAME[gpu_id_xl]
                    xl_gpu_info={
                        "id":index-1,
                        "timestamp": parts[0],
-                        "name": gpu_name_xl,
+                        "name": parts[1],
                        "pcie_bus": parts[2].split(':', 1)[1],
                        "driver": parts[3],
                        "pstate": parts[4],
@ -287,7 +281,6 @@ def get_gpu_info():
                        "mem_free": parts[11],
                        "mem_used": parts[12]
                    }
                    try:
                        pci_query = parts[2][parts[2].find(':')+1:]
                        for index, valid_pci_dev in enumerate(valid_pci_dev_list):
@ -303,13 +296,7 @@ def get_gpu_info():
            for line in lines:
                parts = line.split(',')
                if bool(re.match(r'^[0-9]+$', parts[0])):
-
+                    gpu_str = f"{len(lines)-1}x {parts[1].strip()}"
                    gpu_name = parts[1].strip()
                    gpu_id = parts[5].strip().lower()
                    if gpu_name == "NVIDIA Graphics Device" and gpu_id in constants.GPU_ID_TO_NAME:
                        gpu_name = constants.GPU_ID_TO_NAME[gpu_id]
                    gpu_str = f"{len(lines)-1}x {gpu_name}"
                    gpu_mem = round(int(filter_non_numeric(parts[4]).strip())/1024, 2)
        except Exception as e:
            nvml_err=True
--- a/lib/nvml.py
+++ b/lib/nvml.py
@ -1,7 +1,6 @@
 from lib import config as config_module
 from lib import logging as logging_lib
 from lib import get_specs
 from lib import constants
 config = config_module.config
 log = logging_lib.log
@ -98,13 +97,7 @@ def init(gpu_specs_file=None, allow_hive_binaries=True):
                break
            gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
-
+            gpu_name_list.append(pynvml.nvmlDeviceGetName(gpu_handle))
            gpu_name = pynvml.nvmlDeviceGetName(gpu_handle)
            gpu_device_id = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).lower()
            if gpu_name == "NVIDIA Graphics Device" and gpu_device_id in constants.GPU_ID_TO_NAME:
                gpu_name = constants.GPU_ID_TO_NAME[gpu_device_id]
            gpu_name_list.append(gpu_name)
            if not f"{i}-{gpu_uuid}" in parsed_specs_keys:
                parsed_specs={}
                regenerate_specs=True
@ -125,13 +118,7 @@ def init(gpu_specs_file=None, allow_hive_binaries=True):
                max_power_limit = int(power_limits[1] / 1000.0)
                gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0)
                gpu_spec["power_limits"] = [min_power_limit, max_power_limit]
-
+                gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle)
                gpu_name_regen = pynvml.nvmlDeviceGetName(gpu_handle)
                gpu_device_id_regen = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).lower()
                if gpu_name_regen == "NVIDIA Graphics Device" and gpu_device_id_regen in constants.GPU_ID_TO_NAME:
                    gpu_name_regen = constants.GPU_ID_TO_NAME[gpu_device_id_regen]
                gpu_spec["name"] = gpu_name_regen
                gpu_name_list.append(gpu_spec["name"])
                gpu_spec["locks"] = mem_to_core_allowed_locks
Author	SHA1	Message	Date
clore	83e7d7f730	enforce image_cache_allowed_prefixes	2025-03-18 20:02:41 +00:00
clore	95ea84a9a0	do not touch container param	2025-03-18 19:21:40 +00:00
clore	5af4a5c635	fix - volume.name	2025-03-04 12:37:30 +03:00
clore	9ec9a14a0e	image prefixes to cache, volumes	2025-03-01 02:35:45 +00:00