Compare commits

..

4 Commits

Author SHA1 Message Date
clore 83e7d7f730 enforce image_cache_allowed_prefixes 2025-03-18 20:02:41 +00:00
clore 95ea84a9a0 do not touch container param 2025-03-18 19:21:40 +00:00
clore 5af4a5c635 fix - volume.name 2025-03-04 12:37:30 +03:00
clore 9ec9a14a0e image prefixes to cache, volumes 2025-03-01 02:35:45 +00:00
6 changed files with 61 additions and 48 deletions

View File

@ -151,6 +151,7 @@ class CloreClient:
self.start_time = utils.unix_timestamp() self.start_time = utils.unix_timestamp()
self.runned_pull_selftest = False self.runned_pull_selftest = False
self.image_cache_allowed_prefixes = None
WebSocketClient.set_gpu_list(nvml.get_gpu_name_list()) WebSocketClient.set_gpu_list(nvml.get_gpu_name_list())
WebSocketClient.set_is_hive(self.is_hive) WebSocketClient.set_is_hive(self.is_hive)
@ -279,7 +280,7 @@ class CloreClient:
if len(got_data)>0: if len(got_data)>0:
self.p_needed_containers=got_data[len(got_data)-1] self.p_needed_containers=got_data[len(got_data)-1]
if len(self.p_needed_containers)>0: if len(self.p_needed_containers)>0 and self.image_cache_allowed_prefixes != None and len(self.containers) > 0:
local_images = await get_local_images(no_latest_tag=True) local_images = await get_local_images(no_latest_tag=True)
partner_images = await clore_partner.get_partner_allowed_images() partner_images = await clore_partner.get_partner_allowed_images()
for local_image in local_images: for local_image in local_images:
@ -324,6 +325,11 @@ class CloreClient:
image_needed = True image_needed = True
del self.last_pull_progress[local_image] del self.last_pull_progress[local_image]
break break
for image_needed_prefix in self.image_cache_allowed_prefixes:
if local_image[:len(image_needed_prefix)] == image_needed_prefix:
image_needed = True
del self.last_pull_progress[local_image]
break
if not image_needed and removed_cnt < config.max_remove_images_per_run and config.delete_unused_containers and partner_images != None: if not image_needed and removed_cnt < config.max_remove_images_per_run and config.delete_unused_containers and partner_images != None:
log.success(f"GOING TO REMOVE {local_image}") log.success(f"GOING TO REMOVE {local_image}")
with concurrent.futures.ThreadPoolExecutor() as pool: with concurrent.futures.ThreadPoolExecutor() as pool:
@ -409,10 +415,13 @@ class CloreClient:
tmp_images = [] tmp_images = []
is_order_spot = False is_order_spot = False
self.image_cache_allowed_prefixes=[]
for idx, container in enumerate(self.containers): for idx, container in enumerate(self.containers):
if "spot" in container: if "spot" in container:
is_order_spot = True is_order_spot = True
if "allow_image_cache_prefix" in container:
self.image_cache_allowed_prefixes.append(container["allow_image_cache_prefix"])
if "image" in container and "image" in container and container["image"]!="cloreai/hive-use-flightsheet": if "image" in container and "image" in container and container["image"]!="cloreai/hive-use-flightsheet":
log_pull = False log_pull = False
if "name" in container: if "name" in container:
@ -519,7 +528,7 @@ class CloreClient:
async def submit_specs(self, current_specs): async def submit_specs(self, current_specs):
try: try:
if type(current_specs) == dict: if type(current_specs) == dict:
current_specs["backend_version"]=21 current_specs["backend_version"]=23
current_specs["update_hw"]=True current_specs["update_hw"]=True
smallest_pcie_width = 999 smallest_pcie_width = 999
for gpu in current_specs["gpus"]["nvidia"]: for gpu in current_specs["gpus"]["nvidia"]:
@ -554,7 +563,6 @@ class CloreClient:
try: try:
await monitoring.put("specs_service") await monitoring.put("specs_service")
current_specs = await specs.get() current_specs = await specs.get()
if self.last_hw_specs_submit < (utils.unix_timestamp()-1800): if self.last_hw_specs_submit < (utils.unix_timestamp()-1800):
self.last_hw_specs_submit=utils.unix_timestamp() self.last_hw_specs_submit=utils.unix_timestamp()
await self.submit_specs(current_specs) await self.submit_specs(current_specs)

View File

@ -1,4 +0,0 @@
GPU_ID_TO_NAME = {
"0x20c210de": "NVIDIA CMP 170HX",
"0x208210de": "NVIDIA CMP 170HX"
}

View File

@ -19,11 +19,6 @@ log = logging_lib.log
def deploy(validated_containers, allowed_running_containers=[], can_run_partner_workloads=False): def deploy(validated_containers, allowed_running_containers=[], can_run_partner_workloads=False):
local_images = docker_interface.get_local_images() local_images = docker_interface.get_local_images()
all_containers = docker_interface.get_containers(all=True) all_containers = docker_interface.get_containers(all=True)
is_hive = "hive" in get_specs.get_kernel()
# Deploy wireguard first
wireguard_containers = [] wireguard_containers = []
rest_containers = [] rest_containers = []
for container in validated_containers: for container in validated_containers:
@ -40,6 +35,13 @@ def deploy(validated_containers, allowed_running_containers=[], can_run_partner_
needed_running_names = [] needed_running_names = []
paused_names = [] paused_names = []
all_use_volumes = []
allowed_container_prefixes = []
local_volume_list = docker_interface.list_volumes()
clore_volume_list = []
for volume in local_volume_list:
if volume.name[:6]=="clore_":
clore_volume_list.append(volume.name)
created_container_names = [] created_container_names = []
for container in all_containers: for container in all_containers:
@ -62,6 +64,15 @@ def deploy(validated_containers, allowed_running_containers=[], can_run_partner_
else: else:
needed_running_names.append(validated_container["name"]) needed_running_names.append(validated_container["name"])
if "mandatory_volumes" in validated_container:
for volume_name in validated_container["mandatory_volumes"]:
if volume_name[:6] == "clore_" and not volume_name in clore_volume_list:
docker_interface.create_volume(volume_name)
all_use_volumes += validated_container["mandatory_volumes"]
if "allowed_container_prefixes" in validated_container:
allowed_container_prefixes += validated_container["allowed_container_prefixes"]
container_options = { container_options = {
'image': validated_container["image"], 'image': validated_container["image"],
'name': validated_container["name"], 'name': validated_container["name"],
@ -163,16 +174,35 @@ def deploy(validated_containers, allowed_running_containers=[], can_run_partner_
log.debug(f"Container creation issue | {e}") log.debug(f"Container creation issue | {e}")
pass pass
all_use_volumes=list(dict.fromkeys(all_use_volumes))
for volume in local_volume_list:
if volume.name[:6]=="clore_" and not volume.name in all_use_volumes:
try:
volume.remove()
except Exception as e:
pass
all_running_container_names = [] all_running_container_names = []
all_stopped_container_names = [] all_stopped_container_names = []
for container in all_containers: for container in all_containers:
if type(container.name)==str: if type(container.name)==str:
if container.status == "running": do_not_touch_container = False
for container_prefix in allowed_container_prefixes:
try:
if container.name[:len(container_prefix)] == container_prefix:
do_not_touch_container = True
except Exception as e:
pass
if container.status == "running" and not do_not_touch_container:
all_running_container_names.append(container.name) all_running_container_names.append(container.name)
else: elif not do_not_touch_container:
all_stopped_container_names.append(container.name) all_stopped_container_names.append(container.name)
if background_job.is_background_job_container_name(container.name) and not background_job.is_enabled():
if do_not_touch_container:
pass
elif background_job.is_background_job_container_name(container.name) and not background_job.is_enabled():
if container.status == "running": if container.status == "running":
container.stop() container.stop()
elif container.name in needed_running_names and container.status != 'running': elif container.name in needed_running_names and container.status != 'running':

View File

@ -444,3 +444,8 @@ def configure_exec_opts(key="native.cgroupdriver", value="cgroupfs"):
def is_docker_default_name_lenient(container_name): # Not a perfect solution, but it will do the job, def is_docker_default_name_lenient(container_name): # Not a perfect solution, but it will do the job,
pattern = r'^[a-z]+_[a-z]+$' pattern = r'^[a-z]+_[a-z]+$'
return re.match(pattern, container_name) is not None return re.match(pattern, container_name) is not None
def list_volumes():
return client.volumes.list()
def create_volume(volume_name):
client.volumes.create(name=volume_name)

View File

@ -4,7 +4,6 @@ import xml.etree.ElementTree as ET
from lib import docker_interface from lib import docker_interface
from typing import Dict, List, Optional from typing import Dict, List, Optional
from lib import utils from lib import utils
from lib import constants
import subprocess import subprocess
import speedtest import speedtest
import platform import platform
@ -257,8 +256,8 @@ def get_gpu_info():
except Exception as e: except Exception as e:
pass pass
nvidia_smi_return_code, nvidia_smi_stdout, nvidia_smi_stderr = utils.run_command(f"nvidia-smi --query-gpu=index,name,uuid,serial,memory.total,pci.device_id --format=csv") nvidia_smi_return_code, nvidia_smi_stdout, nvidia_smi_stderr = utils.run_command(f"nvidia-smi --query-gpu=index,name,uuid,serial,memory.total --format=csv")
nvidia_smi_xl_return_code, nvidia_smi_xl_stdout, nvidia_smi_xl_stderr = utils.run_command("nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used,pci.device_id --format=csv") nvidia_smi_xl_return_code, nvidia_smi_xl_stdout, nvidia_smi_xl_stderr = utils.run_command("nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv")
if "Failed to initialize NVML" in nvidia_smi_stdout or "Failed to initialize NVML" in nvidia_smi_stderr or "Failed to initialize NVML" in nvidia_smi_xl_stdout or "Failed to initialize NVML" in nvidia_smi_xl_stderr: if "Failed to initialize NVML" in nvidia_smi_stdout or "Failed to initialize NVML" in nvidia_smi_stderr or "Failed to initialize NVML" in nvidia_smi_xl_stdout or "Failed to initialize NVML" in nvidia_smi_xl_stderr:
nvml_err=True nvml_err=True
@ -268,15 +267,10 @@ def get_gpu_info():
for index, line in enumerate(lines_xl): for index, line in enumerate(lines_xl):
parts = [s.strip() for s in line.split(',')] parts = [s.strip() for s in line.split(',')]
if len(parts)>12 and index>0: if len(parts)>12 and index>0:
gpu_name_xl = parts[1]
gpu_id_xl = parts[13].lower()
if gpu_name_xl == "NVIDIA Graphics Device" and gpu_id_xl in constants.GPU_ID_TO_NAME:
gpu_name_xl = constants.GPU_ID_TO_NAME[gpu_id_xl]
xl_gpu_info={ xl_gpu_info={
"id":index-1, "id":index-1,
"timestamp": parts[0], "timestamp": parts[0],
"name": gpu_name_xl, "name": parts[1],
"pcie_bus": parts[2].split(':', 1)[1], "pcie_bus": parts[2].split(':', 1)[1],
"driver": parts[3], "driver": parts[3],
"pstate": parts[4], "pstate": parts[4],
@ -287,7 +281,6 @@ def get_gpu_info():
"mem_free": parts[11], "mem_free": parts[11],
"mem_used": parts[12] "mem_used": parts[12]
} }
try: try:
pci_query = parts[2][parts[2].find(':')+1:] pci_query = parts[2][parts[2].find(':')+1:]
for index, valid_pci_dev in enumerate(valid_pci_dev_list): for index, valid_pci_dev in enumerate(valid_pci_dev_list):
@ -303,13 +296,7 @@ def get_gpu_info():
for line in lines: for line in lines:
parts = line.split(',') parts = line.split(',')
if bool(re.match(r'^[0-9]+$', parts[0])): if bool(re.match(r'^[0-9]+$', parts[0])):
gpu_str = f"{len(lines)-1}x {parts[1].strip()}"
gpu_name = parts[1].strip()
gpu_id = parts[5].strip().lower()
if gpu_name == "NVIDIA Graphics Device" and gpu_id in constants.GPU_ID_TO_NAME:
gpu_name = constants.GPU_ID_TO_NAME[gpu_id]
gpu_str = f"{len(lines)-1}x {gpu_name}"
gpu_mem = round(int(filter_non_numeric(parts[4]).strip())/1024, 2) gpu_mem = round(int(filter_non_numeric(parts[4]).strip())/1024, 2)
except Exception as e: except Exception as e:
nvml_err=True nvml_err=True

View File

@ -1,7 +1,6 @@
from lib import config as config_module from lib import config as config_module
from lib import logging as logging_lib from lib import logging as logging_lib
from lib import get_specs from lib import get_specs
from lib import constants
config = config_module.config config = config_module.config
log = logging_lib.log log = logging_lib.log
@ -98,13 +97,7 @@ def init(gpu_specs_file=None, allow_hive_binaries=True):
break break
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i) gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle) gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
gpu_name_list.append(pynvml.nvmlDeviceGetName(gpu_handle))
gpu_name = pynvml.nvmlDeviceGetName(gpu_handle)
gpu_device_id = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).lower()
if gpu_name == "NVIDIA Graphics Device" and gpu_device_id in constants.GPU_ID_TO_NAME:
gpu_name = constants.GPU_ID_TO_NAME[gpu_device_id]
gpu_name_list.append(gpu_name)
if not f"{i}-{gpu_uuid}" in parsed_specs_keys: if not f"{i}-{gpu_uuid}" in parsed_specs_keys:
parsed_specs={} parsed_specs={}
regenerate_specs=True regenerate_specs=True
@ -125,13 +118,7 @@ def init(gpu_specs_file=None, allow_hive_binaries=True):
max_power_limit = int(power_limits[1] / 1000.0) max_power_limit = int(power_limits[1] / 1000.0)
gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0) gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0)
gpu_spec["power_limits"] = [min_power_limit, max_power_limit] gpu_spec["power_limits"] = [min_power_limit, max_power_limit]
gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle)
gpu_name_regen = pynvml.nvmlDeviceGetName(gpu_handle)
gpu_device_id_regen = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).lower()
if gpu_name_regen == "NVIDIA Graphics Device" and gpu_device_id_regen in constants.GPU_ID_TO_NAME:
gpu_name_regen = constants.GPU_ID_TO_NAME[gpu_device_id_regen]
gpu_spec["name"] = gpu_name_regen
gpu_name_list.append(gpu_spec["name"]) gpu_name_list.append(gpu_spec["name"])
gpu_spec["locks"] = mem_to_core_allowed_locks gpu_spec["locks"] = mem_to_core_allowed_locks