Compare commits

...

4 Commits

Author SHA1 Message Date
clore 83e7d7f730 enforce image_cache_allowed_prefixes 2025-03-18 20:02:41 +00:00
clore 95ea84a9a0 do not touch container param 2025-03-18 19:21:40 +00:00
clore 5af4a5c635 fix - volume.name 2025-03-04 12:37:30 +03:00
clore 9ec9a14a0e image prefixes to cache, volumes 2025-03-01 02:35:45 +00:00
3 changed files with 55 additions and 11 deletions

View File

@ -151,6 +151,7 @@ class CloreClient:
self.start_time = utils.unix_timestamp()
self.runned_pull_selftest = False
self.image_cache_allowed_prefixes = None
WebSocketClient.set_gpu_list(nvml.get_gpu_name_list())
WebSocketClient.set_is_hive(self.is_hive)
@ -279,7 +280,7 @@ class CloreClient:
if len(got_data)>0:
self.p_needed_containers=got_data[len(got_data)-1]
if len(self.p_needed_containers)>0:
if len(self.p_needed_containers)>0 and self.image_cache_allowed_prefixes != None and len(self.containers) > 0:
local_images = await get_local_images(no_latest_tag=True)
partner_images = await clore_partner.get_partner_allowed_images()
for local_image in local_images:
@ -324,6 +325,11 @@ class CloreClient:
image_needed = True
del self.last_pull_progress[local_image]
break
for image_needed_prefix in self.image_cache_allowed_prefixes:
if local_image[:len(image_needed_prefix)] == image_needed_prefix:
image_needed = True
del self.last_pull_progress[local_image]
break
if not image_needed and removed_cnt < config.max_remove_images_per_run and config.delete_unused_containers and partner_images != None:
log.success(f"GOING TO REMOVE {local_image}")
with concurrent.futures.ThreadPoolExecutor() as pool:
@ -409,10 +415,13 @@ class CloreClient:
tmp_images = []
is_order_spot = False
self.image_cache_allowed_prefixes=[]
for idx, container in enumerate(self.containers):
if "spot" in container:
is_order_spot = True
if "allow_image_cache_prefix" in container:
self.image_cache_allowed_prefixes.append(container["allow_image_cache_prefix"])
if "image" in container and "image" in container and container["image"]!="cloreai/hive-use-flightsheet":
log_pull = False
if "name" in container:
@ -519,7 +528,7 @@ class CloreClient:
async def submit_specs(self, current_specs):
try:
if type(current_specs) == dict:
current_specs["backend_version"]=21
current_specs["backend_version"]=23
current_specs["update_hw"]=True
smallest_pcie_width = 999
for gpu in current_specs["gpus"]["nvidia"]:

View File

@ -19,11 +19,6 @@ log = logging_lib.log
def deploy(validated_containers, allowed_running_containers=[], can_run_partner_workloads=False):
local_images = docker_interface.get_local_images()
all_containers = docker_interface.get_containers(all=True)
is_hive = "hive" in get_specs.get_kernel()
# Deploy wireguard first
wireguard_containers = []
rest_containers = []
for container in validated_containers:
@ -40,6 +35,13 @@ def deploy(validated_containers, allowed_running_containers=[], can_run_partner_
needed_running_names = []
paused_names = []
all_use_volumes = []
allowed_container_prefixes = []
local_volume_list = docker_interface.list_volumes()
clore_volume_list = []
for volume in local_volume_list:
if volume.name[:6]=="clore_":
clore_volume_list.append(volume.name)
created_container_names = []
for container in all_containers:
@ -62,6 +64,15 @@ def deploy(validated_containers, allowed_running_containers=[], can_run_partner_
else:
needed_running_names.append(validated_container["name"])
if "mandatory_volumes" in validated_container:
for volume_name in validated_container["mandatory_volumes"]:
if volume_name[:6] == "clore_" and not volume_name in clore_volume_list:
docker_interface.create_volume(volume_name)
all_use_volumes += validated_container["mandatory_volumes"]
if "allowed_container_prefixes" in validated_container:
allowed_container_prefixes += validated_container["allowed_container_prefixes"]
container_options = {
'image': validated_container["image"],
'name': validated_container["name"],
@ -162,17 +173,36 @@ def deploy(validated_containers, allowed_running_containers=[], can_run_partner_
except Exception as e:
log.debug(f"Container creation issue | {e}")
pass
all_use_volumes=list(dict.fromkeys(all_use_volumes))
for volume in local_volume_list:
if volume.name[:6]=="clore_" and not volume.name in all_use_volumes:
try:
volume.remove()
except Exception as e:
pass
all_running_container_names = []
all_stopped_container_names = []
for container in all_containers:
if type(container.name)==str:
if container.status == "running":
do_not_touch_container = False
for container_prefix in allowed_container_prefixes:
try:
if container.name[:len(container_prefix)] == container_prefix:
do_not_touch_container = True
except Exception as e:
pass
if container.status == "running" and not do_not_touch_container:
all_running_container_names.append(container.name)
else:
elif not do_not_touch_container:
all_stopped_container_names.append(container.name)
if background_job.is_background_job_container_name(container.name) and not background_job.is_enabled():
if do_not_touch_container:
pass
elif background_job.is_background_job_container_name(container.name) and not background_job.is_enabled():
if container.status == "running":
container.stop()
elif container.name in needed_running_names and container.status != 'running':

View File

@ -443,4 +443,9 @@ def configure_exec_opts(key="native.cgroupdriver", value="cgroupfs"):
def is_docker_default_name_lenient(container_name): # Not a perfect solution, but it will do the job,
pattern = r'^[a-z]+_[a-z]+$'
return re.match(pattern, container_name) is not None
return re.match(pattern, container_name) is not None
def list_volumes():
return client.volumes.list()
def create_volume(volume_name):
client.volumes.create(name=volume_name)