diff --git a/clore_hosting/docker_configurator.py b/clore_hosting/docker_configurator.py index e82c848..107395d 100644 --- a/clore_hosting/docker_configurator.py +++ b/clore_hosting/docker_configurator.py @@ -4,6 +4,7 @@ from lib import custom_entrypoint from lib import networking from lib import wireguard from lib import logging as logging_lib +from clore_hosting import utils as hosting_utils import shutil import os import re @@ -53,9 +54,12 @@ def configure(containers): for index, container in enumerate(containers): ok_custom_entrypoint = False + invalid_hostname = False if index < len(custom_entrypoint_state): ok_custom_entrypoint = custom_entrypoint_state[index] startup_script_name = f"{container['name']}.sh" + if "hostname" in container and not hosting_utils.validate_hostname(container["hostname"]): + invalid_hostname = True if "ip" in container and len(container["ip"])>6 and type(container["ip"])==str: if container["ip"][:8] == "; echo '": last_occurrence, text_after_last_ip = get_last_ip_occurrence_and_text(container["ip"]) @@ -95,14 +99,14 @@ def configure(containers): newly_created_networks.append(container["network"]) else: any_fail=True - if not any_fail and ok_custom_entrypoint: + if not any_fail and ok_custom_entrypoint and not invalid_hostname: valid_containers.append(container) elif "network" in container and container["network"][:len(config.clore_network_name_prefix)]==config.clore_network_name_prefix: # Subnet & gateway not defined, must be some of default networks, otherwise dump it if container["network"] in default_network_names: for docker_network in docker_networks: if docker_network["Name"]==container["network"]: for ipam in docker_network["IPAM"]: - if not ok_custom_entrypoint: + if not ok_custom_entrypoint or invalid_hostname: break elif not "ip" in container: valid_containers.append(container) diff --git a/clore_hosting/main.py b/clore_hosting/main.py index 8065a9f..19051b9 100644 --- a/clore_hosting/main.py +++ b/clore_hosting/main.py @@ -105,7 +105,16 @@ class CloreClient: self.ws_peers[str(config.debug_ws_peer)]={ "expiration":"immune" } - + + self.os_release = get_specs.get_os_release() + self.restart_docker = False + if "use_cgroupfs" in self.os_release: + self.updated_exec_opts = True if docker_interface.configure_exec_opts("native.cgroupdriver","cgroupfs") else False + if self.updated_exec_opts: + docker_info = docker_interface.get_info() + if "CgroupDriver" in docker_info and docker_info["CgroupDriver"]=="systemd": + self.restart_docker = True # Restart docker when it's loaded under systemd (accual restart will happen only if no orders running to not disrupt workload) + docker_interface.verify_docker_version() nvml.init() @@ -333,6 +342,7 @@ class CloreClient: print("STEP",step,'|',self.containers_set, self.containers if config.log_containers_strings else '') tasks = [] + running_order = False container_conf = WebSocketClient.get_containers() @@ -345,6 +355,7 @@ class CloreClient: log_pull = False if "name" in container: if "-order-" in container["name"]: + running_order=True log_pull=True image_config = { "image":container["image"], @@ -362,6 +373,12 @@ class CloreClient: if not image_config in tmp_images: tmp_images.append(image_config) + + if self.restart_docker and not running_order and len(self.containers)>0: + log.debug("Sending docker restart command") + utils.run_command_v2("systemctl restart docker") + self.restart_docker=False + if tmp_images!=self.needed_images: self.needed_images=tmp_images await pull_list.put(self.needed_images) @@ -426,7 +443,7 @@ class CloreClient: async def submit_specs(self, current_specs): try: if type(current_specs) == dict: - current_specs["backend_version"]=11 + current_specs["backend_version"]=12 current_specs["update_hw"]=True smallest_pcie_width = 999 for gpu in current_specs["gpus"]["nvidia"]: diff --git a/clore_hosting/utils.py b/clore_hosting/utils.py index 19d2b80..3773c46 100644 --- a/clore_hosting/utils.py +++ b/clore_hosting/utils.py @@ -10,5 +10,13 @@ def is_valid_websocket_url(url): return True return False +def validate_hostname(hostname): + # Define the regular expression pattern for a valid hostname + pattern = re.compile(r'^[a-zA-Z0-9._-]{1,63}$') + if pattern.match(hostname): + return True + else: + return False + def unix_timestamp(): return int(time.time()) \ No newline at end of file diff --git a/lib/docker_cli_wrapper.py b/lib/docker_cli_wrapper.py index f225f52..762459c 100644 --- a/lib/docker_cli_wrapper.py +++ b/lib/docker_cli_wrapper.py @@ -21,6 +21,9 @@ def create_container(container_options, ip=None, docker_gpus=False, timeout=30): if "network_mode" in container_options: command.extend(["--network", container_options["network_mode"]]) + if "hostname" in container_options: + command.extend(["--hostname", container_options["hostname"]]) + if "cap_add" in container_options: for cap in container_options["cap_add"]: command.extend(["--cap-add", cap]) diff --git a/lib/docker_deploy.py b/lib/docker_deploy.py index aa07cb8..83838b2 100644 --- a/lib/docker_deploy.py +++ b/lib/docker_deploy.py @@ -76,6 +76,13 @@ def deploy(validated_containers): ) } + if "hostname" in validated_container: + container_options["hostname"]=validated_container["hostname"] + elif "clore-order-" in validated_container["name"]: + try: + container_options["hostname"] = f"O-{int(validated_container["name"][12:])}" + except Exception as eon: + pass if "network" in validated_container: container_options["network_mode"]=validated_container["network"] if "ip" in validated_container and config.creation_engine=="sdk": diff --git a/lib/docker_interface.py b/lib/docker_interface.py index ac60871..60e4c9f 100644 --- a/lib/docker_interface.py +++ b/lib/docker_interface.py @@ -50,6 +50,14 @@ class DockerNetwork(BaseModel): client = docker.from_env() low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock') +daemon_config_path = "/etc/docker/daemon.json" + +def get_info(): + try: + client_info = client.info() + return client_info + except Exception as e: + return {} def check_docker_connection(): try: @@ -346,16 +354,15 @@ def validate_and_secure_networks(): def get_daemon_config(): - config_path = "/etc/docker/daemon.json" try: - with open(config_path, 'r') as file: + with open(daemon_config_path, 'r') as file: config_data = json.load(file) return config_data except FileNotFoundError: - print(f"Error: {config_path} not found.") + print(f"Error: {daemon_config_path} not found.") return None except json.JSONDecodeError: - print(f"Error: Failed to parse JSON from {config_path}.") + print(f"Error: Failed to parse JSON from {daemon_config_path}.") return None def verify_docker_version(min_version="17.06"): @@ -367,4 +374,42 @@ def verify_docker_version(min_version="17.06"): os._exit(1) except Exception as e: log.error(f"Failed to verify docker version | {e}") - os._exit(1) \ No newline at end of file + os._exit(1) + +def configure_exec_opts(key="native.cgroupdriver", value="cgroupfs"): + deamon_config = get_daemon_config() + if deamon_config: + try: + if (not "exec-opts" in deamon_config or type(deamon_config["exec-opts"])!=list) and value!=None: + deamon_config["exec-opts"]=[f"{key}={value}"] + elif "exec-opts" in deamon_config: + new_exec_opts=[] + matched_key=False + for exec_opt in deamon_config["exec-opts"]: + if '=' in exec_opt: + exec_opt_key, exec_opt_value = exec_opt.split('=',1) + if exec_opt_key==key: + matched_key=True + if value!=None: + new_exec_opts.append(f"{key}={value}") + else: + new_exec_opts.append(exec_opt) + else: + new_exec_opts.append(exec_opt) + if not matched_key: + new_exec_opts.append(f"{key}={value}") + if len(new_exec_opts)==0: + del deamon_config["exec-opts"] + else: + if deamon_config["exec-opts"] == new_exec_opts: + return "Same" + deamon_config["exec-opts"]=new_exec_opts + json_string = json.dumps(deamon_config, indent=4) + with open(daemon_config_path, 'w') as file: + file.write(json_string) + return True + except Exception as e: + log.error(f"Failed 'configure_exec_opts' | {e}") + return False + else: + return False \ No newline at end of file diff --git a/lib/get_specs.py b/lib/get_specs.py index 7d5c622..673e3ac 100644 --- a/lib/get_specs.py +++ b/lib/get_specs.py @@ -43,6 +43,28 @@ def get_kernel(): def is_hive(): return "hive" in get_kernel() +def get_os_release(): + try: + with open("/etc/os-release") as f: + os_info = f.read() + os_release = {} + for line in os_info.split('\n'): + if '=' in line: + key, value = line.split('=', 1) + if value[:1]=='"' and value.endswith('"'): + value = value[1:len(value)-1] + os_release[key]=value + + needed_cgroupfs_versions = ["22.04", "22.10"] # Mitigate issue https://github.com/NVIDIA/nvidia-docker/issues/1730 + + if "NAME" in os_release and "VERSION_ID" in os_release: + if os_release["NAME"].lower() == "ubuntu" and os_release["VERSION_ID"] in needed_cgroupfs_versions: + os_release["use_cgroupfs"]=True + + return os_release + except Exception as e: + return {} + def drop_caches(): try: with open('/proc/sys/vm/drop_caches', 'w') as f: