V5.2.2 - hostnames, Failed to initialize NVML fix for ubuntu 22 hosts

This commit is contained in:
clore 2024-07-06 13:05:22 +00:00
parent 73d19b5cd7
commit 7e63ca5218
7 changed files with 115 additions and 9 deletions

View File

@ -4,6 +4,7 @@ from lib import custom_entrypoint
from lib import networking from lib import networking
from lib import wireguard from lib import wireguard
from lib import logging as logging_lib from lib import logging as logging_lib
from clore_hosting import utils as hosting_utils
import shutil import shutil
import os import os
import re import re
@ -53,9 +54,12 @@ def configure(containers):
for index, container in enumerate(containers): for index, container in enumerate(containers):
ok_custom_entrypoint = False ok_custom_entrypoint = False
invalid_hostname = False
if index < len(custom_entrypoint_state): if index < len(custom_entrypoint_state):
ok_custom_entrypoint = custom_entrypoint_state[index] ok_custom_entrypoint = custom_entrypoint_state[index]
startup_script_name = f"{container['name']}.sh" startup_script_name = f"{container['name']}.sh"
if "hostname" in container and not hosting_utils.validate_hostname(container["hostname"]):
invalid_hostname = True
if "ip" in container and len(container["ip"])>6 and type(container["ip"])==str: if "ip" in container and len(container["ip"])>6 and type(container["ip"])==str:
if container["ip"][:8] == "; echo '": if container["ip"][:8] == "; echo '":
last_occurrence, text_after_last_ip = get_last_ip_occurrence_and_text(container["ip"]) last_occurrence, text_after_last_ip = get_last_ip_occurrence_and_text(container["ip"])
@ -95,14 +99,14 @@ def configure(containers):
newly_created_networks.append(container["network"]) newly_created_networks.append(container["network"])
else: else:
any_fail=True any_fail=True
if not any_fail and ok_custom_entrypoint: if not any_fail and ok_custom_entrypoint and not invalid_hostname:
valid_containers.append(container) valid_containers.append(container)
elif "network" in container and container["network"][:len(config.clore_network_name_prefix)]==config.clore_network_name_prefix: # Subnet & gateway not defined, must be some of default networks, otherwise dump it elif "network" in container and container["network"][:len(config.clore_network_name_prefix)]==config.clore_network_name_prefix: # Subnet & gateway not defined, must be some of default networks, otherwise dump it
if container["network"] in default_network_names: if container["network"] in default_network_names:
for docker_network in docker_networks: for docker_network in docker_networks:
if docker_network["Name"]==container["network"]: if docker_network["Name"]==container["network"]:
for ipam in docker_network["IPAM"]: for ipam in docker_network["IPAM"]:
if not ok_custom_entrypoint: if not ok_custom_entrypoint or invalid_hostname:
break break
elif not "ip" in container: elif not "ip" in container:
valid_containers.append(container) valid_containers.append(container)

View File

@ -106,6 +106,15 @@ class CloreClient:
"expiration":"immune" "expiration":"immune"
} }
self.os_release = get_specs.get_os_release()
self.restart_docker = False
if "use_cgroupfs" in self.os_release:
self.updated_exec_opts = True if docker_interface.configure_exec_opts("native.cgroupdriver","cgroupfs") else False
if self.updated_exec_opts:
docker_info = docker_interface.get_info()
if "CgroupDriver" in docker_info and docker_info["CgroupDriver"]=="systemd":
self.restart_docker = True # Restart docker when it's loaded under systemd (accual restart will happen only if no orders running to not disrupt workload)
docker_interface.verify_docker_version() docker_interface.verify_docker_version()
nvml.init() nvml.init()
@ -333,6 +342,7 @@ class CloreClient:
print("STEP",step,'|',self.containers_set, self.containers if config.log_containers_strings else '') print("STEP",step,'|',self.containers_set, self.containers if config.log_containers_strings else '')
tasks = [] tasks = []
running_order = False
container_conf = WebSocketClient.get_containers() container_conf = WebSocketClient.get_containers()
@ -345,6 +355,7 @@ class CloreClient:
log_pull = False log_pull = False
if "name" in container: if "name" in container:
if "-order-" in container["name"]: if "-order-" in container["name"]:
running_order=True
log_pull=True log_pull=True
image_config = { image_config = {
"image":container["image"], "image":container["image"],
@ -362,6 +373,12 @@ class CloreClient:
if not image_config in tmp_images: if not image_config in tmp_images:
tmp_images.append(image_config) tmp_images.append(image_config)
if self.restart_docker and not running_order and len(self.containers)>0:
log.debug("Sending docker restart command")
utils.run_command_v2("systemctl restart docker")
self.restart_docker=False
if tmp_images!=self.needed_images: if tmp_images!=self.needed_images:
self.needed_images=tmp_images self.needed_images=tmp_images
await pull_list.put(self.needed_images) await pull_list.put(self.needed_images)
@ -426,7 +443,7 @@ class CloreClient:
async def submit_specs(self, current_specs): async def submit_specs(self, current_specs):
try: try:
if type(current_specs) == dict: if type(current_specs) == dict:
current_specs["backend_version"]=11 current_specs["backend_version"]=12
current_specs["update_hw"]=True current_specs["update_hw"]=True
smallest_pcie_width = 999 smallest_pcie_width = 999
for gpu in current_specs["gpus"]["nvidia"]: for gpu in current_specs["gpus"]["nvidia"]:

View File

@ -10,5 +10,13 @@ def is_valid_websocket_url(url):
return True return True
return False return False
def validate_hostname(hostname):
# Define the regular expression pattern for a valid hostname
pattern = re.compile(r'^[a-zA-Z0-9._-]{1,63}$')
if pattern.match(hostname):
return True
else:
return False
def unix_timestamp(): def unix_timestamp():
return int(time.time()) return int(time.time())

View File

@ -21,6 +21,9 @@ def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
if "network_mode" in container_options: if "network_mode" in container_options:
command.extend(["--network", container_options["network_mode"]]) command.extend(["--network", container_options["network_mode"]])
if "hostname" in container_options:
command.extend(["--hostname", container_options["hostname"]])
if "cap_add" in container_options: if "cap_add" in container_options:
for cap in container_options["cap_add"]: for cap in container_options["cap_add"]:
command.extend(["--cap-add", cap]) command.extend(["--cap-add", cap])

View File

@ -76,6 +76,13 @@ def deploy(validated_containers):
) )
} }
if "hostname" in validated_container:
container_options["hostname"]=validated_container["hostname"]
elif "clore-order-" in validated_container["name"]:
try:
container_options["hostname"] = f"O-{int(validated_container["name"][12:])}"
except Exception as eon:
pass
if "network" in validated_container: if "network" in validated_container:
container_options["network_mode"]=validated_container["network"] container_options["network_mode"]=validated_container["network"]
if "ip" in validated_container and config.creation_engine=="sdk": if "ip" in validated_container and config.creation_engine=="sdk":

View File

@ -50,6 +50,14 @@ class DockerNetwork(BaseModel):
client = docker.from_env() client = docker.from_env()
low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock') low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock')
daemon_config_path = "/etc/docker/daemon.json"
def get_info():
try:
client_info = client.info()
return client_info
except Exception as e:
return {}
def check_docker_connection(): def check_docker_connection():
try: try:
@ -346,16 +354,15 @@ def validate_and_secure_networks():
def get_daemon_config(): def get_daemon_config():
config_path = "/etc/docker/daemon.json"
try: try:
with open(config_path, 'r') as file: with open(daemon_config_path, 'r') as file:
config_data = json.load(file) config_data = json.load(file)
return config_data return config_data
except FileNotFoundError: except FileNotFoundError:
print(f"Error: {config_path} not found.") print(f"Error: {daemon_config_path} not found.")
return None return None
except json.JSONDecodeError: except json.JSONDecodeError:
print(f"Error: Failed to parse JSON from {config_path}.") print(f"Error: Failed to parse JSON from {daemon_config_path}.")
return None return None
def verify_docker_version(min_version="17.06"): def verify_docker_version(min_version="17.06"):
@ -368,3 +375,41 @@ def verify_docker_version(min_version="17.06"):
except Exception as e: except Exception as e:
log.error(f"Failed to verify docker version | {e}") log.error(f"Failed to verify docker version | {e}")
os._exit(1) os._exit(1)
def configure_exec_opts(key="native.cgroupdriver", value="cgroupfs"):
deamon_config = get_daemon_config()
if deamon_config:
try:
if (not "exec-opts" in deamon_config or type(deamon_config["exec-opts"])!=list) and value!=None:
deamon_config["exec-opts"]=[f"{key}={value}"]
elif "exec-opts" in deamon_config:
new_exec_opts=[]
matched_key=False
for exec_opt in deamon_config["exec-opts"]:
if '=' in exec_opt:
exec_opt_key, exec_opt_value = exec_opt.split('=',1)
if exec_opt_key==key:
matched_key=True
if value!=None:
new_exec_opts.append(f"{key}={value}")
else:
new_exec_opts.append(exec_opt)
else:
new_exec_opts.append(exec_opt)
if not matched_key:
new_exec_opts.append(f"{key}={value}")
if len(new_exec_opts)==0:
del deamon_config["exec-opts"]
else:
if deamon_config["exec-opts"] == new_exec_opts:
return "Same"
deamon_config["exec-opts"]=new_exec_opts
json_string = json.dumps(deamon_config, indent=4)
with open(daemon_config_path, 'w') as file:
file.write(json_string)
return True
except Exception as e:
log.error(f"Failed 'configure_exec_opts' | {e}")
return False
else:
return False

View File

@ -43,6 +43,28 @@ def get_kernel():
def is_hive(): def is_hive():
return "hive" in get_kernel() return "hive" in get_kernel()
def get_os_release():
try:
with open("/etc/os-release") as f:
os_info = f.read()
os_release = {}
for line in os_info.split('\n'):
if '=' in line:
key, value = line.split('=', 1)
if value[:1]=='"' and value.endswith('"'):
value = value[1:len(value)-1]
os_release[key]=value
needed_cgroupfs_versions = ["22.04", "22.10"] # Mitigate issue https://github.com/NVIDIA/nvidia-docker/issues/1730
if "NAME" in os_release and "VERSION_ID" in os_release:
if os_release["NAME"].lower() == "ubuntu" and os_release["VERSION_ID"] in needed_cgroupfs_versions:
os_release["use_cgroupfs"]=True
return os_release
except Exception as e:
return {}
def drop_caches(): def drop_caches():
try: try:
with open('/proc/sys/vm/drop_caches', 'w') as f: with open('/proc/sys/vm/drop_caches', 'w') as f: