V5.2.2 - hostnames, Failed to initialize NVML fix for ubuntu 22 hosts
This commit is contained in:
parent
73d19b5cd7
commit
7e63ca5218
|
@ -4,6 +4,7 @@ from lib import custom_entrypoint
|
|||
from lib import networking
|
||||
from lib import wireguard
|
||||
from lib import logging as logging_lib
|
||||
from clore_hosting import utils as hosting_utils
|
||||
import shutil
|
||||
import os
|
||||
import re
|
||||
|
@ -53,9 +54,12 @@ def configure(containers):
|
|||
|
||||
for index, container in enumerate(containers):
|
||||
ok_custom_entrypoint = False
|
||||
invalid_hostname = False
|
||||
if index < len(custom_entrypoint_state):
|
||||
ok_custom_entrypoint = custom_entrypoint_state[index]
|
||||
startup_script_name = f"{container['name']}.sh"
|
||||
if "hostname" in container and not hosting_utils.validate_hostname(container["hostname"]):
|
||||
invalid_hostname = True
|
||||
if "ip" in container and len(container["ip"])>6 and type(container["ip"])==str:
|
||||
if container["ip"][:8] == "; echo '":
|
||||
last_occurrence, text_after_last_ip = get_last_ip_occurrence_and_text(container["ip"])
|
||||
|
@ -95,14 +99,14 @@ def configure(containers):
|
|||
newly_created_networks.append(container["network"])
|
||||
else:
|
||||
any_fail=True
|
||||
if not any_fail and ok_custom_entrypoint:
|
||||
if not any_fail and ok_custom_entrypoint and not invalid_hostname:
|
||||
valid_containers.append(container)
|
||||
elif "network" in container and container["network"][:len(config.clore_network_name_prefix)]==config.clore_network_name_prefix: # Subnet & gateway not defined, must be some of default networks, otherwise dump it
|
||||
if container["network"] in default_network_names:
|
||||
for docker_network in docker_networks:
|
||||
if docker_network["Name"]==container["network"]:
|
||||
for ipam in docker_network["IPAM"]:
|
||||
if not ok_custom_entrypoint:
|
||||
if not ok_custom_entrypoint or invalid_hostname:
|
||||
break
|
||||
elif not "ip" in container:
|
||||
valid_containers.append(container)
|
||||
|
|
|
@ -105,7 +105,16 @@ class CloreClient:
|
|||
self.ws_peers[str(config.debug_ws_peer)]={
|
||||
"expiration":"immune"
|
||||
}
|
||||
|
||||
|
||||
self.os_release = get_specs.get_os_release()
|
||||
self.restart_docker = False
|
||||
if "use_cgroupfs" in self.os_release:
|
||||
self.updated_exec_opts = True if docker_interface.configure_exec_opts("native.cgroupdriver","cgroupfs") else False
|
||||
if self.updated_exec_opts:
|
||||
docker_info = docker_interface.get_info()
|
||||
if "CgroupDriver" in docker_info and docker_info["CgroupDriver"]=="systemd":
|
||||
self.restart_docker = True # Restart docker when it's loaded under systemd (accual restart will happen only if no orders running to not disrupt workload)
|
||||
|
||||
docker_interface.verify_docker_version()
|
||||
nvml.init()
|
||||
|
||||
|
@ -333,6 +342,7 @@ class CloreClient:
|
|||
print("STEP",step,'|',self.containers_set, self.containers if config.log_containers_strings else '')
|
||||
|
||||
tasks = []
|
||||
running_order = False
|
||||
|
||||
container_conf = WebSocketClient.get_containers()
|
||||
|
||||
|
@ -345,6 +355,7 @@ class CloreClient:
|
|||
log_pull = False
|
||||
if "name" in container:
|
||||
if "-order-" in container["name"]:
|
||||
running_order=True
|
||||
log_pull=True
|
||||
image_config = {
|
||||
"image":container["image"],
|
||||
|
@ -362,6 +373,12 @@ class CloreClient:
|
|||
|
||||
if not image_config in tmp_images:
|
||||
tmp_images.append(image_config)
|
||||
|
||||
if self.restart_docker and not running_order and len(self.containers)>0:
|
||||
log.debug("Sending docker restart command")
|
||||
utils.run_command_v2("systemctl restart docker")
|
||||
self.restart_docker=False
|
||||
|
||||
if tmp_images!=self.needed_images:
|
||||
self.needed_images=tmp_images
|
||||
await pull_list.put(self.needed_images)
|
||||
|
@ -426,7 +443,7 @@ class CloreClient:
|
|||
async def submit_specs(self, current_specs):
|
||||
try:
|
||||
if type(current_specs) == dict:
|
||||
current_specs["backend_version"]=11
|
||||
current_specs["backend_version"]=12
|
||||
current_specs["update_hw"]=True
|
||||
smallest_pcie_width = 999
|
||||
for gpu in current_specs["gpus"]["nvidia"]:
|
||||
|
|
|
@ -10,5 +10,13 @@ def is_valid_websocket_url(url):
|
|||
return True
|
||||
return False
|
||||
|
||||
def validate_hostname(hostname):
|
||||
# Define the regular expression pattern for a valid hostname
|
||||
pattern = re.compile(r'^[a-zA-Z0-9._-]{1,63}$')
|
||||
if pattern.match(hostname):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def unix_timestamp():
|
||||
return int(time.time())
|
|
@ -21,6 +21,9 @@ def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
|
|||
if "network_mode" in container_options:
|
||||
command.extend(["--network", container_options["network_mode"]])
|
||||
|
||||
if "hostname" in container_options:
|
||||
command.extend(["--hostname", container_options["hostname"]])
|
||||
|
||||
if "cap_add" in container_options:
|
||||
for cap in container_options["cap_add"]:
|
||||
command.extend(["--cap-add", cap])
|
||||
|
|
|
@ -76,6 +76,13 @@ def deploy(validated_containers):
|
|||
)
|
||||
}
|
||||
|
||||
if "hostname" in validated_container:
|
||||
container_options["hostname"]=validated_container["hostname"]
|
||||
elif "clore-order-" in validated_container["name"]:
|
||||
try:
|
||||
container_options["hostname"] = f"O-{int(validated_container["name"][12:])}"
|
||||
except Exception as eon:
|
||||
pass
|
||||
if "network" in validated_container:
|
||||
container_options["network_mode"]=validated_container["network"]
|
||||
if "ip" in validated_container and config.creation_engine=="sdk":
|
||||
|
|
|
@ -50,6 +50,14 @@ class DockerNetwork(BaseModel):
|
|||
|
||||
client = docker.from_env()
|
||||
low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock')
|
||||
daemon_config_path = "/etc/docker/daemon.json"
|
||||
|
||||
def get_info():
|
||||
try:
|
||||
client_info = client.info()
|
||||
return client_info
|
||||
except Exception as e:
|
||||
return {}
|
||||
|
||||
def check_docker_connection():
|
||||
try:
|
||||
|
@ -346,16 +354,15 @@ def validate_and_secure_networks():
|
|||
|
||||
|
||||
def get_daemon_config():
|
||||
config_path = "/etc/docker/daemon.json"
|
||||
try:
|
||||
with open(config_path, 'r') as file:
|
||||
with open(daemon_config_path, 'r') as file:
|
||||
config_data = json.load(file)
|
||||
return config_data
|
||||
except FileNotFoundError:
|
||||
print(f"Error: {config_path} not found.")
|
||||
print(f"Error: {daemon_config_path} not found.")
|
||||
return None
|
||||
except json.JSONDecodeError:
|
||||
print(f"Error: Failed to parse JSON from {config_path}.")
|
||||
print(f"Error: Failed to parse JSON from {daemon_config_path}.")
|
||||
return None
|
||||
|
||||
def verify_docker_version(min_version="17.06"):
|
||||
|
@ -367,4 +374,42 @@ def verify_docker_version(min_version="17.06"):
|
|||
os._exit(1)
|
||||
except Exception as e:
|
||||
log.error(f"Failed to verify docker version | {e}")
|
||||
os._exit(1)
|
||||
os._exit(1)
|
||||
|
||||
def configure_exec_opts(key="native.cgroupdriver", value="cgroupfs"):
|
||||
deamon_config = get_daemon_config()
|
||||
if deamon_config:
|
||||
try:
|
||||
if (not "exec-opts" in deamon_config or type(deamon_config["exec-opts"])!=list) and value!=None:
|
||||
deamon_config["exec-opts"]=[f"{key}={value}"]
|
||||
elif "exec-opts" in deamon_config:
|
||||
new_exec_opts=[]
|
||||
matched_key=False
|
||||
for exec_opt in deamon_config["exec-opts"]:
|
||||
if '=' in exec_opt:
|
||||
exec_opt_key, exec_opt_value = exec_opt.split('=',1)
|
||||
if exec_opt_key==key:
|
||||
matched_key=True
|
||||
if value!=None:
|
||||
new_exec_opts.append(f"{key}={value}")
|
||||
else:
|
||||
new_exec_opts.append(exec_opt)
|
||||
else:
|
||||
new_exec_opts.append(exec_opt)
|
||||
if not matched_key:
|
||||
new_exec_opts.append(f"{key}={value}")
|
||||
if len(new_exec_opts)==0:
|
||||
del deamon_config["exec-opts"]
|
||||
else:
|
||||
if deamon_config["exec-opts"] == new_exec_opts:
|
||||
return "Same"
|
||||
deamon_config["exec-opts"]=new_exec_opts
|
||||
json_string = json.dumps(deamon_config, indent=4)
|
||||
with open(daemon_config_path, 'w') as file:
|
||||
file.write(json_string)
|
||||
return True
|
||||
except Exception as e:
|
||||
log.error(f"Failed 'configure_exec_opts' | {e}")
|
||||
return False
|
||||
else:
|
||||
return False
|
|
@ -43,6 +43,28 @@ def get_kernel():
|
|||
def is_hive():
|
||||
return "hive" in get_kernel()
|
||||
|
||||
def get_os_release():
|
||||
try:
|
||||
with open("/etc/os-release") as f:
|
||||
os_info = f.read()
|
||||
os_release = {}
|
||||
for line in os_info.split('\n'):
|
||||
if '=' in line:
|
||||
key, value = line.split('=', 1)
|
||||
if value[:1]=='"' and value.endswith('"'):
|
||||
value = value[1:len(value)-1]
|
||||
os_release[key]=value
|
||||
|
||||
needed_cgroupfs_versions = ["22.04", "22.10"] # Mitigate issue https://github.com/NVIDIA/nvidia-docker/issues/1730
|
||||
|
||||
if "NAME" in os_release and "VERSION_ID" in os_release:
|
||||
if os_release["NAME"].lower() == "ubuntu" and os_release["VERSION_ID"] in needed_cgroupfs_versions:
|
||||
os_release["use_cgroupfs"]=True
|
||||
|
||||
return os_release
|
||||
except Exception as e:
|
||||
return {}
|
||||
|
||||
def drop_caches():
|
||||
try:
|
||||
with open('/proc/sys/vm/drop_caches', 'w') as f:
|
||||
|
|
Loading…
Reference in New Issue