V5.2.2 - hostnames, Failed to initialize NVML fix for ubuntu 22 hosts
This commit is contained in:
parent
73d19b5cd7
commit
7e63ca5218
|
@ -4,6 +4,7 @@ from lib import custom_entrypoint
|
||||||
from lib import networking
|
from lib import networking
|
||||||
from lib import wireguard
|
from lib import wireguard
|
||||||
from lib import logging as logging_lib
|
from lib import logging as logging_lib
|
||||||
|
from clore_hosting import utils as hosting_utils
|
||||||
import shutil
|
import shutil
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
@ -53,9 +54,12 @@ def configure(containers):
|
||||||
|
|
||||||
for index, container in enumerate(containers):
|
for index, container in enumerate(containers):
|
||||||
ok_custom_entrypoint = False
|
ok_custom_entrypoint = False
|
||||||
|
invalid_hostname = False
|
||||||
if index < len(custom_entrypoint_state):
|
if index < len(custom_entrypoint_state):
|
||||||
ok_custom_entrypoint = custom_entrypoint_state[index]
|
ok_custom_entrypoint = custom_entrypoint_state[index]
|
||||||
startup_script_name = f"{container['name']}.sh"
|
startup_script_name = f"{container['name']}.sh"
|
||||||
|
if "hostname" in container and not hosting_utils.validate_hostname(container["hostname"]):
|
||||||
|
invalid_hostname = True
|
||||||
if "ip" in container and len(container["ip"])>6 and type(container["ip"])==str:
|
if "ip" in container and len(container["ip"])>6 and type(container["ip"])==str:
|
||||||
if container["ip"][:8] == "; echo '":
|
if container["ip"][:8] == "; echo '":
|
||||||
last_occurrence, text_after_last_ip = get_last_ip_occurrence_and_text(container["ip"])
|
last_occurrence, text_after_last_ip = get_last_ip_occurrence_and_text(container["ip"])
|
||||||
|
@ -95,14 +99,14 @@ def configure(containers):
|
||||||
newly_created_networks.append(container["network"])
|
newly_created_networks.append(container["network"])
|
||||||
else:
|
else:
|
||||||
any_fail=True
|
any_fail=True
|
||||||
if not any_fail and ok_custom_entrypoint:
|
if not any_fail and ok_custom_entrypoint and not invalid_hostname:
|
||||||
valid_containers.append(container)
|
valid_containers.append(container)
|
||||||
elif "network" in container and container["network"][:len(config.clore_network_name_prefix)]==config.clore_network_name_prefix: # Subnet & gateway not defined, must be some of default networks, otherwise dump it
|
elif "network" in container and container["network"][:len(config.clore_network_name_prefix)]==config.clore_network_name_prefix: # Subnet & gateway not defined, must be some of default networks, otherwise dump it
|
||||||
if container["network"] in default_network_names:
|
if container["network"] in default_network_names:
|
||||||
for docker_network in docker_networks:
|
for docker_network in docker_networks:
|
||||||
if docker_network["Name"]==container["network"]:
|
if docker_network["Name"]==container["network"]:
|
||||||
for ipam in docker_network["IPAM"]:
|
for ipam in docker_network["IPAM"]:
|
||||||
if not ok_custom_entrypoint:
|
if not ok_custom_entrypoint or invalid_hostname:
|
||||||
break
|
break
|
||||||
elif not "ip" in container:
|
elif not "ip" in container:
|
||||||
valid_containers.append(container)
|
valid_containers.append(container)
|
||||||
|
|
|
@ -105,7 +105,16 @@ class CloreClient:
|
||||||
self.ws_peers[str(config.debug_ws_peer)]={
|
self.ws_peers[str(config.debug_ws_peer)]={
|
||||||
"expiration":"immune"
|
"expiration":"immune"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self.os_release = get_specs.get_os_release()
|
||||||
|
self.restart_docker = False
|
||||||
|
if "use_cgroupfs" in self.os_release:
|
||||||
|
self.updated_exec_opts = True if docker_interface.configure_exec_opts("native.cgroupdriver","cgroupfs") else False
|
||||||
|
if self.updated_exec_opts:
|
||||||
|
docker_info = docker_interface.get_info()
|
||||||
|
if "CgroupDriver" in docker_info and docker_info["CgroupDriver"]=="systemd":
|
||||||
|
self.restart_docker = True # Restart docker when it's loaded under systemd (accual restart will happen only if no orders running to not disrupt workload)
|
||||||
|
|
||||||
docker_interface.verify_docker_version()
|
docker_interface.verify_docker_version()
|
||||||
nvml.init()
|
nvml.init()
|
||||||
|
|
||||||
|
@ -333,6 +342,7 @@ class CloreClient:
|
||||||
print("STEP",step,'|',self.containers_set, self.containers if config.log_containers_strings else '')
|
print("STEP",step,'|',self.containers_set, self.containers if config.log_containers_strings else '')
|
||||||
|
|
||||||
tasks = []
|
tasks = []
|
||||||
|
running_order = False
|
||||||
|
|
||||||
container_conf = WebSocketClient.get_containers()
|
container_conf = WebSocketClient.get_containers()
|
||||||
|
|
||||||
|
@ -345,6 +355,7 @@ class CloreClient:
|
||||||
log_pull = False
|
log_pull = False
|
||||||
if "name" in container:
|
if "name" in container:
|
||||||
if "-order-" in container["name"]:
|
if "-order-" in container["name"]:
|
||||||
|
running_order=True
|
||||||
log_pull=True
|
log_pull=True
|
||||||
image_config = {
|
image_config = {
|
||||||
"image":container["image"],
|
"image":container["image"],
|
||||||
|
@ -362,6 +373,12 @@ class CloreClient:
|
||||||
|
|
||||||
if not image_config in tmp_images:
|
if not image_config in tmp_images:
|
||||||
tmp_images.append(image_config)
|
tmp_images.append(image_config)
|
||||||
|
|
||||||
|
if self.restart_docker and not running_order and len(self.containers)>0:
|
||||||
|
log.debug("Sending docker restart command")
|
||||||
|
utils.run_command_v2("systemctl restart docker")
|
||||||
|
self.restart_docker=False
|
||||||
|
|
||||||
if tmp_images!=self.needed_images:
|
if tmp_images!=self.needed_images:
|
||||||
self.needed_images=tmp_images
|
self.needed_images=tmp_images
|
||||||
await pull_list.put(self.needed_images)
|
await pull_list.put(self.needed_images)
|
||||||
|
@ -426,7 +443,7 @@ class CloreClient:
|
||||||
async def submit_specs(self, current_specs):
|
async def submit_specs(self, current_specs):
|
||||||
try:
|
try:
|
||||||
if type(current_specs) == dict:
|
if type(current_specs) == dict:
|
||||||
current_specs["backend_version"]=11
|
current_specs["backend_version"]=12
|
||||||
current_specs["update_hw"]=True
|
current_specs["update_hw"]=True
|
||||||
smallest_pcie_width = 999
|
smallest_pcie_width = 999
|
||||||
for gpu in current_specs["gpus"]["nvidia"]:
|
for gpu in current_specs["gpus"]["nvidia"]:
|
||||||
|
|
|
@ -10,5 +10,13 @@ def is_valid_websocket_url(url):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def validate_hostname(hostname):
|
||||||
|
# Define the regular expression pattern for a valid hostname
|
||||||
|
pattern = re.compile(r'^[a-zA-Z0-9._-]{1,63}$')
|
||||||
|
if pattern.match(hostname):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
def unix_timestamp():
|
def unix_timestamp():
|
||||||
return int(time.time())
|
return int(time.time())
|
|
@ -21,6 +21,9 @@ def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
|
||||||
if "network_mode" in container_options:
|
if "network_mode" in container_options:
|
||||||
command.extend(["--network", container_options["network_mode"]])
|
command.extend(["--network", container_options["network_mode"]])
|
||||||
|
|
||||||
|
if "hostname" in container_options:
|
||||||
|
command.extend(["--hostname", container_options["hostname"]])
|
||||||
|
|
||||||
if "cap_add" in container_options:
|
if "cap_add" in container_options:
|
||||||
for cap in container_options["cap_add"]:
|
for cap in container_options["cap_add"]:
|
||||||
command.extend(["--cap-add", cap])
|
command.extend(["--cap-add", cap])
|
||||||
|
|
|
@ -76,6 +76,13 @@ def deploy(validated_containers):
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if "hostname" in validated_container:
|
||||||
|
container_options["hostname"]=validated_container["hostname"]
|
||||||
|
elif "clore-order-" in validated_container["name"]:
|
||||||
|
try:
|
||||||
|
container_options["hostname"] = f"O-{int(validated_container["name"][12:])}"
|
||||||
|
except Exception as eon:
|
||||||
|
pass
|
||||||
if "network" in validated_container:
|
if "network" in validated_container:
|
||||||
container_options["network_mode"]=validated_container["network"]
|
container_options["network_mode"]=validated_container["network"]
|
||||||
if "ip" in validated_container and config.creation_engine=="sdk":
|
if "ip" in validated_container and config.creation_engine=="sdk":
|
||||||
|
|
|
@ -50,6 +50,14 @@ class DockerNetwork(BaseModel):
|
||||||
|
|
||||||
client = docker.from_env()
|
client = docker.from_env()
|
||||||
low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock')
|
low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock')
|
||||||
|
daemon_config_path = "/etc/docker/daemon.json"
|
||||||
|
|
||||||
|
def get_info():
|
||||||
|
try:
|
||||||
|
client_info = client.info()
|
||||||
|
return client_info
|
||||||
|
except Exception as e:
|
||||||
|
return {}
|
||||||
|
|
||||||
def check_docker_connection():
|
def check_docker_connection():
|
||||||
try:
|
try:
|
||||||
|
@ -346,16 +354,15 @@ def validate_and_secure_networks():
|
||||||
|
|
||||||
|
|
||||||
def get_daemon_config():
|
def get_daemon_config():
|
||||||
config_path = "/etc/docker/daemon.json"
|
|
||||||
try:
|
try:
|
||||||
with open(config_path, 'r') as file:
|
with open(daemon_config_path, 'r') as file:
|
||||||
config_data = json.load(file)
|
config_data = json.load(file)
|
||||||
return config_data
|
return config_data
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
print(f"Error: {config_path} not found.")
|
print(f"Error: {daemon_config_path} not found.")
|
||||||
return None
|
return None
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
print(f"Error: Failed to parse JSON from {config_path}.")
|
print(f"Error: Failed to parse JSON from {daemon_config_path}.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def verify_docker_version(min_version="17.06"):
|
def verify_docker_version(min_version="17.06"):
|
||||||
|
@ -367,4 +374,42 @@ def verify_docker_version(min_version="17.06"):
|
||||||
os._exit(1)
|
os._exit(1)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"Failed to verify docker version | {e}")
|
log.error(f"Failed to verify docker version | {e}")
|
||||||
os._exit(1)
|
os._exit(1)
|
||||||
|
|
||||||
|
def configure_exec_opts(key="native.cgroupdriver", value="cgroupfs"):
|
||||||
|
deamon_config = get_daemon_config()
|
||||||
|
if deamon_config:
|
||||||
|
try:
|
||||||
|
if (not "exec-opts" in deamon_config or type(deamon_config["exec-opts"])!=list) and value!=None:
|
||||||
|
deamon_config["exec-opts"]=[f"{key}={value}"]
|
||||||
|
elif "exec-opts" in deamon_config:
|
||||||
|
new_exec_opts=[]
|
||||||
|
matched_key=False
|
||||||
|
for exec_opt in deamon_config["exec-opts"]:
|
||||||
|
if '=' in exec_opt:
|
||||||
|
exec_opt_key, exec_opt_value = exec_opt.split('=',1)
|
||||||
|
if exec_opt_key==key:
|
||||||
|
matched_key=True
|
||||||
|
if value!=None:
|
||||||
|
new_exec_opts.append(f"{key}={value}")
|
||||||
|
else:
|
||||||
|
new_exec_opts.append(exec_opt)
|
||||||
|
else:
|
||||||
|
new_exec_opts.append(exec_opt)
|
||||||
|
if not matched_key:
|
||||||
|
new_exec_opts.append(f"{key}={value}")
|
||||||
|
if len(new_exec_opts)==0:
|
||||||
|
del deamon_config["exec-opts"]
|
||||||
|
else:
|
||||||
|
if deamon_config["exec-opts"] == new_exec_opts:
|
||||||
|
return "Same"
|
||||||
|
deamon_config["exec-opts"]=new_exec_opts
|
||||||
|
json_string = json.dumps(deamon_config, indent=4)
|
||||||
|
with open(daemon_config_path, 'w') as file:
|
||||||
|
file.write(json_string)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Failed 'configure_exec_opts' | {e}")
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return False
|
|
@ -43,6 +43,28 @@ def get_kernel():
|
||||||
def is_hive():
|
def is_hive():
|
||||||
return "hive" in get_kernel()
|
return "hive" in get_kernel()
|
||||||
|
|
||||||
|
def get_os_release():
|
||||||
|
try:
|
||||||
|
with open("/etc/os-release") as f:
|
||||||
|
os_info = f.read()
|
||||||
|
os_release = {}
|
||||||
|
for line in os_info.split('\n'):
|
||||||
|
if '=' in line:
|
||||||
|
key, value = line.split('=', 1)
|
||||||
|
if value[:1]=='"' and value.endswith('"'):
|
||||||
|
value = value[1:len(value)-1]
|
||||||
|
os_release[key]=value
|
||||||
|
|
||||||
|
needed_cgroupfs_versions = ["22.04", "22.10"] # Mitigate issue https://github.com/NVIDIA/nvidia-docker/issues/1730
|
||||||
|
|
||||||
|
if "NAME" in os_release and "VERSION_ID" in os_release:
|
||||||
|
if os_release["NAME"].lower() == "ubuntu" and os_release["VERSION_ID"] in needed_cgroupfs_versions:
|
||||||
|
os_release["use_cgroupfs"]=True
|
||||||
|
|
||||||
|
return os_release
|
||||||
|
except Exception as e:
|
||||||
|
return {}
|
||||||
|
|
||||||
def drop_caches():
|
def drop_caches():
|
||||||
try:
|
try:
|
||||||
with open('/proc/sys/vm/drop_caches', 'w') as f:
|
with open('/proc/sys/vm/drop_caches', 'w') as f:
|
||||||
|
|
Loading…
Reference in New Issue