From d5620c64c4cd4534244ddb3cd6b3de093dcbe0ee Mon Sep 17 00:00:00 2001 From: clore Date: Thu, 17 Oct 2024 17:01:41 +0000 Subject: [PATCH] allocate /dev/shm towards instances - V5.2.6 --- clore_hosting/main.py | 2 +- lib/docker_cli_wrapper.py | 6 +++++- lib/docker_deploy.py | 10 +++++++++- lib/get_specs.py | 4 ++++ lib/nvml.py | 15 ++++++++++++++- lib/utils.py | 28 +++++++++++++++++++++++++++- 6 files changed, 60 insertions(+), 5 deletions(-) diff --git a/clore_hosting/main.py b/clore_hosting/main.py index 12ec8b1..26390a6 100644 --- a/clore_hosting/main.py +++ b/clore_hosting/main.py @@ -455,7 +455,7 @@ class CloreClient: async def submit_specs(self, current_specs): try: if type(current_specs) == dict: - current_specs["backend_version"]=15 + current_specs["backend_version"]=16 current_specs["update_hw"]=True smallest_pcie_width = 999 for gpu in current_specs["gpus"]["nvidia"]: diff --git a/lib/docker_cli_wrapper.py b/lib/docker_cli_wrapper.py index 762459c..1e1f4f5 100644 --- a/lib/docker_cli_wrapper.py +++ b/lib/docker_cli_wrapper.py @@ -9,7 +9,7 @@ import docker config = config_module.config log = logging_lib.log -def create_container(container_options, ip=None, docker_gpus=False, timeout=30): +def create_container(container_options, ip=None, docker_gpus=False, shm_size=64, timeout=30): # Sanitize and validate input container_options = sanitize_input(container_options) @@ -55,6 +55,10 @@ def create_container(container_options, ip=None, docker_gpus=False, timeout=30): if "runtime" in container_options: command.extend(["--runtime", container_options["runtime"]]) + + if shm_size != 64: + command.extend(["--shm-size", f"{shm_size}m"]) + if docker_gpus: if type(docker_gpus)==list: command.extend(['--gpus', '"device=' + ','.join(str(gpu_id) for gpu_id in docker_gpus) + '"']) diff --git a/lib/docker_deploy.py b/lib/docker_deploy.py index 16b7f1b..c696066 100644 --- a/lib/docker_deploy.py +++ b/lib/docker_deploy.py @@ -3,10 +3,13 @@ from lib import logging as logging_lib from lib import docker_cli_wrapper from lib import docker_interface from lib import get_specs +from lib import utils import docker from docker.types import EndpointConfig, NetworkingConfig import os +shm_calculator = utils.shm_calculator(get_specs.get_total_ram_mb()) + client = docker_interface.client config = config_module.config log = logging_lib.log @@ -43,6 +46,7 @@ def deploy(validated_containers, allowed_running_containers=[]): for validated_container in validated_containers: try: + SHM_SIZE = 64 # MB - default image_ready = False docker_gpus = None @@ -89,6 +93,8 @@ def deploy(validated_containers, allowed_running_containers=[]): del container_options["network_mode"] if "gpus" in validated_container and type(validated_container["gpus"])==bool: + if "clore-order-" in validated_container["name"]: + SHM_SIZE = shm_calculator.calculate('*') container_options["runtime"]="nvidia" docker_gpus=True container_options["device_requests"].append(docker.types.DeviceRequest(count=-1, capabilities=[['gpu']])) @@ -128,9 +134,11 @@ def deploy(validated_containers, allowed_running_containers=[]): elif "entrypoint_command" in validated_container and type(validated_container["entrypoint_command"])==str and len(validated_container["entrypoint_command"])>0: container_options["entrypoint"]=validated_container["entrypoint_command"] + container_options["shm_size"] = f"{SHM_SIZE}m" + if not validated_container["name"] in created_container_names and image_ready: if config.creation_engine == "wrapper": - docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), docker_gpus=docker_gpus) + docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), shm_size=SHM_SIZE, docker_gpus=docker_gpus) else: container = client.containers.create(**container_options) if "ip" in validated_container: diff --git a/lib/get_specs.py b/lib/get_specs.py index 673e3ac..6b6fac1 100644 --- a/lib/get_specs.py +++ b/lib/get_specs.py @@ -43,6 +43,10 @@ def get_kernel(): def is_hive(): return "hive" in get_kernel() +def get_total_ram_mb(): + total_ram = psutil.virtual_memory().total + return total_ram / (1024 ** 2) + def get_os_release(): try: with open("/etc/os-release") as f: diff --git a/lib/nvml.py b/lib/nvml.py index 8b0a567..cc3fea6 100644 --- a/lib/nvml.py +++ b/lib/nvml.py @@ -383,4 +383,17 @@ def get_hive_clock_range(is_hive, gpu_index, part): except Exception as e: return False else: - return False \ No newline at end of file + return False + +def get_vram_per_gpu(): + vram_per_gpu = [] + try: + gpu_count = pynvml.nvmlDeviceGetCount() + for i in range(0,gpu_count): + gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i) + mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle) + vram_per_gpu.append(mem_info.total / 1024 ** 2) + except Exception as e: + log.error(f"Failed loading get_vram_per_gpu() | {e}") + pass + return vram_per_gpu \ No newline at end of file diff --git a/lib/utils.py b/lib/utils.py index 91b0497..42c2176 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -1,11 +1,13 @@ from lib import config as config_module from lib import logging as logging_lib +from lib import nvml import subprocess import hashlib import random import string import shlex import time +import math import json import os @@ -141,4 +143,28 @@ def get_extra_allowed_images(): log.error(f"get_extra_allowed_images() | error: {e}") return [] else: - return [] \ No newline at end of file + return [] + +class shm_calculator: + def __init__(self, total_ram): + self.total_ram = total_ram + self.gpu_vram_sizes = [] + + def calculate(self, used_gpu_ids): + assume_ram_utilised = 2500 #MB + default_shm_size = 64 #MB + + if len(self.gpu_vram_sizes) == 0: + self.gpu_vram_sizes = nvml.get_vram_per_gpu() + + instance_vram_total = 0 + total_vram_size = sum(self.gpu_vram_sizes) + for idx, value in enumerate(self.gpu_vram_sizes): + if used_gpu_ids == '*' or idx in used_gpu_ids: + instance_vram_total += value + if instance_vram_total == 0 or total_vram_size == 0: + return default_shm_size + shm_size = instance_vram_total * 1.5 if instance_vram_total * 1.5 < self.total_ram - assume_ram_utilised else ( + instance_vram_total/total_vram_size * (self.total_ram - assume_ram_utilised) + ) + return math.floor(shm_size if shm_size > default_shm_size else default_shm_size) \ No newline at end of file