allocate /dev/shm towards instances - V5.2.6

2024-10-17 17:01:41 +00:00 · 2024-10-17 17:01:41 +00:00 · d5620c64c4
parent d6f90ab497
commit d5620c64c4
6 changed files with 60 additions and 5 deletions
--- a/clore_hosting/main.py
+++ b/clore_hosting/main.py
@ -455,7 +455,7 @@ class CloreClient:
    async def submit_specs(self, current_specs):
        try:
            if type(current_specs) == dict:
-                current_specs["backend_version"]=15
+                current_specs["backend_version"]=16
                current_specs["update_hw"]=True
                smallest_pcie_width = 999
                for gpu in current_specs["gpus"]["nvidia"]:
--- a/lib/docker_cli_wrapper.py
+++ b/lib/docker_cli_wrapper.py
@ -9,7 +9,7 @@ import docker
 config = config_module.config
 log = logging_lib.log

-def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
+def create_container(container_options, ip=None, docker_gpus=False, shm_size=64, timeout=30):
    # Sanitize and validate input
    container_options = sanitize_input(container_options)

@ -55,6 +55,10 @@ def create_container(container_options, ip=None, docker_gpus=False, timeout=30):

    if "runtime" in container_options:
        command.extend(["--runtime", container_options["runtime"]])
+    
+    if shm_size != 64:
+        command.extend(["--shm-size", f"{shm_size}m"])
+
    if docker_gpus:
        if type(docker_gpus)==list:
            command.extend(['--gpus', '"device=' + ','.join(str(gpu_id) for gpu_id in docker_gpus) + '"'])
--- a/lib/docker_deploy.py
+++ b/lib/docker_deploy.py
@ -3,10 +3,13 @@ from lib import logging as logging_lib
 from lib import docker_cli_wrapper
 from lib import docker_interface
 from lib import get_specs
+from lib import utils
 import docker
 from docker.types import EndpointConfig, NetworkingConfig
 import os

+shm_calculator = utils.shm_calculator(get_specs.get_total_ram_mb())
+
 client = docker_interface.client
 config = config_module.config
 log = logging_lib.log
@ -43,6 +46,7 @@ def deploy(validated_containers, allowed_running_containers=[]):

    for validated_container in validated_containers:
        try:
+            SHM_SIZE = 64 # MB - default

            image_ready = False
            docker_gpus = None
@ -89,6 +93,8 @@ def deploy(validated_containers, allowed_running_containers=[]):
                del container_options["network_mode"]

            if "gpus" in validated_container and type(validated_container["gpus"])==bool:
+                if "clore-order-" in validated_container["name"]:
+                    SHM_SIZE = shm_calculator.calculate('*')
                container_options["runtime"]="nvidia"
                docker_gpus=True
                container_options["device_requests"].append(docker.types.DeviceRequest(count=-1, capabilities=[['gpu']]))
@ -128,9 +134,11 @@ def deploy(validated_containers, allowed_running_containers=[]):
            elif "entrypoint_command" in validated_container and type(validated_container["entrypoint_command"])==str and len(validated_container["entrypoint_command"])>0:
                container_options["entrypoint"]=validated_container["entrypoint_command"]

+            container_options["shm_size"] = f"{SHM_SIZE}m"
+
            if not validated_container["name"] in created_container_names and image_ready:
                if config.creation_engine == "wrapper":
-                    docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), docker_gpus=docker_gpus)
+                    docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), shm_size=SHM_SIZE, docker_gpus=docker_gpus)
                else:
                    container = client.containers.create(**container_options)
                    if "ip" in validated_container:
--- a/lib/get_specs.py
+++ b/lib/get_specs.py
@ -43,6 +43,10 @@ def get_kernel():
 def is_hive():
    return "hive" in get_kernel()

+def get_total_ram_mb():
+    total_ram = psutil.virtual_memory().total
+    return total_ram / (1024 ** 2)
+
 def get_os_release():
    try:
        with open("/etc/os-release") as f:
--- a/lib/nvml.py
+++ b/lib/nvml.py
@ -383,4 +383,17 @@ def get_hive_clock_range(is_hive, gpu_index, part):
        except Exception as e:
            return False
    else:
-        return False
+        return False
+
+def get_vram_per_gpu():
+    vram_per_gpu = []
+    try:
+        gpu_count = pynvml.nvmlDeviceGetCount()
+        for i in range(0,gpu_count):
+            gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+            mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
+            vram_per_gpu.append(mem_info.total / 1024 ** 2)
+    except Exception as e:
+        log.error(f"Failed loading get_vram_per_gpu() | {e}")
+        pass
+    return vram_per_gpu
--- a/lib/utils.py
+++ b/lib/utils.py
@ -1,11 +1,13 @@
 from lib import config as config_module
 from lib import logging as logging_lib
+from lib import nvml
 import subprocess
 import hashlib
 import random
 import string
 import shlex
 import time
+import math
 import json
 import os

@ -141,4 +143,28 @@ def get_extra_allowed_images():
            log.error(f"get_extra_allowed_images() | error: {e}")
            return []
    else:
-        return []
+        return []
+    
+class shm_calculator:
+    def __init__(self, total_ram):
+        self.total_ram = total_ram
+        self.gpu_vram_sizes = []
+
+    def calculate(self, used_gpu_ids):
+        assume_ram_utilised = 2500 #MB
+        default_shm_size = 64 #MB
+        
+        if len(self.gpu_vram_sizes) == 0:
+            self.gpu_vram_sizes = nvml.get_vram_per_gpu()
+
+        instance_vram_total = 0
+        total_vram_size = sum(self.gpu_vram_sizes)
+        for idx, value in enumerate(self.gpu_vram_sizes):
+            if used_gpu_ids == '*' or idx in used_gpu_ids:
+                instance_vram_total += value
+        if instance_vram_total == 0 or total_vram_size == 0:
+            return default_shm_size
+        shm_size = instance_vram_total * 1.5 if instance_vram_total * 1.5 < self.total_ram - assume_ram_utilised else (
+            instance_vram_total/total_vram_size * (self.total_ram - assume_ram_utilised)
+        )
+        return math.floor(shm_size if shm_size > default_shm_size else default_shm_size)