From d5620c64c4cd4534244ddb3cd6b3de093dcbe0ee Mon Sep 17 00:00:00 2001
From: clore <support@clore.ai>
Date: Thu, 17 Oct 2024 17:01:41 +0000
Subject: [PATCH] allocate /dev/shm towards instances - V5.2.6

---
 clore_hosting/main.py     |  2 +-
 lib/docker_cli_wrapper.py |  6 +++++-
 lib/docker_deploy.py      | 10 +++++++++-
 lib/get_specs.py          |  4 ++++
 lib/nvml.py               | 15 ++++++++++++++-
 lib/utils.py              | 28 +++++++++++++++++++++++++++-
 6 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/clore_hosting/main.py b/clore_hosting/main.py
index 12ec8b1..26390a6 100644
--- a/clore_hosting/main.py
+++ b/clore_hosting/main.py
@@ -455,7 +455,7 @@ class CloreClient:
     async def submit_specs(self, current_specs):
         try:
             if type(current_specs) == dict:
-                current_specs["backend_version"]=15
+                current_specs["backend_version"]=16
                 current_specs["update_hw"]=True
                 smallest_pcie_width = 999
                 for gpu in current_specs["gpus"]["nvidia"]:
diff --git a/lib/docker_cli_wrapper.py b/lib/docker_cli_wrapper.py
index 762459c..1e1f4f5 100644
--- a/lib/docker_cli_wrapper.py
+++ b/lib/docker_cli_wrapper.py
@@ -9,7 +9,7 @@ import docker
 config = config_module.config
 log = logging_lib.log
 
-def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
+def create_container(container_options, ip=None, docker_gpus=False, shm_size=64, timeout=30):
     # Sanitize and validate input
     container_options = sanitize_input(container_options)
 
@@ -55,6 +55,10 @@ def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
 
     if "runtime" in container_options:
         command.extend(["--runtime", container_options["runtime"]])
+    
+    if shm_size != 64:
+        command.extend(["--shm-size", f"{shm_size}m"])
+
     if docker_gpus:
         if type(docker_gpus)==list:
             command.extend(['--gpus', '"device=' + ','.join(str(gpu_id) for gpu_id in docker_gpus) + '"'])
diff --git a/lib/docker_deploy.py b/lib/docker_deploy.py
index 16b7f1b..c696066 100644
--- a/lib/docker_deploy.py
+++ b/lib/docker_deploy.py
@@ -3,10 +3,13 @@ from lib import logging as logging_lib
 from lib import docker_cli_wrapper
 from lib import docker_interface
 from lib import get_specs
+from lib import utils
 import docker
 from docker.types import EndpointConfig, NetworkingConfig
 import os
 
+shm_calculator = utils.shm_calculator(get_specs.get_total_ram_mb())
+
 client = docker_interface.client
 config = config_module.config
 log = logging_lib.log
@@ -43,6 +46,7 @@ def deploy(validated_containers, allowed_running_containers=[]):
 
     for validated_container in validated_containers:
         try:
+            SHM_SIZE = 64 # MB - default
 
             image_ready = False
             docker_gpus = None
@@ -89,6 +93,8 @@ def deploy(validated_containers, allowed_running_containers=[]):
                 del container_options["network_mode"]
 
             if "gpus" in validated_container and type(validated_container["gpus"])==bool:
+                if "clore-order-" in validated_container["name"]:
+                    SHM_SIZE = shm_calculator.calculate('*')
                 container_options["runtime"]="nvidia"
                 docker_gpus=True
                 container_options["device_requests"].append(docker.types.DeviceRequest(count=-1, capabilities=[['gpu']]))
@@ -128,9 +134,11 @@ def deploy(validated_containers, allowed_running_containers=[]):
             elif "entrypoint_command" in validated_container and type(validated_container["entrypoint_command"])==str and len(validated_container["entrypoint_command"])>0:
                 container_options["entrypoint"]=validated_container["entrypoint_command"]
 
+            container_options["shm_size"] = f"{SHM_SIZE}m"
+
             if not validated_container["name"] in created_container_names and image_ready:
                 if config.creation_engine == "wrapper":
-                    docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), docker_gpus=docker_gpus)
+                    docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), shm_size=SHM_SIZE, docker_gpus=docker_gpus)
                 else:
                     container = client.containers.create(**container_options)
                     if "ip" in validated_container:
diff --git a/lib/get_specs.py b/lib/get_specs.py
index 673e3ac..6b6fac1 100644
--- a/lib/get_specs.py
+++ b/lib/get_specs.py
@@ -43,6 +43,10 @@ def get_kernel():
 def is_hive():
     return "hive" in get_kernel()
 
+def get_total_ram_mb():
+    total_ram = psutil.virtual_memory().total
+    return total_ram / (1024 ** 2)
+
 def get_os_release():
     try:
         with open("/etc/os-release") as f:
diff --git a/lib/nvml.py b/lib/nvml.py
index 8b0a567..cc3fea6 100644
--- a/lib/nvml.py
+++ b/lib/nvml.py
@@ -383,4 +383,17 @@ def get_hive_clock_range(is_hive, gpu_index, part):
         except Exception as e:
             return False
     else:
-        return False
\ No newline at end of file
+        return False
+
+def get_vram_per_gpu():
+    vram_per_gpu = []
+    try:
+        gpu_count = pynvml.nvmlDeviceGetCount()
+        for i in range(0,gpu_count):
+            gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+            mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
+            vram_per_gpu.append(mem_info.total / 1024 ** 2)
+    except Exception as e:
+        log.error(f"Failed loading get_vram_per_gpu() | {e}")
+        pass
+    return vram_per_gpu
\ No newline at end of file
diff --git a/lib/utils.py b/lib/utils.py
index 91b0497..42c2176 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -1,11 +1,13 @@
 from lib import config as config_module
 from lib import logging as logging_lib
+from lib import nvml
 import subprocess
 import hashlib
 import random
 import string
 import shlex
 import time
+import math
 import json
 import os
 
@@ -141,4 +143,28 @@ def get_extra_allowed_images():
             log.error(f"get_extra_allowed_images() | error: {e}")
             return []
     else:
-        return []
\ No newline at end of file
+        return []
+    
+class shm_calculator:
+    def __init__(self, total_ram):
+        self.total_ram = total_ram
+        self.gpu_vram_sizes = []
+
+    def calculate(self, used_gpu_ids):
+        assume_ram_utilised = 2500 #MB
+        default_shm_size = 64 #MB
+        
+        if len(self.gpu_vram_sizes) == 0:
+            self.gpu_vram_sizes = nvml.get_vram_per_gpu()
+
+        instance_vram_total = 0
+        total_vram_size = sum(self.gpu_vram_sizes)
+        for idx, value in enumerate(self.gpu_vram_sizes):
+            if used_gpu_ids == '*' or idx in used_gpu_ids:
+                instance_vram_total += value
+        if instance_vram_total == 0 or total_vram_size == 0:
+            return default_shm_size
+        shm_size = instance_vram_total * 1.5 if instance_vram_total * 1.5 < self.total_ram - assume_ram_utilised else (
+            instance_vram_total/total_vram_size * (self.total_ram - assume_ram_utilised)
+        )
+        return math.floor(shm_size if shm_size > default_shm_size else default_shm_size)
\ No newline at end of file