allocate /dev/shm towards instances - V5.2.6

This commit is contained in:
clore 2024-10-17 17:01:41 +00:00
parent d6f90ab497
commit d5620c64c4
6 changed files with 60 additions and 5 deletions

View File

@ -455,7 +455,7 @@ class CloreClient:
async def submit_specs(self, current_specs):
try:
if type(current_specs) == dict:
current_specs["backend_version"]=15
current_specs["backend_version"]=16
current_specs["update_hw"]=True
smallest_pcie_width = 999
for gpu in current_specs["gpus"]["nvidia"]:

View File

@ -9,7 +9,7 @@ import docker
config = config_module.config
log = logging_lib.log
def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
def create_container(container_options, ip=None, docker_gpus=False, shm_size=64, timeout=30):
# Sanitize and validate input
container_options = sanitize_input(container_options)
@ -55,6 +55,10 @@ def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
if "runtime" in container_options:
command.extend(["--runtime", container_options["runtime"]])
if shm_size != 64:
command.extend(["--shm-size", f"{shm_size}m"])
if docker_gpus:
if type(docker_gpus)==list:
command.extend(['--gpus', '"device=' + ','.join(str(gpu_id) for gpu_id in docker_gpus) + '"'])

View File

@ -3,10 +3,13 @@ from lib import logging as logging_lib
from lib import docker_cli_wrapper
from lib import docker_interface
from lib import get_specs
from lib import utils
import docker
from docker.types import EndpointConfig, NetworkingConfig
import os
shm_calculator = utils.shm_calculator(get_specs.get_total_ram_mb())
client = docker_interface.client
config = config_module.config
log = logging_lib.log
@ -43,6 +46,7 @@ def deploy(validated_containers, allowed_running_containers=[]):
for validated_container in validated_containers:
try:
SHM_SIZE = 64 # MB - default
image_ready = False
docker_gpus = None
@ -89,6 +93,8 @@ def deploy(validated_containers, allowed_running_containers=[]):
del container_options["network_mode"]
if "gpus" in validated_container and type(validated_container["gpus"])==bool:
if "clore-order-" in validated_container["name"]:
SHM_SIZE = shm_calculator.calculate('*')
container_options["runtime"]="nvidia"
docker_gpus=True
container_options["device_requests"].append(docker.types.DeviceRequest(count=-1, capabilities=[['gpu']]))
@ -128,9 +134,11 @@ def deploy(validated_containers, allowed_running_containers=[]):
elif "entrypoint_command" in validated_container and type(validated_container["entrypoint_command"])==str and len(validated_container["entrypoint_command"])>0:
container_options["entrypoint"]=validated_container["entrypoint_command"]
container_options["shm_size"] = f"{SHM_SIZE}m"
if not validated_container["name"] in created_container_names and image_ready:
if config.creation_engine == "wrapper":
docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), docker_gpus=docker_gpus)
docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), shm_size=SHM_SIZE, docker_gpus=docker_gpus)
else:
container = client.containers.create(**container_options)
if "ip" in validated_container:

View File

@ -43,6 +43,10 @@ def get_kernel():
def is_hive():
return "hive" in get_kernel()
def get_total_ram_mb():
total_ram = psutil.virtual_memory().total
return total_ram / (1024 ** 2)
def get_os_release():
try:
with open("/etc/os-release") as f:

View File

@ -383,4 +383,17 @@ def get_hive_clock_range(is_hive, gpu_index, part):
except Exception as e:
return False
else:
return False
return False
def get_vram_per_gpu():
vram_per_gpu = []
try:
gpu_count = pynvml.nvmlDeviceGetCount()
for i in range(0,gpu_count):
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
vram_per_gpu.append(mem_info.total / 1024 ** 2)
except Exception as e:
log.error(f"Failed loading get_vram_per_gpu() | {e}")
pass
return vram_per_gpu

View File

@ -1,11 +1,13 @@
from lib import config as config_module
from lib import logging as logging_lib
from lib import nvml
import subprocess
import hashlib
import random
import string
import shlex
import time
import math
import json
import os
@ -141,4 +143,28 @@ def get_extra_allowed_images():
log.error(f"get_extra_allowed_images() | error: {e}")
return []
else:
return []
return []
class shm_calculator:
def __init__(self, total_ram):
self.total_ram = total_ram
self.gpu_vram_sizes = []
def calculate(self, used_gpu_ids):
assume_ram_utilised = 2500 #MB
default_shm_size = 64 #MB
if len(self.gpu_vram_sizes) == 0:
self.gpu_vram_sizes = nvml.get_vram_per_gpu()
instance_vram_total = 0
total_vram_size = sum(self.gpu_vram_sizes)
for idx, value in enumerate(self.gpu_vram_sizes):
if used_gpu_ids == '*' or idx in used_gpu_ids:
instance_vram_total += value
if instance_vram_total == 0 or total_vram_size == 0:
return default_shm_size
shm_size = instance_vram_total * 1.5 if instance_vram_total * 1.5 < self.total_ram - assume_ram_utilised else (
instance_vram_total/total_vram_size * (self.total_ram - assume_ram_utilised)
)
return math.floor(shm_size if shm_size > default_shm_size else default_shm_size)