allocate /dev/shm towards instances - V5.2.6

This commit is contained in:
clore 2024-10-17 17:01:41 +00:00
parent d6f90ab497
commit d5620c64c4
6 changed files with 60 additions and 5 deletions

View File

@ -455,7 +455,7 @@ class CloreClient:
async def submit_specs(self, current_specs): async def submit_specs(self, current_specs):
try: try:
if type(current_specs) == dict: if type(current_specs) == dict:
current_specs["backend_version"]=15 current_specs["backend_version"]=16
current_specs["update_hw"]=True current_specs["update_hw"]=True
smallest_pcie_width = 999 smallest_pcie_width = 999
for gpu in current_specs["gpus"]["nvidia"]: for gpu in current_specs["gpus"]["nvidia"]:

View File

@ -9,7 +9,7 @@ import docker
config = config_module.config config = config_module.config
log = logging_lib.log log = logging_lib.log
def create_container(container_options, ip=None, docker_gpus=False, timeout=30): def create_container(container_options, ip=None, docker_gpus=False, shm_size=64, timeout=30):
# Sanitize and validate input # Sanitize and validate input
container_options = sanitize_input(container_options) container_options = sanitize_input(container_options)
@ -55,6 +55,10 @@ def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
if "runtime" in container_options: if "runtime" in container_options:
command.extend(["--runtime", container_options["runtime"]]) command.extend(["--runtime", container_options["runtime"]])
if shm_size != 64:
command.extend(["--shm-size", f"{shm_size}m"])
if docker_gpus: if docker_gpus:
if type(docker_gpus)==list: if type(docker_gpus)==list:
command.extend(['--gpus', '"device=' + ','.join(str(gpu_id) for gpu_id in docker_gpus) + '"']) command.extend(['--gpus', '"device=' + ','.join(str(gpu_id) for gpu_id in docker_gpus) + '"'])

View File

@ -3,10 +3,13 @@ from lib import logging as logging_lib
from lib import docker_cli_wrapper from lib import docker_cli_wrapper
from lib import docker_interface from lib import docker_interface
from lib import get_specs from lib import get_specs
from lib import utils
import docker import docker
from docker.types import EndpointConfig, NetworkingConfig from docker.types import EndpointConfig, NetworkingConfig
import os import os
shm_calculator = utils.shm_calculator(get_specs.get_total_ram_mb())
client = docker_interface.client client = docker_interface.client
config = config_module.config config = config_module.config
log = logging_lib.log log = logging_lib.log
@ -43,6 +46,7 @@ def deploy(validated_containers, allowed_running_containers=[]):
for validated_container in validated_containers: for validated_container in validated_containers:
try: try:
SHM_SIZE = 64 # MB - default
image_ready = False image_ready = False
docker_gpus = None docker_gpus = None
@ -89,6 +93,8 @@ def deploy(validated_containers, allowed_running_containers=[]):
del container_options["network_mode"] del container_options["network_mode"]
if "gpus" in validated_container and type(validated_container["gpus"])==bool: if "gpus" in validated_container and type(validated_container["gpus"])==bool:
if "clore-order-" in validated_container["name"]:
SHM_SIZE = shm_calculator.calculate('*')
container_options["runtime"]="nvidia" container_options["runtime"]="nvidia"
docker_gpus=True docker_gpus=True
container_options["device_requests"].append(docker.types.DeviceRequest(count=-1, capabilities=[['gpu']])) container_options["device_requests"].append(docker.types.DeviceRequest(count=-1, capabilities=[['gpu']]))
@ -128,9 +134,11 @@ def deploy(validated_containers, allowed_running_containers=[]):
elif "entrypoint_command" in validated_container and type(validated_container["entrypoint_command"])==str and len(validated_container["entrypoint_command"])>0: elif "entrypoint_command" in validated_container and type(validated_container["entrypoint_command"])==str and len(validated_container["entrypoint_command"])>0:
container_options["entrypoint"]=validated_container["entrypoint_command"] container_options["entrypoint"]=validated_container["entrypoint_command"]
container_options["shm_size"] = f"{SHM_SIZE}m"
if not validated_container["name"] in created_container_names and image_ready: if not validated_container["name"] in created_container_names and image_ready:
if config.creation_engine == "wrapper": if config.creation_engine == "wrapper":
docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), docker_gpus=docker_gpus) docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), shm_size=SHM_SIZE, docker_gpus=docker_gpus)
else: else:
container = client.containers.create(**container_options) container = client.containers.create(**container_options)
if "ip" in validated_container: if "ip" in validated_container:

View File

@ -43,6 +43,10 @@ def get_kernel():
def is_hive(): def is_hive():
return "hive" in get_kernel() return "hive" in get_kernel()
def get_total_ram_mb():
total_ram = psutil.virtual_memory().total
return total_ram / (1024 ** 2)
def get_os_release(): def get_os_release():
try: try:
with open("/etc/os-release") as f: with open("/etc/os-release") as f:

View File

@ -383,4 +383,17 @@ def get_hive_clock_range(is_hive, gpu_index, part):
except Exception as e: except Exception as e:
return False return False
else: else:
return False return False
def get_vram_per_gpu():
vram_per_gpu = []
try:
gpu_count = pynvml.nvmlDeviceGetCount()
for i in range(0,gpu_count):
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
vram_per_gpu.append(mem_info.total / 1024 ** 2)
except Exception as e:
log.error(f"Failed loading get_vram_per_gpu() | {e}")
pass
return vram_per_gpu

View File

@ -1,11 +1,13 @@
from lib import config as config_module from lib import config as config_module
from lib import logging as logging_lib from lib import logging as logging_lib
from lib import nvml
import subprocess import subprocess
import hashlib import hashlib
import random import random
import string import string
import shlex import shlex
import time import time
import math
import json import json
import os import os
@ -141,4 +143,28 @@ def get_extra_allowed_images():
log.error(f"get_extra_allowed_images() | error: {e}") log.error(f"get_extra_allowed_images() | error: {e}")
return [] return []
else: else:
return [] return []
class shm_calculator:
def __init__(self, total_ram):
self.total_ram = total_ram
self.gpu_vram_sizes = []
def calculate(self, used_gpu_ids):
assume_ram_utilised = 2500 #MB
default_shm_size = 64 #MB
if len(self.gpu_vram_sizes) == 0:
self.gpu_vram_sizes = nvml.get_vram_per_gpu()
instance_vram_total = 0
total_vram_size = sum(self.gpu_vram_sizes)
for idx, value in enumerate(self.gpu_vram_sizes):
if used_gpu_ids == '*' or idx in used_gpu_ids:
instance_vram_total += value
if instance_vram_total == 0 or total_vram_size == 0:
return default_shm_size
shm_size = instance_vram_total * 1.5 if instance_vram_total * 1.5 < self.total_ram - assume_ram_utilised else (
instance_vram_total/total_vram_size * (self.total_ram - assume_ram_utilised)
)
return math.floor(shm_size if shm_size > default_shm_size else default_shm_size)