allocate /dev/shm towards instances - V5.2.6
This commit is contained in:
parent
d6f90ab497
commit
d5620c64c4
|
@ -455,7 +455,7 @@ class CloreClient:
|
|||
async def submit_specs(self, current_specs):
|
||||
try:
|
||||
if type(current_specs) == dict:
|
||||
current_specs["backend_version"]=15
|
||||
current_specs["backend_version"]=16
|
||||
current_specs["update_hw"]=True
|
||||
smallest_pcie_width = 999
|
||||
for gpu in current_specs["gpus"]["nvidia"]:
|
||||
|
|
|
@ -9,7 +9,7 @@ import docker
|
|||
config = config_module.config
|
||||
log = logging_lib.log
|
||||
|
||||
def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
|
||||
def create_container(container_options, ip=None, docker_gpus=False, shm_size=64, timeout=30):
|
||||
# Sanitize and validate input
|
||||
container_options = sanitize_input(container_options)
|
||||
|
||||
|
@ -55,6 +55,10 @@ def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
|
|||
|
||||
if "runtime" in container_options:
|
||||
command.extend(["--runtime", container_options["runtime"]])
|
||||
|
||||
if shm_size != 64:
|
||||
command.extend(["--shm-size", f"{shm_size}m"])
|
||||
|
||||
if docker_gpus:
|
||||
if type(docker_gpus)==list:
|
||||
command.extend(['--gpus', '"device=' + ','.join(str(gpu_id) for gpu_id in docker_gpus) + '"'])
|
||||
|
|
|
@ -3,10 +3,13 @@ from lib import logging as logging_lib
|
|||
from lib import docker_cli_wrapper
|
||||
from lib import docker_interface
|
||||
from lib import get_specs
|
||||
from lib import utils
|
||||
import docker
|
||||
from docker.types import EndpointConfig, NetworkingConfig
|
||||
import os
|
||||
|
||||
shm_calculator = utils.shm_calculator(get_specs.get_total_ram_mb())
|
||||
|
||||
client = docker_interface.client
|
||||
config = config_module.config
|
||||
log = logging_lib.log
|
||||
|
@ -43,6 +46,7 @@ def deploy(validated_containers, allowed_running_containers=[]):
|
|||
|
||||
for validated_container in validated_containers:
|
||||
try:
|
||||
SHM_SIZE = 64 # MB - default
|
||||
|
||||
image_ready = False
|
||||
docker_gpus = None
|
||||
|
@ -89,6 +93,8 @@ def deploy(validated_containers, allowed_running_containers=[]):
|
|||
del container_options["network_mode"]
|
||||
|
||||
if "gpus" in validated_container and type(validated_container["gpus"])==bool:
|
||||
if "clore-order-" in validated_container["name"]:
|
||||
SHM_SIZE = shm_calculator.calculate('*')
|
||||
container_options["runtime"]="nvidia"
|
||||
docker_gpus=True
|
||||
container_options["device_requests"].append(docker.types.DeviceRequest(count=-1, capabilities=[['gpu']]))
|
||||
|
@ -128,9 +134,11 @@ def deploy(validated_containers, allowed_running_containers=[]):
|
|||
elif "entrypoint_command" in validated_container and type(validated_container["entrypoint_command"])==str and len(validated_container["entrypoint_command"])>0:
|
||||
container_options["entrypoint"]=validated_container["entrypoint_command"]
|
||||
|
||||
container_options["shm_size"] = f"{SHM_SIZE}m"
|
||||
|
||||
if not validated_container["name"] in created_container_names and image_ready:
|
||||
if config.creation_engine == "wrapper":
|
||||
docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), docker_gpus=docker_gpus)
|
||||
docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), shm_size=SHM_SIZE, docker_gpus=docker_gpus)
|
||||
else:
|
||||
container = client.containers.create(**container_options)
|
||||
if "ip" in validated_container:
|
||||
|
|
|
@ -43,6 +43,10 @@ def get_kernel():
|
|||
def is_hive():
|
||||
return "hive" in get_kernel()
|
||||
|
||||
def get_total_ram_mb():
|
||||
total_ram = psutil.virtual_memory().total
|
||||
return total_ram / (1024 ** 2)
|
||||
|
||||
def get_os_release():
|
||||
try:
|
||||
with open("/etc/os-release") as f:
|
||||
|
|
15
lib/nvml.py
15
lib/nvml.py
|
@ -383,4 +383,17 @@ def get_hive_clock_range(is_hive, gpu_index, part):
|
|||
except Exception as e:
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
return False
|
||||
|
||||
def get_vram_per_gpu():
|
||||
vram_per_gpu = []
|
||||
try:
|
||||
gpu_count = pynvml.nvmlDeviceGetCount()
|
||||
for i in range(0,gpu_count):
|
||||
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
||||
mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
|
||||
vram_per_gpu.append(mem_info.total / 1024 ** 2)
|
||||
except Exception as e:
|
||||
log.error(f"Failed loading get_vram_per_gpu() | {e}")
|
||||
pass
|
||||
return vram_per_gpu
|
28
lib/utils.py
28
lib/utils.py
|
@ -1,11 +1,13 @@
|
|||
from lib import config as config_module
|
||||
from lib import logging as logging_lib
|
||||
from lib import nvml
|
||||
import subprocess
|
||||
import hashlib
|
||||
import random
|
||||
import string
|
||||
import shlex
|
||||
import time
|
||||
import math
|
||||
import json
|
||||
import os
|
||||
|
||||
|
@ -141,4 +143,28 @@ def get_extra_allowed_images():
|
|||
log.error(f"get_extra_allowed_images() | error: {e}")
|
||||
return []
|
||||
else:
|
||||
return []
|
||||
return []
|
||||
|
||||
class shm_calculator:
|
||||
def __init__(self, total_ram):
|
||||
self.total_ram = total_ram
|
||||
self.gpu_vram_sizes = []
|
||||
|
||||
def calculate(self, used_gpu_ids):
|
||||
assume_ram_utilised = 2500 #MB
|
||||
default_shm_size = 64 #MB
|
||||
|
||||
if len(self.gpu_vram_sizes) == 0:
|
||||
self.gpu_vram_sizes = nvml.get_vram_per_gpu()
|
||||
|
||||
instance_vram_total = 0
|
||||
total_vram_size = sum(self.gpu_vram_sizes)
|
||||
for idx, value in enumerate(self.gpu_vram_sizes):
|
||||
if used_gpu_ids == '*' or idx in used_gpu_ids:
|
||||
instance_vram_total += value
|
||||
if instance_vram_total == 0 or total_vram_size == 0:
|
||||
return default_shm_size
|
||||
shm_size = instance_vram_total * 1.5 if instance_vram_total * 1.5 < self.total_ram - assume_ram_utilised else (
|
||||
instance_vram_total/total_vram_size * (self.total_ram - assume_ram_utilised)
|
||||
)
|
||||
return math.floor(shm_size if shm_size > default_shm_size else default_shm_size)
|
Loading…
Reference in New Issue