allocate /dev/shm towards instances - V5.2.6
This commit is contained in:
parent
d6f90ab497
commit
d5620c64c4
|
@ -455,7 +455,7 @@ class CloreClient:
|
||||||
async def submit_specs(self, current_specs):
|
async def submit_specs(self, current_specs):
|
||||||
try:
|
try:
|
||||||
if type(current_specs) == dict:
|
if type(current_specs) == dict:
|
||||||
current_specs["backend_version"]=15
|
current_specs["backend_version"]=16
|
||||||
current_specs["update_hw"]=True
|
current_specs["update_hw"]=True
|
||||||
smallest_pcie_width = 999
|
smallest_pcie_width = 999
|
||||||
for gpu in current_specs["gpus"]["nvidia"]:
|
for gpu in current_specs["gpus"]["nvidia"]:
|
||||||
|
|
|
@ -9,7 +9,7 @@ import docker
|
||||||
config = config_module.config
|
config = config_module.config
|
||||||
log = logging_lib.log
|
log = logging_lib.log
|
||||||
|
|
||||||
def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
|
def create_container(container_options, ip=None, docker_gpus=False, shm_size=64, timeout=30):
|
||||||
# Sanitize and validate input
|
# Sanitize and validate input
|
||||||
container_options = sanitize_input(container_options)
|
container_options = sanitize_input(container_options)
|
||||||
|
|
||||||
|
@ -55,6 +55,10 @@ def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
|
||||||
|
|
||||||
if "runtime" in container_options:
|
if "runtime" in container_options:
|
||||||
command.extend(["--runtime", container_options["runtime"]])
|
command.extend(["--runtime", container_options["runtime"]])
|
||||||
|
|
||||||
|
if shm_size != 64:
|
||||||
|
command.extend(["--shm-size", f"{shm_size}m"])
|
||||||
|
|
||||||
if docker_gpus:
|
if docker_gpus:
|
||||||
if type(docker_gpus)==list:
|
if type(docker_gpus)==list:
|
||||||
command.extend(['--gpus', '"device=' + ','.join(str(gpu_id) for gpu_id in docker_gpus) + '"'])
|
command.extend(['--gpus', '"device=' + ','.join(str(gpu_id) for gpu_id in docker_gpus) + '"'])
|
||||||
|
|
|
@ -3,10 +3,13 @@ from lib import logging as logging_lib
|
||||||
from lib import docker_cli_wrapper
|
from lib import docker_cli_wrapper
|
||||||
from lib import docker_interface
|
from lib import docker_interface
|
||||||
from lib import get_specs
|
from lib import get_specs
|
||||||
|
from lib import utils
|
||||||
import docker
|
import docker
|
||||||
from docker.types import EndpointConfig, NetworkingConfig
|
from docker.types import EndpointConfig, NetworkingConfig
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
shm_calculator = utils.shm_calculator(get_specs.get_total_ram_mb())
|
||||||
|
|
||||||
client = docker_interface.client
|
client = docker_interface.client
|
||||||
config = config_module.config
|
config = config_module.config
|
||||||
log = logging_lib.log
|
log = logging_lib.log
|
||||||
|
@ -43,6 +46,7 @@ def deploy(validated_containers, allowed_running_containers=[]):
|
||||||
|
|
||||||
for validated_container in validated_containers:
|
for validated_container in validated_containers:
|
||||||
try:
|
try:
|
||||||
|
SHM_SIZE = 64 # MB - default
|
||||||
|
|
||||||
image_ready = False
|
image_ready = False
|
||||||
docker_gpus = None
|
docker_gpus = None
|
||||||
|
@ -89,6 +93,8 @@ def deploy(validated_containers, allowed_running_containers=[]):
|
||||||
del container_options["network_mode"]
|
del container_options["network_mode"]
|
||||||
|
|
||||||
if "gpus" in validated_container and type(validated_container["gpus"])==bool:
|
if "gpus" in validated_container and type(validated_container["gpus"])==bool:
|
||||||
|
if "clore-order-" in validated_container["name"]:
|
||||||
|
SHM_SIZE = shm_calculator.calculate('*')
|
||||||
container_options["runtime"]="nvidia"
|
container_options["runtime"]="nvidia"
|
||||||
docker_gpus=True
|
docker_gpus=True
|
||||||
container_options["device_requests"].append(docker.types.DeviceRequest(count=-1, capabilities=[['gpu']]))
|
container_options["device_requests"].append(docker.types.DeviceRequest(count=-1, capabilities=[['gpu']]))
|
||||||
|
@ -128,9 +134,11 @@ def deploy(validated_containers, allowed_running_containers=[]):
|
||||||
elif "entrypoint_command" in validated_container and type(validated_container["entrypoint_command"])==str and len(validated_container["entrypoint_command"])>0:
|
elif "entrypoint_command" in validated_container and type(validated_container["entrypoint_command"])==str and len(validated_container["entrypoint_command"])>0:
|
||||||
container_options["entrypoint"]=validated_container["entrypoint_command"]
|
container_options["entrypoint"]=validated_container["entrypoint_command"]
|
||||||
|
|
||||||
|
container_options["shm_size"] = f"{SHM_SIZE}m"
|
||||||
|
|
||||||
if not validated_container["name"] in created_container_names and image_ready:
|
if not validated_container["name"] in created_container_names and image_ready:
|
||||||
if config.creation_engine == "wrapper":
|
if config.creation_engine == "wrapper":
|
||||||
docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), docker_gpus=docker_gpus)
|
docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), shm_size=SHM_SIZE, docker_gpus=docker_gpus)
|
||||||
else:
|
else:
|
||||||
container = client.containers.create(**container_options)
|
container = client.containers.create(**container_options)
|
||||||
if "ip" in validated_container:
|
if "ip" in validated_container:
|
||||||
|
|
|
@ -43,6 +43,10 @@ def get_kernel():
|
||||||
def is_hive():
|
def is_hive():
|
||||||
return "hive" in get_kernel()
|
return "hive" in get_kernel()
|
||||||
|
|
||||||
|
def get_total_ram_mb():
|
||||||
|
total_ram = psutil.virtual_memory().total
|
||||||
|
return total_ram / (1024 ** 2)
|
||||||
|
|
||||||
def get_os_release():
|
def get_os_release():
|
||||||
try:
|
try:
|
||||||
with open("/etc/os-release") as f:
|
with open("/etc/os-release") as f:
|
||||||
|
|
15
lib/nvml.py
15
lib/nvml.py
|
@ -383,4 +383,17 @@ def get_hive_clock_range(is_hive, gpu_index, part):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def get_vram_per_gpu():
|
||||||
|
vram_per_gpu = []
|
||||||
|
try:
|
||||||
|
gpu_count = pynvml.nvmlDeviceGetCount()
|
||||||
|
for i in range(0,gpu_count):
|
||||||
|
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
||||||
|
mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
|
||||||
|
vram_per_gpu.append(mem_info.total / 1024 ** 2)
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Failed loading get_vram_per_gpu() | {e}")
|
||||||
|
pass
|
||||||
|
return vram_per_gpu
|
28
lib/utils.py
28
lib/utils.py
|
@ -1,11 +1,13 @@
|
||||||
from lib import config as config_module
|
from lib import config as config_module
|
||||||
from lib import logging as logging_lib
|
from lib import logging as logging_lib
|
||||||
|
from lib import nvml
|
||||||
import subprocess
|
import subprocess
|
||||||
import hashlib
|
import hashlib
|
||||||
import random
|
import random
|
||||||
import string
|
import string
|
||||||
import shlex
|
import shlex
|
||||||
import time
|
import time
|
||||||
|
import math
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
@ -141,4 +143,28 @@ def get_extra_allowed_images():
|
||||||
log.error(f"get_extra_allowed_images() | error: {e}")
|
log.error(f"get_extra_allowed_images() | error: {e}")
|
||||||
return []
|
return []
|
||||||
else:
|
else:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
class shm_calculator:
|
||||||
|
def __init__(self, total_ram):
|
||||||
|
self.total_ram = total_ram
|
||||||
|
self.gpu_vram_sizes = []
|
||||||
|
|
||||||
|
def calculate(self, used_gpu_ids):
|
||||||
|
assume_ram_utilised = 2500 #MB
|
||||||
|
default_shm_size = 64 #MB
|
||||||
|
|
||||||
|
if len(self.gpu_vram_sizes) == 0:
|
||||||
|
self.gpu_vram_sizes = nvml.get_vram_per_gpu()
|
||||||
|
|
||||||
|
instance_vram_total = 0
|
||||||
|
total_vram_size = sum(self.gpu_vram_sizes)
|
||||||
|
for idx, value in enumerate(self.gpu_vram_sizes):
|
||||||
|
if used_gpu_ids == '*' or idx in used_gpu_ids:
|
||||||
|
instance_vram_total += value
|
||||||
|
if instance_vram_total == 0 or total_vram_size == 0:
|
||||||
|
return default_shm_size
|
||||||
|
shm_size = instance_vram_total * 1.5 if instance_vram_total * 1.5 < self.total_ram - assume_ram_utilised else (
|
||||||
|
instance_vram_total/total_vram_size * (self.total_ram - assume_ram_utilised)
|
||||||
|
)
|
||||||
|
return math.floor(shm_size if shm_size > default_shm_size else default_shm_size)
|
Loading…
Reference in New Issue