gigaspot
This commit is contained in:
parent
d2c4bb6044
commit
4e1e72da25
|
@ -151,6 +151,9 @@ class CloreClient:
|
||||||
|
|
||||||
self.runned_pull_selftest = False
|
self.runned_pull_selftest = False
|
||||||
|
|
||||||
|
WebSocketClient.set_gpu_list(nvml.get_gpu_name_list())
|
||||||
|
WebSocketClient.set_is_hive(self.is_hive)
|
||||||
|
|
||||||
async def service(self):
|
async def service(self):
|
||||||
global container_log_broken
|
global container_log_broken
|
||||||
|
|
||||||
|
|
|
@ -55,7 +55,16 @@ class WebSocketClient:
|
||||||
|
|
||||||
self.clore_partner_config = None
|
self.clore_partner_config = None
|
||||||
self.forwarding_latency_measurment = None
|
self.forwarding_latency_measurment = None
|
||||||
|
|
||||||
|
self.gpu_list = []
|
||||||
|
self.is_hive = False
|
||||||
|
|
||||||
|
def set_gpu_list(self, gpu_list):
|
||||||
|
self.gpu_list = gpu_list
|
||||||
|
|
||||||
|
def set_is_hive(self, is_hive):
|
||||||
|
self.is_hive = is_hive
|
||||||
|
|
||||||
def get_last_heartbeat(self):
|
def get_last_heartbeat(self):
|
||||||
return self.last_heartbeat
|
return self.last_heartbeat
|
||||||
|
|
||||||
|
@ -113,7 +122,9 @@ class WebSocketClient:
|
||||||
"login":str(self.auth),
|
"login":str(self.auth),
|
||||||
"xfs_state": self.xfs_state,
|
"xfs_state": self.xfs_state,
|
||||||
"type":"python",
|
"type":"python",
|
||||||
"clore_partner_support": True
|
"clore_partner_support": True,
|
||||||
|
"gpu_list": self.gpu_list,
|
||||||
|
"is_hive": self.is_hive
|
||||||
}))
|
}))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.debug(f"CLOREWS | Connection to {random_ws_peer} failed: {e} ❌")
|
log.debug(f"CLOREWS | Connection to {random_ws_peer} failed: {e} ❌")
|
||||||
|
|
|
@ -9,11 +9,11 @@ import docker
|
||||||
config = config_module.config
|
config = config_module.config
|
||||||
log = logging_lib.log
|
log = logging_lib.log
|
||||||
|
|
||||||
def create_container(container_options, ip=None, docker_gpus=False, shm_size=64, timeout=30):
|
def create_container(container_options, ip=None, docker_gpus=False, shm_size=64, timeout=30, paused=False):
|
||||||
# Sanitize and validate input
|
# Sanitize and validate input
|
||||||
container_options = sanitize_input(container_options)
|
container_options = sanitize_input(container_options)
|
||||||
|
|
||||||
command = ["docker", "run", "--detach", "--tty"]
|
command = ["docker", ("create" if paused else "run"), "--detach", "--tty"]
|
||||||
|
|
||||||
if "name" in container_options:
|
if "name" in container_options:
|
||||||
command.extend(["--name", container_options["name"]])
|
command.extend(["--name", container_options["name"]])
|
||||||
|
|
|
@ -151,7 +151,7 @@ def deploy(validated_containers, allowed_running_containers=[], can_run_partner_
|
||||||
|
|
||||||
if not validated_container["name"] in created_container_names and image_ready and not (not background_job.is_enabled() and background_job.is_background_job_container_name(validated_container["name"])):
|
if not validated_container["name"] in created_container_names and image_ready and not (not background_job.is_enabled() and background_job.is_background_job_container_name(validated_container["name"])):
|
||||||
if config.creation_engine == "wrapper":
|
if config.creation_engine == "wrapper":
|
||||||
docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), shm_size=SHM_SIZE, docker_gpus=docker_gpus)
|
docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), shm_size=SHM_SIZE, docker_gpus=docker_gpus, paused="paused" in validated_container)
|
||||||
else:
|
else:
|
||||||
container = client.containers.create(**container_options)
|
container = client.containers.create(**container_options)
|
||||||
if "ip" in validated_container:
|
if "ip" in validated_container:
|
||||||
|
|
|
@ -68,10 +68,11 @@ GPU_CORE_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
|
||||||
|
|
||||||
is_hive = False
|
is_hive = False
|
||||||
all_gpus_data_list=[]
|
all_gpus_data_list=[]
|
||||||
|
gpu_name_list=[]
|
||||||
get_data_fail=False
|
get_data_fail=False
|
||||||
|
|
||||||
def init(gpu_specs_file=None, allow_hive_binaries=True):
|
def init(gpu_specs_file=None, allow_hive_binaries=True):
|
||||||
global is_hive, all_gpus_data_list, get_data_fail
|
global is_hive, all_gpus_data_list, get_data_fail, gpu_name_list
|
||||||
log.info("Loading GPU OC specs [ working ]")
|
log.info("Loading GPU OC specs [ working ]")
|
||||||
try:
|
try:
|
||||||
pynvml.nvmlInit()
|
pynvml.nvmlInit()
|
||||||
|
@ -96,6 +97,7 @@ def init(gpu_specs_file=None, allow_hive_binaries=True):
|
||||||
break
|
break
|
||||||
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
||||||
gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
|
gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
|
||||||
|
gpu_name_list.append(pynvml.nvmlDeviceGetName(gpu_handle))
|
||||||
if not f"{i}-{gpu_uuid}" in parsed_specs_keys:
|
if not f"{i}-{gpu_uuid}" in parsed_specs_keys:
|
||||||
parsed_specs={}
|
parsed_specs={}
|
||||||
regenerate_specs=True
|
regenerate_specs=True
|
||||||
|
@ -117,6 +119,7 @@ def init(gpu_specs_file=None, allow_hive_binaries=True):
|
||||||
gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0)
|
gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0)
|
||||||
gpu_spec["power_limits"] = [min_power_limit, max_power_limit]
|
gpu_spec["power_limits"] = [min_power_limit, max_power_limit]
|
||||||
gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle)
|
gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle)
|
||||||
|
gpu_name_list.append(gpu_spec["name"])
|
||||||
gpu_spec["locks"] = mem_to_core_allowed_locks
|
gpu_spec["locks"] = mem_to_core_allowed_locks
|
||||||
|
|
||||||
pci_info = pynvml.nvmlDeviceGetPciInfo(gpu_handle)
|
pci_info = pynvml.nvmlDeviceGetPciInfo(gpu_handle)
|
||||||
|
@ -201,6 +204,10 @@ def init(gpu_specs_file=None, allow_hive_binaries=True):
|
||||||
print(all_gpus_data_list)
|
print(all_gpus_data_list)
|
||||||
# Load GPU specs
|
# Load GPU specs
|
||||||
|
|
||||||
|
def get_gpu_name_list():
|
||||||
|
global gpu_name_list
|
||||||
|
return gpu_name_list
|
||||||
|
|
||||||
def get_gpu_oc_specs():
|
def get_gpu_oc_specs():
|
||||||
global get_data_fail
|
global get_data_fail
|
||||||
if get_data_fail:
|
if get_data_fail:
|
||||||
|
|
Loading…
Reference in New Issue