diff --git a/clore_hosting/main.py b/clore_hosting/main.py index 988b965..34ed3a2 100644 --- a/clore_hosting/main.py +++ b/clore_hosting/main.py @@ -554,6 +554,7 @@ class CloreClient: try: await monitoring.put("specs_service") current_specs = await specs.get() + if self.last_hw_specs_submit < (utils.unix_timestamp()-1800): self.last_hw_specs_submit=utils.unix_timestamp() await self.submit_specs(current_specs) diff --git a/lib/constants.py b/lib/constants.py new file mode 100644 index 0000000..e5b87fa --- /dev/null +++ b/lib/constants.py @@ -0,0 +1,4 @@ +GPU_ID_TO_NAME = { + "0x20c210de": "NVIDIA CMP 170HX", + "0x208210de": "NVIDIA CMP 170HX" +} diff --git a/lib/get_specs.py b/lib/get_specs.py index 5092333..7a0b39f 100644 --- a/lib/get_specs.py +++ b/lib/get_specs.py @@ -4,6 +4,7 @@ import xml.etree.ElementTree as ET from lib import docker_interface from typing import Dict, List, Optional from lib import utils +from lib import constants import subprocess import speedtest import platform @@ -256,8 +257,8 @@ def get_gpu_info(): except Exception as e: pass - nvidia_smi_return_code, nvidia_smi_stdout, nvidia_smi_stderr = utils.run_command(f"nvidia-smi --query-gpu=index,name,uuid,serial,memory.total --format=csv") - nvidia_smi_xl_return_code, nvidia_smi_xl_stdout, nvidia_smi_xl_stderr = utils.run_command("nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv") + nvidia_smi_return_code, nvidia_smi_stdout, nvidia_smi_stderr = utils.run_command(f"nvidia-smi --query-gpu=index,name,uuid,serial,memory.total,pci.device_id --format=csv") + nvidia_smi_xl_return_code, nvidia_smi_xl_stdout, nvidia_smi_xl_stderr = utils.run_command("nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used,pci.device_id --format=csv") if "Failed to initialize NVML" in nvidia_smi_stdout or "Failed to initialize NVML" in nvidia_smi_stderr or "Failed to initialize NVML" in nvidia_smi_xl_stdout or "Failed to initialize NVML" in nvidia_smi_xl_stderr: nvml_err=True @@ -267,10 +268,15 @@ def get_gpu_info(): for index, line in enumerate(lines_xl): parts = [s.strip() for s in line.split(',')] if len(parts)>12 and index>0: + gpu_name_xl = parts[1] + gpu_id_xl = parts[13].lower() + if gpu_name_xl == "NVIDIA Graphics Device" and gpu_id_xl in constants.GPU_ID_TO_NAME: + gpu_name_xl = constants.GPU_ID_TO_NAME[gpu_id_xl] + xl_gpu_info={ "id":index-1, "timestamp": parts[0], - "name": parts[1], + "name": gpu_name_xl, "pcie_bus": parts[2].split(':', 1)[1], "driver": parts[3], "pstate": parts[4], @@ -281,6 +287,7 @@ def get_gpu_info(): "mem_free": parts[11], "mem_used": parts[12] } + try: pci_query = parts[2][parts[2].find(':')+1:] for index, valid_pci_dev in enumerate(valid_pci_dev_list): @@ -296,7 +303,13 @@ def get_gpu_info(): for line in lines: parts = line.split(',') if bool(re.match(r'^[0-9]+$', parts[0])): - gpu_str = f"{len(lines)-1}x {parts[1].strip()}" + + gpu_name = parts[1].strip() + gpu_id = parts[5].strip().lower() + if gpu_name == "NVIDIA Graphics Device" and gpu_id in constants.GPU_ID_TO_NAME: + gpu_name = constants.GPU_ID_TO_NAME[gpu_id] + + gpu_str = f"{len(lines)-1}x {gpu_name}" gpu_mem = round(int(filter_non_numeric(parts[4]).strip())/1024, 2) except Exception as e: nvml_err=True diff --git a/lib/nvml.py b/lib/nvml.py index 59dade3..4120fa1 100644 --- a/lib/nvml.py +++ b/lib/nvml.py @@ -1,6 +1,7 @@ from lib import config as config_module from lib import logging as logging_lib from lib import get_specs +from lib import constants config = config_module.config log = logging_lib.log @@ -97,7 +98,13 @@ def init(gpu_specs_file=None, allow_hive_binaries=True): break gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i) gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle) - gpu_name_list.append(pynvml.nvmlDeviceGetName(gpu_handle)) + + gpu_name = pynvml.nvmlDeviceGetName(gpu_handle) + gpu_device_id = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).lower() + if gpu_name == "NVIDIA Graphics Device" and gpu_device_id in constants.GPU_ID_TO_NAME: + gpu_name = constants.GPU_ID_TO_NAME[gpu_device_id] + + gpu_name_list.append(gpu_name) if not f"{i}-{gpu_uuid}" in parsed_specs_keys: parsed_specs={} regenerate_specs=True @@ -118,7 +125,13 @@ def init(gpu_specs_file=None, allow_hive_binaries=True): max_power_limit = int(power_limits[1] / 1000.0) gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0) gpu_spec["power_limits"] = [min_power_limit, max_power_limit] - gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle) + + gpu_name_regen = pynvml.nvmlDeviceGetName(gpu_handle) + gpu_device_id_regen = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).lower() + if gpu_name_regen == "NVIDIA Graphics Device" and gpu_device_id_regen in constants.GPU_ID_TO_NAME: + gpu_name_regen = constants.GPU_ID_TO_NAME[gpu_device_id_regen] + + gpu_spec["name"] = gpu_name_regen gpu_name_list.append(gpu_spec["name"]) gpu_spec["locks"] = mem_to_core_allowed_locks