Merge pull request 'dev: fixed an issues with CMP170' (#6) from dev into main

Reviewed-on: #6
This commit is contained in:
clore 2025-10-15 19:13:45 +00:00
commit 3772e33c97
4 changed files with 37 additions and 6 deletions

View File

@ -554,6 +554,7 @@ class CloreClient:
try: try:
await monitoring.put("specs_service") await monitoring.put("specs_service")
current_specs = await specs.get() current_specs = await specs.get()
if self.last_hw_specs_submit < (utils.unix_timestamp()-1800): if self.last_hw_specs_submit < (utils.unix_timestamp()-1800):
self.last_hw_specs_submit=utils.unix_timestamp() self.last_hw_specs_submit=utils.unix_timestamp()
await self.submit_specs(current_specs) await self.submit_specs(current_specs)

4
lib/constants.py Normal file
View File

@ -0,0 +1,4 @@
GPU_ID_TO_NAME = {
"0x20c210de": "NVIDIA CMP 170HX",
"0x208210de": "NVIDIA CMP 170HX"
}

View File

@ -4,6 +4,7 @@ import xml.etree.ElementTree as ET
from lib import docker_interface from lib import docker_interface
from typing import Dict, List, Optional from typing import Dict, List, Optional
from lib import utils from lib import utils
from lib import constants
import subprocess import subprocess
import speedtest import speedtest
import platform import platform
@ -256,8 +257,8 @@ def get_gpu_info():
except Exception as e: except Exception as e:
pass pass
nvidia_smi_return_code, nvidia_smi_stdout, nvidia_smi_stderr = utils.run_command(f"nvidia-smi --query-gpu=index,name,uuid,serial,memory.total --format=csv") nvidia_smi_return_code, nvidia_smi_stdout, nvidia_smi_stderr = utils.run_command(f"nvidia-smi --query-gpu=index,name,uuid,serial,memory.total,pci.device_id --format=csv")
nvidia_smi_xl_return_code, nvidia_smi_xl_stdout, nvidia_smi_xl_stderr = utils.run_command("nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv") nvidia_smi_xl_return_code, nvidia_smi_xl_stdout, nvidia_smi_xl_stderr = utils.run_command("nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used,pci.device_id --format=csv")
if "Failed to initialize NVML" in nvidia_smi_stdout or "Failed to initialize NVML" in nvidia_smi_stderr or "Failed to initialize NVML" in nvidia_smi_xl_stdout or "Failed to initialize NVML" in nvidia_smi_xl_stderr: if "Failed to initialize NVML" in nvidia_smi_stdout or "Failed to initialize NVML" in nvidia_smi_stderr or "Failed to initialize NVML" in nvidia_smi_xl_stdout or "Failed to initialize NVML" in nvidia_smi_xl_stderr:
nvml_err=True nvml_err=True
@ -267,10 +268,15 @@ def get_gpu_info():
for index, line in enumerate(lines_xl): for index, line in enumerate(lines_xl):
parts = [s.strip() for s in line.split(',')] parts = [s.strip() for s in line.split(',')]
if len(parts)>12 and index>0: if len(parts)>12 and index>0:
gpu_name_xl = parts[1]
gpu_id_xl = parts[13].lower()
if gpu_name_xl == "NVIDIA Graphics Device" and gpu_id_xl in constants.GPU_ID_TO_NAME:
gpu_name_xl = constants.GPU_ID_TO_NAME[gpu_id_xl]
xl_gpu_info={ xl_gpu_info={
"id":index-1, "id":index-1,
"timestamp": parts[0], "timestamp": parts[0],
"name": parts[1], "name": gpu_name_xl,
"pcie_bus": parts[2].split(':', 1)[1], "pcie_bus": parts[2].split(':', 1)[1],
"driver": parts[3], "driver": parts[3],
"pstate": parts[4], "pstate": parts[4],
@ -281,6 +287,7 @@ def get_gpu_info():
"mem_free": parts[11], "mem_free": parts[11],
"mem_used": parts[12] "mem_used": parts[12]
} }
try: try:
pci_query = parts[2][parts[2].find(':')+1:] pci_query = parts[2][parts[2].find(':')+1:]
for index, valid_pci_dev in enumerate(valid_pci_dev_list): for index, valid_pci_dev in enumerate(valid_pci_dev_list):
@ -296,7 +303,13 @@ def get_gpu_info():
for line in lines: for line in lines:
parts = line.split(',') parts = line.split(',')
if bool(re.match(r'^[0-9]+$', parts[0])): if bool(re.match(r'^[0-9]+$', parts[0])):
gpu_str = f"{len(lines)-1}x {parts[1].strip()}"
gpu_name = parts[1].strip()
gpu_id = parts[5].strip().lower()
if gpu_name == "NVIDIA Graphics Device" and gpu_id in constants.GPU_ID_TO_NAME:
gpu_name = constants.GPU_ID_TO_NAME[gpu_id]
gpu_str = f"{len(lines)-1}x {gpu_name}"
gpu_mem = round(int(filter_non_numeric(parts[4]).strip())/1024, 2) gpu_mem = round(int(filter_non_numeric(parts[4]).strip())/1024, 2)
except Exception as e: except Exception as e:
nvml_err=True nvml_err=True

View File

@ -1,6 +1,7 @@
from lib import config as config_module from lib import config as config_module
from lib import logging as logging_lib from lib import logging as logging_lib
from lib import get_specs from lib import get_specs
from lib import constants
config = config_module.config config = config_module.config
log = logging_lib.log log = logging_lib.log
@ -97,7 +98,13 @@ def init(gpu_specs_file=None, allow_hive_binaries=True):
break break
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i) gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle) gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
gpu_name_list.append(pynvml.nvmlDeviceGetName(gpu_handle))
gpu_name = pynvml.nvmlDeviceGetName(gpu_handle)
gpu_device_id = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).lower()
if gpu_name == "NVIDIA Graphics Device" and gpu_device_id in constants.GPU_ID_TO_NAME:
gpu_name = constants.GPU_ID_TO_NAME[gpu_device_id]
gpu_name_list.append(gpu_name)
if not f"{i}-{gpu_uuid}" in parsed_specs_keys: if not f"{i}-{gpu_uuid}" in parsed_specs_keys:
parsed_specs={} parsed_specs={}
regenerate_specs=True regenerate_specs=True
@ -118,7 +125,13 @@ def init(gpu_specs_file=None, allow_hive_binaries=True):
max_power_limit = int(power_limits[1] / 1000.0) max_power_limit = int(power_limits[1] / 1000.0)
gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0) gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0)
gpu_spec["power_limits"] = [min_power_limit, max_power_limit] gpu_spec["power_limits"] = [min_power_limit, max_power_limit]
gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle)
gpu_name_regen = pynvml.nvmlDeviceGetName(gpu_handle)
gpu_device_id_regen = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).lower()
if gpu_name_regen == "NVIDIA Graphics Device" and gpu_device_id_regen in constants.GPU_ID_TO_NAME:
gpu_name_regen = constants.GPU_ID_TO_NAME[gpu_device_id_regen]
gpu_spec["name"] = gpu_name_regen
gpu_name_list.append(gpu_spec["name"]) gpu_name_list.append(gpu_spec["name"])
gpu_spec["locks"] = mem_to_core_allowed_locks gpu_spec["locks"] = mem_to_core_allowed_locks