Compare commits

..

17 Commits

Author SHA1 Message Date
clore 3772e33c97 Merge pull request 'dev: fixed an issues with CMP170' (#6) from dev into main
Reviewed-on: #6
2025-10-15 19:13:45 +00:00
empresa 9d61280f7a dev: finished debug 2025-10-16 02:12:13 +07:00
empresa 02918f9c2a dev: upper -> lower 2025-10-16 02:03:03 +07:00
empresa f68e176340 dev 2025-10-16 01:57:30 +07:00
empresa bae3d395f9 dev: more debug 2025-10-16 01:50:41 +07:00
empresa d908fb043d dev: debug hex 2025-10-16 01:45:45 +07:00
empresa 260ee6f18f dev: fixing oc 2025-10-16 01:43:56 +07:00
empresa cc9941db02 dev: debug 2025-10-16 00:56:55 +07:00
empresa 96bd92e5f0 dev: debug 2025-10-16 00:49:59 +07:00
empresa daa634bfa9 dev: debug... 2025-10-16 00:41:20 +07:00
empresa 9012bc2b2a dev: more debug 2025-10-16 00:37:28 +07:00
empresa 351bc269d1 dev: CMP170 debug 2025-10-16 00:32:53 +07:00
empresa bbd4c669a2 Debug CMP170 2025-10-16 00:30:27 +07:00
empresa 6170c98f7b Debug CMP170 2025-10-16 00:25:35 +07:00
empresa 35104eff6a Debug CMP170 p.2 2025-10-16 00:20:00 +07:00
empresa 73b9bacbd8 Debug CMP170 2025-10-16 00:05:17 +07:00
empresa b8ef86c020 dev: fixed an issues with CMP170 2025-10-15 17:39:38 +07:00
4 changed files with 37 additions and 6 deletions

View File

@ -554,6 +554,7 @@ class CloreClient:
try: try:
await monitoring.put("specs_service") await monitoring.put("specs_service")
current_specs = await specs.get() current_specs = await specs.get()
if self.last_hw_specs_submit < (utils.unix_timestamp()-1800): if self.last_hw_specs_submit < (utils.unix_timestamp()-1800):
self.last_hw_specs_submit=utils.unix_timestamp() self.last_hw_specs_submit=utils.unix_timestamp()
await self.submit_specs(current_specs) await self.submit_specs(current_specs)

4
lib/constants.py Normal file
View File

@ -0,0 +1,4 @@
GPU_ID_TO_NAME = {
"0x20c210de": "NVIDIA CMP 170HX",
"0x208210de": "NVIDIA CMP 170HX"
}

View File

@ -4,6 +4,7 @@ import xml.etree.ElementTree as ET
from lib import docker_interface from lib import docker_interface
from typing import Dict, List, Optional from typing import Dict, List, Optional
from lib import utils from lib import utils
from lib import constants
import subprocess import subprocess
import speedtest import speedtest
import platform import platform
@ -256,8 +257,8 @@ def get_gpu_info():
except Exception as e: except Exception as e:
pass pass
nvidia_smi_return_code, nvidia_smi_stdout, nvidia_smi_stderr = utils.run_command(f"nvidia-smi --query-gpu=index,name,uuid,serial,memory.total --format=csv") nvidia_smi_return_code, nvidia_smi_stdout, nvidia_smi_stderr = utils.run_command(f"nvidia-smi --query-gpu=index,name,uuid,serial,memory.total,pci.device_id --format=csv")
nvidia_smi_xl_return_code, nvidia_smi_xl_stdout, nvidia_smi_xl_stderr = utils.run_command("nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv") nvidia_smi_xl_return_code, nvidia_smi_xl_stdout, nvidia_smi_xl_stderr = utils.run_command("nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used,pci.device_id --format=csv")
if "Failed to initialize NVML" in nvidia_smi_stdout or "Failed to initialize NVML" in nvidia_smi_stderr or "Failed to initialize NVML" in nvidia_smi_xl_stdout or "Failed to initialize NVML" in nvidia_smi_xl_stderr: if "Failed to initialize NVML" in nvidia_smi_stdout or "Failed to initialize NVML" in nvidia_smi_stderr or "Failed to initialize NVML" in nvidia_smi_xl_stdout or "Failed to initialize NVML" in nvidia_smi_xl_stderr:
nvml_err=True nvml_err=True
@ -267,10 +268,15 @@ def get_gpu_info():
for index, line in enumerate(lines_xl): for index, line in enumerate(lines_xl):
parts = [s.strip() for s in line.split(',')] parts = [s.strip() for s in line.split(',')]
if len(parts)>12 and index>0: if len(parts)>12 and index>0:
gpu_name_xl = parts[1]
gpu_id_xl = parts[13].lower()
if gpu_name_xl == "NVIDIA Graphics Device" and gpu_id_xl in constants.GPU_ID_TO_NAME:
gpu_name_xl = constants.GPU_ID_TO_NAME[gpu_id_xl]
xl_gpu_info={ xl_gpu_info={
"id":index-1, "id":index-1,
"timestamp": parts[0], "timestamp": parts[0],
"name": parts[1], "name": gpu_name_xl,
"pcie_bus": parts[2].split(':', 1)[1], "pcie_bus": parts[2].split(':', 1)[1],
"driver": parts[3], "driver": parts[3],
"pstate": parts[4], "pstate": parts[4],
@ -281,6 +287,7 @@ def get_gpu_info():
"mem_free": parts[11], "mem_free": parts[11],
"mem_used": parts[12] "mem_used": parts[12]
} }
try: try:
pci_query = parts[2][parts[2].find(':')+1:] pci_query = parts[2][parts[2].find(':')+1:]
for index, valid_pci_dev in enumerate(valid_pci_dev_list): for index, valid_pci_dev in enumerate(valid_pci_dev_list):
@ -296,7 +303,13 @@ def get_gpu_info():
for line in lines: for line in lines:
parts = line.split(',') parts = line.split(',')
if bool(re.match(r'^[0-9]+$', parts[0])): if bool(re.match(r'^[0-9]+$', parts[0])):
gpu_str = f"{len(lines)-1}x {parts[1].strip()}"
gpu_name = parts[1].strip()
gpu_id = parts[5].strip().lower()
if gpu_name == "NVIDIA Graphics Device" and gpu_id in constants.GPU_ID_TO_NAME:
gpu_name = constants.GPU_ID_TO_NAME[gpu_id]
gpu_str = f"{len(lines)-1}x {gpu_name}"
gpu_mem = round(int(filter_non_numeric(parts[4]).strip())/1024, 2) gpu_mem = round(int(filter_non_numeric(parts[4]).strip())/1024, 2)
except Exception as e: except Exception as e:
nvml_err=True nvml_err=True

View File

@ -1,6 +1,7 @@
from lib import config as config_module from lib import config as config_module
from lib import logging as logging_lib from lib import logging as logging_lib
from lib import get_specs from lib import get_specs
from lib import constants
config = config_module.config config = config_module.config
log = logging_lib.log log = logging_lib.log
@ -97,7 +98,13 @@ def init(gpu_specs_file=None, allow_hive_binaries=True):
break break
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i) gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle) gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
gpu_name_list.append(pynvml.nvmlDeviceGetName(gpu_handle))
gpu_name = pynvml.nvmlDeviceGetName(gpu_handle)
gpu_device_id = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).lower()
if gpu_name == "NVIDIA Graphics Device" and gpu_device_id in constants.GPU_ID_TO_NAME:
gpu_name = constants.GPU_ID_TO_NAME[gpu_device_id]
gpu_name_list.append(gpu_name)
if not f"{i}-{gpu_uuid}" in parsed_specs_keys: if not f"{i}-{gpu_uuid}" in parsed_specs_keys:
parsed_specs={} parsed_specs={}
regenerate_specs=True regenerate_specs=True
@ -118,7 +125,13 @@ def init(gpu_specs_file=None, allow_hive_binaries=True):
max_power_limit = int(power_limits[1] / 1000.0) max_power_limit = int(power_limits[1] / 1000.0)
gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0) gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0)
gpu_spec["power_limits"] = [min_power_limit, max_power_limit] gpu_spec["power_limits"] = [min_power_limit, max_power_limit]
gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle)
gpu_name_regen = pynvml.nvmlDeviceGetName(gpu_handle)
gpu_device_id_regen = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).lower()
if gpu_name_regen == "NVIDIA Graphics Device" and gpu_device_id_regen in constants.GPU_ID_TO_NAME:
gpu_name_regen = constants.GPU_ID_TO_NAME[gpu_device_id_regen]
gpu_spec["name"] = gpu_name_regen
gpu_name_list.append(gpu_spec["name"]) gpu_name_list.append(gpu_spec["name"])
gpu_spec["locks"] = mem_to_core_allowed_locks gpu_spec["locks"] = mem_to_core_allowed_locks