From b8ef86c020a45013a8b1d1c93dfe1b3403894a66 Mon Sep 17 00:00:00 2001 From: empresa Date: Wed, 15 Oct 2025 17:39:38 +0700 Subject: [PATCH 01/16] dev: fixed an issues with CMP170 --- lib/get_specs.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/lib/get_specs.py b/lib/get_specs.py index 5092333..a162201 100644 --- a/lib/get_specs.py +++ b/lib/get_specs.py @@ -241,6 +241,11 @@ def get_bus_spec(bus_id): return PCIBusInfo() def get_gpu_info(): + GPU_ID_TO_NAME = { + "0x20C210DE": "NVIDIA CMP 170HX", + "0x208210DE": "NVIDIA CMP 170HX" + } + gpu_str = "0x Unknown" nvml_err = False gpu_mem = 0 @@ -257,7 +262,7 @@ def get_gpu_info(): pass nvidia_smi_return_code, nvidia_smi_stdout, nvidia_smi_stderr = utils.run_command(f"nvidia-smi --query-gpu=index,name,uuid,serial,memory.total --format=csv") - nvidia_smi_xl_return_code, nvidia_smi_xl_stdout, nvidia_smi_xl_stderr = utils.run_command("nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv") + nvidia_smi_xl_return_code, nvidia_smi_xl_stdout, nvidia_smi_xl_stderr = utils.run_command("nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used,pci.device_id --format=csv") if "Failed to initialize NVML" in nvidia_smi_stdout or "Failed to initialize NVML" in nvidia_smi_stderr or "Failed to initialize NVML" in nvidia_smi_xl_stdout or "Failed to initialize NVML" in nvidia_smi_xl_stderr: nvml_err=True @@ -267,10 +272,14 @@ def get_gpu_info(): for index, line in enumerate(lines_xl): parts = [s.strip() for s in line.split(',')] if len(parts)>12 and index>0: + gpu_name = parts[1] + if gpu_name == "NVIDIA Graphics Device" and parts[13] in GPU_ID_TO_NAME: + gpu_name = GPU_ID_TO_NAME[parts[13]] + xl_gpu_info={ "id":index-1, "timestamp": parts[0], - "name": parts[1], + "name": gpu_name, "pcie_bus": parts[2].split(':', 1)[1], "driver": parts[3], "pstate": parts[4], -- 2.34.1 From 73b9bacbd812f12e8ff3c4a0946edbadc21d457c Mon Sep 17 00:00:00 2001 From: empresa Date: Thu, 16 Oct 2025 00:05:17 +0700 Subject: [PATCH 02/16] Debug CMP170 --- lib/get_specs.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/get_specs.py b/lib/get_specs.py index a162201..7ac2a50 100644 --- a/lib/get_specs.py +++ b/lib/get_specs.py @@ -245,7 +245,7 @@ def get_gpu_info(): "0x20C210DE": "NVIDIA CMP 170HX", "0x208210DE": "NVIDIA CMP 170HX" } - + gpu_str = "0x Unknown" nvml_err = False gpu_mem = 0 @@ -273,8 +273,14 @@ def get_gpu_info(): parts = [s.strip() for s in line.split(',')] if len(parts)>12 and index>0: gpu_name = parts[1] + print("gpu name 1") + print(gpu_name) + print("part 13") + print(parts[13]) if gpu_name == "NVIDIA Graphics Device" and parts[13] in GPU_ID_TO_NAME: gpu_name = GPU_ID_TO_NAME[parts[13]] + print("gpu name 2") + print(gpu_name) xl_gpu_info={ "id":index-1, -- 2.34.1 From 35104eff6a069960a2a00d180b550de7ef68b89c Mon Sep 17 00:00:00 2001 From: empresa Date: Thu, 16 Oct 2025 00:20:00 +0700 Subject: [PATCH 03/16] Debug CMP170 p.2 --- lib/get_specs.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/lib/get_specs.py b/lib/get_specs.py index 7ac2a50..176b67d 100644 --- a/lib/get_specs.py +++ b/lib/get_specs.py @@ -261,7 +261,7 @@ def get_gpu_info(): except Exception as e: pass - nvidia_smi_return_code, nvidia_smi_stdout, nvidia_smi_stderr = utils.run_command(f"nvidia-smi --query-gpu=index,name,uuid,serial,memory.total --format=csv") + nvidia_smi_return_code, nvidia_smi_stdout, nvidia_smi_stderr = utils.run_command(f"nvidia-smi --query-gpu=index,name,uuid,serial,memory.total,pci.device_id --format=csv") nvidia_smi_xl_return_code, nvidia_smi_xl_stdout, nvidia_smi_xl_stderr = utils.run_command("nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used,pci.device_id --format=csv") if "Failed to initialize NVML" in nvidia_smi_stdout or "Failed to initialize NVML" in nvidia_smi_stderr or "Failed to initialize NVML" in nvidia_smi_xl_stdout or "Failed to initialize NVML" in nvidia_smi_xl_stderr: @@ -272,20 +272,14 @@ def get_gpu_info(): for index, line in enumerate(lines_xl): parts = [s.strip() for s in line.split(',')] if len(parts)>12 and index>0: - gpu_name = parts[1] - print("gpu name 1") - print(gpu_name) - print("part 13") - print(parts[13]) - if gpu_name == "NVIDIA Graphics Device" and parts[13] in GPU_ID_TO_NAME: - gpu_name = GPU_ID_TO_NAME[parts[13]] - print("gpu name 2") - print(gpu_name) + gpu_name_xl = parts[1] + if gpu_name_xl == "NVIDIA Graphics Device" and parts[13] in GPU_ID_TO_NAME: + gpu_name_xl = GPU_ID_TO_NAME[parts[13]] xl_gpu_info={ "id":index-1, "timestamp": parts[0], - "name": gpu_name, + "name": gpu_name_xl, "pcie_bus": parts[2].split(':', 1)[1], "driver": parts[3], "pstate": parts[4], @@ -296,6 +290,7 @@ def get_gpu_info(): "mem_free": parts[11], "mem_used": parts[12] } + print(xl_gpu_info) try: pci_query = parts[2][parts[2].find(':')+1:] for index, valid_pci_dev in enumerate(valid_pci_dev_list): @@ -311,7 +306,12 @@ def get_gpu_info(): for line in lines: parts = line.split(',') if bool(re.match(r'^[0-9]+$', parts[0])): - gpu_str = f"{len(lines)-1}x {parts[1].strip()}" + + gpu_name = parts[1] + if gpu_name == "NVIDIA Graphics Device" and parts[5] in GPU_ID_TO_NAME: + gpu_name = GPU_ID_TO_NAME[parts[5]] + + gpu_str = f"{len(lines)-1}x {gpu_name.strip()}" gpu_mem = round(int(filter_non_numeric(parts[4]).strip())/1024, 2) except Exception as e: nvml_err=True -- 2.34.1 From 6170c98f7b3c856d1140b5774162808e4367988c Mon Sep 17 00:00:00 2001 From: empresa Date: Thu, 16 Oct 2025 00:25:35 +0700 Subject: [PATCH 04/16] Debug CMP170 --- lib/get_specs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/get_specs.py b/lib/get_specs.py index 176b67d..ec3ea87 100644 --- a/lib/get_specs.py +++ b/lib/get_specs.py @@ -290,7 +290,7 @@ def get_gpu_info(): "mem_free": parts[11], "mem_used": parts[12] } - print(xl_gpu_info) + try: pci_query = parts[2][parts[2].find(':')+1:] for index, valid_pci_dev in enumerate(valid_pci_dev_list): @@ -308,10 +308,13 @@ def get_gpu_info(): if bool(re.match(r'^[0-9]+$', parts[0])): gpu_name = parts[1] + print("p1" + gpu_name) if gpu_name == "NVIDIA Graphics Device" and parts[5] in GPU_ID_TO_NAME: gpu_name = GPU_ID_TO_NAME[parts[5]] + print("p1" + gpu_name) gpu_str = f"{len(lines)-1}x {gpu_name.strip()}" + print(gpu_str) gpu_mem = round(int(filter_non_numeric(parts[4]).strip())/1024, 2) except Exception as e: nvml_err=True -- 2.34.1 From bbd4c669a2d3c9f6d5ca91c6d3f2b1212ebc9a99 Mon Sep 17 00:00:00 2001 From: empresa Date: Thu, 16 Oct 2025 00:30:27 +0700 Subject: [PATCH 05/16] Debug CMP170 --- lib/get_specs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/get_specs.py b/lib/get_specs.py index ec3ea87..fbfd48b 100644 --- a/lib/get_specs.py +++ b/lib/get_specs.py @@ -300,6 +300,8 @@ def get_gpu_info(): xl_gpu_info["pcie_width"]=bus_spec.width xl_gpu_info["pcie_revision"]=bus_spec.revision except Exception as e: + print("err") + print(e) pass gpus["nvidia"].append(xl_gpu_info) lines = nvidia_smi_stdout.split('\n') -- 2.34.1 From 351bc269d1b0ce82ada31c6d7ac19ca00239cc9d Mon Sep 17 00:00:00 2001 From: empresa Date: Thu, 16 Oct 2025 00:32:53 +0700 Subject: [PATCH 06/16] dev: CMP170 debug --- lib/get_specs.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/get_specs.py b/lib/get_specs.py index fbfd48b..4a64182 100644 --- a/lib/get_specs.py +++ b/lib/get_specs.py @@ -291,6 +291,8 @@ def get_gpu_info(): "mem_used": parts[12] } + print(xl_gpu_info) + try: pci_query = parts[2][parts[2].find(':')+1:] for index, valid_pci_dev in enumerate(valid_pci_dev_list): @@ -320,6 +322,8 @@ def get_gpu_info(): gpu_mem = round(int(filter_non_numeric(parts[4]).strip())/1024, 2) except Exception as e: nvml_err=True + print("err2") + print(e) pass else: nvml_err=True -- 2.34.1 From 9012bc2b2a45fe9477d452518c3ab6966dda3041 Mon Sep 17 00:00:00 2001 From: empresa Date: Thu, 16 Oct 2025 00:37:28 +0700 Subject: [PATCH 07/16] dev: more debug --- lib/get_specs.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/get_specs.py b/lib/get_specs.py index 4a64182..3057f1c 100644 --- a/lib/get_specs.py +++ b/lib/get_specs.py @@ -312,10 +312,12 @@ def get_gpu_info(): if bool(re.match(r'^[0-9]+$', parts[0])): gpu_name = parts[1] + print("parts") + print(parts) print("p1" + gpu_name) if gpu_name == "NVIDIA Graphics Device" and parts[5] in GPU_ID_TO_NAME: gpu_name = GPU_ID_TO_NAME[parts[5]] - print("p1" + gpu_name) + print("p2" + gpu_name) gpu_str = f"{len(lines)-1}x {gpu_name.strip()}" print(gpu_str) -- 2.34.1 From daa634bfa95972473439aa11b53943a91ab9c49d Mon Sep 17 00:00:00 2001 From: empresa Date: Thu, 16 Oct 2025 00:41:20 +0700 Subject: [PATCH 08/16] dev: debug... --- lib/get_specs.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/lib/get_specs.py b/lib/get_specs.py index 3057f1c..8170d79 100644 --- a/lib/get_specs.py +++ b/lib/get_specs.py @@ -311,15 +311,17 @@ def get_gpu_info(): parts = line.split(',') if bool(re.match(r'^[0-9]+$', parts[0])): - gpu_name = parts[1] + gpu_name = parts[1].strip() + gpu_id = parts[5].strip(); print("parts") print(parts) - print("p1" + gpu_name) - if gpu_name == "NVIDIA Graphics Device" and parts[5] in GPU_ID_TO_NAME: - gpu_name = GPU_ID_TO_NAME[parts[5]] - print("p2" + gpu_name) + print("p1:" + gpu_name) + print("p1:" + gpu_id) + if gpu_name == "NVIDIA Graphics Device" and gpu_id in GPU_ID_TO_NAME: + gpu_name = GPU_ID_TO_NAME[gpu_id] + print("p2:" + gpu_name) - gpu_str = f"{len(lines)-1}x {gpu_name.strip()}" + gpu_str = f"{len(lines)-1}x {gpu_name}" print(gpu_str) gpu_mem = round(int(filter_non_numeric(parts[4]).strip())/1024, 2) except Exception as e: -- 2.34.1 From 96bd92e5f0df95dbaed2430c5f0875359ab3e4ed Mon Sep 17 00:00:00 2001 From: empresa Date: Thu, 16 Oct 2025 00:49:59 +0700 Subject: [PATCH 09/16] dev: debug --- clore_hosting/main.py | 3 +++ lib/get_specs.py | 12 ------------ 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/clore_hosting/main.py b/clore_hosting/main.py index 988b965..e00e2c2 100644 --- a/clore_hosting/main.py +++ b/clore_hosting/main.py @@ -554,10 +554,13 @@ class CloreClient: try: await monitoring.put("specs_service") current_specs = await specs.get() + print(current_specs) if self.last_hw_specs_submit < (utils.unix_timestamp()-1800): self.last_hw_specs_submit=utils.unix_timestamp() await self.submit_specs(current_specs) + print("submit specs") await self.update_realtime_data(current_specs) + print("update realtime") try: if self.xfs_state == "active" and len(current_specs["gpus"]["nvidia"]) > 0 and not self.runned_pull_selftest: await clore_partner.check_to_pull_selftest(current_specs) diff --git a/lib/get_specs.py b/lib/get_specs.py index 8170d79..ff05d74 100644 --- a/lib/get_specs.py +++ b/lib/get_specs.py @@ -291,8 +291,6 @@ def get_gpu_info(): "mem_used": parts[12] } - print(xl_gpu_info) - try: pci_query = parts[2][parts[2].find(':')+1:] for index, valid_pci_dev in enumerate(valid_pci_dev_list): @@ -302,8 +300,6 @@ def get_gpu_info(): xl_gpu_info["pcie_width"]=bus_spec.width xl_gpu_info["pcie_revision"]=bus_spec.revision except Exception as e: - print("err") - print(e) pass gpus["nvidia"].append(xl_gpu_info) lines = nvidia_smi_stdout.split('\n') @@ -313,21 +309,13 @@ def get_gpu_info(): gpu_name = parts[1].strip() gpu_id = parts[5].strip(); - print("parts") - print(parts) - print("p1:" + gpu_name) - print("p1:" + gpu_id) if gpu_name == "NVIDIA Graphics Device" and gpu_id in GPU_ID_TO_NAME: gpu_name = GPU_ID_TO_NAME[gpu_id] - print("p2:" + gpu_name) gpu_str = f"{len(lines)-1}x {gpu_name}" - print(gpu_str) gpu_mem = round(int(filter_non_numeric(parts[4]).strip())/1024, 2) except Exception as e: nvml_err=True - print("err2") - print(e) pass else: nvml_err=True -- 2.34.1 From cc9941db0223695963af79ee10d538aa5451c757 Mon Sep 17 00:00:00 2001 From: empresa Date: Thu, 16 Oct 2025 00:56:55 +0700 Subject: [PATCH 10/16] dev: debug --- clore_hosting/main.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/clore_hosting/main.py b/clore_hosting/main.py index e00e2c2..6e78ad5 100644 --- a/clore_hosting/main.py +++ b/clore_hosting/main.py @@ -537,6 +537,8 @@ class CloreClient: cpu_usage = await get_specs.get_cpu_usage() ram_usage = await get_specs.get_ram_usage() gpu_list = current_specs["gpus"]["nvidia"]+current_specs["gpus"]["amd"] + print("realtime gpus") + print(gpu_list) submit_document = { "update_realtime_data":True, "gpus": gpu_list, @@ -557,10 +559,12 @@ class CloreClient: print(current_specs) if self.last_hw_specs_submit < (utils.unix_timestamp()-1800): self.last_hw_specs_submit=utils.unix_timestamp() + print("submit specs start") await self.submit_specs(current_specs) - print("submit specs") + print("submit specs end") + print("update realtime start") await self.update_realtime_data(current_specs) - print("update realtime") + print("update realtime end") try: if self.xfs_state == "active" and len(current_specs["gpus"]["nvidia"]) > 0 and not self.runned_pull_selftest: await clore_partner.check_to_pull_selftest(current_specs) -- 2.34.1 From 260ee6f18fecb49e76c4a646020de7ef614c25e0 Mon Sep 17 00:00:00 2001 From: empresa Date: Thu, 16 Oct 2025 01:43:56 +0700 Subject: [PATCH 11/16] dev: fixing oc --- clore_hosting/main.py | 6 ------ lib/constants.py | 4 ++++ lib/get_specs.py | 14 +++++--------- lib/nvml.py | 10 ++++++++++ 4 files changed, 19 insertions(+), 15 deletions(-) create mode 100644 lib/constants.py diff --git a/clore_hosting/main.py b/clore_hosting/main.py index 6e78ad5..0e05e24 100644 --- a/clore_hosting/main.py +++ b/clore_hosting/main.py @@ -537,8 +537,6 @@ class CloreClient: cpu_usage = await get_specs.get_cpu_usage() ram_usage = await get_specs.get_ram_usage() gpu_list = current_specs["gpus"]["nvidia"]+current_specs["gpus"]["amd"] - print("realtime gpus") - print(gpu_list) submit_document = { "update_realtime_data":True, "gpus": gpu_list, @@ -559,12 +557,8 @@ class CloreClient: print(current_specs) if self.last_hw_specs_submit < (utils.unix_timestamp()-1800): self.last_hw_specs_submit=utils.unix_timestamp() - print("submit specs start") await self.submit_specs(current_specs) - print("submit specs end") - print("update realtime start") await self.update_realtime_data(current_specs) - print("update realtime end") try: if self.xfs_state == "active" and len(current_specs["gpus"]["nvidia"]) > 0 and not self.runned_pull_selftest: await clore_partner.check_to_pull_selftest(current_specs) diff --git a/lib/constants.py b/lib/constants.py new file mode 100644 index 0000000..dc317f0 --- /dev/null +++ b/lib/constants.py @@ -0,0 +1,4 @@ +GPU_ID_TO_NAME = { + "0x20C210DE": "NVIDIA CMP 170HX", + "0x208210DE": "NVIDIA CMP 170HX" +} diff --git a/lib/get_specs.py b/lib/get_specs.py index ff05d74..fae4ce9 100644 --- a/lib/get_specs.py +++ b/lib/get_specs.py @@ -4,6 +4,7 @@ import xml.etree.ElementTree as ET from lib import docker_interface from typing import Dict, List, Optional from lib import utils +from lib import constants import subprocess import speedtest import platform @@ -241,11 +242,6 @@ def get_bus_spec(bus_id): return PCIBusInfo() def get_gpu_info(): - GPU_ID_TO_NAME = { - "0x20C210DE": "NVIDIA CMP 170HX", - "0x208210DE": "NVIDIA CMP 170HX" - } - gpu_str = "0x Unknown" nvml_err = False gpu_mem = 0 @@ -273,8 +269,8 @@ def get_gpu_info(): parts = [s.strip() for s in line.split(',')] if len(parts)>12 and index>0: gpu_name_xl = parts[1] - if gpu_name_xl == "NVIDIA Graphics Device" and parts[13] in GPU_ID_TO_NAME: - gpu_name_xl = GPU_ID_TO_NAME[parts[13]] + if gpu_name_xl == "NVIDIA Graphics Device" and parts[13] in constants.GPU_ID_TO_NAME: + gpu_name_xl = constants.GPU_ID_TO_NAME[parts[13]] xl_gpu_info={ "id":index-1, @@ -309,8 +305,8 @@ def get_gpu_info(): gpu_name = parts[1].strip() gpu_id = parts[5].strip(); - if gpu_name == "NVIDIA Graphics Device" and gpu_id in GPU_ID_TO_NAME: - gpu_name = GPU_ID_TO_NAME[gpu_id] + if gpu_name == "NVIDIA Graphics Device" and gpu_id in constants.GPU_ID_TO_NAME: + gpu_name = constants.GPU_ID_TO_NAME[gpu_id] gpu_str = f"{len(lines)-1}x {gpu_name}" gpu_mem = round(int(filter_non_numeric(parts[4]).strip())/1024, 2) diff --git a/lib/nvml.py b/lib/nvml.py index 59dade3..b44b792 100644 --- a/lib/nvml.py +++ b/lib/nvml.py @@ -1,6 +1,7 @@ from lib import config as config_module from lib import logging as logging_lib from lib import get_specs +from lib import constants config = config_module.config log = logging_lib.log @@ -97,6 +98,15 @@ def init(gpu_specs_file=None, allow_hive_binaries=True): break gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i) gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle) + + + print("name") + print(pynvml.nvmlDeviceGetName(gpu_handle)) + print("device_id") + pci_info = pynvml.nvmlDeviceGetPciInfo(gpu_handle) + print(pci_info.pciDeviceId) + + gpu_name_list.append(pynvml.nvmlDeviceGetName(gpu_handle)) if not f"{i}-{gpu_uuid}" in parsed_specs_keys: parsed_specs={} -- 2.34.1 From d908fb043d7de6302e441ff7ec425c3b6c36d1ab Mon Sep 17 00:00:00 2001 From: empresa Date: Thu, 16 Oct 2025 01:45:45 +0700 Subject: [PATCH 12/16] dev: debug hex --- lib/nvml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/nvml.py b/lib/nvml.py index b44b792..1606767 100644 --- a/lib/nvml.py +++ b/lib/nvml.py @@ -104,8 +104,8 @@ def init(gpu_specs_file=None, allow_hive_binaries=True): print(pynvml.nvmlDeviceGetName(gpu_handle)) print("device_id") pci_info = pynvml.nvmlDeviceGetPciInfo(gpu_handle) - print(pci_info.pciDeviceId) - + print(hex(pci_info.pciDeviceId)) + gpu_name_list.append(pynvml.nvmlDeviceGetName(gpu_handle)) if not f"{i}-{gpu_uuid}" in parsed_specs_keys: -- 2.34.1 From bae3d395f9918be1c334c8e105e763e16589e4f1 Mon Sep 17 00:00:00 2001 From: empresa Date: Thu, 16 Oct 2025 01:50:41 +0700 Subject: [PATCH 13/16] dev: more debug --- lib/nvml.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/lib/nvml.py b/lib/nvml.py index 1606767..bcb1d31 100644 --- a/lib/nvml.py +++ b/lib/nvml.py @@ -99,15 +99,13 @@ def init(gpu_specs_file=None, allow_hive_binaries=True): gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i) gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle) - - print("name") - print(pynvml.nvmlDeviceGetName(gpu_handle)) - print("device_id") - pci_info = pynvml.nvmlDeviceGetPciInfo(gpu_handle) - print(hex(pci_info.pciDeviceId)) + gpu_name = pynvml.nvmlDeviceGetName(gpu_handle) + gpu_device_id = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).upper() + if gpu_name == "NVIDIA Graphics Device" and gpu_device_id in constants.GPU_ID_TO_NAME: + gpu_name = constants.GPU_ID_TO_NAME[gpu_device_id] - gpu_name_list.append(pynvml.nvmlDeviceGetName(gpu_handle)) + gpu_name_list.append(gpu_name) if not f"{i}-{gpu_uuid}" in parsed_specs_keys: parsed_specs={} regenerate_specs=True @@ -128,7 +126,13 @@ def init(gpu_specs_file=None, allow_hive_binaries=True): max_power_limit = int(power_limits[1] / 1000.0) gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0) gpu_spec["power_limits"] = [min_power_limit, max_power_limit] - gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle) + + gpu_name_regen = pynvml.nvmlDeviceGetName(gpu_handle) + gpu_device_id_regen = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).upper() + if gpu_name_regen == "NVIDIA Graphics Device" and gpu_device_id_regen in constants.GPU_ID_TO_NAME: + gpu_name_regen = constants.GPU_ID_TO_NAME[gpu_device_id_regen] + + gpu_spec["name"] = gpu_name_regen gpu_name_list.append(gpu_spec["name"]) gpu_spec["locks"] = mem_to_core_allowed_locks -- 2.34.1 From f68e17634066fe51833c73a25fce4fe885a0afc5 Mon Sep 17 00:00:00 2001 From: empresa Date: Thu, 16 Oct 2025 01:57:30 +0700 Subject: [PATCH 14/16] dev --- lib/nvml.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/nvml.py b/lib/nvml.py index bcb1d31..1eca64d 100644 --- a/lib/nvml.py +++ b/lib/nvml.py @@ -100,9 +100,13 @@ def init(gpu_specs_file=None, allow_hive_binaries=True): gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle) gpu_name = pynvml.nvmlDeviceGetName(gpu_handle) + print(gpu_name) gpu_device_id = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).upper() + print(gpu_device_id) + print(constants.GPU_ID_TO_NAME) if gpu_name == "NVIDIA Graphics Device" and gpu_device_id in constants.GPU_ID_TO_NAME: gpu_name = constants.GPU_ID_TO_NAME[gpu_device_id] + print(gpu_name) gpu_name_list.append(gpu_name) -- 2.34.1 From 02918f9c2ae5ec706632b21c0b3d5aa402b9a858 Mon Sep 17 00:00:00 2001 From: empresa Date: Thu, 16 Oct 2025 02:03:03 +0700 Subject: [PATCH 15/16] dev: upper -> lower --- lib/constants.py | 4 ++-- lib/get_specs.py | 7 ++++--- lib/nvml.py | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/lib/constants.py b/lib/constants.py index dc317f0..e5b87fa 100644 --- a/lib/constants.py +++ b/lib/constants.py @@ -1,4 +1,4 @@ GPU_ID_TO_NAME = { - "0x20C210DE": "NVIDIA CMP 170HX", - "0x208210DE": "NVIDIA CMP 170HX" + "0x20c210de": "NVIDIA CMP 170HX", + "0x208210de": "NVIDIA CMP 170HX" } diff --git a/lib/get_specs.py b/lib/get_specs.py index fae4ce9..7a0b39f 100644 --- a/lib/get_specs.py +++ b/lib/get_specs.py @@ -269,8 +269,9 @@ def get_gpu_info(): parts = [s.strip() for s in line.split(',')] if len(parts)>12 and index>0: gpu_name_xl = parts[1] - if gpu_name_xl == "NVIDIA Graphics Device" and parts[13] in constants.GPU_ID_TO_NAME: - gpu_name_xl = constants.GPU_ID_TO_NAME[parts[13]] + gpu_id_xl = parts[13].lower() + if gpu_name_xl == "NVIDIA Graphics Device" and gpu_id_xl in constants.GPU_ID_TO_NAME: + gpu_name_xl = constants.GPU_ID_TO_NAME[gpu_id_xl] xl_gpu_info={ "id":index-1, @@ -304,7 +305,7 @@ def get_gpu_info(): if bool(re.match(r'^[0-9]+$', parts[0])): gpu_name = parts[1].strip() - gpu_id = parts[5].strip(); + gpu_id = parts[5].strip().lower() if gpu_name == "NVIDIA Graphics Device" and gpu_id in constants.GPU_ID_TO_NAME: gpu_name = constants.GPU_ID_TO_NAME[gpu_id] diff --git a/lib/nvml.py b/lib/nvml.py index 1eca64d..66b95f0 100644 --- a/lib/nvml.py +++ b/lib/nvml.py @@ -101,7 +101,7 @@ def init(gpu_specs_file=None, allow_hive_binaries=True): gpu_name = pynvml.nvmlDeviceGetName(gpu_handle) print(gpu_name) - gpu_device_id = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).upper() + gpu_device_id = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).lower() print(gpu_device_id) print(constants.GPU_ID_TO_NAME) if gpu_name == "NVIDIA Graphics Device" and gpu_device_id in constants.GPU_ID_TO_NAME: @@ -132,7 +132,7 @@ def init(gpu_specs_file=None, allow_hive_binaries=True): gpu_spec["power_limits"] = [min_power_limit, max_power_limit] gpu_name_regen = pynvml.nvmlDeviceGetName(gpu_handle) - gpu_device_id_regen = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).upper() + gpu_device_id_regen = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).lower() if gpu_name_regen == "NVIDIA Graphics Device" and gpu_device_id_regen in constants.GPU_ID_TO_NAME: gpu_name_regen = constants.GPU_ID_TO_NAME[gpu_device_id_regen] -- 2.34.1 From 9d61280f7a3287164fcfaaa7d376805cbe4ce8bc Mon Sep 17 00:00:00 2001 From: empresa Date: Thu, 16 Oct 2025 02:12:13 +0700 Subject: [PATCH 16/16] dev: finished debug --- clore_hosting/main.py | 2 +- lib/nvml.py | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/clore_hosting/main.py b/clore_hosting/main.py index 0e05e24..34ed3a2 100644 --- a/clore_hosting/main.py +++ b/clore_hosting/main.py @@ -554,7 +554,7 @@ class CloreClient: try: await monitoring.put("specs_service") current_specs = await specs.get() - print(current_specs) + if self.last_hw_specs_submit < (utils.unix_timestamp()-1800): self.last_hw_specs_submit=utils.unix_timestamp() await self.submit_specs(current_specs) diff --git a/lib/nvml.py b/lib/nvml.py index 66b95f0..4120fa1 100644 --- a/lib/nvml.py +++ b/lib/nvml.py @@ -100,14 +100,9 @@ def init(gpu_specs_file=None, allow_hive_binaries=True): gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle) gpu_name = pynvml.nvmlDeviceGetName(gpu_handle) - print(gpu_name) gpu_device_id = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).lower() - print(gpu_device_id) - print(constants.GPU_ID_TO_NAME) if gpu_name == "NVIDIA Graphics Device" and gpu_device_id in constants.GPU_ID_TO_NAME: gpu_name = constants.GPU_ID_TO_NAME[gpu_device_id] - print(gpu_name) - gpu_name_list.append(gpu_name) if not f"{i}-{gpu_uuid}" in parsed_specs_keys: -- 2.34.1