Merge pull request 'dev: fixed an issues with CMP170' (#6) from dev into main
Reviewed-on: #6
This commit is contained in:
commit
3772e33c97
|
@ -554,6 +554,7 @@ class CloreClient:
|
||||||
try:
|
try:
|
||||||
await monitoring.put("specs_service")
|
await monitoring.put("specs_service")
|
||||||
current_specs = await specs.get()
|
current_specs = await specs.get()
|
||||||
|
|
||||||
if self.last_hw_specs_submit < (utils.unix_timestamp()-1800):
|
if self.last_hw_specs_submit < (utils.unix_timestamp()-1800):
|
||||||
self.last_hw_specs_submit=utils.unix_timestamp()
|
self.last_hw_specs_submit=utils.unix_timestamp()
|
||||||
await self.submit_specs(current_specs)
|
await self.submit_specs(current_specs)
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
GPU_ID_TO_NAME = {
|
||||||
|
"0x20c210de": "NVIDIA CMP 170HX",
|
||||||
|
"0x208210de": "NVIDIA CMP 170HX"
|
||||||
|
}
|
|
@ -4,6 +4,7 @@ import xml.etree.ElementTree as ET
|
||||||
from lib import docker_interface
|
from lib import docker_interface
|
||||||
from typing import Dict, List, Optional
|
from typing import Dict, List, Optional
|
||||||
from lib import utils
|
from lib import utils
|
||||||
|
from lib import constants
|
||||||
import subprocess
|
import subprocess
|
||||||
import speedtest
|
import speedtest
|
||||||
import platform
|
import platform
|
||||||
|
@ -256,8 +257,8 @@ def get_gpu_info():
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
nvidia_smi_return_code, nvidia_smi_stdout, nvidia_smi_stderr = utils.run_command(f"nvidia-smi --query-gpu=index,name,uuid,serial,memory.total --format=csv")
|
nvidia_smi_return_code, nvidia_smi_stdout, nvidia_smi_stderr = utils.run_command(f"nvidia-smi --query-gpu=index,name,uuid,serial,memory.total,pci.device_id --format=csv")
|
||||||
nvidia_smi_xl_return_code, nvidia_smi_xl_stdout, nvidia_smi_xl_stderr = utils.run_command("nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv")
|
nvidia_smi_xl_return_code, nvidia_smi_xl_stdout, nvidia_smi_xl_stderr = utils.run_command("nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used,pci.device_id --format=csv")
|
||||||
|
|
||||||
if "Failed to initialize NVML" in nvidia_smi_stdout or "Failed to initialize NVML" in nvidia_smi_stderr or "Failed to initialize NVML" in nvidia_smi_xl_stdout or "Failed to initialize NVML" in nvidia_smi_xl_stderr:
|
if "Failed to initialize NVML" in nvidia_smi_stdout or "Failed to initialize NVML" in nvidia_smi_stderr or "Failed to initialize NVML" in nvidia_smi_xl_stdout or "Failed to initialize NVML" in nvidia_smi_xl_stderr:
|
||||||
nvml_err=True
|
nvml_err=True
|
||||||
|
@ -267,10 +268,15 @@ def get_gpu_info():
|
||||||
for index, line in enumerate(lines_xl):
|
for index, line in enumerate(lines_xl):
|
||||||
parts = [s.strip() for s in line.split(',')]
|
parts = [s.strip() for s in line.split(',')]
|
||||||
if len(parts)>12 and index>0:
|
if len(parts)>12 and index>0:
|
||||||
|
gpu_name_xl = parts[1]
|
||||||
|
gpu_id_xl = parts[13].lower()
|
||||||
|
if gpu_name_xl == "NVIDIA Graphics Device" and gpu_id_xl in constants.GPU_ID_TO_NAME:
|
||||||
|
gpu_name_xl = constants.GPU_ID_TO_NAME[gpu_id_xl]
|
||||||
|
|
||||||
xl_gpu_info={
|
xl_gpu_info={
|
||||||
"id":index-1,
|
"id":index-1,
|
||||||
"timestamp": parts[0],
|
"timestamp": parts[0],
|
||||||
"name": parts[1],
|
"name": gpu_name_xl,
|
||||||
"pcie_bus": parts[2].split(':', 1)[1],
|
"pcie_bus": parts[2].split(':', 1)[1],
|
||||||
"driver": parts[3],
|
"driver": parts[3],
|
||||||
"pstate": parts[4],
|
"pstate": parts[4],
|
||||||
|
@ -281,6 +287,7 @@ def get_gpu_info():
|
||||||
"mem_free": parts[11],
|
"mem_free": parts[11],
|
||||||
"mem_used": parts[12]
|
"mem_used": parts[12]
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
pci_query = parts[2][parts[2].find(':')+1:]
|
pci_query = parts[2][parts[2].find(':')+1:]
|
||||||
for index, valid_pci_dev in enumerate(valid_pci_dev_list):
|
for index, valid_pci_dev in enumerate(valid_pci_dev_list):
|
||||||
|
@ -296,7 +303,13 @@ def get_gpu_info():
|
||||||
for line in lines:
|
for line in lines:
|
||||||
parts = line.split(',')
|
parts = line.split(',')
|
||||||
if bool(re.match(r'^[0-9]+$', parts[0])):
|
if bool(re.match(r'^[0-9]+$', parts[0])):
|
||||||
gpu_str = f"{len(lines)-1}x {parts[1].strip()}"
|
|
||||||
|
gpu_name = parts[1].strip()
|
||||||
|
gpu_id = parts[5].strip().lower()
|
||||||
|
if gpu_name == "NVIDIA Graphics Device" and gpu_id in constants.GPU_ID_TO_NAME:
|
||||||
|
gpu_name = constants.GPU_ID_TO_NAME[gpu_id]
|
||||||
|
|
||||||
|
gpu_str = f"{len(lines)-1}x {gpu_name}"
|
||||||
gpu_mem = round(int(filter_non_numeric(parts[4]).strip())/1024, 2)
|
gpu_mem = round(int(filter_non_numeric(parts[4]).strip())/1024, 2)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
nvml_err=True
|
nvml_err=True
|
||||||
|
|
17
lib/nvml.py
17
lib/nvml.py
|
@ -1,6 +1,7 @@
|
||||||
from lib import config as config_module
|
from lib import config as config_module
|
||||||
from lib import logging as logging_lib
|
from lib import logging as logging_lib
|
||||||
from lib import get_specs
|
from lib import get_specs
|
||||||
|
from lib import constants
|
||||||
|
|
||||||
config = config_module.config
|
config = config_module.config
|
||||||
log = logging_lib.log
|
log = logging_lib.log
|
||||||
|
@ -97,7 +98,13 @@ def init(gpu_specs_file=None, allow_hive_binaries=True):
|
||||||
break
|
break
|
||||||
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
||||||
gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
|
gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
|
||||||
gpu_name_list.append(pynvml.nvmlDeviceGetName(gpu_handle))
|
|
||||||
|
gpu_name = pynvml.nvmlDeviceGetName(gpu_handle)
|
||||||
|
gpu_device_id = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).lower()
|
||||||
|
if gpu_name == "NVIDIA Graphics Device" and gpu_device_id in constants.GPU_ID_TO_NAME:
|
||||||
|
gpu_name = constants.GPU_ID_TO_NAME[gpu_device_id]
|
||||||
|
|
||||||
|
gpu_name_list.append(gpu_name)
|
||||||
if not f"{i}-{gpu_uuid}" in parsed_specs_keys:
|
if not f"{i}-{gpu_uuid}" in parsed_specs_keys:
|
||||||
parsed_specs={}
|
parsed_specs={}
|
||||||
regenerate_specs=True
|
regenerate_specs=True
|
||||||
|
@ -118,7 +125,13 @@ def init(gpu_specs_file=None, allow_hive_binaries=True):
|
||||||
max_power_limit = int(power_limits[1] / 1000.0)
|
max_power_limit = int(power_limits[1] / 1000.0)
|
||||||
gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0)
|
gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0)
|
||||||
gpu_spec["power_limits"] = [min_power_limit, max_power_limit]
|
gpu_spec["power_limits"] = [min_power_limit, max_power_limit]
|
||||||
gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle)
|
|
||||||
|
gpu_name_regen = pynvml.nvmlDeviceGetName(gpu_handle)
|
||||||
|
gpu_device_id_regen = hex(pynvml.nvmlDeviceGetPciInfo(gpu_handle).pciDeviceId).lower()
|
||||||
|
if gpu_name_regen == "NVIDIA Graphics Device" and gpu_device_id_regen in constants.GPU_ID_TO_NAME:
|
||||||
|
gpu_name_regen = constants.GPU_ID_TO_NAME[gpu_device_id_regen]
|
||||||
|
|
||||||
|
gpu_spec["name"] = gpu_name_regen
|
||||||
gpu_name_list.append(gpu_spec["name"])
|
gpu_name_list.append(gpu_spec["name"])
|
||||||
gpu_spec["locks"] = mem_to_core_allowed_locks
|
gpu_spec["locks"] = mem_to_core_allowed_locks
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue