V5.2.1 | fix reading OC specs for older GPUs
This commit is contained in:
parent
36d3026d5d
commit
5e733fd0d6
|
@ -426,7 +426,7 @@ class CloreClient:
|
|||
async def submit_specs(self, current_specs):
|
||||
try:
|
||||
if type(current_specs) == dict:
|
||||
current_specs["backend_version"]=10
|
||||
current_specs["backend_version"]=11
|
||||
current_specs["update_hw"]=True
|
||||
smallest_pcie_width = 999
|
||||
for gpu in current_specs["gpus"]["nvidia"]:
|
||||
|
|
91
lib/nvml.py
91
lib/nvml.py
|
@ -6,8 +6,38 @@ config = config_module.config
|
|||
log = logging_lib.log
|
||||
|
||||
import subprocess
|
||||
import pynvml
|
||||
import clore_pynvml as pynvml
|
||||
import json
|
||||
import math
|
||||
|
||||
HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
||||
|
||||
GPU_MEM_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
|
||||
"NVIDIA P102-100": [-2000, 2000],
|
||||
"NVIDIA P104-100": [-2000, 2000],
|
||||
"NVIDIA P106-090": [-2000, 2000],
|
||||
"NVIDIA P106-100": [-2000, 2000],
|
||||
"NVIDIA GeForce GTX 1050 Ti": [-2000, 2000],
|
||||
"NVIDIA GeForce GTX 1060 3GB": [-2000, 2000],
|
||||
"NVIDIA GeForce GTX 1060 6GB": [-2000, 2000],
|
||||
"NVIDIA GeForce GTX 1070": [-2000, 2000],
|
||||
"NVIDIA GeForce GTX 1070 Ti": [-2000, 2000],
|
||||
"NVIDIA GeForce GTX 1080": [-2000, 2000],
|
||||
"NVIDIA GeForce GTX 1080 Ti":[-2000, 2000],
|
||||
"NVIDIA CMP 30HX": [-2000, 6000],
|
||||
"NVIDIA CMP 40HX": [-2000, 6000],
|
||||
"NVIDIA CMP 50HX": [-2000, 6000],
|
||||
"NVIDIA CMP 90HX": [-2000, 6000],
|
||||
"NVIDIA GeForce GTX 1650": [-2000, 6000],
|
||||
"NVIDIA GeForce GTX 1660 SUPER": [-2000, 6000],
|
||||
"NVIDIA GeForce GTX 1660 Ti": [-2000, 6000],
|
||||
"NVIDIA GeForce RTX 2060": [-2000, 6000],
|
||||
"NVIDIA GeForce RTX 2060 SUPER": [-2000, 6000],
|
||||
"NVIDIA GeForce RTX 2070": [-2000, 6000],
|
||||
"NVIDIA GeForce RTX 2070 SUPER": [-2000, 6000],
|
||||
"NVIDIA GeForce RTX 2080": [-2000, 6000],
|
||||
"NVIDIA GeForce RTX 2080 Ti": [-2000, 6000]
|
||||
}
|
||||
|
||||
is_hive = False
|
||||
all_gpus_data_list=[]
|
||||
|
@ -64,22 +94,47 @@ def init(gpu_specs_file=None):
|
|||
|
||||
mem_range = get_hive_clock_range(is_hive, i, "mem")
|
||||
core_range = get_hive_clock_range(is_hive, i, "core")
|
||||
if type(mem_range) != list:
|
||||
pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load
|
||||
failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle)
|
||||
failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle)
|
||||
if (not failure_min) and (not failure_max):
|
||||
mem_range=[min_oc_solution, max_oc_solution]
|
||||
pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, 0)
|
||||
pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
|
||||
if type(core_range) != list:
|
||||
pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, 300, 350) # Force low clocks, so the GPU can't crash when testing if under load
|
||||
failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle, True)
|
||||
failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle, True)
|
||||
if (not failure_min) and (not failure_max):
|
||||
core_range=[min_oc_solution, max_oc_solution]
|
||||
pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0)
|
||||
pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
|
||||
try:
|
||||
if type(mem_range) != list:
|
||||
pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load
|
||||
failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle)
|
||||
failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle)
|
||||
if (not failure_min) and (not failure_max):
|
||||
mem_range=[min_oc_solution, max_oc_solution]
|
||||
pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, 0)
|
||||
pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
|
||||
if type(core_range) != list:
|
||||
pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, 300, 350) # Force low clocks, so the GPU can't crash when testing if under load
|
||||
failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle, True)
|
||||
failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle, True)
|
||||
if (not failure_min) and (not failure_max):
|
||||
core_range=[min_oc_solution, max_oc_solution]
|
||||
pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0)
|
||||
pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
|
||||
except Exception as e_pinpointing:
|
||||
if "not supported" in str(e_pinpointing).lower():
|
||||
try:
|
||||
min_core_offset, max_core_offset = pynvml.nvmlDeviceGetGpcClkMinMaxVfOffset(gpu_handle)
|
||||
if min_core_offset>0:
|
||||
min_core_offset = min_core_offset - math.floor((2**32)/1000)
|
||||
if min_core_offset > -20000 and min_core_offset <= 0 and max_core_offset>=0 and min_core_offset < 20000:
|
||||
core_range=[min_core_offset, max_core_offset]
|
||||
else:
|
||||
core_range=[0,0]
|
||||
min_mem_offset, max_mem_offset = pynvml.nvmlDeviceGetMemClkMinMaxVfOffset(gpu_handle)
|
||||
if min_mem_offset>0:
|
||||
min_mem_offset = min_mem_offset - math.floor((2**32)/1000)
|
||||
if min_mem_offset==0 and max_mem_offset==0:
|
||||
if gpu_spec["name"] in GPU_MEM_ALLOWED_OC_RANGES:
|
||||
mem_range = GPU_MEM_ALLOWED_OC_RANGES[gpu_spec["name"]]
|
||||
else:
|
||||
mem_range = [0,0]
|
||||
elif min_mem_offset > -20000 and min_mem_offset <= 0 and max_mem_offset>=0 and max_mem_offset < 20000:
|
||||
mem_range=[min_mem_offset, max_mem_offset]
|
||||
else:
|
||||
mem_range=[0,0]
|
||||
except Exception as e2:
|
||||
get_data_fail=True
|
||||
if type(mem_range) == list and type(core_range) == list and len(mem_range)==2 and len(core_range)==2:
|
||||
gpu_spec["mem"]=mem_range
|
||||
gpu_spec["core"]=core_range
|
||||
|
@ -267,7 +322,7 @@ def get_hive_clock_range(is_hive, gpu_index, part):
|
|||
if is_hive:
|
||||
try:
|
||||
flag = "--setmemoffset" if part=="mem" else "--setcoreoffset"
|
||||
cmd = ["bash",'-c',f"nvtool -i 0 {flag} -100000"]
|
||||
cmd = ["bash",'-c',f"PATH={HIVE_PATH} && sudo nvtool -i {gpu_index} {flag} -100000"]
|
||||
|
||||
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
lines = result.stdout.decode().splitlines()
|
||||
|
|
|
@ -7,5 +7,5 @@ psutil==5.9.0
|
|||
python-iptables==1.0.1
|
||||
websockets==12.0
|
||||
packaging==23.2
|
||||
pynvml==11.5.0
|
||||
clore-pynvml==11.5.4
|
||||
requests==2.31.0
|
Loading…
Reference in New Issue