V5.2.1 | fix reading OC specs for older GPUs

This commit is contained in:
clore 2024-05-28 00:25:58 +00:00
parent 36d3026d5d
commit 5e733fd0d6
3 changed files with 75 additions and 20 deletions

View File

@ -426,7 +426,7 @@ class CloreClient:
async def submit_specs(self, current_specs):
try:
if type(current_specs) == dict:
current_specs["backend_version"]=10
current_specs["backend_version"]=11
current_specs["update_hw"]=True
smallest_pcie_width = 999
for gpu in current_specs["gpus"]["nvidia"]:

View File

@ -6,8 +6,38 @@ config = config_module.config
log = logging_lib.log
import subprocess
import pynvml
import clore_pynvml as pynvml
import json
import math
HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
GPU_MEM_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
"NVIDIA P102-100": [-2000, 2000],
"NVIDIA P104-100": [-2000, 2000],
"NVIDIA P106-090": [-2000, 2000],
"NVIDIA P106-100": [-2000, 2000],
"NVIDIA GeForce GTX 1050 Ti": [-2000, 2000],
"NVIDIA GeForce GTX 1060 3GB": [-2000, 2000],
"NVIDIA GeForce GTX 1060 6GB": [-2000, 2000],
"NVIDIA GeForce GTX 1070": [-2000, 2000],
"NVIDIA GeForce GTX 1070 Ti": [-2000, 2000],
"NVIDIA GeForce GTX 1080": [-2000, 2000],
"NVIDIA GeForce GTX 1080 Ti":[-2000, 2000],
"NVIDIA CMP 30HX": [-2000, 6000],
"NVIDIA CMP 40HX": [-2000, 6000],
"NVIDIA CMP 50HX": [-2000, 6000],
"NVIDIA CMP 90HX": [-2000, 6000],
"NVIDIA GeForce GTX 1650": [-2000, 6000],
"NVIDIA GeForce GTX 1660 SUPER": [-2000, 6000],
"NVIDIA GeForce GTX 1660 Ti": [-2000, 6000],
"NVIDIA GeForce RTX 2060": [-2000, 6000],
"NVIDIA GeForce RTX 2060 SUPER": [-2000, 6000],
"NVIDIA GeForce RTX 2070": [-2000, 6000],
"NVIDIA GeForce RTX 2070 SUPER": [-2000, 6000],
"NVIDIA GeForce RTX 2080": [-2000, 6000],
"NVIDIA GeForce RTX 2080 Ti": [-2000, 6000]
}
is_hive = False
all_gpus_data_list=[]
@ -64,22 +94,47 @@ def init(gpu_specs_file=None):
mem_range = get_hive_clock_range(is_hive, i, "mem")
core_range = get_hive_clock_range(is_hive, i, "core")
if type(mem_range) != list:
pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load
failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle)
failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle)
if (not failure_min) and (not failure_max):
mem_range=[min_oc_solution, max_oc_solution]
pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, 0)
pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
if type(core_range) != list:
pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, 300, 350) # Force low clocks, so the GPU can't crash when testing if under load
failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle, True)
failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle, True)
if (not failure_min) and (not failure_max):
core_range=[min_oc_solution, max_oc_solution]
pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0)
pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
try:
if type(mem_range) != list:
pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load
failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle)
failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle)
if (not failure_min) and (not failure_max):
mem_range=[min_oc_solution, max_oc_solution]
pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, 0)
pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
if type(core_range) != list:
pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, 300, 350) # Force low clocks, so the GPU can't crash when testing if under load
failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle, True)
failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle, True)
if (not failure_min) and (not failure_max):
core_range=[min_oc_solution, max_oc_solution]
pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0)
pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
except Exception as e_pinpointing:
if "not supported" in str(e_pinpointing).lower():
try:
min_core_offset, max_core_offset = pynvml.nvmlDeviceGetGpcClkMinMaxVfOffset(gpu_handle)
if min_core_offset>0:
min_core_offset = min_core_offset - math.floor((2**32)/1000)
if min_core_offset > -20000 and min_core_offset <= 0 and max_core_offset>=0 and min_core_offset < 20000:
core_range=[min_core_offset, max_core_offset]
else:
core_range=[0,0]
min_mem_offset, max_mem_offset = pynvml.nvmlDeviceGetMemClkMinMaxVfOffset(gpu_handle)
if min_mem_offset>0:
min_mem_offset = min_mem_offset - math.floor((2**32)/1000)
if min_mem_offset==0 and max_mem_offset==0:
if gpu_spec["name"] in GPU_MEM_ALLOWED_OC_RANGES:
mem_range = GPU_MEM_ALLOWED_OC_RANGES[gpu_spec["name"]]
else:
mem_range = [0,0]
elif min_mem_offset > -20000 and min_mem_offset <= 0 and max_mem_offset>=0 and max_mem_offset < 20000:
mem_range=[min_mem_offset, max_mem_offset]
else:
mem_range=[0,0]
except Exception as e2:
get_data_fail=True
if type(mem_range) == list and type(core_range) == list and len(mem_range)==2 and len(core_range)==2:
gpu_spec["mem"]=mem_range
gpu_spec["core"]=core_range
@ -267,7 +322,7 @@ def get_hive_clock_range(is_hive, gpu_index, part):
if is_hive:
try:
flag = "--setmemoffset" if part=="mem" else "--setcoreoffset"
cmd = ["bash",'-c',f"nvtool -i 0 {flag} -100000"]
cmd = ["bash",'-c',f"PATH={HIVE_PATH} && sudo nvtool -i {gpu_index} {flag} -100000"]
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
lines = result.stdout.decode().splitlines()

View File

@ -7,5 +7,5 @@ psutil==5.9.0
python-iptables==1.0.1
websockets==12.0
packaging==23.2
pynvml==11.5.0
clore-pynvml==11.5.4
requests==2.31.0