V5.2.1 | fix reading OC specs for older GPUs
This commit is contained in:
parent
36d3026d5d
commit
5e733fd0d6
|
@ -426,7 +426,7 @@ class CloreClient:
|
||||||
async def submit_specs(self, current_specs):
|
async def submit_specs(self, current_specs):
|
||||||
try:
|
try:
|
||||||
if type(current_specs) == dict:
|
if type(current_specs) == dict:
|
||||||
current_specs["backend_version"]=10
|
current_specs["backend_version"]=11
|
||||||
current_specs["update_hw"]=True
|
current_specs["update_hw"]=True
|
||||||
smallest_pcie_width = 999
|
smallest_pcie_width = 999
|
||||||
for gpu in current_specs["gpus"]["nvidia"]:
|
for gpu in current_specs["gpus"]["nvidia"]:
|
||||||
|
|
59
lib/nvml.py
59
lib/nvml.py
|
@ -6,8 +6,38 @@ config = config_module.config
|
||||||
log = logging_lib.log
|
log = logging_lib.log
|
||||||
|
|
||||||
import subprocess
|
import subprocess
|
||||||
import pynvml
|
import clore_pynvml as pynvml
|
||||||
import json
|
import json
|
||||||
|
import math
|
||||||
|
|
||||||
|
HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
||||||
|
|
||||||
|
GPU_MEM_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
|
||||||
|
"NVIDIA P102-100": [-2000, 2000],
|
||||||
|
"NVIDIA P104-100": [-2000, 2000],
|
||||||
|
"NVIDIA P106-090": [-2000, 2000],
|
||||||
|
"NVIDIA P106-100": [-2000, 2000],
|
||||||
|
"NVIDIA GeForce GTX 1050 Ti": [-2000, 2000],
|
||||||
|
"NVIDIA GeForce GTX 1060 3GB": [-2000, 2000],
|
||||||
|
"NVIDIA GeForce GTX 1060 6GB": [-2000, 2000],
|
||||||
|
"NVIDIA GeForce GTX 1070": [-2000, 2000],
|
||||||
|
"NVIDIA GeForce GTX 1070 Ti": [-2000, 2000],
|
||||||
|
"NVIDIA GeForce GTX 1080": [-2000, 2000],
|
||||||
|
"NVIDIA GeForce GTX 1080 Ti":[-2000, 2000],
|
||||||
|
"NVIDIA CMP 30HX": [-2000, 6000],
|
||||||
|
"NVIDIA CMP 40HX": [-2000, 6000],
|
||||||
|
"NVIDIA CMP 50HX": [-2000, 6000],
|
||||||
|
"NVIDIA CMP 90HX": [-2000, 6000],
|
||||||
|
"NVIDIA GeForce GTX 1650": [-2000, 6000],
|
||||||
|
"NVIDIA GeForce GTX 1660 SUPER": [-2000, 6000],
|
||||||
|
"NVIDIA GeForce GTX 1660 Ti": [-2000, 6000],
|
||||||
|
"NVIDIA GeForce RTX 2060": [-2000, 6000],
|
||||||
|
"NVIDIA GeForce RTX 2060 SUPER": [-2000, 6000],
|
||||||
|
"NVIDIA GeForce RTX 2070": [-2000, 6000],
|
||||||
|
"NVIDIA GeForce RTX 2070 SUPER": [-2000, 6000],
|
||||||
|
"NVIDIA GeForce RTX 2080": [-2000, 6000],
|
||||||
|
"NVIDIA GeForce RTX 2080 Ti": [-2000, 6000]
|
||||||
|
}
|
||||||
|
|
||||||
is_hive = False
|
is_hive = False
|
||||||
all_gpus_data_list=[]
|
all_gpus_data_list=[]
|
||||||
|
@ -64,6 +94,7 @@ def init(gpu_specs_file=None):
|
||||||
|
|
||||||
mem_range = get_hive_clock_range(is_hive, i, "mem")
|
mem_range = get_hive_clock_range(is_hive, i, "mem")
|
||||||
core_range = get_hive_clock_range(is_hive, i, "core")
|
core_range = get_hive_clock_range(is_hive, i, "core")
|
||||||
|
try:
|
||||||
if type(mem_range) != list:
|
if type(mem_range) != list:
|
||||||
pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load
|
pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load
|
||||||
failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle)
|
failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle)
|
||||||
|
@ -80,6 +111,30 @@ def init(gpu_specs_file=None):
|
||||||
core_range=[min_oc_solution, max_oc_solution]
|
core_range=[min_oc_solution, max_oc_solution]
|
||||||
pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0)
|
pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0)
|
||||||
pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
|
pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
|
||||||
|
except Exception as e_pinpointing:
|
||||||
|
if "not supported" in str(e_pinpointing).lower():
|
||||||
|
try:
|
||||||
|
min_core_offset, max_core_offset = pynvml.nvmlDeviceGetGpcClkMinMaxVfOffset(gpu_handle)
|
||||||
|
if min_core_offset>0:
|
||||||
|
min_core_offset = min_core_offset - math.floor((2**32)/1000)
|
||||||
|
if min_core_offset > -20000 and min_core_offset <= 0 and max_core_offset>=0 and min_core_offset < 20000:
|
||||||
|
core_range=[min_core_offset, max_core_offset]
|
||||||
|
else:
|
||||||
|
core_range=[0,0]
|
||||||
|
min_mem_offset, max_mem_offset = pynvml.nvmlDeviceGetMemClkMinMaxVfOffset(gpu_handle)
|
||||||
|
if min_mem_offset>0:
|
||||||
|
min_mem_offset = min_mem_offset - math.floor((2**32)/1000)
|
||||||
|
if min_mem_offset==0 and max_mem_offset==0:
|
||||||
|
if gpu_spec["name"] in GPU_MEM_ALLOWED_OC_RANGES:
|
||||||
|
mem_range = GPU_MEM_ALLOWED_OC_RANGES[gpu_spec["name"]]
|
||||||
|
else:
|
||||||
|
mem_range = [0,0]
|
||||||
|
elif min_mem_offset > -20000 and min_mem_offset <= 0 and max_mem_offset>=0 and max_mem_offset < 20000:
|
||||||
|
mem_range=[min_mem_offset, max_mem_offset]
|
||||||
|
else:
|
||||||
|
mem_range=[0,0]
|
||||||
|
except Exception as e2:
|
||||||
|
get_data_fail=True
|
||||||
if type(mem_range) == list and type(core_range) == list and len(mem_range)==2 and len(core_range)==2:
|
if type(mem_range) == list and type(core_range) == list and len(mem_range)==2 and len(core_range)==2:
|
||||||
gpu_spec["mem"]=mem_range
|
gpu_spec["mem"]=mem_range
|
||||||
gpu_spec["core"]=core_range
|
gpu_spec["core"]=core_range
|
||||||
|
@ -267,7 +322,7 @@ def get_hive_clock_range(is_hive, gpu_index, part):
|
||||||
if is_hive:
|
if is_hive:
|
||||||
try:
|
try:
|
||||||
flag = "--setmemoffset" if part=="mem" else "--setcoreoffset"
|
flag = "--setmemoffset" if part=="mem" else "--setcoreoffset"
|
||||||
cmd = ["bash",'-c',f"nvtool -i 0 {flag} -100000"]
|
cmd = ["bash",'-c',f"PATH={HIVE_PATH} && sudo nvtool -i {gpu_index} {flag} -100000"]
|
||||||
|
|
||||||
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
lines = result.stdout.decode().splitlines()
|
lines = result.stdout.decode().splitlines()
|
||||||
|
|
|
@ -7,5 +7,5 @@ psutil==5.9.0
|
||||||
python-iptables==1.0.1
|
python-iptables==1.0.1
|
||||||
websockets==12.0
|
websockets==12.0
|
||||||
packaging==23.2
|
packaging==23.2
|
||||||
pynvml==11.5.0
|
clore-pynvml==11.5.4
|
||||||
requests==2.31.0
|
requests==2.31.0
|
Loading…
Reference in New Issue