V5.2.8 - when failing to set clock locks on HiveOS fallback on nvtool

This commit is contained in:
clore 2024-11-03 23:28:03 +00:00
parent 6c4995e19f
commit 68e7dc215d
2 changed files with 41 additions and 6 deletions

View File

@ -455,7 +455,7 @@ class CloreClient:
async def submit_specs(self, current_specs):
try:
if type(current_specs) == dict:
current_specs["backend_version"]=17
current_specs["backend_version"]=18
current_specs["update_hw"]=True
smallest_pcie_width = 999
for gpu in current_specs["gpus"]["nvidia"]:

View File

@ -10,7 +10,7 @@ import clore_pynvml as pynvml
import json
import math
HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./"
GPU_MEM_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
"NVIDIA P102-100": [-2000, 2000],
@ -329,6 +329,7 @@ def pinpoint_oc_limits_positive(gpu_handle, core=False):
return failure, found_solution
def set_oc(settings):
global is_hive
try:
gpu_count = pynvml.nvmlDeviceGetCount()
settings_keys = settings.keys()
@ -342,6 +343,10 @@ def set_oc(settings):
}
settings_keys = settings.keys()
log.debug(f"Rewriting settings with: {json.dumps(settings)}")
core_locks = []
mem_locks = []
any_lock_failure = False
for oc_gpu_index in settings_keys:
if oc_gpu_index.isdigit():
oc_gpu_index=int(oc_gpu_index)
@ -349,16 +354,35 @@ def set_oc(settings):
gpu_oc_config = settings[str(oc_gpu_index)]
gpu_possible_ranges = all_gpus_data_list[oc_gpu_index]
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(oc_gpu_index)
if "core_lock" in gpu_oc_config:
core_lock = int(gpu_oc_config["core_lock"])
pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, core_lock, core_lock)
core_locks.append(str(core_lock))
try:
pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, core_lock, core_lock)
except Exception as core_lock_exception:
any_lock_failure=True
else:
pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
core_locks.append('0')
try:
pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
except Exception as core_lock_exception:
any_lock_failure=True
if "mem_lock" in gpu_oc_config:
mem_lock = int(gpu_oc_config["mem_lock"])
pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, mem_lock, mem_lock)
mem_locks.append(str(mem_lock))
try:
pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, mem_lock, mem_lock)
except Exception as mem_lock_exception:
any_lock_failure=True
else:
pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
mem_locks.append('0')
try:
pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
except Exception as mem_lock_exception:
any_lock_failure=True
if "core" in gpu_oc_config: # Core offset
wanted_core_clock = int(round(gpu_oc_config["core"]*2))
if gpu_possible_ranges["core"][0] <= wanted_core_clock and wanted_core_clock <= gpu_possible_ranges["core"][1]:
@ -377,6 +401,17 @@ def set_oc(settings):
pynvml.nvmlDeviceSetPowerManagementLimit(gpu_handle, wanted_power_limit_milliwatts)
else:
log.error(f"Requested OC for GPU:{oc_gpu_index} (POWER LIMIT) out of bound | {gpu_oc_config["pl"]} | [{gpu_possible_ranges["power_limits"][0]}, {gpu_possible_ranges["power_limits"][1]}]")
if is_hive and any_lock_failure and len(mem_locks)==len(core_locks):
try:
nvtool_commands = []
for idx, mem_lock in enumerate(mem_locks):
core_lock = core_locks[idx]
nvtool_commands.append(f"nvtool -i {str(idx)} --setmem {mem_lock} --setcore {core_lock}")
cmd = ["bash",'-c',f"PATH={HIVE_PATH} && sudo {' && '.join(nvtool_commands)}"]
#print(cmd)
subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except Exception as hive_oc_settings:
pass
return True
except Exception as e:
log.error(f"set_oc | ERROR | {e}")