V5.2.8 - when failing to set clock locks on HiveOS fallback on nvtool
This commit is contained in:
parent
6c4995e19f
commit
68e7dc215d
|
@ -455,7 +455,7 @@ class CloreClient:
|
|||
async def submit_specs(self, current_specs):
|
||||
try:
|
||||
if type(current_specs) == dict:
|
||||
current_specs["backend_version"]=17
|
||||
current_specs["backend_version"]=18
|
||||
current_specs["update_hw"]=True
|
||||
smallest_pcie_width = 999
|
||||
for gpu in current_specs["gpus"]["nvidia"]:
|
||||
|
|
45
lib/nvml.py
45
lib/nvml.py
|
@ -10,7 +10,7 @@ import clore_pynvml as pynvml
|
|||
import json
|
||||
import math
|
||||
|
||||
HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
||||
HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./"
|
||||
|
||||
GPU_MEM_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
|
||||
"NVIDIA P102-100": [-2000, 2000],
|
||||
|
@ -329,6 +329,7 @@ def pinpoint_oc_limits_positive(gpu_handle, core=False):
|
|||
return failure, found_solution
|
||||
|
||||
def set_oc(settings):
|
||||
global is_hive
|
||||
try:
|
||||
gpu_count = pynvml.nvmlDeviceGetCount()
|
||||
settings_keys = settings.keys()
|
||||
|
@ -342,6 +343,10 @@ def set_oc(settings):
|
|||
}
|
||||
settings_keys = settings.keys()
|
||||
log.debug(f"Rewriting settings with: {json.dumps(settings)}")
|
||||
|
||||
core_locks = []
|
||||
mem_locks = []
|
||||
any_lock_failure = False
|
||||
for oc_gpu_index in settings_keys:
|
||||
if oc_gpu_index.isdigit():
|
||||
oc_gpu_index=int(oc_gpu_index)
|
||||
|
@ -349,16 +354,35 @@ def set_oc(settings):
|
|||
gpu_oc_config = settings[str(oc_gpu_index)]
|
||||
gpu_possible_ranges = all_gpus_data_list[oc_gpu_index]
|
||||
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(oc_gpu_index)
|
||||
|
||||
if "core_lock" in gpu_oc_config:
|
||||
core_lock = int(gpu_oc_config["core_lock"])
|
||||
pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, core_lock, core_lock)
|
||||
core_locks.append(str(core_lock))
|
||||
try:
|
||||
pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, core_lock, core_lock)
|
||||
except Exception as core_lock_exception:
|
||||
any_lock_failure=True
|
||||
else:
|
||||
pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
|
||||
core_locks.append('0')
|
||||
try:
|
||||
pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
|
||||
except Exception as core_lock_exception:
|
||||
any_lock_failure=True
|
||||
|
||||
if "mem_lock" in gpu_oc_config:
|
||||
mem_lock = int(gpu_oc_config["mem_lock"])
|
||||
pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, mem_lock, mem_lock)
|
||||
mem_locks.append(str(mem_lock))
|
||||
try:
|
||||
pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, mem_lock, mem_lock)
|
||||
except Exception as mem_lock_exception:
|
||||
any_lock_failure=True
|
||||
else:
|
||||
pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
|
||||
mem_locks.append('0')
|
||||
try:
|
||||
pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
|
||||
except Exception as mem_lock_exception:
|
||||
any_lock_failure=True
|
||||
|
||||
if "core" in gpu_oc_config: # Core offset
|
||||
wanted_core_clock = int(round(gpu_oc_config["core"]*2))
|
||||
if gpu_possible_ranges["core"][0] <= wanted_core_clock and wanted_core_clock <= gpu_possible_ranges["core"][1]:
|
||||
|
@ -377,6 +401,17 @@ def set_oc(settings):
|
|||
pynvml.nvmlDeviceSetPowerManagementLimit(gpu_handle, wanted_power_limit_milliwatts)
|
||||
else:
|
||||
log.error(f"Requested OC for GPU:{oc_gpu_index} (POWER LIMIT) out of bound | {gpu_oc_config["pl"]} | [{gpu_possible_ranges["power_limits"][0]}, {gpu_possible_ranges["power_limits"][1]}]")
|
||||
if is_hive and any_lock_failure and len(mem_locks)==len(core_locks):
|
||||
try:
|
||||
nvtool_commands = []
|
||||
for idx, mem_lock in enumerate(mem_locks):
|
||||
core_lock = core_locks[idx]
|
||||
nvtool_commands.append(f"nvtool -i {str(idx)} --setmem {mem_lock} --setcore {core_lock}")
|
||||
cmd = ["bash",'-c',f"PATH={HIVE_PATH} && sudo {' && '.join(nvtool_commands)}"]
|
||||
#print(cmd)
|
||||
subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
except Exception as hive_oc_settings:
|
||||
pass
|
||||
return True
|
||||
except Exception as e:
|
||||
log.error(f"set_oc | ERROR | {e}")
|
||||
|
|
Loading…
Reference in New Issue