V5.2.8 - when failing to set clock locks on HiveOS fallback on nvtool
This commit is contained in:
parent
6c4995e19f
commit
68e7dc215d
|
@ -455,7 +455,7 @@ class CloreClient:
|
||||||
async def submit_specs(self, current_specs):
|
async def submit_specs(self, current_specs):
|
||||||
try:
|
try:
|
||||||
if type(current_specs) == dict:
|
if type(current_specs) == dict:
|
||||||
current_specs["backend_version"]=17
|
current_specs["backend_version"]=18
|
||||||
current_specs["update_hw"]=True
|
current_specs["update_hw"]=True
|
||||||
smallest_pcie_width = 999
|
smallest_pcie_width = 999
|
||||||
for gpu in current_specs["gpus"]["nvidia"]:
|
for gpu in current_specs["gpus"]["nvidia"]:
|
||||||
|
|
45
lib/nvml.py
45
lib/nvml.py
|
@ -10,7 +10,7 @@ import clore_pynvml as pynvml
|
||||||
import json
|
import json
|
||||||
import math
|
import math
|
||||||
|
|
||||||
HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./"
|
||||||
|
|
||||||
GPU_MEM_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
|
GPU_MEM_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
|
||||||
"NVIDIA P102-100": [-2000, 2000],
|
"NVIDIA P102-100": [-2000, 2000],
|
||||||
|
@ -329,6 +329,7 @@ def pinpoint_oc_limits_positive(gpu_handle, core=False):
|
||||||
return failure, found_solution
|
return failure, found_solution
|
||||||
|
|
||||||
def set_oc(settings):
|
def set_oc(settings):
|
||||||
|
global is_hive
|
||||||
try:
|
try:
|
||||||
gpu_count = pynvml.nvmlDeviceGetCount()
|
gpu_count = pynvml.nvmlDeviceGetCount()
|
||||||
settings_keys = settings.keys()
|
settings_keys = settings.keys()
|
||||||
|
@ -342,6 +343,10 @@ def set_oc(settings):
|
||||||
}
|
}
|
||||||
settings_keys = settings.keys()
|
settings_keys = settings.keys()
|
||||||
log.debug(f"Rewriting settings with: {json.dumps(settings)}")
|
log.debug(f"Rewriting settings with: {json.dumps(settings)}")
|
||||||
|
|
||||||
|
core_locks = []
|
||||||
|
mem_locks = []
|
||||||
|
any_lock_failure = False
|
||||||
for oc_gpu_index in settings_keys:
|
for oc_gpu_index in settings_keys:
|
||||||
if oc_gpu_index.isdigit():
|
if oc_gpu_index.isdigit():
|
||||||
oc_gpu_index=int(oc_gpu_index)
|
oc_gpu_index=int(oc_gpu_index)
|
||||||
|
@ -349,16 +354,35 @@ def set_oc(settings):
|
||||||
gpu_oc_config = settings[str(oc_gpu_index)]
|
gpu_oc_config = settings[str(oc_gpu_index)]
|
||||||
gpu_possible_ranges = all_gpus_data_list[oc_gpu_index]
|
gpu_possible_ranges = all_gpus_data_list[oc_gpu_index]
|
||||||
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(oc_gpu_index)
|
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(oc_gpu_index)
|
||||||
|
|
||||||
if "core_lock" in gpu_oc_config:
|
if "core_lock" in gpu_oc_config:
|
||||||
core_lock = int(gpu_oc_config["core_lock"])
|
core_lock = int(gpu_oc_config["core_lock"])
|
||||||
pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, core_lock, core_lock)
|
core_locks.append(str(core_lock))
|
||||||
|
try:
|
||||||
|
pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, core_lock, core_lock)
|
||||||
|
except Exception as core_lock_exception:
|
||||||
|
any_lock_failure=True
|
||||||
else:
|
else:
|
||||||
pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
|
core_locks.append('0')
|
||||||
|
try:
|
||||||
|
pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
|
||||||
|
except Exception as core_lock_exception:
|
||||||
|
any_lock_failure=True
|
||||||
|
|
||||||
if "mem_lock" in gpu_oc_config:
|
if "mem_lock" in gpu_oc_config:
|
||||||
mem_lock = int(gpu_oc_config["mem_lock"])
|
mem_lock = int(gpu_oc_config["mem_lock"])
|
||||||
pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, mem_lock, mem_lock)
|
mem_locks.append(str(mem_lock))
|
||||||
|
try:
|
||||||
|
pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, mem_lock, mem_lock)
|
||||||
|
except Exception as mem_lock_exception:
|
||||||
|
any_lock_failure=True
|
||||||
else:
|
else:
|
||||||
pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
|
mem_locks.append('0')
|
||||||
|
try:
|
||||||
|
pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
|
||||||
|
except Exception as mem_lock_exception:
|
||||||
|
any_lock_failure=True
|
||||||
|
|
||||||
if "core" in gpu_oc_config: # Core offset
|
if "core" in gpu_oc_config: # Core offset
|
||||||
wanted_core_clock = int(round(gpu_oc_config["core"]*2))
|
wanted_core_clock = int(round(gpu_oc_config["core"]*2))
|
||||||
if gpu_possible_ranges["core"][0] <= wanted_core_clock and wanted_core_clock <= gpu_possible_ranges["core"][1]:
|
if gpu_possible_ranges["core"][0] <= wanted_core_clock and wanted_core_clock <= gpu_possible_ranges["core"][1]:
|
||||||
|
@ -377,6 +401,17 @@ def set_oc(settings):
|
||||||
pynvml.nvmlDeviceSetPowerManagementLimit(gpu_handle, wanted_power_limit_milliwatts)
|
pynvml.nvmlDeviceSetPowerManagementLimit(gpu_handle, wanted_power_limit_milliwatts)
|
||||||
else:
|
else:
|
||||||
log.error(f"Requested OC for GPU:{oc_gpu_index} (POWER LIMIT) out of bound | {gpu_oc_config["pl"]} | [{gpu_possible_ranges["power_limits"][0]}, {gpu_possible_ranges["power_limits"][1]}]")
|
log.error(f"Requested OC for GPU:{oc_gpu_index} (POWER LIMIT) out of bound | {gpu_oc_config["pl"]} | [{gpu_possible_ranges["power_limits"][0]}, {gpu_possible_ranges["power_limits"][1]}]")
|
||||||
|
if is_hive and any_lock_failure and len(mem_locks)==len(core_locks):
|
||||||
|
try:
|
||||||
|
nvtool_commands = []
|
||||||
|
for idx, mem_lock in enumerate(mem_locks):
|
||||||
|
core_lock = core_locks[idx]
|
||||||
|
nvtool_commands.append(f"nvtool -i {str(idx)} --setmem {mem_lock} --setcore {core_lock}")
|
||||||
|
cmd = ["bash",'-c',f"PATH={HIVE_PATH} && sudo {' && '.join(nvtool_commands)}"]
|
||||||
|
#print(cmd)
|
||||||
|
subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
|
except Exception as hive_oc_settings:
|
||||||
|
pass
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"set_oc | ERROR | {e}")
|
log.error(f"set_oc | ERROR | {e}")
|
||||||
|
|
Loading…
Reference in New Issue