From 68e7dc215d55316758070756c95cc9de11d36d83 Mon Sep 17 00:00:00 2001 From: clore Date: Sun, 3 Nov 2024 23:28:03 +0000 Subject: [PATCH] V5.2.8 - when failing to set clock locks on HiveOS fallback on nvtool --- clore_hosting/main.py | 2 +- lib/nvml.py | 45 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/clore_hosting/main.py b/clore_hosting/main.py index 5066e5a..9859a5d 100644 --- a/clore_hosting/main.py +++ b/clore_hosting/main.py @@ -455,7 +455,7 @@ class CloreClient: async def submit_specs(self, current_specs): try: if type(current_specs) == dict: - current_specs["backend_version"]=17 + current_specs["backend_version"]=18 current_specs["update_hw"]=True smallest_pcie_width = 999 for gpu in current_specs["gpus"]["nvidia"]: diff --git a/lib/nvml.py b/lib/nvml.py index e5d1049..8049af6 100644 --- a/lib/nvml.py +++ b/lib/nvml.py @@ -10,7 +10,7 @@ import clore_pynvml as pynvml import json import math -HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" +HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./" GPU_MEM_ALLOWED_OC_RANGES = { # Known to be problematic GPUs "NVIDIA P102-100": [-2000, 2000], @@ -329,6 +329,7 @@ def pinpoint_oc_limits_positive(gpu_handle, core=False): return failure, found_solution def set_oc(settings): + global is_hive try: gpu_count = pynvml.nvmlDeviceGetCount() settings_keys = settings.keys() @@ -342,6 +343,10 @@ def set_oc(settings): } settings_keys = settings.keys() log.debug(f"Rewriting settings with: {json.dumps(settings)}") + + core_locks = [] + mem_locks = [] + any_lock_failure = False for oc_gpu_index in settings_keys: if oc_gpu_index.isdigit(): oc_gpu_index=int(oc_gpu_index) @@ -349,16 +354,35 @@ def set_oc(settings): gpu_oc_config = settings[str(oc_gpu_index)] gpu_possible_ranges = all_gpus_data_list[oc_gpu_index] gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(oc_gpu_index) + if "core_lock" in gpu_oc_config: core_lock = int(gpu_oc_config["core_lock"]) - pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, core_lock, core_lock) + core_locks.append(str(core_lock)) + try: + pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, core_lock, core_lock) + except Exception as core_lock_exception: + any_lock_failure=True else: - pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle) + core_locks.append('0') + try: + pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle) + except Exception as core_lock_exception: + any_lock_failure=True + if "mem_lock" in gpu_oc_config: mem_lock = int(gpu_oc_config["mem_lock"]) - pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, mem_lock, mem_lock) + mem_locks.append(str(mem_lock)) + try: + pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, mem_lock, mem_lock) + except Exception as mem_lock_exception: + any_lock_failure=True else: - pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle) + mem_locks.append('0') + try: + pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle) + except Exception as mem_lock_exception: + any_lock_failure=True + if "core" in gpu_oc_config: # Core offset wanted_core_clock = int(round(gpu_oc_config["core"]*2)) if gpu_possible_ranges["core"][0] <= wanted_core_clock and wanted_core_clock <= gpu_possible_ranges["core"][1]: @@ -377,6 +401,17 @@ def set_oc(settings): pynvml.nvmlDeviceSetPowerManagementLimit(gpu_handle, wanted_power_limit_milliwatts) else: log.error(f"Requested OC for GPU:{oc_gpu_index} (POWER LIMIT) out of bound | {gpu_oc_config["pl"]} | [{gpu_possible_ranges["power_limits"][0]}, {gpu_possible_ranges["power_limits"][1]}]") + if is_hive and any_lock_failure and len(mem_locks)==len(core_locks): + try: + nvtool_commands = [] + for idx, mem_lock in enumerate(mem_locks): + core_lock = core_locks[idx] + nvtool_commands.append(f"nvtool -i {str(idx)} --setmem {mem_lock} --setcore {core_lock}") + cmd = ["bash",'-c',f"PATH={HIVE_PATH} && sudo {' && '.join(nvtool_commands)}"] + #print(cmd) + subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + except Exception as hive_oc_settings: + pass return True except Exception as e: log.error(f"set_oc | ERROR | {e}")