V5.2.7 - core, mem lock

2024-10-31 02:32:47 +00:00 · 2024-10-31 02:32:47 +00:00 · 7faadc76ea
parent 1375d5599e
commit 7faadc76ea
2 changed files with 33 additions and 4 deletions
--- a/clore_hosting/main.py
+++ b/clore_hosting/main.py
@ -455,7 +455,7 @@ class CloreClient:
    async def submit_specs(self, current_specs):
        try:
            if type(current_specs) == dict:
-                current_specs["backend_version"]=16
+                current_specs["backend_version"]=17
                current_specs["update_hw"]=True
                smallest_pcie_width = 999
                for gpu in current_specs["gpus"]["nvidia"]:
--- a/lib/nvml.py
+++ b/lib/nvml.py
@ -100,10 +100,15 @@ def init(gpu_specs_file=None, allow_hive_binaries=True):
                parsed_specs={}
                regenerate_specs=True
                break
-        
+            elif not "locks" in parsed_specs[f"{i}-{gpu_uuid}"]:
+                parsed_specs={}
+                regenerate_specs=True
+                break
+
        if regenerate_specs:
            for i in range(0,gpu_count):
                gpu_spec={}
+                mem_to_core_allowed_locks = get_gpu_locked_clocks(i)
                gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
                gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
                power_limits = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(gpu_handle)
@ -112,6 +117,7 @@ def init(gpu_specs_file=None, allow_hive_binaries=True):
                gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0)
                gpu_spec["power_limits"] = [min_power_limit, max_power_limit]
                gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle)
+                gpu_spec["locks"] = mem_to_core_allowed_locks

                pci_info = pynvml.nvmlDeviceGetPciInfo(gpu_handle)
                pci_bus_id = pci_info.bus
@ -205,6 +211,19 @@ def get_gpu_oc_specs():
 def shutdown():
    pynvml.nvmlShutdown()

+def get_gpu_locked_clocks(gpu_index):
+    try:
+        handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
+        mem_clocks = pynvml.nvmlDeviceGetSupportedMemoryClocks(handle)
+        mem_to_core = {}
+        for idx, mem_clock in enumerate(mem_clocks):
+            if idx < 12 or idx == len(mem_clocks)-1:
+                graphics_clocks = pynvml.nvmlDeviceGetSupportedGraphicsClocks(handle, mem_clock)
+                mem_to_core[str(mem_clock)] = [min(graphics_clocks), max(graphics_clocks)]
+        return mem_to_core
+    except Exception as e:
+        return {}
+
 def handle_nn(input_int):
    if abs(4293967-input_int) < 10000:
        return input_int-4293967
@ -330,13 +349,23 @@ def set_oc(settings):
                    gpu_oc_config = settings[str(oc_gpu_index)]
                    gpu_possible_ranges = all_gpus_data_list[oc_gpu_index]
                    gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(oc_gpu_index)
-                    if "core" in gpu_oc_config:
+                    if "core_lock" in gpu_oc_config:
+                        core_lock = int(gpu_oc_config["core_lock"])
+                        pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, core_lock, core_lock)
+                    else:
+                        pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
+                    if "mem_lock" in gpu_oc_config:
+                        mem_lock = int(gpu_oc_config["mem_lock"])
+                        pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, mem_lock, mem_lock)
+                    else:
+                        pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
+                    if "core" in gpu_oc_config: # Core offset
                        wanted_core_clock = int(round(gpu_oc_config["core"]*2))
                        if gpu_possible_ranges["core"][0] <= wanted_core_clock and wanted_core_clock <= gpu_possible_ranges["core"][1]:
                            pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, wanted_core_clock)
                        else:
                            log.error(f"Requested OC for GPU:{oc_gpu_index} (CORE) out of bound | {wanted_core_clock} | [{gpu_possible_ranges["core"][0]}, {gpu_possible_ranges["core"][1]}]")
-                    if "mem" in gpu_oc_config:
+                    if "mem" in gpu_oc_config: # Memory offset
                        wanted_mem_clock = int(round(gpu_oc_config["mem"]*2))
                        if gpu_possible_ranges["mem"][0] <= wanted_mem_clock and wanted_mem_clock <= gpu_possible_ranges["mem"][1]:
                            pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, wanted_mem_clock)