diff --git a/clore_hosting/main.py b/clore_hosting/main.py index 26390a6..eea6fa8 100644 --- a/clore_hosting/main.py +++ b/clore_hosting/main.py @@ -455,7 +455,7 @@ class CloreClient: async def submit_specs(self, current_specs): try: if type(current_specs) == dict: - current_specs["backend_version"]=16 + current_specs["backend_version"]=17 current_specs["update_hw"]=True smallest_pcie_width = 999 for gpu in current_specs["gpus"]["nvidia"]: diff --git a/lib/nvml.py b/lib/nvml.py index cc3fea6..e5d1049 100644 --- a/lib/nvml.py +++ b/lib/nvml.py @@ -100,10 +100,15 @@ def init(gpu_specs_file=None, allow_hive_binaries=True): parsed_specs={} regenerate_specs=True break - + elif not "locks" in parsed_specs[f"{i}-{gpu_uuid}"]: + parsed_specs={} + regenerate_specs=True + break + if regenerate_specs: for i in range(0,gpu_count): gpu_spec={} + mem_to_core_allowed_locks = get_gpu_locked_clocks(i) gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i) gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle) power_limits = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(gpu_handle) @@ -112,6 +117,7 @@ def init(gpu_specs_file=None, allow_hive_binaries=True): gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0) gpu_spec["power_limits"] = [min_power_limit, max_power_limit] gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle) + gpu_spec["locks"] = mem_to_core_allowed_locks pci_info = pynvml.nvmlDeviceGetPciInfo(gpu_handle) pci_bus_id = pci_info.bus @@ -205,6 +211,19 @@ def get_gpu_oc_specs(): def shutdown(): pynvml.nvmlShutdown() +def get_gpu_locked_clocks(gpu_index): + try: + handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index) + mem_clocks = pynvml.nvmlDeviceGetSupportedMemoryClocks(handle) + mem_to_core = {} + for idx, mem_clock in enumerate(mem_clocks): + if idx < 12 or idx == len(mem_clocks)-1: + graphics_clocks = pynvml.nvmlDeviceGetSupportedGraphicsClocks(handle, mem_clock) + mem_to_core[str(mem_clock)] = [min(graphics_clocks), max(graphics_clocks)] + return mem_to_core + except Exception as e: + return {} + def handle_nn(input_int): if abs(4293967-input_int) < 10000: return input_int-4293967 @@ -330,13 +349,23 @@ def set_oc(settings): gpu_oc_config = settings[str(oc_gpu_index)] gpu_possible_ranges = all_gpus_data_list[oc_gpu_index] gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(oc_gpu_index) - if "core" in gpu_oc_config: + if "core_lock" in gpu_oc_config: + core_lock = int(gpu_oc_config["core_lock"]) + pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, core_lock, core_lock) + else: + pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle) + if "mem_lock" in gpu_oc_config: + mem_lock = int(gpu_oc_config["mem_lock"]) + pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, mem_lock, mem_lock) + else: + pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle) + if "core" in gpu_oc_config: # Core offset wanted_core_clock = int(round(gpu_oc_config["core"]*2)) if gpu_possible_ranges["core"][0] <= wanted_core_clock and wanted_core_clock <= gpu_possible_ranges["core"][1]: pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, wanted_core_clock) else: log.error(f"Requested OC for GPU:{oc_gpu_index} (CORE) out of bound | {wanted_core_clock} | [{gpu_possible_ranges["core"][0]}, {gpu_possible_ranges["core"][1]}]") - if "mem" in gpu_oc_config: + if "mem" in gpu_oc_config: # Memory offset wanted_mem_clock = int(round(gpu_oc_config["mem"]*2)) if gpu_possible_ranges["mem"][0] <= wanted_mem_clock and wanted_mem_clock <= gpu_possible_ranges["mem"][1]: pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, wanted_mem_clock)