from lib import config as config_module from lib import logging as logging_lib from lib import get_specs config = config_module.config log = logging_lib.log import subprocess import pynvml import json is_hive = False all_gpus_data_list=[] get_data_fail=False def init(gpu_specs_file=None): global is_hive, all_gpus_data_list, get_data_fail log.info("Loading GPU OC specs [ working ]") try: pynvml.nvmlInit() kernel = get_specs.get_kernel() if "hive" in kernel: is_hive=True specs_file_loc = gpu_specs_file if gpu_specs_file else config.gpu_specs_file regenerate_specs = False parsed_specs={} try: with open(specs_file_loc, "r") as specs_file: parsed_specs = json.loads(specs_file.read()) except Exception as specs_load_fail: log.error(f"Failed loading gpu_specs_file ({specs_load_fail}) | regenerating...") regenerate_specs=True parsed_specs_keys = parsed_specs.keys() gpu_count = pynvml.nvmlDeviceGetCount() for i in range(0,gpu_count): if regenerate_specs: break gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i) gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle) if not f"{i}-{gpu_uuid}" in parsed_specs_keys: parsed_specs={} regenerate_specs=True break if regenerate_specs: for i in range(0,gpu_count): gpu_spec={} gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i) gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle) power_limits = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(gpu_handle) min_power_limit = int(power_limits[0] / 1000.0) max_power_limit = int(power_limits[1] / 1000.0) gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0) gpu_spec["power_limits"] = [min_power_limit, max_power_limit] gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle) pci_info = pynvml.nvmlDeviceGetPciInfo(gpu_handle) pci_bus_id = pci_info.bus pci_device_id = pci_info.device pci_domain_id = pci_info.domain gpu_spec["pci_core"] = f"{pci_domain_id}:{pci_bus_id:02d}:{pci_device_id:02d}.0" mem_range = get_hive_clock_range(is_hive, i, "mem") core_range = get_hive_clock_range(is_hive, i, "core") if type(mem_range) != list: pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle) failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle) if (not failure_min) and (not failure_max): mem_range=[min_oc_solution, max_oc_solution] pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, 0) pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle) if type(core_range) != list: pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, 300, 350) # Force low clocks, so the GPU can't crash when testing if under load failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle, True) failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle, True) if (not failure_min) and (not failure_max): core_range=[min_oc_solution, max_oc_solution] pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0) pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle) if type(mem_range) == list and type(core_range) == list and len(mem_range)==2 and len(core_range)==2: gpu_spec["mem"]=mem_range gpu_spec["core"]=core_range else: get_data_fail=True parsed_specs[f"{i}-{gpu_uuid}"]=gpu_spec with open(specs_file_loc, "w") as specs_file: json.dump(parsed_specs, specs_file) if not get_data_fail: parsed_specs_keys=parsed_specs.keys() for key in parsed_specs_keys: all_gpus_data_list.append(parsed_specs[key]) except Exception as e: get_data_fail=True log.error("Loading GPU OC specs [ fail ]") if not get_data_fail: log.success("Loading GPU OC specs [ success ]") print(all_gpus_data_list) # Load GPU specs def get_gpu_oc_specs(): global get_data_fail if get_data_fail: return False else: return all_gpus_data_list def shutdown(): pynvml.nvmlShutdown() def handle_nn(input_int): if abs(4293967-input_int) < 10000: return input_int-4293967 elif abs(8589934-input_int) < 10000: return input_int-8589934 else: return input_int def pinpoint_find_dicts_negative(data): false_success_items = [d for d in data if not d['success']] true_success_items = [d for d in data if d['success']] highest_false_success = max(false_success_items, key=lambda x: x['offset'], default=None) lowest_true_success = min(true_success_items, key=lambda x: x['offset'], default=None) return highest_false_success, lowest_true_success def pinpoint_find_dicts_positive(data): false_success_items = [d for d in data if not d['success']] true_success_items = [d for d in data if d['success']] lowest_false_success = min(false_success_items, key=lambda x: x['offset'], default=None) highest_true_success = max(true_success_items, key=lambda x: x['offset'], default=None) return highest_true_success, lowest_false_success def pinpoint_oc_limits_negative(gpu_handle, core=False): step_cnt = 0 found_solution = None init_negative_max = -19855 # Probably history_info = [{"offset": init_negative_max*2, "success":False}] failure = False max_step_cnt = 20 try: while found_solution == None and step_cnt