from lib import config as config_module from lib import logging as logging_lib from lib import get_specs config = config_module.config log = logging_lib.log import subprocess import clore_pynvml as pynvml import json import math HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" GPU_MEM_ALLOWED_OC_RANGES = { # Known to be problematic GPUs "NVIDIA P102-100": [-2000, 2000], "NVIDIA P104-100": [-2000, 2000], "NVIDIA P106-090": [-2000, 2000], "NVIDIA P106-100": [-2000, 2000], "NVIDIA GeForce GTX 1050 Ti": [-2000, 2000], "NVIDIA GeForce GTX 1060 3GB": [-2000, 2000], "NVIDIA GeForce GTX 1060 6GB": [-2000, 2000], "NVIDIA GeForce GTX 1070": [-2000, 2000], "NVIDIA GeForce GTX 1070 Ti": [-2000, 2000], "NVIDIA GeForce GTX 1080": [-2000, 2000], "NVIDIA GeForce GTX 1080 Ti": [-2000, 2000], "NVIDIA CMP 30HX": [-2000, 6000], "NVIDIA CMP 40HX": [-2000, 6000], "NVIDIA CMP 50HX": [-2000, 6000], "NVIDIA CMP 90HX": [-2000, 6000], "NVIDIA GeForce GTX 1650": [-2000, 6000], "NVIDIA GeForce GTX 1660 SUPER": [-2000, 6000], "NVIDIA GeForce GTX 1660 Ti": [-2000, 6000], "NVIDIA GeForce RTX 2060": [-2000, 6000], "NVIDIA GeForce RTX 2060 SUPER": [-2000, 6000], "NVIDIA GeForce RTX 2070": [-2000, 6000], "NVIDIA GeForce RTX 2070 SUPER": [-2000, 6000], "NVIDIA GeForce RTX 2080": [-2000, 6000], "NVIDIA GeForce RTX 2080 Ti": [-2000, 6000] } GPU_CORE_ALLOWED_OC_RANGES = { # Known to be problematic GPUs "NVIDIA P102-100": [-200, 1200], "NVIDIA P104-100": [-200, 1200], "NVIDIA P106-090": [-200, 1200], "NVIDIA P106-100": [-200, 1200], "NVIDIA GeForce GTX 1050 Ti": [-200, 1200], "NVIDIA GeForce GTX 1060 3GB": [-200, 1200], "NVIDIA GeForce GTX 1060 6GB": [-200, 1200], "NVIDIA GeForce GTX 1070": [-200, 1200], "NVIDIA GeForce GTX 1070 Ti": [-200, 1200], "NVIDIA GeForce GTX 1080": [-200, 1200], "NVIDIA GeForce GTX 1080 Ti": [-200, 1200], "NVIDIA CMP 30HX": [-1000, 1000], "NVIDIA CMP 40HX": [-1000, 1000], "NVIDIA CMP 50HX": [-1000, 1000], "NVIDIA CMP 90HX": [-1000, 1000], "NVIDIA GeForce GTX 1650": [-1000, 1000], "NVIDIA GeForce GTX 1660 SUPER": [-1000, 1000], "NVIDIA GeForce GTX 1660 Ti": [-1000, 1000], "NVIDIA GeForce RTX 2060": [-1000, 1000], "NVIDIA GeForce RTX 2060 SUPER": [-1000, 1000], "NVIDIA GeForce RTX 2070": [-1000, 1000], "NVIDIA GeForce RTX 2070 SUPER": [-1000, 1000], "NVIDIA GeForce RTX 2080": [-1000, 1000], "NVIDIA GeForce RTX 2080 Ti": [-1000, 1000] } is_hive = False all_gpus_data_list=[] get_data_fail=False def init(gpu_specs_file=None, allow_hive_binaries=True): global is_hive, all_gpus_data_list, get_data_fail log.info("Loading GPU OC specs [ working ]") try: pynvml.nvmlInit() kernel = get_specs.get_kernel() if "hive" in kernel and allow_hive_binaries: is_hive=True specs_file_loc = gpu_specs_file if gpu_specs_file else config.gpu_specs_file regenerate_specs = False parsed_specs={} try: with open(specs_file_loc, "r") as specs_file: parsed_specs = json.loads(specs_file.read()) except Exception as specs_load_fail: log.error(f"Failed loading gpu_specs_file ({specs_load_fail}) | regenerating...") regenerate_specs=True parsed_specs_keys = parsed_specs.keys() gpu_count = pynvml.nvmlDeviceGetCount() for i in range(0,gpu_count): if regenerate_specs: break gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i) gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle) if not f"{i}-{gpu_uuid}" in parsed_specs_keys: parsed_specs={} regenerate_specs=True break if regenerate_specs: for i in range(0,gpu_count): gpu_spec={} gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i) gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle) power_limits = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(gpu_handle) min_power_limit = int(power_limits[0] / 1000.0) max_power_limit = int(power_limits[1] / 1000.0) gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0) gpu_spec["power_limits"] = [min_power_limit, max_power_limit] gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle) pci_info = pynvml.nvmlDeviceGetPciInfo(gpu_handle) pci_bus_id = pci_info.bus pci_device_id = pci_info.device pci_domain_id = pci_info.domain gpu_spec["pci_core"] = f"{pci_domain_id}:{pci_bus_id:02d}:{pci_device_id:02d}.0" mem_range = get_hive_clock_range(is_hive, i, "mem") core_range = get_hive_clock_range(is_hive, i, "core") try: if type(mem_range) != list: pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle) failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle) if (not failure_min) and (not failure_max): mem_range=[min_oc_solution, max_oc_solution] pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, 0) pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle) if type(core_range) != list: pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, 300, 350) # Force low clocks, so the GPU can't crash when testing if under load failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle, True) failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle, True) if (not failure_min) and (not failure_max): core_range=[min_oc_solution, max_oc_solution] pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0) pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle) except Exception as e_pinpointing: if "not supported" in str(e_pinpointing).lower(): try: min_core_offset, max_core_offset = pynvml.nvmlDeviceGetGpcClkMinMaxVfOffset(gpu_handle) if min_core_offset>0: min_core_offset = min_core_offset - math.floor((2**32)/1000) if min_core_offset > -20000 and min_core_offset <= 0 and max_core_offset>=0 and min_core_offset < 20000: core_range=[min_core_offset, max_core_offset] else: core_range=[0,0] min_mem_offset, max_mem_offset = pynvml.nvmlDeviceGetMemClkMinMaxVfOffset(gpu_handle) if min_mem_offset>0: min_mem_offset = min_mem_offset - math.floor((2**32)/1000) if min_mem_offset==0 and max_mem_offset==0: if gpu_spec["name"] in GPU_MEM_ALLOWED_OC_RANGES: mem_range = GPU_MEM_ALLOWED_OC_RANGES[gpu_spec["name"]] else: mem_range = [0,0] elif min_mem_offset > -20000 and min_mem_offset <= 0 and max_mem_offset>=0 and max_mem_offset < 20000: mem_range=[min_mem_offset, max_mem_offset] else: mem_range=[0,0] except Exception as e2: if "function not found" in str(e2).lower(): if gpu_spec["name"] in GPU_MEM_ALLOWED_OC_RANGES: mem_range = GPU_MEM_ALLOWED_OC_RANGES[gpu_spec["name"]] else: mem_range = [0,0] if gpu_spec["name"] in GPU_CORE_ALLOWED_OC_RANGES: core_range = GPU_CORE_ALLOWED_OC_RANGES[gpu_spec["name"]] else: core_range = [0,0] else: get_data_fail=True if type(mem_range) == list and type(core_range) == list and len(mem_range)==2 and len(core_range)==2: gpu_spec["mem"]=mem_range gpu_spec["core"]=core_range else: get_data_fail=True parsed_specs[f"{i}-{gpu_uuid}"]=gpu_spec with open(specs_file_loc, "w") as specs_file: json.dump(parsed_specs, specs_file) if not get_data_fail: parsed_specs_keys=parsed_specs.keys() for key in parsed_specs_keys: all_gpus_data_list.append(parsed_specs[key]) except Exception as e: get_data_fail=True log.error("Loading GPU OC specs [ fail ]") if not get_data_fail: log.success("Loading GPU OC specs [ success ]") print(all_gpus_data_list) # Load GPU specs def get_gpu_oc_specs(): global get_data_fail if get_data_fail: return False else: return all_gpus_data_list def shutdown(): pynvml.nvmlShutdown() def handle_nn(input_int): if abs(4293967-input_int) < 10000: return input_int-4293967 elif abs(8589934-input_int) < 10000: return input_int-8589934 else: return input_int def pinpoint_find_dicts_negative(data): false_success_items = [d for d in data if not d['success']] true_success_items = [d for d in data if d['success']] highest_false_success = max(false_success_items, key=lambda x: x['offset'], default=None) lowest_true_success = min(true_success_items, key=lambda x: x['offset'], default=None) return highest_false_success, lowest_true_success def pinpoint_find_dicts_positive(data): false_success_items = [d for d in data if not d['success']] true_success_items = [d for d in data if d['success']] lowest_false_success = min(false_success_items, key=lambda x: x['offset'], default=None) highest_true_success = max(true_success_items, key=lambda x: x['offset'], default=None) return highest_true_success, lowest_false_success def pinpoint_oc_limits_negative(gpu_handle, core=False): step_cnt = 0 found_solution = None init_negative_max = -19855 # Probably history_info = [{"offset": init_negative_max*2, "success":False}] failure = False max_step_cnt = 20 try: while found_solution == None and step_cnt