diff --git a/clore_hosting/main.py b/clore_hosting/main.py index 2b43b32..8065a9f 100644 --- a/clore_hosting/main.py +++ b/clore_hosting/main.py @@ -426,7 +426,7 @@ class CloreClient: async def submit_specs(self, current_specs): try: if type(current_specs) == dict: - current_specs["backend_version"]=10 + current_specs["backend_version"]=11 current_specs["update_hw"]=True smallest_pcie_width = 999 for gpu in current_specs["gpus"]["nvidia"]: diff --git a/lib/nvml.py b/lib/nvml.py index 6ecb307..94fb47e 100644 --- a/lib/nvml.py +++ b/lib/nvml.py @@ -6,8 +6,38 @@ config = config_module.config log = logging_lib.log import subprocess -import pynvml +import clore_pynvml as pynvml import json +import math + +HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + +GPU_MEM_ALLOWED_OC_RANGES = { # Known to be problematic GPUs + "NVIDIA P102-100": [-2000, 2000], + "NVIDIA P104-100": [-2000, 2000], + "NVIDIA P106-090": [-2000, 2000], + "NVIDIA P106-100": [-2000, 2000], + "NVIDIA GeForce GTX 1050 Ti": [-2000, 2000], + "NVIDIA GeForce GTX 1060 3GB": [-2000, 2000], + "NVIDIA GeForce GTX 1060 6GB": [-2000, 2000], + "NVIDIA GeForce GTX 1070": [-2000, 2000], + "NVIDIA GeForce GTX 1070 Ti": [-2000, 2000], + "NVIDIA GeForce GTX 1080": [-2000, 2000], + "NVIDIA GeForce GTX 1080 Ti":[-2000, 2000], + "NVIDIA CMP 30HX": [-2000, 6000], + "NVIDIA CMP 40HX": [-2000, 6000], + "NVIDIA CMP 50HX": [-2000, 6000], + "NVIDIA CMP 90HX": [-2000, 6000], + "NVIDIA GeForce GTX 1650": [-2000, 6000], + "NVIDIA GeForce GTX 1660 SUPER": [-2000, 6000], + "NVIDIA GeForce GTX 1660 Ti": [-2000, 6000], + "NVIDIA GeForce RTX 2060": [-2000, 6000], + "NVIDIA GeForce RTX 2060 SUPER": [-2000, 6000], + "NVIDIA GeForce RTX 2070": [-2000, 6000], + "NVIDIA GeForce RTX 2070 SUPER": [-2000, 6000], + "NVIDIA GeForce RTX 2080": [-2000, 6000], + "NVIDIA GeForce RTX 2080 Ti": [-2000, 6000] +} is_hive = False all_gpus_data_list=[] @@ -64,22 +94,47 @@ def init(gpu_specs_file=None): mem_range = get_hive_clock_range(is_hive, i, "mem") core_range = get_hive_clock_range(is_hive, i, "core") - if type(mem_range) != list: - pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load - failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle) - failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle) - if (not failure_min) and (not failure_max): - mem_range=[min_oc_solution, max_oc_solution] - pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, 0) - pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle) - if type(core_range) != list: - pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, 300, 350) # Force low clocks, so the GPU can't crash when testing if under load - failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle, True) - failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle, True) - if (not failure_min) and (not failure_max): - core_range=[min_oc_solution, max_oc_solution] - pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0) - pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle) + try: + if type(mem_range) != list: + pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load + failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle) + failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle) + if (not failure_min) and (not failure_max): + mem_range=[min_oc_solution, max_oc_solution] + pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, 0) + pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle) + if type(core_range) != list: + pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, 300, 350) # Force low clocks, so the GPU can't crash when testing if under load + failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle, True) + failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle, True) + if (not failure_min) and (not failure_max): + core_range=[min_oc_solution, max_oc_solution] + pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0) + pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle) + except Exception as e_pinpointing: + if "not supported" in str(e_pinpointing).lower(): + try: + min_core_offset, max_core_offset = pynvml.nvmlDeviceGetGpcClkMinMaxVfOffset(gpu_handle) + if min_core_offset>0: + min_core_offset = min_core_offset - math.floor((2**32)/1000) + if min_core_offset > -20000 and min_core_offset <= 0 and max_core_offset>=0 and min_core_offset < 20000: + core_range=[min_core_offset, max_core_offset] + else: + core_range=[0,0] + min_mem_offset, max_mem_offset = pynvml.nvmlDeviceGetMemClkMinMaxVfOffset(gpu_handle) + if min_mem_offset>0: + min_mem_offset = min_mem_offset - math.floor((2**32)/1000) + if min_mem_offset==0 and max_mem_offset==0: + if gpu_spec["name"] in GPU_MEM_ALLOWED_OC_RANGES: + mem_range = GPU_MEM_ALLOWED_OC_RANGES[gpu_spec["name"]] + else: + mem_range = [0,0] + elif min_mem_offset > -20000 and min_mem_offset <= 0 and max_mem_offset>=0 and max_mem_offset < 20000: + mem_range=[min_mem_offset, max_mem_offset] + else: + mem_range=[0,0] + except Exception as e2: + get_data_fail=True if type(mem_range) == list and type(core_range) == list and len(mem_range)==2 and len(core_range)==2: gpu_spec["mem"]=mem_range gpu_spec["core"]=core_range @@ -267,7 +322,7 @@ def get_hive_clock_range(is_hive, gpu_index, part): if is_hive: try: flag = "--setmemoffset" if part=="mem" else "--setcoreoffset" - cmd = ["bash",'-c',f"nvtool -i 0 {flag} -100000"] + cmd = ["bash",'-c',f"PATH={HIVE_PATH} && sudo nvtool -i {gpu_index} {flag} -100000"] result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) lines = result.stdout.decode().splitlines() diff --git a/requirements.txt b/requirements.txt index 3763287..75ae456 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ psutil==5.9.0 python-iptables==1.0.1 websockets==12.0 packaging==23.2 -pynvml==11.5.0 +clore-pynvml==11.5.4 requests==2.31.0 \ No newline at end of file