V5.2.1 | fix reading OC specs for older GPUs

2024-05-28 00:25:58 +00:00 · 2024-05-28 00:25:58 +00:00 · 5e733fd0d6
parent 36d3026d5d
commit 5e733fd0d6
3 changed files with 75 additions and 20 deletions
--- a/clore_hosting/main.py
+++ b/clore_hosting/main.py
@ -426,7 +426,7 @@ class CloreClient:
    async def submit_specs(self, current_specs):
        try:
            if type(current_specs) == dict:
-                current_specs["backend_version"]=10
+                current_specs["backend_version"]=11
                current_specs["update_hw"]=True
                smallest_pcie_width = 999
                for gpu in current_specs["gpus"]["nvidia"]:
--- a/lib/nvml.py
+++ b/lib/nvml.py
@ -6,8 +6,38 @@ config = config_module.config
 log = logging_lib.log

 import subprocess
-import pynvml
+import clore_pynvml as pynvml
 import json
+import math
+
+HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
+
+GPU_MEM_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
+    "NVIDIA P102-100": [-2000, 2000],
+    "NVIDIA P104-100": [-2000, 2000],
+    "NVIDIA P106-090": [-2000, 2000],
+    "NVIDIA P106-100": [-2000, 2000],
+    "NVIDIA GeForce GTX 1050 Ti": [-2000, 2000],
+    "NVIDIA GeForce GTX 1060 3GB": [-2000, 2000],
+    "NVIDIA GeForce GTX 1060 6GB": [-2000, 2000],
+    "NVIDIA GeForce GTX 1070": [-2000, 2000],
+    "NVIDIA GeForce GTX 1070 Ti": [-2000, 2000],
+    "NVIDIA GeForce GTX 1080": [-2000, 2000],
+    "NVIDIA GeForce GTX 1080 Ti":[-2000, 2000],
+    "NVIDIA CMP 30HX": [-2000, 6000],
+    "NVIDIA CMP 40HX": [-2000, 6000],
+    "NVIDIA CMP 50HX": [-2000, 6000],
+    "NVIDIA CMP 90HX": [-2000, 6000],
+    "NVIDIA GeForce GTX 1650": [-2000, 6000],
+    "NVIDIA GeForce GTX 1660 SUPER": [-2000, 6000],
+    "NVIDIA GeForce GTX 1660 Ti": [-2000, 6000],
+    "NVIDIA GeForce RTX 2060": [-2000, 6000],
+    "NVIDIA GeForce RTX 2060 SUPER": [-2000, 6000],
+    "NVIDIA GeForce RTX 2070": [-2000, 6000],
+    "NVIDIA GeForce RTX 2070 SUPER": [-2000, 6000],
+    "NVIDIA GeForce RTX 2080": [-2000, 6000],
+    "NVIDIA GeForce RTX 2080 Ti": [-2000, 6000]
+}

 is_hive = False
 all_gpus_data_list=[]
@ -64,22 +94,47 @@ def init(gpu_specs_file=None):

                mem_range = get_hive_clock_range(is_hive, i, "mem")
                core_range = get_hive_clock_range(is_hive, i, "core")
-                if type(mem_range) != list:
-                    pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load
-                    failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle)
-                    failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle)
-                    if (not failure_min) and (not failure_max):
-                        mem_range=[min_oc_solution, max_oc_solution]
-                    pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, 0)
-                    pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
-                if type(core_range) != list:
-                    pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, 300, 350) # Force low clocks, so the GPU can't crash when testing if under load
-                    failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle, True)
-                    failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle, True)
-                    if (not failure_min) and (not failure_max):
-                        core_range=[min_oc_solution, max_oc_solution]
-                    pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0)
-                    pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
+                try:
+                    if type(mem_range) != list:
+                        pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load
+                        failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle)
+                        failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle)
+                        if (not failure_min) and (not failure_max):
+                            mem_range=[min_oc_solution, max_oc_solution]
+                        pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, 0)
+                        pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
+                    if type(core_range) != list:
+                        pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, 300, 350) # Force low clocks, so the GPU can't crash when testing if under load
+                        failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle, True)
+                        failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle, True)
+                        if (not failure_min) and (not failure_max):
+                            core_range=[min_oc_solution, max_oc_solution]
+                        pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0)
+                        pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
+                except Exception as e_pinpointing:
+                    if "not supported" in str(e_pinpointing).lower():
+                        try:
+                            min_core_offset, max_core_offset = pynvml.nvmlDeviceGetGpcClkMinMaxVfOffset(gpu_handle)
+                            if min_core_offset>0:
+                                min_core_offset = min_core_offset - math.floor((2**32)/1000)
+                            if min_core_offset > -20000 and min_core_offset <= 0 and max_core_offset>=0 and min_core_offset < 20000:
+                                core_range=[min_core_offset, max_core_offset]
+                            else:
+                                core_range=[0,0]
+                            min_mem_offset, max_mem_offset = pynvml.nvmlDeviceGetMemClkMinMaxVfOffset(gpu_handle)
+                            if min_mem_offset>0:
+                                min_mem_offset = min_mem_offset - math.floor((2**32)/1000)
+                            if min_mem_offset==0 and max_mem_offset==0:
+                                if gpu_spec["name"] in GPU_MEM_ALLOWED_OC_RANGES:
+                                    mem_range = GPU_MEM_ALLOWED_OC_RANGES[gpu_spec["name"]]
+                                else:
+                                    mem_range = [0,0]
+                            elif min_mem_offset > -20000 and min_mem_offset <= 0 and max_mem_offset>=0 and max_mem_offset < 20000:
+                                mem_range=[min_mem_offset, max_mem_offset]
+                            else:
+                                mem_range=[0,0]
+                        except Exception as e2:
+                            get_data_fail=True
                if type(mem_range) == list and type(core_range) == list and len(mem_range)==2 and len(core_range)==2:
                    gpu_spec["mem"]=mem_range
                    gpu_spec["core"]=core_range
@ -267,7 +322,7 @@ def get_hive_clock_range(is_hive, gpu_index, part):
    if is_hive:
        try:
            flag = "--setmemoffset" if part=="mem" else "--setcoreoffset"
-            cmd = ["bash",'-c',f"nvtool -i 0 {flag} -100000"]
+            cmd = ["bash",'-c',f"PATH={HIVE_PATH} && sudo nvtool -i {gpu_index} {flag} -100000"]

            result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            lines = result.stdout.decode().splitlines()
--- a/requirements.txt
+++ b/requirements.txt
@ -7,5 +7,5 @@ psutil==5.9.0
 python-iptables==1.0.1
 websockets==12.0
 packaging==23.2
-pynvml==11.5.0
+clore-pynvml==11.5.4
 requests==2.31.0