470 lines
22 KiB
Python
470 lines
22 KiB
Python
from lib import config as config_module
|
|
from lib import logging as logging_lib
|
|
from lib import get_specs
|
|
|
|
config = config_module.config
|
|
log = logging_lib.log
|
|
|
|
import subprocess
|
|
import clore_pynvml as pynvml
|
|
import json
|
|
import math
|
|
|
|
HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./"
|
|
|
|
GPU_MEM_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
|
|
"NVIDIA P102-100": [-2000, 2000],
|
|
"NVIDIA P104-100": [-2000, 2000],
|
|
"NVIDIA P106-090": [-2000, 2000],
|
|
"NVIDIA P106-100": [-2000, 2000],
|
|
"NVIDIA GeForce GTX 1050 Ti": [-2000, 2000],
|
|
"NVIDIA GeForce GTX 1060 3GB": [-2000, 2000],
|
|
"NVIDIA GeForce GTX 1060 6GB": [-2000, 2000],
|
|
"NVIDIA GeForce GTX 1070": [-2000, 2000],
|
|
"NVIDIA GeForce GTX 1070 Ti": [-2000, 2000],
|
|
"NVIDIA GeForce GTX 1080": [-2000, 2000],
|
|
"NVIDIA GeForce GTX 1080 Ti": [-2000, 2000],
|
|
"NVIDIA CMP 30HX": [-2000, 6000],
|
|
"NVIDIA CMP 40HX": [-2000, 6000],
|
|
"NVIDIA CMP 50HX": [-2000, 6000],
|
|
"NVIDIA CMP 90HX": [-2000, 6000],
|
|
"NVIDIA GeForce GTX 1650": [-2000, 6000],
|
|
"NVIDIA GeForce GTX 1660 SUPER": [-2000, 6000],
|
|
"NVIDIA GeForce GTX 1660 Ti": [-2000, 6000],
|
|
"NVIDIA GeForce RTX 2060": [-2000, 6000],
|
|
"NVIDIA GeForce RTX 2060 SUPER": [-2000, 6000],
|
|
"NVIDIA GeForce RTX 2070": [-2000, 6000],
|
|
"NVIDIA GeForce RTX 2070 SUPER": [-2000, 6000],
|
|
"NVIDIA GeForce RTX 2080": [-2000, 6000],
|
|
"NVIDIA GeForce RTX 2080 Ti": [-2000, 6000]
|
|
}
|
|
|
|
GPU_CORE_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
|
|
"NVIDIA P102-100": [-200, 1200],
|
|
"NVIDIA P104-100": [-200, 1200],
|
|
"NVIDIA P106-090": [-200, 1200],
|
|
"NVIDIA P106-100": [-200, 1200],
|
|
"NVIDIA GeForce GTX 1050 Ti": [-200, 1200],
|
|
"NVIDIA GeForce GTX 1060 3GB": [-200, 1200],
|
|
"NVIDIA GeForce GTX 1060 6GB": [-200, 1200],
|
|
"NVIDIA GeForce GTX 1070": [-200, 1200],
|
|
"NVIDIA GeForce GTX 1070 Ti": [-200, 1200],
|
|
"NVIDIA GeForce GTX 1080": [-200, 1200],
|
|
"NVIDIA GeForce GTX 1080 Ti": [-200, 1200],
|
|
"NVIDIA CMP 30HX": [-1000, 1000],
|
|
"NVIDIA CMP 40HX": [-1000, 1000],
|
|
"NVIDIA CMP 50HX": [-1000, 1000],
|
|
"NVIDIA CMP 90HX": [-1000, 1000],
|
|
"NVIDIA GeForce GTX 1650": [-1000, 1000],
|
|
"NVIDIA GeForce GTX 1660 SUPER": [-1000, 1000],
|
|
"NVIDIA GeForce GTX 1660 Ti": [-1000, 1000],
|
|
"NVIDIA GeForce RTX 2060": [-1000, 1000],
|
|
"NVIDIA GeForce RTX 2060 SUPER": [-1000, 1000],
|
|
"NVIDIA GeForce RTX 2070": [-1000, 1000],
|
|
"NVIDIA GeForce RTX 2070 SUPER": [-1000, 1000],
|
|
"NVIDIA GeForce RTX 2080": [-1000, 1000],
|
|
"NVIDIA GeForce RTX 2080 Ti": [-1000, 1000]
|
|
}
|
|
|
|
is_hive = False
|
|
all_gpus_data_list=[]
|
|
gpu_name_list=[]
|
|
get_data_fail=False
|
|
|
|
def init(gpu_specs_file=None, allow_hive_binaries=True):
|
|
global is_hive, all_gpus_data_list, get_data_fail, gpu_name_list
|
|
log.info("Loading GPU OC specs [ working ]")
|
|
try:
|
|
pynvml.nvmlInit()
|
|
kernel = get_specs.get_kernel()
|
|
if "hive" in kernel and allow_hive_binaries:
|
|
is_hive=True
|
|
|
|
specs_file_loc = gpu_specs_file if gpu_specs_file else config.gpu_specs_file
|
|
regenerate_specs = False
|
|
parsed_specs={}
|
|
try:
|
|
with open(specs_file_loc, "r") as specs_file:
|
|
parsed_specs = json.loads(specs_file.read())
|
|
except Exception as specs_load_fail:
|
|
log.error(f"Failed loading gpu_specs_file ({specs_load_fail}) | regenerating...")
|
|
regenerate_specs=True
|
|
|
|
parsed_specs_keys = parsed_specs.keys()
|
|
gpu_count = pynvml.nvmlDeviceGetCount()
|
|
for i in range(0,gpu_count):
|
|
if regenerate_specs:
|
|
break
|
|
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
|
gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
|
|
gpu_name_list.append(pynvml.nvmlDeviceGetName(gpu_handle))
|
|
if not f"{i}-{gpu_uuid}" in parsed_specs_keys:
|
|
parsed_specs={}
|
|
regenerate_specs=True
|
|
break
|
|
elif not "locks" in parsed_specs[f"{i}-{gpu_uuid}"]:
|
|
parsed_specs={}
|
|
regenerate_specs=True
|
|
break
|
|
|
|
if regenerate_specs:
|
|
for i in range(0,gpu_count):
|
|
gpu_spec={}
|
|
mem_to_core_allowed_locks = get_gpu_locked_clocks(i)
|
|
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
|
gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
|
|
power_limits = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(gpu_handle)
|
|
min_power_limit = int(power_limits[0] / 1000.0)
|
|
max_power_limit = int(power_limits[1] / 1000.0)
|
|
gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0)
|
|
gpu_spec["power_limits"] = [min_power_limit, max_power_limit]
|
|
gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle)
|
|
gpu_name_list.append(gpu_spec["name"])
|
|
gpu_spec["locks"] = mem_to_core_allowed_locks
|
|
|
|
pci_info = pynvml.nvmlDeviceGetPciInfo(gpu_handle)
|
|
pci_bus_id = pci_info.bus
|
|
pci_device_id = pci_info.device
|
|
pci_domain_id = pci_info.domain
|
|
gpu_spec["pci_core"] = f"{pci_domain_id}:{pci_bus_id:02d}:{pci_device_id:02d}.0"
|
|
|
|
mem_range = get_hive_clock_range(is_hive, i, "mem")
|
|
core_range = get_hive_clock_range(is_hive, i, "core")
|
|
try:
|
|
if type(mem_range) != list:
|
|
pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load
|
|
failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle)
|
|
failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle)
|
|
if (not failure_min) and (not failure_max):
|
|
mem_range=[min_oc_solution, max_oc_solution]
|
|
pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, 0)
|
|
pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
|
|
if type(core_range) != list:
|
|
pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, 300, 350) # Force low clocks, so the GPU can't crash when testing if under load
|
|
failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle, True)
|
|
failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle, True)
|
|
if (not failure_min) and (not failure_max):
|
|
core_range=[min_oc_solution, max_oc_solution]
|
|
pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0)
|
|
pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
|
|
except Exception as e_pinpointing:
|
|
if "not supported" in str(e_pinpointing).lower():
|
|
try:
|
|
min_core_offset, max_core_offset = pynvml.nvmlDeviceGetGpcClkMinMaxVfOffset(gpu_handle)
|
|
if min_core_offset>0:
|
|
min_core_offset = min_core_offset - math.floor((2**32)/1000)
|
|
if min_core_offset > -20000 and min_core_offset <= 0 and max_core_offset>=0 and min_core_offset < 20000:
|
|
core_range=[min_core_offset, max_core_offset]
|
|
else:
|
|
core_range=[0,0]
|
|
min_mem_offset, max_mem_offset = pynvml.nvmlDeviceGetMemClkMinMaxVfOffset(gpu_handle)
|
|
if min_mem_offset>0:
|
|
min_mem_offset = min_mem_offset - math.floor((2**32)/1000)
|
|
if min_mem_offset==0 and max_mem_offset==0:
|
|
if gpu_spec["name"] in GPU_MEM_ALLOWED_OC_RANGES:
|
|
mem_range = GPU_MEM_ALLOWED_OC_RANGES[gpu_spec["name"]]
|
|
else:
|
|
mem_range = [0,0]
|
|
elif min_mem_offset > -20000 and min_mem_offset <= 0 and max_mem_offset>=0 and max_mem_offset < 20000:
|
|
mem_range=[min_mem_offset, max_mem_offset]
|
|
else:
|
|
mem_range=[0,0]
|
|
except Exception as e2:
|
|
if "function not found" in str(e2).lower():
|
|
if gpu_spec["name"] in GPU_MEM_ALLOWED_OC_RANGES:
|
|
mem_range = GPU_MEM_ALLOWED_OC_RANGES[gpu_spec["name"]]
|
|
else:
|
|
mem_range = [0,0]
|
|
if gpu_spec["name"] in GPU_CORE_ALLOWED_OC_RANGES:
|
|
core_range = GPU_CORE_ALLOWED_OC_RANGES[gpu_spec["name"]]
|
|
else:
|
|
core_range = [0,0]
|
|
else:
|
|
get_data_fail=True
|
|
if type(mem_range) == list and type(core_range) == list and len(mem_range)==2 and len(core_range)==2:
|
|
gpu_spec["mem"]=mem_range
|
|
gpu_spec["core"]=core_range
|
|
else:
|
|
get_data_fail=True
|
|
|
|
parsed_specs[f"{i}-{gpu_uuid}"]=gpu_spec
|
|
with open(specs_file_loc, "w") as specs_file:
|
|
json.dump(parsed_specs, specs_file)
|
|
|
|
if not get_data_fail:
|
|
parsed_specs_keys=parsed_specs.keys()
|
|
for key in parsed_specs_keys:
|
|
all_gpus_data_list.append(parsed_specs[key])
|
|
except Exception as e:
|
|
get_data_fail=True
|
|
log.error("Loading GPU OC specs [ fail ]")
|
|
if not get_data_fail:
|
|
log.success("Loading GPU OC specs [ success ]")
|
|
|
|
print(all_gpus_data_list)
|
|
# Load GPU specs
|
|
|
|
def get_gpu_name_list():
|
|
global gpu_name_list
|
|
return gpu_name_list
|
|
|
|
def get_gpu_oc_specs():
|
|
global get_data_fail
|
|
if get_data_fail:
|
|
return False
|
|
else:
|
|
return all_gpus_data_list
|
|
|
|
def shutdown():
|
|
pynvml.nvmlShutdown()
|
|
|
|
def get_gpu_locked_clocks(gpu_index):
|
|
try:
|
|
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
|
|
mem_clocks = pynvml.nvmlDeviceGetSupportedMemoryClocks(handle)
|
|
mem_to_core = {}
|
|
for idx, mem_clock in enumerate(mem_clocks):
|
|
if idx < 12 or idx == len(mem_clocks)-1:
|
|
graphics_clocks = pynvml.nvmlDeviceGetSupportedGraphicsClocks(handle, mem_clock)
|
|
mem_to_core[str(mem_clock)] = [min(graphics_clocks), max(graphics_clocks)]
|
|
return mem_to_core
|
|
except Exception as e:
|
|
return {}
|
|
|
|
def handle_nn(input_int):
|
|
if abs(4293967-input_int) < 10000:
|
|
return input_int-4293967
|
|
elif abs(8589934-input_int) < 10000:
|
|
return input_int-8589934
|
|
else:
|
|
return input_int
|
|
|
|
def pinpoint_find_dicts_negative(data):
|
|
false_success_items = [d for d in data if not d['success']]
|
|
true_success_items = [d for d in data if d['success']]
|
|
highest_false_success = max(false_success_items, key=lambda x: x['offset'], default=None)
|
|
lowest_true_success = min(true_success_items, key=lambda x: x['offset'], default=None)
|
|
return highest_false_success, lowest_true_success
|
|
|
|
def pinpoint_find_dicts_positive(data):
|
|
false_success_items = [d for d in data if not d['success']]
|
|
true_success_items = [d for d in data if d['success']]
|
|
lowest_false_success = min(false_success_items, key=lambda x: x['offset'], default=None)
|
|
highest_true_success = max(true_success_items, key=lambda x: x['offset'], default=None)
|
|
return highest_true_success, lowest_false_success
|
|
|
|
def pinpoint_oc_limits_negative(gpu_handle, core=False):
|
|
step_cnt = 0
|
|
found_solution = None
|
|
init_negative_max = -19855 # Probably
|
|
history_info = [{"offset": init_negative_max*2, "success":False}]
|
|
failure = False
|
|
max_step_cnt = 20
|
|
try:
|
|
while found_solution == None and step_cnt<max_step_cnt and not failure:
|
|
step_cnt+=1
|
|
#print("STEP", step_cnt)
|
|
#print(history_info)
|
|
highest_false_success, lowest_true_success = pinpoint_find_dicts_negative(history_info)
|
|
test_offset = None
|
|
if lowest_true_success == None:
|
|
test_offset = int(highest_false_success["offset"]/2)
|
|
elif highest_false_success != None:
|
|
test_offset = int((highest_false_success["offset"]+lowest_true_success["offset"])/2)
|
|
if not step_cnt<max_step_cnt:
|
|
found_solution=lowest_true_success["offset"]
|
|
test_offset=None
|
|
elif test_offset==lowest_true_success["offset"]:
|
|
found_solution=test_offset
|
|
test_offset=None
|
|
|
|
if test_offset != None:
|
|
any_exception = False
|
|
try:
|
|
if core:
|
|
pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, test_offset)
|
|
else:
|
|
pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, test_offset)
|
|
except Exception as e:
|
|
any_exception=True
|
|
if not "Unknown Error" in str(e):
|
|
failure=True
|
|
history_info.append({"offset": test_offset, "success":not any_exception})
|
|
except Exception as e:
|
|
failure=True
|
|
return failure, found_solution
|
|
|
|
def pinpoint_oc_limits_positive(gpu_handle, core=False):
|
|
step_cnt = 0
|
|
found_solution = None
|
|
init_negative_max = 20000 # Probably
|
|
history_info = [{"offset": init_negative_max*2, "success":False}]
|
|
failure = False
|
|
max_step_cnt = 20
|
|
try:
|
|
while found_solution == None and step_cnt<max_step_cnt and not failure:
|
|
step_cnt+=1
|
|
#print("STEP", step_cnt)
|
|
#print(history_info)
|
|
highest_true_success, lowest_false_success = pinpoint_find_dicts_positive(history_info)
|
|
test_offset = None
|
|
if highest_true_success == None:
|
|
test_offset = int(lowest_false_success["offset"]/2)
|
|
elif lowest_false_success != None:
|
|
test_offset = int((highest_true_success["offset"]+lowest_false_success["offset"])/2)
|
|
if not step_cnt<max_step_cnt:
|
|
found_solution=highest_true_success["offset"]
|
|
test_offset=None
|
|
elif test_offset==highest_true_success["offset"]:
|
|
found_solution=test_offset
|
|
test_offset=None
|
|
|
|
if test_offset != None:
|
|
any_exception = False
|
|
try:
|
|
if core:
|
|
pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, test_offset)
|
|
else:
|
|
pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, test_offset)
|
|
except Exception as e:
|
|
any_exception=True
|
|
if not "Unknown Error" in str(e):
|
|
failure=True
|
|
history_info.append({"offset": test_offset, "success":not any_exception})
|
|
except Exception as e:
|
|
failure=True
|
|
return failure, found_solution
|
|
|
|
def set_oc(settings):
|
|
global is_hive
|
|
try:
|
|
gpu_count = pynvml.nvmlDeviceGetCount()
|
|
settings_keys = settings.keys()
|
|
if len(settings_keys)==0: # Configure default clocks/pl
|
|
settings={}
|
|
for i in range(0,gpu_count):
|
|
settings[str(i)]={
|
|
"core":0,
|
|
"mem":0,
|
|
"pl": all_gpus_data_list[i]["default_power_limit"]
|
|
}
|
|
settings_keys = settings.keys()
|
|
log.debug(f"Rewriting settings with: {json.dumps(settings)}")
|
|
|
|
core_locks = []
|
|
mem_locks = []
|
|
any_lock_failure = False
|
|
for oc_gpu_index in settings_keys:
|
|
if oc_gpu_index.isdigit():
|
|
oc_gpu_index=int(oc_gpu_index)
|
|
if oc_gpu_index < gpu_count and type(settings[str(oc_gpu_index)])==dict:
|
|
gpu_oc_config = settings[str(oc_gpu_index)]
|
|
gpu_possible_ranges = all_gpus_data_list[oc_gpu_index]
|
|
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(oc_gpu_index)
|
|
|
|
if "core_lock" in gpu_oc_config:
|
|
core_lock = int(gpu_oc_config["core_lock"])
|
|
core_locks.append(str(core_lock))
|
|
try:
|
|
pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, core_lock, core_lock)
|
|
except Exception as core_lock_exception:
|
|
any_lock_failure=True
|
|
else:
|
|
core_locks.append('0')
|
|
try:
|
|
pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
|
|
except Exception as core_lock_exception:
|
|
any_lock_failure=True
|
|
|
|
if "mem_lock" in gpu_oc_config:
|
|
mem_lock = int(gpu_oc_config["mem_lock"])
|
|
mem_locks.append(str(mem_lock))
|
|
try:
|
|
pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, mem_lock, mem_lock)
|
|
except Exception as mem_lock_exception:
|
|
any_lock_failure=True
|
|
else:
|
|
mem_locks.append('0')
|
|
try:
|
|
pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
|
|
except Exception as mem_lock_exception:
|
|
any_lock_failure=True
|
|
|
|
if "core" in gpu_oc_config: # Core offset
|
|
wanted_core_clock = int(round(gpu_oc_config["core"]*2))
|
|
if gpu_possible_ranges["core"][0] <= wanted_core_clock and wanted_core_clock <= gpu_possible_ranges["core"][1]:
|
|
pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, wanted_core_clock)
|
|
else:
|
|
log.error(f"Requested OC for GPU:{oc_gpu_index} (CORE) out of bound | {wanted_core_clock} | [{gpu_possible_ranges["core"][0]}, {gpu_possible_ranges["core"][1]}]")
|
|
if "mem" in gpu_oc_config: # Memory offset
|
|
wanted_mem_clock = int(round(gpu_oc_config["mem"]*2))
|
|
if gpu_possible_ranges["mem"][0] <= wanted_mem_clock and wanted_mem_clock <= gpu_possible_ranges["mem"][1]:
|
|
pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, wanted_mem_clock)
|
|
else:
|
|
log.error(f"Requested OC for GPU:{oc_gpu_index} (MEMORY) out of bound | {wanted_mem_clock} | [{gpu_possible_ranges["mem"][0]}, {gpu_possible_ranges["mem"][1]}]")
|
|
if "pl" in gpu_oc_config:
|
|
wanted_power_limit_milliwatts = gpu_oc_config["pl"]*1000 # convert W to mW
|
|
if gpu_possible_ranges["power_limits"][0] <= gpu_oc_config["pl"] and gpu_oc_config["pl"] <= gpu_possible_ranges["power_limits"][1]:
|
|
pynvml.nvmlDeviceSetPowerManagementLimit(gpu_handle, wanted_power_limit_milliwatts)
|
|
else:
|
|
log.error(f"Requested OC for GPU:{oc_gpu_index} (POWER LIMIT) out of bound | {gpu_oc_config["pl"]} | [{gpu_possible_ranges["power_limits"][0]}, {gpu_possible_ranges["power_limits"][1]}]")
|
|
if is_hive and any_lock_failure and len(mem_locks)==len(core_locks):
|
|
try:
|
|
nvtool_commands = []
|
|
for idx, mem_lock in enumerate(mem_locks):
|
|
core_lock = core_locks[idx]
|
|
nvtool_commands.append(f"nvtool -i {str(idx)} --setmem {mem_lock} --setcore {core_lock}")
|
|
cmd = ["bash",'-c',f"PATH={HIVE_PATH} && sudo {' && '.join(nvtool_commands)}"]
|
|
#print(cmd)
|
|
subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
except Exception as hive_oc_settings:
|
|
pass
|
|
return True
|
|
except Exception as e:
|
|
log.error(f"set_oc | ERROR | {e}")
|
|
return False
|
|
|
|
|
|
|
|
def get_hive_clock_range(is_hive, gpu_index, part):
|
|
if is_hive:
|
|
try:
|
|
flag = "--setmemoffset" if part=="mem" else "--setcoreoffset"
|
|
cmd = ["bash",'-c',f"PATH={HIVE_PATH} && sudo nvtool -i {gpu_index} {flag} -100000"]
|
|
|
|
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
lines = result.stdout.decode().splitlines()
|
|
stripped_lines = [line.strip() for line in lines]
|
|
non_empty_lines = [line for line in stripped_lines if line]
|
|
|
|
device_id = None
|
|
result=[]
|
|
for non_empty_line in non_empty_lines:
|
|
if non_empty_line[:8]=="DEVICE #":
|
|
device_id = int(non_empty_line[8:].replace(':',''))
|
|
elif " is not in range of " in non_empty_line and device_id!=None and device_id==gpu_index:
|
|
splited_line = non_empty_line.split(" is not in range of ",1)[1].split(' ',4)
|
|
min_val = int(splited_line[0])
|
|
max_val = int(splited_line[2])
|
|
result=[min_val, max_val]
|
|
if len(result)==0:
|
|
return False
|
|
else:
|
|
return result
|
|
except Exception as e:
|
|
return False
|
|
else:
|
|
return False
|
|
|
|
def get_vram_per_gpu():
|
|
vram_per_gpu = []
|
|
try:
|
|
gpu_count = pynvml.nvmlDeviceGetCount()
|
|
for i in range(0,gpu_count):
|
|
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
|
mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
|
|
vram_per_gpu.append(mem_info.total / 1024 ** 2)
|
|
except Exception as e:
|
|
log.error(f"Failed loading get_vram_per_gpu() | {e}")
|
|
pass
|
|
return vram_per_gpu |