2024-05-09 23:32:41 +00:00
from lib import config as config_module
from lib import logging as logging_lib
from lib import get_specs
config = config_module . config
log = logging_lib . log
import subprocess
2024-05-28 00:25:58 +00:00
import clore_pynvml as pynvml
2024-05-09 23:32:41 +00:00
import json
2024-05-28 00:25:58 +00:00
import math
2024-11-03 23:28:03 +00:00
HIVE_PATH = " /hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./ "
2024-05-28 00:25:58 +00:00
GPU_MEM_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
" NVIDIA P102-100 " : [ - 2000 , 2000 ] ,
" NVIDIA P104-100 " : [ - 2000 , 2000 ] ,
" NVIDIA P106-090 " : [ - 2000 , 2000 ] ,
" NVIDIA P106-100 " : [ - 2000 , 2000 ] ,
" NVIDIA GeForce GTX 1050 Ti " : [ - 2000 , 2000 ] ,
" NVIDIA GeForce GTX 1060 3GB " : [ - 2000 , 2000 ] ,
" NVIDIA GeForce GTX 1060 6GB " : [ - 2000 , 2000 ] ,
" NVIDIA GeForce GTX 1070 " : [ - 2000 , 2000 ] ,
" NVIDIA GeForce GTX 1070 Ti " : [ - 2000 , 2000 ] ,
" NVIDIA GeForce GTX 1080 " : [ - 2000 , 2000 ] ,
2024-05-28 00:56:26 +00:00
" NVIDIA GeForce GTX 1080 Ti " : [ - 2000 , 2000 ] ,
2024-05-28 00:25:58 +00:00
" NVIDIA CMP 30HX " : [ - 2000 , 6000 ] ,
" NVIDIA CMP 40HX " : [ - 2000 , 6000 ] ,
" NVIDIA CMP 50HX " : [ - 2000 , 6000 ] ,
" NVIDIA CMP 90HX " : [ - 2000 , 6000 ] ,
" NVIDIA GeForce GTX 1650 " : [ - 2000 , 6000 ] ,
" NVIDIA GeForce GTX 1660 SUPER " : [ - 2000 , 6000 ] ,
" NVIDIA GeForce GTX 1660 Ti " : [ - 2000 , 6000 ] ,
" NVIDIA GeForce RTX 2060 " : [ - 2000 , 6000 ] ,
" NVIDIA GeForce RTX 2060 SUPER " : [ - 2000 , 6000 ] ,
" NVIDIA GeForce RTX 2070 " : [ - 2000 , 6000 ] ,
" NVIDIA GeForce RTX 2070 SUPER " : [ - 2000 , 6000 ] ,
" NVIDIA GeForce RTX 2080 " : [ - 2000 , 6000 ] ,
" NVIDIA GeForce RTX 2080 Ti " : [ - 2000 , 6000 ]
}
2024-05-09 23:32:41 +00:00
2024-05-28 00:56:26 +00:00
GPU_CORE_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
" NVIDIA P102-100 " : [ - 200 , 1200 ] ,
" NVIDIA P104-100 " : [ - 200 , 1200 ] ,
" NVIDIA P106-090 " : [ - 200 , 1200 ] ,
" NVIDIA P106-100 " : [ - 200 , 1200 ] ,
" NVIDIA GeForce GTX 1050 Ti " : [ - 200 , 1200 ] ,
" NVIDIA GeForce GTX 1060 3GB " : [ - 200 , 1200 ] ,
" NVIDIA GeForce GTX 1060 6GB " : [ - 200 , 1200 ] ,
" NVIDIA GeForce GTX 1070 " : [ - 200 , 1200 ] ,
" NVIDIA GeForce GTX 1070 Ti " : [ - 200 , 1200 ] ,
" NVIDIA GeForce GTX 1080 " : [ - 200 , 1200 ] ,
" NVIDIA GeForce GTX 1080 Ti " : [ - 200 , 1200 ] ,
" NVIDIA CMP 30HX " : [ - 1000 , 1000 ] ,
" NVIDIA CMP 40HX " : [ - 1000 , 1000 ] ,
" NVIDIA CMP 50HX " : [ - 1000 , 1000 ] ,
" NVIDIA CMP 90HX " : [ - 1000 , 1000 ] ,
" NVIDIA GeForce GTX 1650 " : [ - 1000 , 1000 ] ,
" NVIDIA GeForce GTX 1660 SUPER " : [ - 1000 , 1000 ] ,
" NVIDIA GeForce GTX 1660 Ti " : [ - 1000 , 1000 ] ,
" NVIDIA GeForce RTX 2060 " : [ - 1000 , 1000 ] ,
" NVIDIA GeForce RTX 2060 SUPER " : [ - 1000 , 1000 ] ,
" NVIDIA GeForce RTX 2070 " : [ - 1000 , 1000 ] ,
" NVIDIA GeForce RTX 2070 SUPER " : [ - 1000 , 1000 ] ,
" NVIDIA GeForce RTX 2080 " : [ - 1000 , 1000 ] ,
" NVIDIA GeForce RTX 2080 Ti " : [ - 1000 , 1000 ]
}
2024-05-09 23:32:41 +00:00
is_hive = False
all_gpus_data_list = [ ]
2024-12-27 01:31:59 +00:00
gpu_name_list = [ ]
2024-05-09 23:32:41 +00:00
get_data_fail = False
2024-09-03 23:17:46 +00:00
def init ( gpu_specs_file = None , allow_hive_binaries = True ) :
2024-12-27 01:31:59 +00:00
global is_hive , all_gpus_data_list , get_data_fail , gpu_name_list
2024-05-09 23:32:41 +00:00
log . info ( " Loading GPU OC specs [ working ] " )
try :
pynvml . nvmlInit ( )
kernel = get_specs . get_kernel ( )
2024-09-03 23:17:46 +00:00
if " hive " in kernel and allow_hive_binaries :
2024-05-09 23:32:41 +00:00
is_hive = True
specs_file_loc = gpu_specs_file if gpu_specs_file else config . gpu_specs_file
regenerate_specs = False
parsed_specs = { }
try :
with open ( specs_file_loc , " r " ) as specs_file :
parsed_specs = json . loads ( specs_file . read ( ) )
except Exception as specs_load_fail :
log . error ( f " Failed loading gpu_specs_file ( { specs_load_fail } ) | regenerating... " )
regenerate_specs = True
parsed_specs_keys = parsed_specs . keys ( )
gpu_count = pynvml . nvmlDeviceGetCount ( )
for i in range ( 0 , gpu_count ) :
if regenerate_specs :
break
gpu_handle = pynvml . nvmlDeviceGetHandleByIndex ( i )
gpu_uuid = pynvml . nvmlDeviceGetUUID ( gpu_handle )
2024-12-27 01:31:59 +00:00
gpu_name_list . append ( pynvml . nvmlDeviceGetName ( gpu_handle ) )
2024-05-09 23:32:41 +00:00
if not f " { i } - { gpu_uuid } " in parsed_specs_keys :
parsed_specs = { }
regenerate_specs = True
break
2024-10-31 02:32:47 +00:00
elif not " locks " in parsed_specs [ f " { i } - { gpu_uuid } " ] :
parsed_specs = { }
regenerate_specs = True
break
2024-05-09 23:32:41 +00:00
if regenerate_specs :
for i in range ( 0 , gpu_count ) :
gpu_spec = { }
2024-10-31 02:32:47 +00:00
mem_to_core_allowed_locks = get_gpu_locked_clocks ( i )
2024-05-09 23:32:41 +00:00
gpu_handle = pynvml . nvmlDeviceGetHandleByIndex ( i )
gpu_uuid = pynvml . nvmlDeviceGetUUID ( gpu_handle )
power_limits = pynvml . nvmlDeviceGetPowerManagementLimitConstraints ( gpu_handle )
min_power_limit = int ( power_limits [ 0 ] / 1000.0 )
max_power_limit = int ( power_limits [ 1 ] / 1000.0 )
gpu_spec [ " default_power_limit " ] = int ( pynvml . nvmlDeviceGetPowerManagementDefaultLimit ( gpu_handle ) / 1000.0 )
gpu_spec [ " power_limits " ] = [ min_power_limit , max_power_limit ]
gpu_spec [ " name " ] = pynvml . nvmlDeviceGetName ( gpu_handle )
2024-12-27 01:31:59 +00:00
gpu_name_list . append ( gpu_spec [ " name " ] )
2024-10-31 02:32:47 +00:00
gpu_spec [ " locks " ] = mem_to_core_allowed_locks
2024-05-09 23:32:41 +00:00
pci_info = pynvml . nvmlDeviceGetPciInfo ( gpu_handle )
pci_bus_id = pci_info . bus
pci_device_id = pci_info . device
pci_domain_id = pci_info . domain
gpu_spec [ " pci_core " ] = f " { pci_domain_id } : { pci_bus_id : 02d } : { pci_device_id : 02d } .0 "
mem_range = get_hive_clock_range ( is_hive , i , " mem " )
core_range = get_hive_clock_range ( is_hive , i , " core " )
2024-05-28 00:25:58 +00:00
try :
if type ( mem_range ) != list :
pynvml . nvmlDeviceSetMemoryLockedClocks ( gpu_handle , 200 , 300 ) # Force low clocks, so the GPU can't crash when testing if under load
failure_min , min_oc_solution = pinpoint_oc_limits_negative ( gpu_handle )
failure_max , max_oc_solution = pinpoint_oc_limits_positive ( gpu_handle )
if ( not failure_min ) and ( not failure_max ) :
mem_range = [ min_oc_solution , max_oc_solution ]
pynvml . nvmlDeviceSetMemClkVfOffset ( gpu_handle , 0 )
pynvml . nvmlDeviceResetMemoryLockedClocks ( gpu_handle )
if type ( core_range ) != list :
pynvml . nvmlDeviceSetGpuLockedClocks ( gpu_handle , 300 , 350 ) # Force low clocks, so the GPU can't crash when testing if under load
failure_min , min_oc_solution = pinpoint_oc_limits_negative ( gpu_handle , True )
failure_max , max_oc_solution = pinpoint_oc_limits_positive ( gpu_handle , True )
if ( not failure_min ) and ( not failure_max ) :
core_range = [ min_oc_solution , max_oc_solution ]
pynvml . nvmlDeviceSetGpcClkVfOffset ( gpu_handle , 0 )
pynvml . nvmlDeviceResetGpuLockedClocks ( gpu_handle )
except Exception as e_pinpointing :
if " not supported " in str ( e_pinpointing ) . lower ( ) :
try :
min_core_offset , max_core_offset = pynvml . nvmlDeviceGetGpcClkMinMaxVfOffset ( gpu_handle )
if min_core_offset > 0 :
min_core_offset = min_core_offset - math . floor ( ( 2 * * 32 ) / 1000 )
if min_core_offset > - 20000 and min_core_offset < = 0 and max_core_offset > = 0 and min_core_offset < 20000 :
core_range = [ min_core_offset , max_core_offset ]
else :
core_range = [ 0 , 0 ]
min_mem_offset , max_mem_offset = pynvml . nvmlDeviceGetMemClkMinMaxVfOffset ( gpu_handle )
if min_mem_offset > 0 :
min_mem_offset = min_mem_offset - math . floor ( ( 2 * * 32 ) / 1000 )
if min_mem_offset == 0 and max_mem_offset == 0 :
if gpu_spec [ " name " ] in GPU_MEM_ALLOWED_OC_RANGES :
mem_range = GPU_MEM_ALLOWED_OC_RANGES [ gpu_spec [ " name " ] ]
else :
mem_range = [ 0 , 0 ]
elif min_mem_offset > - 20000 and min_mem_offset < = 0 and max_mem_offset > = 0 and max_mem_offset < 20000 :
mem_range = [ min_mem_offset , max_mem_offset ]
else :
mem_range = [ 0 , 0 ]
except Exception as e2 :
2024-05-28 00:56:26 +00:00
if " function not found " in str ( e2 ) . lower ( ) :
if gpu_spec [ " name " ] in GPU_MEM_ALLOWED_OC_RANGES :
mem_range = GPU_MEM_ALLOWED_OC_RANGES [ gpu_spec [ " name " ] ]
else :
mem_range = [ 0 , 0 ]
if gpu_spec [ " name " ] in GPU_CORE_ALLOWED_OC_RANGES :
core_range = GPU_CORE_ALLOWED_OC_RANGES [ gpu_spec [ " name " ] ]
else :
core_range = [ 0 , 0 ]
else :
get_data_fail = True
2024-05-09 23:32:41 +00:00
if type ( mem_range ) == list and type ( core_range ) == list and len ( mem_range ) == 2 and len ( core_range ) == 2 :
gpu_spec [ " mem " ] = mem_range
gpu_spec [ " core " ] = core_range
else :
get_data_fail = True
parsed_specs [ f " { i } - { gpu_uuid } " ] = gpu_spec
with open ( specs_file_loc , " w " ) as specs_file :
json . dump ( parsed_specs , specs_file )
if not get_data_fail :
parsed_specs_keys = parsed_specs . keys ( )
for key in parsed_specs_keys :
all_gpus_data_list . append ( parsed_specs [ key ] )
except Exception as e :
get_data_fail = True
log . error ( " Loading GPU OC specs [ fail ] " )
if not get_data_fail :
log . success ( " Loading GPU OC specs [ success ] " )
print ( all_gpus_data_list )
# Load GPU specs
2024-12-27 01:31:59 +00:00
def get_gpu_name_list ( ) :
global gpu_name_list
return gpu_name_list
2024-05-09 23:32:41 +00:00
def get_gpu_oc_specs ( ) :
global get_data_fail
if get_data_fail :
return False
else :
return all_gpus_data_list
def shutdown ( ) :
pynvml . nvmlShutdown ( )
2024-10-31 02:32:47 +00:00
def get_gpu_locked_clocks ( gpu_index ) :
try :
handle = pynvml . nvmlDeviceGetHandleByIndex ( gpu_index )
mem_clocks = pynvml . nvmlDeviceGetSupportedMemoryClocks ( handle )
mem_to_core = { }
for idx , mem_clock in enumerate ( mem_clocks ) :
if idx < 12 or idx == len ( mem_clocks ) - 1 :
graphics_clocks = pynvml . nvmlDeviceGetSupportedGraphicsClocks ( handle , mem_clock )
mem_to_core [ str ( mem_clock ) ] = [ min ( graphics_clocks ) , max ( graphics_clocks ) ]
return mem_to_core
except Exception as e :
return { }
2024-05-09 23:32:41 +00:00
def handle_nn ( input_int ) :
if abs ( 4293967 - input_int ) < 10000 :
return input_int - 4293967
elif abs ( 8589934 - input_int ) < 10000 :
return input_int - 8589934
else :
return input_int
def pinpoint_find_dicts_negative ( data ) :
false_success_items = [ d for d in data if not d [ ' success ' ] ]
true_success_items = [ d for d in data if d [ ' success ' ] ]
highest_false_success = max ( false_success_items , key = lambda x : x [ ' offset ' ] , default = None )
lowest_true_success = min ( true_success_items , key = lambda x : x [ ' offset ' ] , default = None )
return highest_false_success , lowest_true_success
def pinpoint_find_dicts_positive ( data ) :
false_success_items = [ d for d in data if not d [ ' success ' ] ]
true_success_items = [ d for d in data if d [ ' success ' ] ]
lowest_false_success = min ( false_success_items , key = lambda x : x [ ' offset ' ] , default = None )
highest_true_success = max ( true_success_items , key = lambda x : x [ ' offset ' ] , default = None )
return highest_true_success , lowest_false_success
def pinpoint_oc_limits_negative ( gpu_handle , core = False ) :
step_cnt = 0
found_solution = None
init_negative_max = - 19855 # Probably
history_info = [ { " offset " : init_negative_max * 2 , " success " : False } ]
failure = False
max_step_cnt = 20
try :
while found_solution == None and step_cnt < max_step_cnt and not failure :
step_cnt + = 1
#print("STEP", step_cnt)
#print(history_info)
highest_false_success , lowest_true_success = pinpoint_find_dicts_negative ( history_info )
test_offset = None
if lowest_true_success == None :
test_offset = int ( highest_false_success [ " offset " ] / 2 )
elif highest_false_success != None :
test_offset = int ( ( highest_false_success [ " offset " ] + lowest_true_success [ " offset " ] ) / 2 )
if not step_cnt < max_step_cnt :
found_solution = lowest_true_success [ " offset " ]
test_offset = None
elif test_offset == lowest_true_success [ " offset " ] :
found_solution = test_offset
test_offset = None
if test_offset != None :
any_exception = False
try :
if core :
pynvml . nvmlDeviceSetGpcClkVfOffset ( gpu_handle , test_offset )
else :
pynvml . nvmlDeviceSetMemClkVfOffset ( gpu_handle , test_offset )
except Exception as e :
any_exception = True
if not " Unknown Error " in str ( e ) :
failure = True
history_info . append ( { " offset " : test_offset , " success " : not any_exception } )
except Exception as e :
failure = True
return failure , found_solution
def pinpoint_oc_limits_positive ( gpu_handle , core = False ) :
step_cnt = 0
found_solution = None
init_negative_max = 20000 # Probably
history_info = [ { " offset " : init_negative_max * 2 , " success " : False } ]
failure = False
max_step_cnt = 20
try :
while found_solution == None and step_cnt < max_step_cnt and not failure :
step_cnt + = 1
#print("STEP", step_cnt)
#print(history_info)
highest_true_success , lowest_false_success = pinpoint_find_dicts_positive ( history_info )
test_offset = None
if highest_true_success == None :
test_offset = int ( lowest_false_success [ " offset " ] / 2 )
elif lowest_false_success != None :
test_offset = int ( ( highest_true_success [ " offset " ] + lowest_false_success [ " offset " ] ) / 2 )
if not step_cnt < max_step_cnt :
found_solution = highest_true_success [ " offset " ]
test_offset = None
elif test_offset == highest_true_success [ " offset " ] :
found_solution = test_offset
test_offset = None
if test_offset != None :
any_exception = False
try :
if core :
pynvml . nvmlDeviceSetGpcClkVfOffset ( gpu_handle , test_offset )
else :
pynvml . nvmlDeviceSetMemClkVfOffset ( gpu_handle , test_offset )
except Exception as e :
any_exception = True
if not " Unknown Error " in str ( e ) :
failure = True
history_info . append ( { " offset " : test_offset , " success " : not any_exception } )
except Exception as e :
failure = True
return failure , found_solution
def set_oc ( settings ) :
2024-11-03 23:28:03 +00:00
global is_hive
2024-05-09 23:32:41 +00:00
try :
gpu_count = pynvml . nvmlDeviceGetCount ( )
settings_keys = settings . keys ( )
2024-05-10 00:38:45 +00:00
if len ( settings_keys ) == 0 : # Configure default clocks/pl
settings = { }
for i in range ( 0 , gpu_count ) :
settings [ str ( i ) ] = {
" core " : 0 ,
" mem " : 0 ,
" pl " : all_gpus_data_list [ i ] [ " default_power_limit " ]
}
settings_keys = settings . keys ( )
log . debug ( f " Rewriting settings with: { json . dumps ( settings ) } " )
2024-11-03 23:28:03 +00:00
core_locks = [ ]
mem_locks = [ ]
any_lock_failure = False
2024-05-09 23:32:41 +00:00
for oc_gpu_index in settings_keys :
if oc_gpu_index . isdigit ( ) :
oc_gpu_index = int ( oc_gpu_index )
if oc_gpu_index < gpu_count and type ( settings [ str ( oc_gpu_index ) ] ) == dict :
gpu_oc_config = settings [ str ( oc_gpu_index ) ]
gpu_possible_ranges = all_gpus_data_list [ oc_gpu_index ]
gpu_handle = pynvml . nvmlDeviceGetHandleByIndex ( oc_gpu_index )
2024-11-03 23:28:03 +00:00
2024-10-31 02:32:47 +00:00
if " core_lock " in gpu_oc_config :
core_lock = int ( gpu_oc_config [ " core_lock " ] )
2024-11-03 23:28:03 +00:00
core_locks . append ( str ( core_lock ) )
try :
pynvml . nvmlDeviceSetGpuLockedClocks ( gpu_handle , core_lock , core_lock )
except Exception as core_lock_exception :
any_lock_failure = True
2024-10-31 02:32:47 +00:00
else :
2024-11-03 23:28:03 +00:00
core_locks . append ( ' 0 ' )
try :
pynvml . nvmlDeviceResetGpuLockedClocks ( gpu_handle )
except Exception as core_lock_exception :
any_lock_failure = True
2024-10-31 02:32:47 +00:00
if " mem_lock " in gpu_oc_config :
mem_lock = int ( gpu_oc_config [ " mem_lock " ] )
2024-11-03 23:28:03 +00:00
mem_locks . append ( str ( mem_lock ) )
try :
pynvml . nvmlDeviceSetMemoryLockedClocks ( gpu_handle , mem_lock , mem_lock )
except Exception as mem_lock_exception :
any_lock_failure = True
2024-10-31 02:32:47 +00:00
else :
2024-11-03 23:28:03 +00:00
mem_locks . append ( ' 0 ' )
try :
pynvml . nvmlDeviceResetMemoryLockedClocks ( gpu_handle )
except Exception as mem_lock_exception :
any_lock_failure = True
2024-10-31 02:32:47 +00:00
if " core " in gpu_oc_config : # Core offset
2024-05-11 00:04:02 +00:00
wanted_core_clock = int ( round ( gpu_oc_config [ " core " ] * 2 ) )
2024-05-09 23:32:41 +00:00
if gpu_possible_ranges [ " core " ] [ 0 ] < = wanted_core_clock and wanted_core_clock < = gpu_possible_ranges [ " core " ] [ 1 ] :
pynvml . nvmlDeviceSetGpcClkVfOffset ( gpu_handle , wanted_core_clock )
else :
log . error ( f " Requested OC for GPU: { oc_gpu_index } (CORE) out of bound | { wanted_core_clock } | [ { gpu_possible_ranges [ " core " ] [ 0 ] } , { gpu_possible_ranges [ " core " ] [ 1 ] } ] " )
2024-10-31 02:32:47 +00:00
if " mem " in gpu_oc_config : # Memory offset
2024-05-11 00:04:02 +00:00
wanted_mem_clock = int ( round ( gpu_oc_config [ " mem " ] * 2 ) )
2024-05-09 23:32:41 +00:00
if gpu_possible_ranges [ " mem " ] [ 0 ] < = wanted_mem_clock and wanted_mem_clock < = gpu_possible_ranges [ " mem " ] [ 1 ] :
2024-05-11 00:04:02 +00:00
pynvml . nvmlDeviceSetMemClkVfOffset ( gpu_handle , wanted_mem_clock )
2024-05-09 23:32:41 +00:00
else :
log . error ( f " Requested OC for GPU: { oc_gpu_index } (MEMORY) out of bound | { wanted_mem_clock } | [ { gpu_possible_ranges [ " mem " ] [ 0 ] } , { gpu_possible_ranges [ " mem " ] [ 1 ] } ] " )
if " pl " in gpu_oc_config :
wanted_power_limit_milliwatts = gpu_oc_config [ " pl " ] * 1000 # convert W to mW
if gpu_possible_ranges [ " power_limits " ] [ 0 ] < = gpu_oc_config [ " pl " ] and gpu_oc_config [ " pl " ] < = gpu_possible_ranges [ " power_limits " ] [ 1 ] :
pynvml . nvmlDeviceSetPowerManagementLimit ( gpu_handle , wanted_power_limit_milliwatts )
else :
log . error ( f " Requested OC for GPU: { oc_gpu_index } (POWER LIMIT) out of bound | { gpu_oc_config [ " pl " ] } | [ { gpu_possible_ranges [ " power_limits " ] [ 0 ] } , { gpu_possible_ranges [ " power_limits " ] [ 1 ] } ] " )
2024-11-03 23:28:03 +00:00
if is_hive and any_lock_failure and len ( mem_locks ) == len ( core_locks ) :
try :
nvtool_commands = [ ]
for idx , mem_lock in enumerate ( mem_locks ) :
core_lock = core_locks [ idx ]
nvtool_commands . append ( f " nvtool -i { str ( idx ) } --setmem { mem_lock } --setcore { core_lock } " )
cmd = [ " bash " , ' -c ' , f " PATH= { HIVE_PATH } && sudo { ' && ' . join ( nvtool_commands ) } " ]
#print(cmd)
subprocess . run ( cmd , stdout = subprocess . PIPE , stderr = subprocess . PIPE )
except Exception as hive_oc_settings :
pass
2024-05-09 23:32:41 +00:00
return True
except Exception as e :
log . error ( f " set_oc | ERROR | { e } " )
return False
def get_hive_clock_range ( is_hive , gpu_index , part ) :
if is_hive :
try :
flag = " --setmemoffset " if part == " mem " else " --setcoreoffset "
2024-05-28 00:25:58 +00:00
cmd = [ " bash " , ' -c ' , f " PATH= { HIVE_PATH } && sudo nvtool -i { gpu_index } { flag } -100000 " ]
2024-05-09 23:32:41 +00:00
result = subprocess . run ( cmd , stdout = subprocess . PIPE , stderr = subprocess . PIPE )
lines = result . stdout . decode ( ) . splitlines ( )
stripped_lines = [ line . strip ( ) for line in lines ]
non_empty_lines = [ line for line in stripped_lines if line ]
device_id = None
result = [ ]
for non_empty_line in non_empty_lines :
if non_empty_line [ : 8 ] == " DEVICE # " :
device_id = int ( non_empty_line [ 8 : ] . replace ( ' : ' , ' ' ) )
elif " is not in range of " in non_empty_line and device_id != None and device_id == gpu_index :
splited_line = non_empty_line . split ( " is not in range of " , 1 ) [ 1 ] . split ( ' ' , 4 )
min_val = int ( splited_line [ 0 ] )
max_val = int ( splited_line [ 2 ] )
result = [ min_val , max_val ]
if len ( result ) == 0 :
return False
else :
return result
except Exception as e :
return False
else :
2024-10-17 17:01:41 +00:00
return False
def get_vram_per_gpu ( ) :
vram_per_gpu = [ ]
try :
gpu_count = pynvml . nvmlDeviceGetCount ( )
for i in range ( 0 , gpu_count ) :
gpu_handle = pynvml . nvmlDeviceGetHandleByIndex ( i )
mem_info = pynvml . nvmlDeviceGetMemoryInfo ( gpu_handle )
vram_per_gpu . append ( mem_info . total / 1024 * * 2 )
except Exception as e :
log . error ( f " Failed loading get_vram_per_gpu() | { e } " )
pass
return vram_per_gpu