2024-05-09 23:32:41 +00:00
from lib import config as config_module
from lib import logging as logging_lib
from lib import get_specs
config = config_module . config
log = logging_lib . log
import subprocess
import pynvml
import json
is_hive = False
all_gpus_data_list = [ ]
get_data_fail = False
def init ( gpu_specs_file = None ) :
global is_hive , all_gpus_data_list , get_data_fail
log . info ( " Loading GPU OC specs [ working ] " )
try :
pynvml . nvmlInit ( )
kernel = get_specs . get_kernel ( )
if " hive " in kernel :
is_hive = True
specs_file_loc = gpu_specs_file if gpu_specs_file else config . gpu_specs_file
regenerate_specs = False
parsed_specs = { }
try :
with open ( specs_file_loc , " r " ) as specs_file :
parsed_specs = json . loads ( specs_file . read ( ) )
except Exception as specs_load_fail :
log . error ( f " Failed loading gpu_specs_file ( { specs_load_fail } ) | regenerating... " )
regenerate_specs = True
parsed_specs_keys = parsed_specs . keys ( )
gpu_count = pynvml . nvmlDeviceGetCount ( )
for i in range ( 0 , gpu_count ) :
if regenerate_specs :
break
gpu_handle = pynvml . nvmlDeviceGetHandleByIndex ( i )
gpu_uuid = pynvml . nvmlDeviceGetUUID ( gpu_handle )
if not f " { i } - { gpu_uuid } " in parsed_specs_keys :
parsed_specs = { }
regenerate_specs = True
break
if regenerate_specs :
for i in range ( 0 , gpu_count ) :
gpu_spec = { }
gpu_handle = pynvml . nvmlDeviceGetHandleByIndex ( i )
gpu_uuid = pynvml . nvmlDeviceGetUUID ( gpu_handle )
power_limits = pynvml . nvmlDeviceGetPowerManagementLimitConstraints ( gpu_handle )
min_power_limit = int ( power_limits [ 0 ] / 1000.0 )
max_power_limit = int ( power_limits [ 1 ] / 1000.0 )
gpu_spec [ " default_power_limit " ] = int ( pynvml . nvmlDeviceGetPowerManagementDefaultLimit ( gpu_handle ) / 1000.0 )
gpu_spec [ " power_limits " ] = [ min_power_limit , max_power_limit ]
gpu_spec [ " name " ] = pynvml . nvmlDeviceGetName ( gpu_handle )
pci_info = pynvml . nvmlDeviceGetPciInfo ( gpu_handle )
pci_bus_id = pci_info . bus
pci_device_id = pci_info . device
pci_domain_id = pci_info . domain
gpu_spec [ " pci_core " ] = f " { pci_domain_id } : { pci_bus_id : 02d } : { pci_device_id : 02d } .0 "
mem_range = get_hive_clock_range ( is_hive , i , " mem " )
core_range = get_hive_clock_range ( is_hive , i , " core " )
if type ( mem_range ) != list :
pynvml . nvmlDeviceSetMemoryLockedClocks ( gpu_handle , 200 , 300 ) # Force low clocks, so the GPU can't crash when testing if under load
failure_min , min_oc_solution = pinpoint_oc_limits_negative ( gpu_handle )
failure_max , max_oc_solution = pinpoint_oc_limits_positive ( gpu_handle )
if ( not failure_min ) and ( not failure_max ) :
mem_range = [ min_oc_solution , max_oc_solution ]
pynvml . nvmlDeviceSetMemClkVfOffset ( gpu_handle , 0 )
pynvml . nvmlDeviceResetMemoryLockedClocks ( gpu_handle )
if type ( core_range ) != list :
pynvml . nvmlDeviceSetGpuLockedClocks ( gpu_handle , 300 , 350 ) # Force low clocks, so the GPU can't crash when testing if under load
failure_min , min_oc_solution = pinpoint_oc_limits_negative ( gpu_handle , True )
failure_max , max_oc_solution = pinpoint_oc_limits_positive ( gpu_handle , True )
if ( not failure_min ) and ( not failure_max ) :
core_range = [ min_oc_solution , max_oc_solution ]
pynvml . nvmlDeviceSetGpcClkVfOffset ( gpu_handle , 0 )
pynvml . nvmlDeviceResetGpuLockedClocks ( gpu_handle )
if type ( mem_range ) == list and type ( core_range ) == list and len ( mem_range ) == 2 and len ( core_range ) == 2 :
gpu_spec [ " mem " ] = mem_range
gpu_spec [ " core " ] = core_range
else :
get_data_fail = True
parsed_specs [ f " { i } - { gpu_uuid } " ] = gpu_spec
with open ( specs_file_loc , " w " ) as specs_file :
json . dump ( parsed_specs , specs_file )
if not get_data_fail :
parsed_specs_keys = parsed_specs . keys ( )
for key in parsed_specs_keys :
all_gpus_data_list . append ( parsed_specs [ key ] )
except Exception as e :
get_data_fail = True
log . error ( " Loading GPU OC specs [ fail ] " )
if not get_data_fail :
log . success ( " Loading GPU OC specs [ success ] " )
print ( all_gpus_data_list )
# Load GPU specs
def get_gpu_oc_specs ( ) :
global get_data_fail
if get_data_fail :
return False
else :
return all_gpus_data_list
def shutdown ( ) :
pynvml . nvmlShutdown ( )
def handle_nn ( input_int ) :
if abs ( 4293967 - input_int ) < 10000 :
return input_int - 4293967
elif abs ( 8589934 - input_int ) < 10000 :
return input_int - 8589934
else :
return input_int
def pinpoint_find_dicts_negative ( data ) :
false_success_items = [ d for d in data if not d [ ' success ' ] ]
true_success_items = [ d for d in data if d [ ' success ' ] ]
highest_false_success = max ( false_success_items , key = lambda x : x [ ' offset ' ] , default = None )
lowest_true_success = min ( true_success_items , key = lambda x : x [ ' offset ' ] , default = None )
return highest_false_success , lowest_true_success
def pinpoint_find_dicts_positive ( data ) :
false_success_items = [ d for d in data if not d [ ' success ' ] ]
true_success_items = [ d for d in data if d [ ' success ' ] ]
lowest_false_success = min ( false_success_items , key = lambda x : x [ ' offset ' ] , default = None )
highest_true_success = max ( true_success_items , key = lambda x : x [ ' offset ' ] , default = None )
return highest_true_success , lowest_false_success
def pinpoint_oc_limits_negative ( gpu_handle , core = False ) :
step_cnt = 0
found_solution = None
init_negative_max = - 19855 # Probably
history_info = [ { " offset " : init_negative_max * 2 , " success " : False } ]
failure = False
max_step_cnt = 20
try :
while found_solution == None and step_cnt < max_step_cnt and not failure :
step_cnt + = 1
#print("STEP", step_cnt)
#print(history_info)
highest_false_success , lowest_true_success = pinpoint_find_dicts_negative ( history_info )
test_offset = None
if lowest_true_success == None :
test_offset = int ( highest_false_success [ " offset " ] / 2 )
elif highest_false_success != None :
test_offset = int ( ( highest_false_success [ " offset " ] + lowest_true_success [ " offset " ] ) / 2 )
if not step_cnt < max_step_cnt :
found_solution = lowest_true_success [ " offset " ]
test_offset = None
elif test_offset == lowest_true_success [ " offset " ] :
found_solution = test_offset
test_offset = None
if test_offset != None :
any_exception = False
try :
if core :
pynvml . nvmlDeviceSetGpcClkVfOffset ( gpu_handle , test_offset )
else :
pynvml . nvmlDeviceSetMemClkVfOffset ( gpu_handle , test_offset )
except Exception as e :
any_exception = True
if not " Unknown Error " in str ( e ) :
failure = True
history_info . append ( { " offset " : test_offset , " success " : not any_exception } )
except Exception as e :
failure = True
return failure , found_solution
def pinpoint_oc_limits_positive ( gpu_handle , core = False ) :
step_cnt = 0
found_solution = None
init_negative_max = 20000 # Probably
history_info = [ { " offset " : init_negative_max * 2 , " success " : False } ]
failure = False
max_step_cnt = 20
try :
while found_solution == None and step_cnt < max_step_cnt and not failure :
step_cnt + = 1
#print("STEP", step_cnt)
#print(history_info)
highest_true_success , lowest_false_success = pinpoint_find_dicts_positive ( history_info )
test_offset = None
if highest_true_success == None :
test_offset = int ( lowest_false_success [ " offset " ] / 2 )
elif lowest_false_success != None :
test_offset = int ( ( highest_true_success [ " offset " ] + lowest_false_success [ " offset " ] ) / 2 )
if not step_cnt < max_step_cnt :
found_solution = highest_true_success [ " offset " ]
test_offset = None
elif test_offset == highest_true_success [ " offset " ] :
found_solution = test_offset
test_offset = None
if test_offset != None :
any_exception = False
try :
if core :
pynvml . nvmlDeviceSetGpcClkVfOffset ( gpu_handle , test_offset )
else :
pynvml . nvmlDeviceSetMemClkVfOffset ( gpu_handle , test_offset )
except Exception as e :
any_exception = True
if not " Unknown Error " in str ( e ) :
failure = True
history_info . append ( { " offset " : test_offset , " success " : not any_exception } )
except Exception as e :
failure = True
return failure , found_solution
def set_oc ( settings ) :
try :
gpu_count = pynvml . nvmlDeviceGetCount ( )
settings_keys = settings . keys ( )
2024-05-10 00:38:45 +00:00
if len ( settings_keys ) == 0 : # Configure default clocks/pl
settings = { }
for i in range ( 0 , gpu_count ) :
settings [ str ( i ) ] = {
" core " : 0 ,
" mem " : 0 ,
" pl " : all_gpus_data_list [ i ] [ " default_power_limit " ]
}
settings_keys = settings . keys ( )
log . debug ( f " Rewriting settings with: { json . dumps ( settings ) } " )
2024-05-09 23:32:41 +00:00
for oc_gpu_index in settings_keys :
if oc_gpu_index . isdigit ( ) :
oc_gpu_index = int ( oc_gpu_index )
if oc_gpu_index < gpu_count and type ( settings [ str ( oc_gpu_index ) ] ) == dict :
gpu_oc_config = settings [ str ( oc_gpu_index ) ]
gpu_possible_ranges = all_gpus_data_list [ oc_gpu_index ]
gpu_handle = pynvml . nvmlDeviceGetHandleByIndex ( oc_gpu_index )
if " core " in gpu_oc_config :
wanted_core_clock = gpu_oc_config [ " core " ]
if gpu_possible_ranges [ " core " ] [ 0 ] < = wanted_core_clock and wanted_core_clock < = gpu_possible_ranges [ " core " ] [ 1 ] :
pynvml . nvmlDeviceSetGpcClkVfOffset ( gpu_handle , wanted_core_clock )
else :
log . error ( f " Requested OC for GPU: { oc_gpu_index } (CORE) out of bound | { wanted_core_clock } | [ { gpu_possible_ranges [ " core " ] [ 0 ] } , { gpu_possible_ranges [ " core " ] [ 1 ] } ] " )
if " mem " in gpu_oc_config :
wanted_mem_clock = gpu_oc_config [ " mem " ]
if gpu_possible_ranges [ " mem " ] [ 0 ] < = wanted_mem_clock and wanted_mem_clock < = gpu_possible_ranges [ " mem " ] [ 1 ] :
pynvml . nvmlDeviceSetGpcClkVfOffset ( gpu_handle , wanted_core_clock )
else :
log . error ( f " Requested OC for GPU: { oc_gpu_index } (MEMORY) out of bound | { wanted_mem_clock } | [ { gpu_possible_ranges [ " mem " ] [ 0 ] } , { gpu_possible_ranges [ " mem " ] [ 1 ] } ] " )
if " pl " in gpu_oc_config :
wanted_power_limit_milliwatts = gpu_oc_config [ " pl " ] * 1000 # convert W to mW
if gpu_possible_ranges [ " power_limits " ] [ 0 ] < = gpu_oc_config [ " pl " ] and gpu_oc_config [ " pl " ] < = gpu_possible_ranges [ " power_limits " ] [ 1 ] :
pynvml . nvmlDeviceSetPowerManagementLimit ( gpu_handle , wanted_power_limit_milliwatts )
else :
log . error ( f " Requested OC for GPU: { oc_gpu_index } (POWER LIMIT) out of bound | { gpu_oc_config [ " pl " ] } | [ { gpu_possible_ranges [ " power_limits " ] [ 0 ] } , { gpu_possible_ranges [ " power_limits " ] [ 1 ] } ] " )
return True
except Exception as e :
log . error ( f " set_oc | ERROR | { e } " )
return False
def get_hive_clock_range ( is_hive , gpu_index , part ) :
if is_hive :
try :
flag = " --setmemoffset " if part == " mem " else " --setcoreoffset "
cmd = [ " bash " , ' -c ' , f " nvtool -i 0 { flag } -100000 " ]
result = subprocess . run ( cmd , stdout = subprocess . PIPE , stderr = subprocess . PIPE )
lines = result . stdout . decode ( ) . splitlines ( )
stripped_lines = [ line . strip ( ) for line in lines ]
non_empty_lines = [ line for line in stripped_lines if line ]
device_id = None
result = [ ]
for non_empty_line in non_empty_lines :
if non_empty_line [ : 8 ] == " DEVICE # " :
device_id = int ( non_empty_line [ 8 : ] . replace ( ' : ' , ' ' ) )
elif " is not in range of " in non_empty_line and device_id != None and device_id == gpu_index :
splited_line = non_empty_line . split ( " is not in range of " , 1 ) [ 1 ] . split ( ' ' , 4 )
min_val = int ( splited_line [ 0 ] )
max_val = int ( splited_line [ 2 ] )
result = [ min_val , max_val ]
if len ( result ) == 0 :
return False
else :
return result
except Exception as e :
return False
else :
return False