Compare commits
7 Commits
ef779233b9
...
9d64eaf051
Author | SHA1 | Date |
---|---|---|
clore | 9d64eaf051 | |
clore | 6a6872d32d | |
clore | cde36bd746 | |
clore | 9e41226603 | |
clore | 97c6c549c1 | |
clore | 9ce9f8eb09 | |
clore | 4d2510eb2b |
|
@ -10,4 +10,5 @@ wireguard/
|
||||||
log-mon.ipynb
|
log-mon.ipynb
|
||||||
t.py
|
t.py
|
||||||
tests.ipynb
|
tests.ipynb
|
||||||
network-tests.ipynb
|
network-tests.ipynb
|
||||||
|
gpu_specs.json
|
|
@ -43,12 +43,13 @@ def configure(containers):
|
||||||
used_startup_files=[]
|
used_startup_files=[]
|
||||||
used_wireguard_configs=[]
|
used_wireguard_configs=[]
|
||||||
startup_sctipt_creation_fail = False
|
startup_sctipt_creation_fail = False
|
||||||
|
use_hive_flightsheet = False
|
||||||
|
|
||||||
if type(containers) == list:
|
if type(containers) == list:
|
||||||
custom_entrypoint_state = custom_entrypoint.cache_entrypoints(containers)
|
custom_entrypoint_state = custom_entrypoint.cache_entrypoints(containers)
|
||||||
|
|
||||||
if type(custom_entrypoint_state)!=list:
|
if type(custom_entrypoint_state)!=list:
|
||||||
return False, valid_containers
|
return False, valid_containers, use_hive_flightsheet
|
||||||
|
|
||||||
for index, container in enumerate(containers):
|
for index, container in enumerate(containers):
|
||||||
ok_custom_entrypoint = False
|
ok_custom_entrypoint = False
|
||||||
|
@ -76,7 +77,9 @@ def configure(containers):
|
||||||
used_startup_files.append(startup_script_name)
|
used_startup_files.append(startup_script_name)
|
||||||
used_startup_files.append(f"{container['name']}.finished")
|
used_startup_files.append(f"{container['name']}.finished")
|
||||||
|
|
||||||
if "network" in container and "network_subnet" in container and "network_gateway" in container and container["network"][:len(config.clore_network_name_prefix)]==config.clore_network_name_prefix:
|
if "image" in container and container["image"]=="cloreai/hive-use-flightsheet":
|
||||||
|
use_hive_flightsheet=True
|
||||||
|
elif "network" in container and "network_subnet" in container and "network_gateway" in container and container["network"][:len(config.clore_network_name_prefix)]==config.clore_network_name_prefix:
|
||||||
if not container["network"] in containers_required_networks:
|
if not container["network"] in containers_required_networks:
|
||||||
containers_required_networks.append(container["network"])
|
containers_required_networks.append(container["network"])
|
||||||
if not container["network"] in default_network_names:
|
if not container["network"] in default_network_names:
|
||||||
|
@ -137,4 +140,4 @@ def configure(containers):
|
||||||
validation_and_security = docker_interface.validate_and_secure_networks()
|
validation_and_security = docker_interface.validate_and_secure_networks()
|
||||||
if startup_sctipt_creation_fail:
|
if startup_sctipt_creation_fail:
|
||||||
validation_and_security=False
|
validation_and_security=False
|
||||||
return validation_and_security, valid_containers
|
return validation_and_security, valid_containers, use_hive_flightsheet
|
|
@ -7,6 +7,7 @@ from lib import docker_deploy
|
||||||
from lib import docker_pull
|
from lib import docker_pull
|
||||||
from lib import get_specs
|
from lib import get_specs
|
||||||
from lib import utils
|
from lib import utils
|
||||||
|
from lib import nvml
|
||||||
log = logging_lib.log
|
log = logging_lib.log
|
||||||
|
|
||||||
from clore_hosting import docker_configurator
|
from clore_hosting import docker_configurator
|
||||||
|
@ -35,7 +36,7 @@ WebSocketClient = ws_interface.WebSocketClient(log_message_broker=container_log_
|
||||||
async def configure_networks(containers):
|
async def configure_networks(containers):
|
||||||
res = await asyncio.to_thread(docker_configurator.configure, containers)
|
res = await asyncio.to_thread(docker_configurator.configure, containers)
|
||||||
try:
|
try:
|
||||||
fin_res = types.DockerConfiguratorRes(validation_and_security=res[0], valid_containers=res[1])
|
fin_res = types.DockerConfiguratorRes(validation_and_security=res[0], valid_containers=res[1], use_hive_flightsheet=res[2])
|
||||||
return fin_res
|
return fin_res
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return False
|
return False
|
||||||
|
@ -51,6 +52,22 @@ async def get_local_images(no_latest_tag = False):
|
||||||
res = await asyncio.to_thread(docker_interface.get_local_images, no_latest_tag)
|
res = await asyncio.to_thread(docker_interface.get_local_images, no_latest_tag)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
async def set_oc(settings):
|
||||||
|
try:
|
||||||
|
result = await asyncio.to_thread(nvml.set_oc, settings)
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"set_oc() | error | {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def set_hive_miner_status(enabled=False):
|
||||||
|
try:
|
||||||
|
result = await asyncio.to_thread(utils.hive_set_miner_status, enabled)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"set_hive_miner_status() | error | {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
class CloreClient:
|
class CloreClient:
|
||||||
def __init__(self, auth_key):
|
def __init__(self, auth_key):
|
||||||
self.auth_key=auth_key
|
self.auth_key=auth_key
|
||||||
|
@ -79,7 +96,8 @@ class CloreClient:
|
||||||
"startup_script_runner": utils.unix_timestamp(),
|
"startup_script_runner": utils.unix_timestamp(),
|
||||||
"log_streaming_task": utils.unix_timestamp(),
|
"log_streaming_task": utils.unix_timestamp(),
|
||||||
"container_log_streaming_service": utils.unix_timestamp(),
|
"container_log_streaming_service": utils.unix_timestamp(),
|
||||||
"specs_service": utils.unix_timestamp()
|
"specs_service": utils.unix_timestamp(),
|
||||||
|
"oc_service": utils.unix_timestamp()
|
||||||
}
|
}
|
||||||
self.max_service_inactivity = 600 # seconds
|
self.max_service_inactivity = 600 # seconds
|
||||||
|
|
||||||
|
@ -87,7 +105,17 @@ class CloreClient:
|
||||||
self.ws_peers[str(config.debug_ws_peer)]={
|
self.ws_peers[str(config.debug_ws_peer)]={
|
||||||
"expiration":"immune"
|
"expiration":"immune"
|
||||||
}
|
}
|
||||||
|
|
||||||
docker_interface.verify_docker_version()
|
docker_interface.verify_docker_version()
|
||||||
|
nvml.init()
|
||||||
|
|
||||||
|
self.gpu_oc_specs = nvml.get_gpu_oc_specs()
|
||||||
|
self.last_oc_service_submit = 0
|
||||||
|
self.last_applied_oc = {}
|
||||||
|
self.last_oc_apply_time = 0
|
||||||
|
|
||||||
|
self.is_hive = get_specs.is_hive()
|
||||||
|
self.use_hive_flightsheet = False
|
||||||
|
|
||||||
async def service(self):
|
async def service(self):
|
||||||
global container_log_broken
|
global container_log_broken
|
||||||
|
@ -101,10 +129,11 @@ class CloreClient:
|
||||||
task4 = asyncio.create_task(log_streaming_task.log_streaming_task(container_log_broken, monitoring))
|
task4 = asyncio.create_task(log_streaming_task.log_streaming_task(container_log_broken, monitoring))
|
||||||
task5 = asyncio.create_task(self.container_log_streaming_service(monitoring))
|
task5 = asyncio.create_task(self.container_log_streaming_service(monitoring))
|
||||||
task6 = asyncio.create_task(self.specs_service(monitoring))
|
task6 = asyncio.create_task(self.specs_service(monitoring))
|
||||||
|
task7 = asyncio.create_task(self.oc_service(monitoring))
|
||||||
monitoring_task = asyncio.create_task(self.monitoring_service(monitoring))
|
monitoring_task = asyncio.create_task(self.monitoring_service(monitoring))
|
||||||
|
|
||||||
# Wait for both tasks to complete (they won't in this case)
|
# Wait for both tasks to complete (they won't in this case)
|
||||||
await asyncio.gather(task1, task2, task3, task4, task5, task6, monitoring_task)
|
await asyncio.gather(task1, task2, task3, task4, task5, task6, task7, monitoring_task)
|
||||||
|
|
||||||
async def monitoring_service(self, monitoring):
|
async def monitoring_service(self, monitoring):
|
||||||
while True:
|
while True:
|
||||||
|
@ -381,6 +410,7 @@ class CloreClient:
|
||||||
if result.validation_and_security:
|
if result.validation_and_security:
|
||||||
self.validated_containers_set=True
|
self.validated_containers_set=True
|
||||||
self.validated_containers = result.valid_containers
|
self.validated_containers = result.valid_containers
|
||||||
|
self.use_hive_flightsheet = result.use_hive_flightsheet
|
||||||
elif type(result)==types.DeployContainersRes:
|
elif type(result)==types.DeployContainersRes:
|
||||||
try:
|
try:
|
||||||
self.all_running_container_names = result.all_running_container_names
|
self.all_running_container_names = result.all_running_container_names
|
||||||
|
@ -395,7 +425,7 @@ class CloreClient:
|
||||||
async def submit_specs(self, current_specs):
|
async def submit_specs(self, current_specs):
|
||||||
try:
|
try:
|
||||||
if type(current_specs) == dict:
|
if type(current_specs) == dict:
|
||||||
current_specs["backend_version"]=8
|
current_specs["backend_version"]=9
|
||||||
current_specs["update_hw"]=True
|
current_specs["update_hw"]=True
|
||||||
smallest_pcie_width = 999
|
smallest_pcie_width = 999
|
||||||
for gpu in current_specs["gpus"]["nvidia"]:
|
for gpu in current_specs["gpus"]["nvidia"]:
|
||||||
|
@ -438,6 +468,36 @@ class CloreClient:
|
||||||
log.debug(f"FAIL | specs_service() | {e}")
|
log.debug(f"FAIL | specs_service() | {e}")
|
||||||
await asyncio.sleep(7)
|
await asyncio.sleep(7)
|
||||||
|
|
||||||
|
async def oc_service(self, monitoring):
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
await monitoring.put("oc_service")
|
||||||
|
oc_apply_allowed = True
|
||||||
|
### OC Service should also hande Hive stuff
|
||||||
|
if self.use_hive_flightsheet and self.is_hive:
|
||||||
|
await set_hive_miner_status(True)
|
||||||
|
oc_apply_allowed = False # Don't apply any OC when running HiveOS miner
|
||||||
|
elif self.is_hive:
|
||||||
|
await set_hive_miner_status(False)
|
||||||
|
### Run OC tasks
|
||||||
|
oc_conf = WebSocketClient.get_oc()
|
||||||
|
if oc_conf[0] and type(self.gpu_oc_specs)==list and oc_conf[1]!=self.gpu_oc_specs and self.last_oc_service_submit+240 < utils.unix_timestamp():
|
||||||
|
log.debug("Submitting \"gpu_oc_specs\"")
|
||||||
|
self.last_oc_service_submit = utils.unix_timestamp()
|
||||||
|
await WebSocketClient.send({
|
||||||
|
"set_gpu_info":self.gpu_oc_specs,
|
||||||
|
"xorg_valid": True
|
||||||
|
})
|
||||||
|
if oc_conf[0] and type(oc_conf[2])==dict and oc_apply_allowed:
|
||||||
|
if utils.normalize_rule(self.last_applied_oc)!=utils.normalize_rule(oc_conf[2]) or (self.last_oc_apply_time < utils.unix_timestamp()-300):
|
||||||
|
self.last_oc_apply_time = utils.unix_timestamp()
|
||||||
|
log.debug(f"Applying OC | {json.dumps(oc_conf[2], separators=(',',':'))}")
|
||||||
|
await set_oc(oc_conf[2])
|
||||||
|
self.last_applied_oc=oc_conf[2]
|
||||||
|
except Exception as e:
|
||||||
|
log.debug(f"FAIL | oc_service() | {e}")
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
|
||||||
def expire_ws_peers(self):
|
def expire_ws_peers(self):
|
||||||
for ws_peer_address in list(self.ws_peers.keys()):
|
for ws_peer_address in list(self.ws_peers.keys()):
|
||||||
ws_peer_info = self.ws_peers[ws_peer_address]
|
ws_peer_info = self.ws_peers[ws_peer_address]
|
||||||
|
|
|
@ -9,6 +9,7 @@ class ServerConfig(BaseModel):
|
||||||
class DockerConfiguratorRes(BaseModel):
|
class DockerConfiguratorRes(BaseModel):
|
||||||
validation_and_security: bool
|
validation_and_security: bool
|
||||||
valid_containers: List[Any]
|
valid_containers: List[Any]
|
||||||
|
use_hive_flightsheet: bool
|
||||||
|
|
||||||
class DeployContainersRes(BaseModel):
|
class DeployContainersRes(BaseModel):
|
||||||
all_running_container_names: List[str]
|
all_running_container_names: List[str]
|
||||||
|
|
|
@ -46,12 +46,19 @@ class WebSocketClient:
|
||||||
self.current_container_logs = {}
|
self.current_container_logs = {}
|
||||||
|
|
||||||
self.last_bash_rnd = ''
|
self.last_bash_rnd = ''
|
||||||
|
|
||||||
|
self.oc_enabled = False
|
||||||
|
self.last_gpu_oc_specs = []
|
||||||
|
self.last_set_oc = {}
|
||||||
|
|
||||||
def get_last_heartbeat(self):
|
def get_last_heartbeat(self):
|
||||||
return self.last_heartbeat
|
return self.last_heartbeat
|
||||||
|
|
||||||
def get_containers(self):
|
def get_containers(self):
|
||||||
return self.containers_set, self.containers
|
return self.containers_set, self.containers
|
||||||
|
|
||||||
|
def get_oc(self):
|
||||||
|
return self.oc_enabled, self.last_gpu_oc_specs, self.last_set_oc
|
||||||
|
|
||||||
def set_ws_peers(self, ws_peers):
|
def set_ws_peers(self, ws_peers):
|
||||||
tmp_ws_peers=[]
|
tmp_ws_peers=[]
|
||||||
|
@ -150,8 +157,12 @@ class WebSocketClient:
|
||||||
self.containers=parsed_json["new_containers"]
|
self.containers=parsed_json["new_containers"]
|
||||||
#log.success(container_str)
|
#log.success(container_str)
|
||||||
elif "allow_oc" in parsed_json: # Enable OC
|
elif "allow_oc" in parsed_json: # Enable OC
|
||||||
|
self.oc_enabled=True
|
||||||
await self.send(json.dumps({"allow_oc":True}))
|
await self.send(json.dumps({"allow_oc":True}))
|
||||||
|
elif "gpu_oc_info" in parsed_json:
|
||||||
|
self.last_gpu_oc_specs = parsed_json["gpu_oc_info"]
|
||||||
elif "set_oc" in parsed_json: # Set specific OC
|
elif "set_oc" in parsed_json: # Set specific OC
|
||||||
|
self.last_set_oc=parsed_json["set_oc"]
|
||||||
back_oc_str = json.dumps({"current_oc":json.dumps(parsed_json["set_oc"], separators=(',',':'))})
|
back_oc_str = json.dumps({"current_oc":json.dumps(parsed_json["set_oc"], separators=(',',':'))})
|
||||||
await self.send(back_oc_str)
|
await self.send(back_oc_str)
|
||||||
elif "bash_cmd" in parsed_json and type(parsed_json["bash_cmd"])==str and "bash_rnd" in parsed_json:
|
elif "bash_cmd" in parsed_json and type(parsed_json["bash_cmd"])==str and "bash_rnd" in parsed_json:
|
||||||
|
|
|
@ -32,7 +32,8 @@ hard_config = {
|
||||||
"container_log_streaming_interval": 2, # Seconds
|
"container_log_streaming_interval": 2, # Seconds
|
||||||
"maximum_service_loop_time": 900, # Seconds, failsafe variable - if service is stuck processing longer than this timeframe it will lead into restarting the app
|
"maximum_service_loop_time": 900, # Seconds, failsafe variable - if service is stuck processing longer than this timeframe it will lead into restarting the app
|
||||||
"maximum_pull_service_loop_time": 14400, # Exception for image pulling
|
"maximum_pull_service_loop_time": 14400, # Exception for image pulling
|
||||||
"creation_engine": "wrapper" # "wrapper" or "sdk" | Wrapper - wrapped docker cli, SDK - docker sdk
|
"creation_engine": "wrapper", # "wrapper" or "sdk" | Wrapper - wrapped docker cli, SDK - docker sdk
|
||||||
|
"allow_mixed_gpus": False
|
||||||
}
|
}
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Example argparse usage')
|
parser = argparse.ArgumentParser(description='Example argparse usage')
|
||||||
|
@ -47,6 +48,7 @@ parser.add_argument('--startup-scripts-folder', type=str, default='/opt/clore-ho
|
||||||
parser.add_argument('--wireguard-config-folder', type=str, default='/opt/clore-hosting/wireguard/configs', help='Folder with wireguard configs')
|
parser.add_argument('--wireguard-config-folder', type=str, default='/opt/clore-hosting/wireguard/configs', help='Folder with wireguard configs')
|
||||||
parser.add_argument('--entrypoints-folder', type=str, default='/opt/clore-hosting/entrypoints', help='Folder with custom entrypoints')
|
parser.add_argument('--entrypoints-folder', type=str, default='/opt/clore-hosting/entrypoints', help='Folder with custom entrypoints')
|
||||||
parser.add_argument('--debug-ws-peer', type=str, help="Specific ws peer to connect to (for debugging only)")
|
parser.add_argument('--debug-ws-peer', type=str, help="Specific ws peer to connect to (for debugging only)")
|
||||||
|
parser.add_argument('--gpu-specs-file', type=str, default='/opt/clore-hosting/client/gpu_specs.json' ,help="Cache with specs of GPU possible OC/Power limit changes")
|
||||||
|
|
||||||
# Parse arguments, ignoring any non-defined arguments
|
# Parse arguments, ignoring any non-defined arguments
|
||||||
args, _ = parser.parse_known_args()
|
args, _ = parser.parse_known_args()
|
||||||
|
|
102
lib/get_specs.py
102
lib/get_specs.py
|
@ -12,6 +12,7 @@ import aiohttp
|
||||||
import asyncio
|
import asyncio
|
||||||
import shutil
|
import shutil
|
||||||
import psutil
|
import psutil
|
||||||
|
import time
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
@ -39,6 +40,80 @@ async def get_ram_usage():
|
||||||
def get_kernel():
|
def get_kernel():
|
||||||
return platform.uname().release
|
return platform.uname().release
|
||||||
|
|
||||||
|
def is_hive():
|
||||||
|
return "hive" in get_kernel()
|
||||||
|
|
||||||
|
def drop_caches():
|
||||||
|
try:
|
||||||
|
with open('/proc/sys/vm/drop_caches', 'w') as f:
|
||||||
|
f.write('3\n')
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def write_test(file_path, block_size, num_blocks):
|
||||||
|
data = os.urandom(block_size)
|
||||||
|
total_bytes = block_size * num_blocks
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
with open(file_path, 'wb') as f:
|
||||||
|
for _ in range(num_blocks):
|
||||||
|
f.write(data)
|
||||||
|
f.flush()
|
||||||
|
os.fsync(f.fileno())
|
||||||
|
|
||||||
|
elapsed_time = time.time() - start_time
|
||||||
|
write_speed = total_bytes / elapsed_time / (1024 * 1024)
|
||||||
|
|
||||||
|
return write_speed, elapsed_time
|
||||||
|
|
||||||
|
def read_test(file_path, block_size, num_blocks):
|
||||||
|
total_bytes = block_size * num_blocks
|
||||||
|
|
||||||
|
# Drop caches to avoid OS-level caching effects
|
||||||
|
drop_caches()
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
with open(file_path, 'rb') as f:
|
||||||
|
for _ in range(num_blocks):
|
||||||
|
data = f.read(block_size)
|
||||||
|
if not data:
|
||||||
|
break
|
||||||
|
|
||||||
|
elapsed_time = time.time() - start_time
|
||||||
|
read_speed = total_bytes / elapsed_time / (1024 * 1024)
|
||||||
|
|
||||||
|
return read_speed, elapsed_time
|
||||||
|
|
||||||
|
def disk_benchmark():
|
||||||
|
total, used, free = shutil.disk_usage("/")
|
||||||
|
|
||||||
|
free_gb = free/1024/1024/1024
|
||||||
|
|
||||||
|
if free_gb<1:
|
||||||
|
return 0,0
|
||||||
|
|
||||||
|
block_size = 1024*1024
|
||||||
|
num_blocks = 250 if free_gb < 3 else 1500
|
||||||
|
|
||||||
|
file_path="/tmp/output"
|
||||||
|
|
||||||
|
print("Running disk benchmark...")
|
||||||
|
print(f"Block Size: {block_size} bytes, Number of Blocks: {num_blocks}")
|
||||||
|
|
||||||
|
# Run write test
|
||||||
|
write_speed, write_time = write_test(file_path, block_size, num_blocks)
|
||||||
|
print(f"Write Speed: {write_speed:.2f} MB/s, Time: {write_time:.2f} seconds")
|
||||||
|
|
||||||
|
# Run read test
|
||||||
|
read_speed, read_time = read_test(file_path, block_size, num_blocks)
|
||||||
|
print(f"Read Speed: {read_speed:.2f} MB/s, Time: {read_time:.2f} seconds")
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
os.remove(file_path)
|
||||||
|
return float(round(write_speed,2)), float(round(read_speed,2))
|
||||||
|
|
||||||
def get_nvidia_version():
|
def get_nvidia_version():
|
||||||
try:
|
try:
|
||||||
output = subprocess.check_output(['nvidia-smi', '-x', '-q'], encoding='utf-8')
|
output = subprocess.check_output(['nvidia-smi', '-x', '-q'], encoding='utf-8')
|
||||||
|
@ -66,6 +141,15 @@ async def measure_internet_speed():
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return '',0, 0
|
return '',0, 0
|
||||||
|
|
||||||
|
async def disk_speed():
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
write_speed, read_speed = await loop.run_in_executor(None, disk_benchmark)
|
||||||
|
return write_speed, read_speed
|
||||||
|
except Exception as e:
|
||||||
|
print("disk benchmark exception",e)
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
async def get_country_code():
|
async def get_country_code():
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
try:
|
try:
|
||||||
|
@ -207,9 +291,17 @@ class Specs:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.motherboard_name_file = "/sys/devices/virtual/dmi/id/board_name"
|
self.motherboard_name_file = "/sys/devices/virtual/dmi/id/board_name"
|
||||||
|
|
||||||
async def get(self, benchmark_internet=False):
|
async def get(self, benchmark_internet=False, benchmark_disk=False, require_same_gpus=False):
|
||||||
total_threads, total_cores, model_name = self.get_cpu_info()
|
total_threads, total_cores, model_name = self.get_cpu_info()
|
||||||
gpu_str, gpu_mem, gpus, nvml_err = get_gpu_info()
|
gpu_str, gpu_mem, gpus, nvml_err = get_gpu_info()
|
||||||
|
if require_same_gpus:
|
||||||
|
last_gpu_name=''
|
||||||
|
for gpu in gpus:
|
||||||
|
if not last_gpu_name:
|
||||||
|
last_gpu_name=gpu["name"]
|
||||||
|
elif last_gpu_name!=gpu["name"]:
|
||||||
|
print("\033[31mMixed GPU machines are not allowed\033[0m")
|
||||||
|
sys.exit(1)
|
||||||
docker_daemon_config = docker_interface.get_daemon_config()
|
docker_daemon_config = docker_interface.get_daemon_config()
|
||||||
disk_str=""
|
disk_str=""
|
||||||
data_root_location="main_disk"
|
data_root_location="main_disk"
|
||||||
|
@ -252,6 +344,12 @@ class Specs:
|
||||||
else: # Disk is overlay
|
else: # Disk is overlay
|
||||||
data_root_location="separate"
|
data_root_location="separate"
|
||||||
disk_str = f"{disk_type} {overlay_total_size}GB"
|
disk_str = f"{disk_type} {overlay_total_size}GB"
|
||||||
|
|
||||||
|
if benchmark_disk:
|
||||||
|
disk_speeds = await disk_speed()
|
||||||
|
else:
|
||||||
|
disk_speeds = [0,0]
|
||||||
|
|
||||||
response = {
|
response = {
|
||||||
"mb": await self.motherboard_type(),
|
"mb": await self.motherboard_type(),
|
||||||
"cpu":model_name,
|
"cpu":model_name,
|
||||||
|
@ -260,7 +358,7 @@ class Specs:
|
||||||
"swap": self.get_swap_size(),
|
"swap": self.get_swap_size(),
|
||||||
"data_root_location":data_root_location,
|
"data_root_location":data_root_location,
|
||||||
"disk": disk_str,
|
"disk": disk_str,
|
||||||
"disk_speed":0,
|
"disk_speed": disk_speeds[1],
|
||||||
"gpu":gpu_str,
|
"gpu":gpu_str,
|
||||||
"gpuram": gpu_mem,
|
"gpuram": gpu_mem,
|
||||||
"gpus": gpus,
|
"gpus": gpus,
|
||||||
|
|
|
@ -86,7 +86,7 @@ async def work_init(loader_event, init_token):
|
||||||
log.error("Cuda must be version 11.7+")
|
log.error("Cuda must be version 11.7+")
|
||||||
os._exit(1)
|
os._exit(1)
|
||||||
|
|
||||||
machine_specs = await specs.get(benchmark_internet=True)
|
machine_specs = await specs.get(benchmark_internet=True, benchmark_disk=True, require_same_gpus=(not config.allow_mixed_gpus))
|
||||||
|
|
||||||
loader_event.clear()
|
loader_event.clear()
|
||||||
complete_loader()
|
complete_loader()
|
||||||
|
|
|
@ -15,6 +15,9 @@ class log:
|
||||||
def error (message):
|
def error (message):
|
||||||
ts = time_str()
|
ts = time_str()
|
||||||
print(f"\033[91m{ts} | {message}\033[0m")
|
print(f"\033[91m{ts} | {message}\033[0m")
|
||||||
|
def info (message):
|
||||||
|
ts = time_str()
|
||||||
|
print(f"\033[94m{ts} | {message}\033[0m")
|
||||||
def debug(message):
|
def debug(message):
|
||||||
if config.debug:
|
if config.debug:
|
||||||
ts = time_str()
|
ts = time_str()
|
||||||
|
|
|
@ -0,0 +1,294 @@
|
||||||
|
from lib import config as config_module
|
||||||
|
from lib import logging as logging_lib
|
||||||
|
from lib import get_specs
|
||||||
|
|
||||||
|
config = config_module.config
|
||||||
|
log = logging_lib.log
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
import pynvml
|
||||||
|
import json
|
||||||
|
|
||||||
|
is_hive = False
|
||||||
|
all_gpus_data_list=[]
|
||||||
|
get_data_fail=False
|
||||||
|
|
||||||
|
def init(gpu_specs_file=None):
|
||||||
|
global is_hive, all_gpus_data_list, get_data_fail
|
||||||
|
log.info("Loading GPU OC specs [ working ]")
|
||||||
|
try:
|
||||||
|
pynvml.nvmlInit()
|
||||||
|
kernel = get_specs.get_kernel()
|
||||||
|
if "hive" in kernel:
|
||||||
|
is_hive=True
|
||||||
|
|
||||||
|
specs_file_loc = gpu_specs_file if gpu_specs_file else config.gpu_specs_file
|
||||||
|
regenerate_specs = False
|
||||||
|
parsed_specs={}
|
||||||
|
try:
|
||||||
|
with open(specs_file_loc, "r") as specs_file:
|
||||||
|
parsed_specs = json.loads(specs_file.read())
|
||||||
|
except Exception as specs_load_fail:
|
||||||
|
log.error(f"Failed loading gpu_specs_file ({specs_load_fail}) | regenerating...")
|
||||||
|
regenerate_specs=True
|
||||||
|
|
||||||
|
parsed_specs_keys = parsed_specs.keys()
|
||||||
|
gpu_count = pynvml.nvmlDeviceGetCount()
|
||||||
|
for i in range(0,gpu_count):
|
||||||
|
if regenerate_specs:
|
||||||
|
break
|
||||||
|
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
||||||
|
gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
|
||||||
|
if not f"{i}-{gpu_uuid}" in parsed_specs_keys:
|
||||||
|
parsed_specs={}
|
||||||
|
regenerate_specs=True
|
||||||
|
break
|
||||||
|
|
||||||
|
if regenerate_specs:
|
||||||
|
for i in range(0,gpu_count):
|
||||||
|
gpu_spec={}
|
||||||
|
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
||||||
|
gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
|
||||||
|
power_limits = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(gpu_handle)
|
||||||
|
min_power_limit = int(power_limits[0] / 1000.0)
|
||||||
|
max_power_limit = int(power_limits[1] / 1000.0)
|
||||||
|
gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0)
|
||||||
|
gpu_spec["power_limits"] = [min_power_limit, max_power_limit]
|
||||||
|
gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle)
|
||||||
|
|
||||||
|
pci_info = pynvml.nvmlDeviceGetPciInfo(gpu_handle)
|
||||||
|
pci_bus_id = pci_info.bus
|
||||||
|
pci_device_id = pci_info.device
|
||||||
|
pci_domain_id = pci_info.domain
|
||||||
|
gpu_spec["pci_core"] = f"{pci_domain_id}:{pci_bus_id:02d}:{pci_device_id:02d}.0"
|
||||||
|
|
||||||
|
mem_range = get_hive_clock_range(is_hive, i, "mem")
|
||||||
|
core_range = get_hive_clock_range(is_hive, i, "core")
|
||||||
|
if type(mem_range) != list:
|
||||||
|
pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load
|
||||||
|
failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle)
|
||||||
|
failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle)
|
||||||
|
if (not failure_min) and (not failure_max):
|
||||||
|
mem_range=[min_oc_solution, max_oc_solution]
|
||||||
|
pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, 0)
|
||||||
|
pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
|
||||||
|
if type(core_range) != list:
|
||||||
|
pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, 300, 350) # Force low clocks, so the GPU can't crash when testing if under load
|
||||||
|
failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle, True)
|
||||||
|
failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle, True)
|
||||||
|
if (not failure_min) and (not failure_max):
|
||||||
|
core_range=[min_oc_solution, max_oc_solution]
|
||||||
|
pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0)
|
||||||
|
pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
|
||||||
|
if type(mem_range) == list and type(core_range) == list and len(mem_range)==2 and len(core_range)==2:
|
||||||
|
gpu_spec["mem"]=mem_range
|
||||||
|
gpu_spec["core"]=core_range
|
||||||
|
else:
|
||||||
|
get_data_fail=True
|
||||||
|
|
||||||
|
parsed_specs[f"{i}-{gpu_uuid}"]=gpu_spec
|
||||||
|
with open(specs_file_loc, "w") as specs_file:
|
||||||
|
json.dump(parsed_specs, specs_file)
|
||||||
|
|
||||||
|
if not get_data_fail:
|
||||||
|
parsed_specs_keys=parsed_specs.keys()
|
||||||
|
for key in parsed_specs_keys:
|
||||||
|
all_gpus_data_list.append(parsed_specs[key])
|
||||||
|
except Exception as e:
|
||||||
|
get_data_fail=True
|
||||||
|
log.error("Loading GPU OC specs [ fail ]")
|
||||||
|
if not get_data_fail:
|
||||||
|
log.success("Loading GPU OC specs [ success ]")
|
||||||
|
|
||||||
|
print(all_gpus_data_list)
|
||||||
|
# Load GPU specs
|
||||||
|
|
||||||
|
def get_gpu_oc_specs():
|
||||||
|
global get_data_fail
|
||||||
|
if get_data_fail:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return all_gpus_data_list
|
||||||
|
|
||||||
|
def shutdown():
|
||||||
|
pynvml.nvmlShutdown()
|
||||||
|
|
||||||
|
def handle_nn(input_int):
|
||||||
|
if abs(4293967-input_int) < 10000:
|
||||||
|
return input_int-4293967
|
||||||
|
elif abs(8589934-input_int) < 10000:
|
||||||
|
return input_int-8589934
|
||||||
|
else:
|
||||||
|
return input_int
|
||||||
|
|
||||||
|
def pinpoint_find_dicts_negative(data):
|
||||||
|
false_success_items = [d for d in data if not d['success']]
|
||||||
|
true_success_items = [d for d in data if d['success']]
|
||||||
|
highest_false_success = max(false_success_items, key=lambda x: x['offset'], default=None)
|
||||||
|
lowest_true_success = min(true_success_items, key=lambda x: x['offset'], default=None)
|
||||||
|
return highest_false_success, lowest_true_success
|
||||||
|
|
||||||
|
def pinpoint_find_dicts_positive(data):
|
||||||
|
false_success_items = [d for d in data if not d['success']]
|
||||||
|
true_success_items = [d for d in data if d['success']]
|
||||||
|
lowest_false_success = min(false_success_items, key=lambda x: x['offset'], default=None)
|
||||||
|
highest_true_success = max(true_success_items, key=lambda x: x['offset'], default=None)
|
||||||
|
return highest_true_success, lowest_false_success
|
||||||
|
|
||||||
|
def pinpoint_oc_limits_negative(gpu_handle, core=False):
|
||||||
|
step_cnt = 0
|
||||||
|
found_solution = None
|
||||||
|
init_negative_max = -19855 # Probably
|
||||||
|
history_info = [{"offset": init_negative_max*2, "success":False}]
|
||||||
|
failure = False
|
||||||
|
max_step_cnt = 20
|
||||||
|
try:
|
||||||
|
while found_solution == None and step_cnt<max_step_cnt and not failure:
|
||||||
|
step_cnt+=1
|
||||||
|
#print("STEP", step_cnt)
|
||||||
|
#print(history_info)
|
||||||
|
highest_false_success, lowest_true_success = pinpoint_find_dicts_negative(history_info)
|
||||||
|
test_offset = None
|
||||||
|
if lowest_true_success == None:
|
||||||
|
test_offset = int(highest_false_success["offset"]/2)
|
||||||
|
elif highest_false_success != None:
|
||||||
|
test_offset = int((highest_false_success["offset"]+lowest_true_success["offset"])/2)
|
||||||
|
if not step_cnt<max_step_cnt:
|
||||||
|
found_solution=lowest_true_success["offset"]
|
||||||
|
test_offset=None
|
||||||
|
elif test_offset==lowest_true_success["offset"]:
|
||||||
|
found_solution=test_offset
|
||||||
|
test_offset=None
|
||||||
|
|
||||||
|
if test_offset != None:
|
||||||
|
any_exception = False
|
||||||
|
try:
|
||||||
|
if core:
|
||||||
|
pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, test_offset)
|
||||||
|
else:
|
||||||
|
pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, test_offset)
|
||||||
|
except Exception as e:
|
||||||
|
any_exception=True
|
||||||
|
if not "Unknown Error" in str(e):
|
||||||
|
failure=True
|
||||||
|
history_info.append({"offset": test_offset, "success":not any_exception})
|
||||||
|
except Exception as e:
|
||||||
|
failure=True
|
||||||
|
return failure, found_solution
|
||||||
|
|
||||||
|
def pinpoint_oc_limits_positive(gpu_handle, core=False):
|
||||||
|
step_cnt = 0
|
||||||
|
found_solution = None
|
||||||
|
init_negative_max = 20000 # Probably
|
||||||
|
history_info = [{"offset": init_negative_max*2, "success":False}]
|
||||||
|
failure = False
|
||||||
|
max_step_cnt = 20
|
||||||
|
try:
|
||||||
|
while found_solution == None and step_cnt<max_step_cnt and not failure:
|
||||||
|
step_cnt+=1
|
||||||
|
#print("STEP", step_cnt)
|
||||||
|
#print(history_info)
|
||||||
|
highest_true_success, lowest_false_success = pinpoint_find_dicts_positive(history_info)
|
||||||
|
test_offset = None
|
||||||
|
if highest_true_success == None:
|
||||||
|
test_offset = int(lowest_false_success["offset"]/2)
|
||||||
|
elif lowest_false_success != None:
|
||||||
|
test_offset = int((highest_true_success["offset"]+lowest_false_success["offset"])/2)
|
||||||
|
if not step_cnt<max_step_cnt:
|
||||||
|
found_solution=highest_true_success["offset"]
|
||||||
|
test_offset=None
|
||||||
|
elif test_offset==highest_true_success["offset"]:
|
||||||
|
found_solution=test_offset
|
||||||
|
test_offset=None
|
||||||
|
|
||||||
|
if test_offset != None:
|
||||||
|
any_exception = False
|
||||||
|
try:
|
||||||
|
if core:
|
||||||
|
pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, test_offset)
|
||||||
|
else:
|
||||||
|
pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, test_offset)
|
||||||
|
except Exception as e:
|
||||||
|
any_exception=True
|
||||||
|
if not "Unknown Error" in str(e):
|
||||||
|
failure=True
|
||||||
|
history_info.append({"offset": test_offset, "success":not any_exception})
|
||||||
|
except Exception as e:
|
||||||
|
failure=True
|
||||||
|
return failure, found_solution
|
||||||
|
|
||||||
|
def set_oc(settings):
|
||||||
|
try:
|
||||||
|
gpu_count = pynvml.nvmlDeviceGetCount()
|
||||||
|
settings_keys = settings.keys()
|
||||||
|
if len(settings_keys)==0: # Configure default clocks/pl
|
||||||
|
settings={}
|
||||||
|
for i in range(0,gpu_count):
|
||||||
|
settings[str(i)]={
|
||||||
|
"core":0,
|
||||||
|
"mem":0,
|
||||||
|
"pl": all_gpus_data_list[i]["default_power_limit"]
|
||||||
|
}
|
||||||
|
settings_keys = settings.keys()
|
||||||
|
log.debug(f"Rewriting settings with: {json.dumps(settings)}")
|
||||||
|
for oc_gpu_index in settings_keys:
|
||||||
|
if oc_gpu_index.isdigit():
|
||||||
|
oc_gpu_index=int(oc_gpu_index)
|
||||||
|
if oc_gpu_index < gpu_count and type(settings[str(oc_gpu_index)])==dict:
|
||||||
|
gpu_oc_config = settings[str(oc_gpu_index)]
|
||||||
|
gpu_possible_ranges = all_gpus_data_list[oc_gpu_index]
|
||||||
|
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(oc_gpu_index)
|
||||||
|
if "core" in gpu_oc_config:
|
||||||
|
wanted_core_clock = int(round(gpu_oc_config["core"]*2))
|
||||||
|
if gpu_possible_ranges["core"][0] <= wanted_core_clock and wanted_core_clock <= gpu_possible_ranges["core"][1]:
|
||||||
|
pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, wanted_core_clock)
|
||||||
|
else:
|
||||||
|
log.error(f"Requested OC for GPU:{oc_gpu_index} (CORE) out of bound | {wanted_core_clock} | [{gpu_possible_ranges["core"][0]}, {gpu_possible_ranges["core"][1]}]")
|
||||||
|
if "mem" in gpu_oc_config:
|
||||||
|
wanted_mem_clock = int(round(gpu_oc_config["mem"]*2))
|
||||||
|
if gpu_possible_ranges["mem"][0] <= wanted_mem_clock and wanted_mem_clock <= gpu_possible_ranges["mem"][1]:
|
||||||
|
pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, wanted_mem_clock)
|
||||||
|
else:
|
||||||
|
log.error(f"Requested OC for GPU:{oc_gpu_index} (MEMORY) out of bound | {wanted_mem_clock} | [{gpu_possible_ranges["mem"][0]}, {gpu_possible_ranges["mem"][1]}]")
|
||||||
|
if "pl" in gpu_oc_config:
|
||||||
|
wanted_power_limit_milliwatts = gpu_oc_config["pl"]*1000 # convert W to mW
|
||||||
|
if gpu_possible_ranges["power_limits"][0] <= gpu_oc_config["pl"] and gpu_oc_config["pl"] <= gpu_possible_ranges["power_limits"][1]:
|
||||||
|
pynvml.nvmlDeviceSetPowerManagementLimit(gpu_handle, wanted_power_limit_milliwatts)
|
||||||
|
else:
|
||||||
|
log.error(f"Requested OC for GPU:{oc_gpu_index} (POWER LIMIT) out of bound | {gpu_oc_config["pl"]} | [{gpu_possible_ranges["power_limits"][0]}, {gpu_possible_ranges["power_limits"][1]}]")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"set_oc | ERROR | {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_hive_clock_range(is_hive, gpu_index, part):
|
||||||
|
if is_hive:
|
||||||
|
try:
|
||||||
|
flag = "--setmemoffset" if part=="mem" else "--setcoreoffset"
|
||||||
|
cmd = ["bash",'-c',f"nvtool -i 0 {flag} -100000"]
|
||||||
|
|
||||||
|
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
|
lines = result.stdout.decode().splitlines()
|
||||||
|
stripped_lines = [line.strip() for line in lines]
|
||||||
|
non_empty_lines = [line for line in stripped_lines if line]
|
||||||
|
|
||||||
|
device_id = None
|
||||||
|
result=[]
|
||||||
|
for non_empty_line in non_empty_lines:
|
||||||
|
if non_empty_line[:8]=="DEVICE #":
|
||||||
|
device_id = int(non_empty_line[8:].replace(':',''))
|
||||||
|
elif " is not in range of " in non_empty_line and device_id!=None and device_id==gpu_index:
|
||||||
|
splited_line = non_empty_line.split(" is not in range of ",1)[1].split(' ',4)
|
||||||
|
min_val = int(splited_line[0])
|
||||||
|
max_val = int(splited_line[2])
|
||||||
|
result=[min_val, max_val]
|
||||||
|
if len(result)==0:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return False
|
18
lib/utils.py
18
lib/utils.py
|
@ -89,4 +89,20 @@ def validate_cuda_version(ver_str):
|
||||||
return False
|
return False
|
||||||
def generate_random_string(length):
|
def generate_random_string(length):
|
||||||
characters = string.ascii_letters + string.digits
|
characters = string.ascii_letters + string.digits
|
||||||
return ''.join(random.choice(characters) for _ in range(length))
|
return ''.join(random.choice(characters) for _ in range(length))
|
||||||
|
|
||||||
|
def hive_set_miner_status(enabled=False):
|
||||||
|
### control miner state - OFF/ON
|
||||||
|
screen_out = run_command("screen -ls")
|
||||||
|
miner_screen_running = False
|
||||||
|
if screen_out[0] == 0 or screen_out[0] == 1:
|
||||||
|
screen_lines=screen_out[1].split('\n')
|
||||||
|
for screen_line in screen_lines:
|
||||||
|
screen_line_parts=screen_line.replace('\t', '', 1).split('\t')
|
||||||
|
if len(screen_line_parts)>2 and '.' in screen_line_parts[0]:
|
||||||
|
if screen_line_parts[0].split('.',1)[1]=="miner":
|
||||||
|
miner_screen_running=True
|
||||||
|
if miner_screen_running and not enabled:
|
||||||
|
run_command("miner stop")
|
||||||
|
elif enabled and not miner_screen_running:
|
||||||
|
run_command("nvidia-oc && miner start")
|
|
@ -6,4 +6,5 @@ speedtest-cli==2.1.3
|
||||||
psutil==5.9.0
|
psutil==5.9.0
|
||||||
python-iptables==1.0.1
|
python-iptables==1.0.1
|
||||||
websockets==12.0
|
websockets==12.0
|
||||||
packaging==23.2
|
packaging==23.2
|
||||||
|
git+https://git.clore.ai/clore/pynvml.git@main
|
Loading…
Reference in New Issue