Compare commits

...

31 Commits

Author SHA1 Message Date
clore 68e7dc215d V5.2.8 - when failing to set clock locks on HiveOS fallback on nvtool 2024-11-03 23:28:03 +00:00
clore 6c4995e19f fix reporting RAM usage 2024-10-31 04:29:52 +00:00
clore 7faadc76ea V5.2.7 - core, mem lock 2024-10-31 02:32:47 +00:00
clore 1375d5599e allow mixed GPU on machine initialization 2024-10-29 10:00:25 +00:00
clore d5620c64c4 allocate /dev/shm towards instances - V5.2.6 2024-10-17 17:01:41 +00:00
clore d6f90ab497 better hiveos miner integration support 2024-10-05 18:53:29 +02:00
clore 81e2659024 correctly submit hashrates of custom miner - V5.2.5 2024-10-05 13:54:19 +02:00
clore 15e0810359 add './' to HIVE_PATH (fix for running some miners) - V5.2.4 2024-10-04 00:59:25 +00:00
clore e71697cefa prevent multiple miner screen sessions under HiveOS 2024-09-12 01:19:01 +00:00
clore 20d3d9b6c8 add hive bin paths 2024-09-12 01:05:33 +00:00
clore 7ec35382b2 detect if hashrate in hs -> khs 2024-09-12 00:39:00 +00:00
clore 36c7db5831 bump version 2024-09-08 23:33:46 +00:00
clore c3e1b684fe fix submit pow info timings 2024-09-08 23:33:06 +00:00
clore 2d1c15c7bf force register_server requests to go throut IPv4 2024-09-08 22:00:57 +00:00
clore 6150cf48cb submit hashrates, algos of background pow job 2024-09-08 21:51:03 +00:00
clore cab037526a ALLOWED_CONTAINER_NAMES env variable 2024-09-04 12:18:23 +00:00
clore 590dc4b65e add optional whitelist for outside images 2024-09-04 00:51:52 +00:00
clore 5e35570d3c DONT_USE_HIVE_BINARIES, AUTH_TOKEN env parameters 2024-09-03 23:17:46 +00:00
clore 7e63ca5218 V5.2.2 - hostnames, Failed to initialize NVML fix for ubuntu 22 hosts 2024-07-06 13:05:22 +00:00
clore 73d19b5cd7 V5.2.1 | add dict for core clocks for problematic gpus on "function not found" error 2024-05-28 00:56:26 +00:00
clore 5e733fd0d6 V5.2.1 | fix reading OC specs for older GPUs 2024-05-28 00:25:58 +00:00
clore 36d3026d5d use requests==2.31.0 2024-05-22 11:54:57 +00:00
clore 79c17624f2 bump version 2024-05-22 00:43:38 +00:00
clore 2ef648df25 fix removing custom entrypoints 2024-05-22 00:35:33 +00:00
clore c71597af16 use pynvml==11.5.0 instead of clore fork 2024-05-16 19:29:07 +00:00
clore e2d309650b use correct env for hive binaries 2024-05-16 15:17:11 +00:00
clore 504aa74f5e fixes on use hive flightsheet flag 2024-05-16 13:57:00 +00:00
clore 35ce001a71 use full path for nvidia-oc 2024-05-16 12:15:38 +00:00
clore 1658ad4f66 use hive miner bin full paths 2024-05-16 12:05:56 +00:00
clore 12b4239cab increase timeout to register server 2024-05-12 11:22:47 +00:00
clore b0d7618592 fix 2024-05-12 11:20:34 +00:00
15 changed files with 662 additions and 69 deletions

View File

@ -4,6 +4,7 @@ from lib import custom_entrypoint
from lib import networking
from lib import wireguard
from lib import logging as logging_lib
from clore_hosting import utils as hosting_utils
import shutil
import os
import re
@ -53,9 +54,12 @@ def configure(containers):
for index, container in enumerate(containers):
ok_custom_entrypoint = False
invalid_hostname = False
if index < len(custom_entrypoint_state):
ok_custom_entrypoint = custom_entrypoint_state[index]
startup_script_name = f"{container['name']}.sh"
if "hostname" in container and not hosting_utils.validate_hostname(container["hostname"]):
invalid_hostname = True
if "ip" in container and len(container["ip"])>6 and type(container["ip"])==str:
if container["ip"][:8] == "; echo '":
last_occurrence, text_after_last_ip = get_last_ip_occurrence_and_text(container["ip"])
@ -95,14 +99,14 @@ def configure(containers):
newly_created_networks.append(container["network"])
else:
any_fail=True
if not any_fail and ok_custom_entrypoint:
if not any_fail and ok_custom_entrypoint and not invalid_hostname:
valid_containers.append(container)
elif "network" in container and container["network"][:len(config.clore_network_name_prefix)]==config.clore_network_name_prefix: # Subnet & gateway not defined, must be some of default networks, otherwise dump it
if container["network"] in default_network_names:
for docker_network in docker_networks:
if docker_network["Name"]==container["network"]:
for ipam in docker_network["IPAM"]:
if not ok_custom_entrypoint:
if not ok_custom_entrypoint or invalid_hostname:
break
elif not "ip" in container:
valid_containers.append(container)

View File

@ -2,6 +2,7 @@ from lib import config as config_module
from lib import logging as logging_lib
from lib import log_streaming_task
from lib import run_startup_script
from lib import hive_miner_interface
from lib import docker_interface
from lib import docker_deploy
from lib import docker_pull
@ -41,9 +42,9 @@ async def configure_networks(containers):
except Exception as e:
return False
async def deploy_containers(validated_containers):
async def deploy_containers(validated_containers, allowed_running_containers):
try:
all_running_container_names, all_stopped_container_names = await asyncio.to_thread(docker_deploy.deploy, validated_containers)
all_running_container_names, all_stopped_container_names = await asyncio.to_thread(docker_deploy.deploy, validated_containers, allowed_running_containers)
return types.DeployContainersRes(all_running_container_names=all_running_container_names, all_stopped_container_names=all_stopped_container_names)
except Exception as e:
return False
@ -97,7 +98,8 @@ class CloreClient:
"log_streaming_task": utils.unix_timestamp(),
"container_log_streaming_service": utils.unix_timestamp(),
"specs_service": utils.unix_timestamp(),
"oc_service": utils.unix_timestamp()
"oc_service": utils.unix_timestamp(),
"background_pow_data_collection": utils.unix_timestamp()
}
self.max_service_inactivity = 600 # seconds
@ -105,9 +107,24 @@ class CloreClient:
self.ws_peers[str(config.debug_ws_peer)]={
"expiration":"immune"
}
self.os_release = get_specs.get_os_release()
self.restart_docker = False
if "use_cgroupfs" in self.os_release:
self.updated_exec_opts = True if docker_interface.configure_exec_opts("native.cgroupdriver","cgroupfs") else False
if self.updated_exec_opts:
docker_info = docker_interface.get_info()
if "CgroupDriver" in docker_info and docker_info["CgroupDriver"]=="systemd":
self.restart_docker = True # Restart docker when it's loaded under systemd (accual restart will happen only if no orders running to not disrupt workload)
docker_interface.verify_docker_version()
nvml.init()
self.dont_use_hive_binaries = True if 'DONT_USE_HIVE_BINARIES' in os.environ else False
nvml.init(allow_hive_binaries=not self.dont_use_hive_binaries)
self.extra_allowed_images = utils.get_extra_allowed_images()
self.allowed_running_containers = utils.get_allowed_container_names()
self.gpu_oc_specs = nvml.get_gpu_oc_specs()
self.last_oc_service_submit = 0
@ -117,6 +134,9 @@ class CloreClient:
self.is_hive = get_specs.is_hive()
self.use_hive_flightsheet = False
self.hive_miner_interface = hive_miner_interface.hive_interface()
self.next_pow_background_job_send_update = 0
async def service(self):
global container_log_broken
@ -126,14 +146,15 @@ class CloreClient:
task1 = asyncio.create_task(self.main(pull_list, monitoring))
task2 = asyncio.create_task(self.handle_container_cache(pull_list, monitoring))
task3 = asyncio.create_task(self.startup_script_runner(monitoring))
task4 = asyncio.create_task(log_streaming_task.log_streaming_task(container_log_broken, monitoring))
task4 = asyncio.create_task(log_streaming_task.log_streaming_task(container_log_broken, monitoring, self.allowed_running_containers))
task5 = asyncio.create_task(self.container_log_streaming_service(monitoring))
task6 = asyncio.create_task(self.specs_service(monitoring))
task7 = asyncio.create_task(self.oc_service(monitoring))
task8 = asyncio.create_task(self.background_pow_data_collection(monitoring))
monitoring_task = asyncio.create_task(self.monitoring_service(monitoring))
# Wait for both tasks to complete (they won't in this case)
await asyncio.gather(task1, task2, task3, task4, task5, task6, task7, monitoring_task)
await asyncio.gather(task1, task2, task3, task4, task5, task6, task7, task8, monitoring_task)
async def monitoring_service(self, monitoring):
while True:
@ -333,6 +354,7 @@ class CloreClient:
print("STEP",step,'|',self.containers_set, self.containers if config.log_containers_strings else '')
tasks = []
running_order = False
container_conf = WebSocketClient.get_containers()
@ -341,10 +363,11 @@ class CloreClient:
self.containers=container_conf[1]
tmp_images = []
for container in self.containers:
if "image" in container:
if "image" in container and "image" in container and container["image"]!="cloreai/hive-use-flightsheet":
log_pull = False
if "name" in container:
if "-order-" in container["name"]:
running_order=True
log_pull=True
image_config = {
"image":container["image"],
@ -362,6 +385,12 @@ class CloreClient:
if not image_config in tmp_images:
tmp_images.append(image_config)
if self.restart_docker and not running_order and len(self.containers)>0:
log.debug("Sending docker restart command")
utils.run_command_v2("systemctl restart docker")
self.restart_docker=False
if tmp_images!=self.needed_images:
self.needed_images=tmp_images
await pull_list.put(self.needed_images)
@ -375,7 +404,7 @@ class CloreClient:
tasks.append(WebSocketClient.stream_pull_logs())
if self.validated_containers_set:
tasks.append(deploy_containers(self.validated_containers))
tasks.append(deploy_containers(self.validated_containers, self.allowed_running_containers))
if step==1:
WebSocketClient.set_auth(self.auth_key)
@ -397,7 +426,7 @@ class CloreClient:
if type(result)==types.ServerConfig:
if result.success:
self.last_checked_ws_peers = utils.unix_timestamp()
self.allowed_images=result.allowed_images
self.allowed_images=result.allowed_images+self.extra_allowed_images
if not config.debug_ws_peer:
for pure_ws_peer in result.ws_peers:
self.ws_peers[pure_ws_peer]={
@ -411,6 +440,7 @@ class CloreClient:
self.validated_containers_set=True
self.validated_containers = result.valid_containers
self.use_hive_flightsheet = result.use_hive_flightsheet
log.debug(f"Use Hive flightsheet: {result.use_hive_flightsheet}")
elif type(result)==types.DeployContainersRes:
try:
self.all_running_container_names = result.all_running_container_names
@ -425,7 +455,7 @@ class CloreClient:
async def submit_specs(self, current_specs):
try:
if type(current_specs) == dict:
current_specs["backend_version"]=9
current_specs["backend_version"]=18
current_specs["update_hw"]=True
smallest_pcie_width = 999
for gpu in current_specs["gpus"]["nvidia"]:
@ -447,7 +477,7 @@ class CloreClient:
"update_realtime_data":True,
"gpus": gpu_list,
"cpu": cpu_usage,
"ram": ram_usage,
"ram": ram_usage.percent,
"all_running_container_names": self.all_running_container_names,
"all_stopped_container_names": self.all_stopped_container_names
}
@ -474,10 +504,10 @@ class CloreClient:
await monitoring.put("oc_service")
oc_apply_allowed = True
### OC Service should also hande Hive stuff
if self.use_hive_flightsheet and self.is_hive:
if self.use_hive_flightsheet and self.is_hive and not self.dont_use_hive_binaries:
await set_hive_miner_status(True)
oc_apply_allowed = False # Don't apply any OC when running HiveOS miner
elif self.is_hive:
elif self.is_hive and not self.dont_use_hive_binaries:
await set_hive_miner_status(False)
### Run OC tasks
oc_conf = WebSocketClient.get_oc()
@ -498,6 +528,22 @@ class CloreClient:
log.debug(f"FAIL | oc_service() | {e}")
await asyncio.sleep(2)
async def background_pow_data_collection(self, monitoring):
while True:
try:
await monitoring.put("background_pow_data_collection")
if not self.dont_use_hive_binaries and self.is_hive:
miner_config = await self.hive_miner_interface.export_miner_stats(get_hashrates=False)
if (miner_config["miner_uptime"]>0 and miner_config["miner_uptime"]<60) or self.next_pow_background_job_send_update < time.time():
self.next_pow_background_job_send_update = time.time()+(5*60)
current_statistics = await self.hive_miner_interface.export_miner_stats(get_hashrates=True)
submit_result = await WebSocketClient.send({"submit_hashrates": current_statistics})
if not submit_result:
self.next_pow_background_job_send_update = time.time()+40
except Exception as e:
log.debug(f"FAIL | background_pow_data_collection() | {e}")
await asyncio.sleep(6)
def expire_ws_peers(self):
for ws_peer_address in list(self.ws_peers.keys()):
ws_peer_info = self.ws_peers[ws_peer_address]

View File

@ -10,5 +10,13 @@ def is_valid_websocket_url(url):
return True
return False
def validate_hostname(hostname):
# Define the regular expression pattern for a valid hostname
pattern = re.compile(r'^[a-zA-Z0-9._-]{1,63}$')
if pattern.match(hostname):
return True
else:
return False
def unix_timestamp():
return int(time.time())

View File

@ -33,7 +33,7 @@ hard_config = {
"maximum_service_loop_time": 900, # Seconds, failsafe variable - if service is stuck processing longer than this timeframe it will lead into restarting the app
"maximum_pull_service_loop_time": 14400, # Exception for image pulling
"creation_engine": "wrapper", # "wrapper" or "sdk" | Wrapper - wrapped docker cli, SDK - docker sdk
"allow_mixed_gpus": False
"allow_mixed_gpus": True
}
parser = argparse.ArgumentParser(description='Example argparse usage')
@ -48,7 +48,8 @@ parser.add_argument('--startup-scripts-folder', type=str, default='/opt/clore-ho
parser.add_argument('--wireguard-config-folder', type=str, default='/opt/clore-hosting/wireguard/configs', help='Folder with wireguard configs')
parser.add_argument('--entrypoints-folder', type=str, default='/opt/clore-hosting/entrypoints', help='Folder with custom entrypoints')
parser.add_argument('--debug-ws-peer', type=str, help="Specific ws peer to connect to (for debugging only)")
parser.add_argument('--gpu-specs-file', type=str, default='/opt/clore-hosting/client/gpu_specs.json' ,help="Cache with specs of GPU possible OC/Power limit changes")
parser.add_argument('--gpu-specs-file', type=str, default='/opt/clore-hosting/client/gpu_specs.json', help="Cache with specs of GPU possible OC/Power limit changes")
parser.add_argument('--extra-allowed-images-file', type=str, default="/opt/clore-hosting/extra_allowed_images.json", help="Docker image whitelist, that are allowed by clore.ai hosting software")
# Parse arguments, ignoring any non-defined arguments
args, _ = parser.parse_known_args()

View File

@ -63,7 +63,7 @@ def cache_entrypoints(containers):
else:
valid_conf.append(True)
for remaining_file in entrypoint_files: # We can remove files that are not needed anymore
os.remove(remaining_file)
os.remove(os.path.join(config.entrypoints_folder,remaining_file))
return valid_conf
except Exception as e:
return 'e'

View File

@ -9,7 +9,7 @@ import docker
config = config_module.config
log = logging_lib.log
def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
def create_container(container_options, ip=None, docker_gpus=False, shm_size=64, timeout=30):
# Sanitize and validate input
container_options = sanitize_input(container_options)
@ -21,6 +21,9 @@ def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
if "network_mode" in container_options:
command.extend(["--network", container_options["network_mode"]])
if "hostname" in container_options:
command.extend(["--hostname", container_options["hostname"]])
if "cap_add" in container_options:
for cap in container_options["cap_add"]:
command.extend(["--cap-add", cap])
@ -52,6 +55,10 @@ def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
if "runtime" in container_options:
command.extend(["--runtime", container_options["runtime"]])
if shm_size != 64:
command.extend(["--shm-size", f"{shm_size}m"])
if docker_gpus:
if type(docker_gpus)==list:
command.extend(['--gpus', '"device=' + ','.join(str(gpu_id) for gpu_id in docker_gpus) + '"'])

View File

@ -3,15 +3,18 @@ from lib import logging as logging_lib
from lib import docker_cli_wrapper
from lib import docker_interface
from lib import get_specs
from lib import utils
import docker
from docker.types import EndpointConfig, NetworkingConfig
import os
shm_calculator = utils.shm_calculator(get_specs.get_total_ram_mb())
client = docker_interface.client
config = config_module.config
log = logging_lib.log
def deploy(validated_containers):
def deploy(validated_containers, allowed_running_containers=[]):
local_images = docker_interface.get_local_images()
all_containers = docker_interface.get_containers(all=True)
@ -43,6 +46,7 @@ def deploy(validated_containers):
for validated_container in validated_containers:
try:
SHM_SIZE = 64 # MB - default
image_ready = False
docker_gpus = None
@ -76,12 +80,21 @@ def deploy(validated_containers):
)
}
if "hostname" in validated_container:
container_options["hostname"]=validated_container["hostname"]
elif "clore-order-" in validated_container["name"]:
try:
container_options["hostname"] = f"O-{int(validated_container["name"][12:])}"
except Exception as eon:
pass
if "network" in validated_container:
container_options["network_mode"]=validated_container["network"]
if "ip" in validated_container and config.creation_engine=="sdk":
del container_options["network_mode"]
if "gpus" in validated_container and type(validated_container["gpus"])==bool:
if "clore-order-" in validated_container["name"]:
SHM_SIZE = shm_calculator.calculate('*')
container_options["runtime"]="nvidia"
docker_gpus=True
container_options["device_requests"].append(docker.types.DeviceRequest(count=-1, capabilities=[['gpu']]))
@ -121,9 +134,11 @@ def deploy(validated_containers):
elif "entrypoint_command" in validated_container and type(validated_container["entrypoint_command"])==str and len(validated_container["entrypoint_command"])>0:
container_options["entrypoint"]=validated_container["entrypoint_command"]
container_options["shm_size"] = f"{SHM_SIZE}m"
if not validated_container["name"] in created_container_names and image_ready:
if config.creation_engine == "wrapper":
docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), docker_gpus=docker_gpus)
docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), shm_size=SHM_SIZE, docker_gpus=docker_gpus)
else:
container = client.containers.create(**container_options)
if "ip" in validated_container:
@ -159,13 +174,13 @@ def deploy(validated_containers):
container.stop()
except Exception as e:
pass
elif container.name not in paused_names+needed_running_names and container.status == 'running':
elif container.name not in paused_names+needed_running_names+allowed_running_containers and container.status == 'running':
try:
container.stop()
container.remove()
except Exception as e:
pass
elif container.name not in paused_names+needed_running_names:
elif container.name not in paused_names+needed_running_names+allowed_running_containers:
try:
container.remove()
except Exception as e:

View File

@ -50,6 +50,14 @@ class DockerNetwork(BaseModel):
client = docker.from_env()
low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock')
daemon_config_path = "/etc/docker/daemon.json"
def get_info():
try:
client_info = client.info()
return client_info
except Exception as e:
return {}
def check_docker_connection():
try:
@ -346,16 +354,15 @@ def validate_and_secure_networks():
def get_daemon_config():
config_path = "/etc/docker/daemon.json"
try:
with open(config_path, 'r') as file:
with open(daemon_config_path, 'r') as file:
config_data = json.load(file)
return config_data
except FileNotFoundError:
print(f"Error: {config_path} not found.")
print(f"Error: {daemon_config_path} not found.")
return None
except json.JSONDecodeError:
print(f"Error: Failed to parse JSON from {config_path}.")
print(f"Error: Failed to parse JSON from {daemon_config_path}.")
return None
def verify_docker_version(min_version="17.06"):
@ -367,4 +374,42 @@ def verify_docker_version(min_version="17.06"):
os._exit(1)
except Exception as e:
log.error(f"Failed to verify docker version | {e}")
os._exit(1)
os._exit(1)
def configure_exec_opts(key="native.cgroupdriver", value="cgroupfs"):
deamon_config = get_daemon_config()
if deamon_config:
try:
if (not "exec-opts" in deamon_config or type(deamon_config["exec-opts"])!=list) and value!=None:
deamon_config["exec-opts"]=[f"{key}={value}"]
elif "exec-opts" in deamon_config:
new_exec_opts=[]
matched_key=False
for exec_opt in deamon_config["exec-opts"]:
if '=' in exec_opt:
exec_opt_key, exec_opt_value = exec_opt.split('=',1)
if exec_opt_key==key:
matched_key=True
if value!=None:
new_exec_opts.append(f"{key}={value}")
else:
new_exec_opts.append(exec_opt)
else:
new_exec_opts.append(exec_opt)
if not matched_key:
new_exec_opts.append(f"{key}={value}")
if len(new_exec_opts)==0:
del deamon_config["exec-opts"]
else:
if deamon_config["exec-opts"] == new_exec_opts:
return "Same"
deamon_config["exec-opts"]=new_exec_opts
json_string = json.dumps(deamon_config, indent=4)
with open(daemon_config_path, 'w') as file:
file.write(json_string)
return True
except Exception as e:
log.error(f"Failed 'configure_exec_opts' | {e}")
return False
else:
return False

View File

@ -43,6 +43,32 @@ def get_kernel():
def is_hive():
return "hive" in get_kernel()
def get_total_ram_mb():
total_ram = psutil.virtual_memory().total
return total_ram / (1024 ** 2)
def get_os_release():
try:
with open("/etc/os-release") as f:
os_info = f.read()
os_release = {}
for line in os_info.split('\n'):
if '=' in line:
key, value = line.split('=', 1)
if value[:1]=='"' and value.endswith('"'):
value = value[1:len(value)-1]
os_release[key]=value
needed_cgroupfs_versions = ["22.04", "22.10"] # Mitigate issue https://github.com/NVIDIA/nvidia-docker/issues/1730
if "NAME" in os_release and "VERSION_ID" in os_release:
if os_release["NAME"].lower() == "ubuntu" and os_release["VERSION_ID"] in needed_cgroupfs_versions:
os_release["use_cgroupfs"]=True
return os_release
except Exception as e:
return {}
def drop_caches():
try:
with open('/proc/sys/vm/drop_caches', 'w') as f:
@ -296,7 +322,7 @@ class Specs:
gpu_str, gpu_mem, gpus, nvml_err = get_gpu_info()
if require_same_gpus:
last_gpu_name=''
for gpu in gpus:
for gpu in gpus["nvidia"]:
if not last_gpu_name:
last_gpu_name=gpu["name"]
elif last_gpu_name!=gpu["name"]:

205
lib/hive_miner_interface.py Normal file
View File

@ -0,0 +1,205 @@
import aiofiles
import asyncio
import json
import time
import os
import re
def extract_json_with_key(text, key):
json_pattern = r'({.*?})'
json_strings = re.findall(json_pattern, text, re.DOTALL)
json_objects_with_key = []
for json_str in json_strings:
try:
json_obj = json.loads(json_str)
if key in json.dumps(json_obj):
json_objects_with_key.append(json_obj)
except json.JSONDecodeError:
continue
return json_objects_with_key
async def async_run_bash_command(command):
process = await asyncio.create_subprocess_exec(
'/bin/bash', '-c', command,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await process.communicate()
return stdout.decode().strip(), stderr.decode().strip(), process.returncode
async def check_and_read_file(file_path):
try:
if os.path.exists(file_path):
async with aiofiles.open(file_path, mode='r') as file:
contents = await file.read()
return contents
else:
return "fail"
except Exception as e:
return "fail"
async def get_session_start_time(pid):
try:
async with aiofiles.open(f'/proc/{pid}/stat', 'r') as file:
stat_info = (await file.read()).split()
start_time_ticks = int(stat_info[21])
clock_ticks_per_sec = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
start_time_seconds = start_time_ticks / clock_ticks_per_sec
boot_time = None
async with aiofiles.open('/proc/stat', 'r') as file:
async for line in file:
if line.startswith('btime'):
boot_time = int(line.split()[1])
break
if boot_time is None:
raise ValueError("Failed to find boot time in /proc/stat")
start_time = (boot_time + start_time_seconds)
return start_time
except FileNotFoundError:
return None
except Exception as e:
print(f"Error retrieving session start time: {e}")
return None
async def get_miner_stats(miner_dir, api_timeout=15):
stdout, stderr, code = await async_run_bash_command(f'export PATH=$PATH:/hive/sbin:/hive/bin && export API_TIMEOUT={api_timeout}'+''' && read -t $((API_TIMEOUT + 5)) -d "" pid khs stats < <(function run_miner_scripts { echo "$BASHPID"; cd '''+miner_dir+''' || exit; cd '''+miner_dir+'''; source h-manifest.conf; cd '''+miner_dir+'''; source h-stats.sh; printf "%q\n" "$khs"; echo "$stats"; }; run_miner_scripts) && [[ $? -ge 128 ]] && [[ ! -z "$pid" && "$pid" -ne $$ ]] && kill -9 "$pid" 2>/dev/null ; echo $stats''')
try:
if stdout and not stderr and code == 0:
stats = json.loads(stdout)
return stats
else:
return 'fail'
except Exception as e:
try:
miner_stat_jsons = extract_json_with_key(stdout, "hs_units")
return miner_stat_jsons[0] if len(miner_stat_jsons) > 0 else 'fail'
except Exception :
return 'fail'
async def get_miner_uptime_stats():
stdout, stderr, code = await async_run_bash_command("screen -ls")
if not stderr and code == 0:
for line in stdout.split('\n'):
if line[:1]=='\t':
if line.split('\t')[1].endswith(".miner"):
miner_screen_pid = line.split('\t')[1].split('.')[0]
if miner_screen_pid.isdigit():
miner_start_time = await get_session_start_time(miner_screen_pid)
return miner_start_time
return None
def extract_miner_names(rig_conf):
miner_names=[]
for line in rig_conf.split('\n'):
if '=' in line:
key, value = line.split('=', 1)
if key[:5]=="MINER" and (len(key)==5 or str(key[5:]).isdigit()):
if value.startswith('"') and value.endswith('"'):
value = value.strip('"')
if len(value)>0:
miner_names.append(value)
return miner_names
def extract_miner_config(miner_names, wallet_conf):
lines = wallet_conf.split('\n')
meta_config = {}
miners = {}
for miner_idx, miner_name in enumerate(miner_names):
remaining_lines = []
algos = []
for line in lines:
if not line.startswith('#') and '=' in line:
key, value = line.split('=', 1)
if value.startswith('"') and value.endswith('"'):
value = value.strip('"')
if value.startswith("'") and value.endswith("'"):
value = value.strip("'")
if key[:len(miner_name.replace('-','_'))+1].lower() == f"{miner_name.replace('-','_')}_".lower():
if key.split('_')[-1][:4].lower()=="algo":
algos.append(value)
elif miner_name.lower()=="custom" and key.lower()=="custom_miner":
miner_names[miner_idx] = os.path.join(miner_names[miner_idx],value)
elif key.lower()=="meta":
try:
meta_config=json.loads(value)
except Exception as e:
pass
else:
remaining_lines.append(line)
lines = remaining_lines
miners[miner_name]={
"algos":algos,
"coins":[]
}
for miner_name in miner_names:
if "custom/" in miner_name.lower():
miner_name = "custom"
if miner_name in meta_config and type(meta_config[miner_name]) == dict:
for key in meta_config[miner_name].keys():
if "coin" in key:
miners[miner_name]["coins"].append(meta_config[miner_name][key])
return miner_names, miners
def sum_numbers_in_list(lst):
if all(isinstance(i, (int, float)) for i in lst):
return sum(lst)
else:
return "The list contains non-numeric elements."
class hive_interface:
def __init__(self):
self.hive_miners_dir = "/hive/miners"
self.hive_rig_config = "/hive-config/rig.conf"
self.hive_wallet_config = "/hive-config/wallet.conf"
async def get_miners_stats(self, miner_names):
scrape_tasks = []
for miner_name in miner_names:
scrape_tasks.append(get_miner_stats(os.path.join(self.hive_miners_dir, miner_name)))
results = await asyncio.gather(*scrape_tasks)
return results
async def get_configured_miners(self):
rig_conf, wallet_conf = await asyncio.gather(*[check_and_read_file(self.hive_rig_config), check_and_read_file(self.hive_wallet_config)])
miner_names = extract_miner_names(rig_conf)
miner_names, miner_config = extract_miner_config(miner_names, wallet_conf)
return miner_names, miner_config
async def export_miner_stats(self, get_hashrates=False):
output = {
"miner_uptime": 0
}
miner_start_ts = await get_miner_uptime_stats()
if miner_start_ts:
output["miner_uptime"] = int(time.time()-miner_start_ts)
miner_names, miner_config = await self.get_configured_miners()
output["miners"]=miner_config
if get_hashrates:
miners_stats = await self.get_miners_stats(miner_names)
for idx, miner_name in enumerate(miner_names):
miner_names[idx] = "custom" if "custom/" in miner_name.lower() else miner_name
for idx, miner_stats in enumerate(miners_stats):
if type(miner_stats) == dict:
for key in miner_stats.keys():
if key[:2]=="hs" and (key=="hs" or key[2:].isdigit()):
all_hs = sum_numbers_in_list(miner_stats[key])
try:
if "hs_units" in miner_stats:
if miner_stats["hs_units"]=="hs":
all_hs = all_hs/1000
if not "hashrates" in output['miners'][miner_names[idx]]:
output['miners'][miner_names[idx]]["hashrates"]=[]
if isinstance(all_hs, (float, int)):
output['miners'][miner_names[idx]]["hashrates"].append(all_hs)
else:
output['miners'][miner_names[idx]]["hashrates"].append(0)
except Exception as e:
pass
return output

View File

@ -3,6 +3,7 @@ from lib import logging as logging_lib
from lib import get_specs
from lib import utils
import threading
import socket
import aiohttp
import asyncio
import json
@ -47,9 +48,11 @@ async def register_server(data):
"Content-Type": "application/json"
}
async with aiohttp.ClientSession() as session:
connector = aiohttp.TCPConnector(family=socket.AF_INET)
async with aiohttp.ClientSession(connector=connector) as session:
try:
async with session.post(url, data=json_data, headers=headers, timeout=5) as response:
async with session.post(url, data=json_data, headers=headers, timeout=15) as response:
if response.status == 200:
# Successful response
response_data = await response.json()

View File

@ -10,7 +10,7 @@ from lib import container_logs
from concurrent.futures import ThreadPoolExecutor
import queue # Import the synchronous queue module
async def log_streaming_task(message_broker, monitoring):
async def log_streaming_task(message_broker, monitoring, do_not_stream_containers):
client = docker_interface.client
executor = ThreadPoolExecutor(max_workers=4)
tasks = {}
@ -29,14 +29,15 @@ async def log_streaming_task(message_broker, monitoring):
# Start tasks for new containers
for container_name, container in current_containers.items():
log_container_names.append(container_name)
if container_name not in tasks:
log.debug(f"log_streaming_task() | Starting task for {container_name}")
sync_queue = queue.Queue()
task = asyncio.ensure_future(asyncio.get_event_loop().run_in_executor(
executor, container_logs.stream_logs, container_name, sync_queue))
tasks[container_name] = task
queues[container_name] = sync_queue
if not container_name in do_not_stream_containers:
log_container_names.append(container_name)
if container_name not in tasks:
log.debug(f"log_streaming_task() | Starting task for {container_name}")
sync_queue = queue.Queue()
task = asyncio.ensure_future(asyncio.get_event_loop().run_in_executor(
executor, container_logs.stream_logs, container_name, sync_queue))
tasks[container_name] = task
queues[container_name] = sync_queue
await message_broker.put(log_container_names)

View File

@ -6,20 +6,77 @@ config = config_module.config
log = logging_lib.log
import subprocess
import pynvml
import clore_pynvml as pynvml
import json
import math
HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./"
GPU_MEM_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
"NVIDIA P102-100": [-2000, 2000],
"NVIDIA P104-100": [-2000, 2000],
"NVIDIA P106-090": [-2000, 2000],
"NVIDIA P106-100": [-2000, 2000],
"NVIDIA GeForce GTX 1050 Ti": [-2000, 2000],
"NVIDIA GeForce GTX 1060 3GB": [-2000, 2000],
"NVIDIA GeForce GTX 1060 6GB": [-2000, 2000],
"NVIDIA GeForce GTX 1070": [-2000, 2000],
"NVIDIA GeForce GTX 1070 Ti": [-2000, 2000],
"NVIDIA GeForce GTX 1080": [-2000, 2000],
"NVIDIA GeForce GTX 1080 Ti": [-2000, 2000],
"NVIDIA CMP 30HX": [-2000, 6000],
"NVIDIA CMP 40HX": [-2000, 6000],
"NVIDIA CMP 50HX": [-2000, 6000],
"NVIDIA CMP 90HX": [-2000, 6000],
"NVIDIA GeForce GTX 1650": [-2000, 6000],
"NVIDIA GeForce GTX 1660 SUPER": [-2000, 6000],
"NVIDIA GeForce GTX 1660 Ti": [-2000, 6000],
"NVIDIA GeForce RTX 2060": [-2000, 6000],
"NVIDIA GeForce RTX 2060 SUPER": [-2000, 6000],
"NVIDIA GeForce RTX 2070": [-2000, 6000],
"NVIDIA GeForce RTX 2070 SUPER": [-2000, 6000],
"NVIDIA GeForce RTX 2080": [-2000, 6000],
"NVIDIA GeForce RTX 2080 Ti": [-2000, 6000]
}
GPU_CORE_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
"NVIDIA P102-100": [-200, 1200],
"NVIDIA P104-100": [-200, 1200],
"NVIDIA P106-090": [-200, 1200],
"NVIDIA P106-100": [-200, 1200],
"NVIDIA GeForce GTX 1050 Ti": [-200, 1200],
"NVIDIA GeForce GTX 1060 3GB": [-200, 1200],
"NVIDIA GeForce GTX 1060 6GB": [-200, 1200],
"NVIDIA GeForce GTX 1070": [-200, 1200],
"NVIDIA GeForce GTX 1070 Ti": [-200, 1200],
"NVIDIA GeForce GTX 1080": [-200, 1200],
"NVIDIA GeForce GTX 1080 Ti": [-200, 1200],
"NVIDIA CMP 30HX": [-1000, 1000],
"NVIDIA CMP 40HX": [-1000, 1000],
"NVIDIA CMP 50HX": [-1000, 1000],
"NVIDIA CMP 90HX": [-1000, 1000],
"NVIDIA GeForce GTX 1650": [-1000, 1000],
"NVIDIA GeForce GTX 1660 SUPER": [-1000, 1000],
"NVIDIA GeForce GTX 1660 Ti": [-1000, 1000],
"NVIDIA GeForce RTX 2060": [-1000, 1000],
"NVIDIA GeForce RTX 2060 SUPER": [-1000, 1000],
"NVIDIA GeForce RTX 2070": [-1000, 1000],
"NVIDIA GeForce RTX 2070 SUPER": [-1000, 1000],
"NVIDIA GeForce RTX 2080": [-1000, 1000],
"NVIDIA GeForce RTX 2080 Ti": [-1000, 1000]
}
is_hive = False
all_gpus_data_list=[]
get_data_fail=False
def init(gpu_specs_file=None):
def init(gpu_specs_file=None, allow_hive_binaries=True):
global is_hive, all_gpus_data_list, get_data_fail
log.info("Loading GPU OC specs [ working ]")
try:
pynvml.nvmlInit()
kernel = get_specs.get_kernel()
if "hive" in kernel:
if "hive" in kernel and allow_hive_binaries:
is_hive=True
specs_file_loc = gpu_specs_file if gpu_specs_file else config.gpu_specs_file
@ -43,10 +100,15 @@ def init(gpu_specs_file=None):
parsed_specs={}
regenerate_specs=True
break
elif not "locks" in parsed_specs[f"{i}-{gpu_uuid}"]:
parsed_specs={}
regenerate_specs=True
break
if regenerate_specs:
for i in range(0,gpu_count):
gpu_spec={}
mem_to_core_allowed_locks = get_gpu_locked_clocks(i)
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
power_limits = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(gpu_handle)
@ -55,6 +117,7 @@ def init(gpu_specs_file=None):
gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0)
gpu_spec["power_limits"] = [min_power_limit, max_power_limit]
gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle)
gpu_spec["locks"] = mem_to_core_allowed_locks
pci_info = pynvml.nvmlDeviceGetPciInfo(gpu_handle)
pci_bus_id = pci_info.bus
@ -64,22 +127,57 @@ def init(gpu_specs_file=None):
mem_range = get_hive_clock_range(is_hive, i, "mem")
core_range = get_hive_clock_range(is_hive, i, "core")
if type(mem_range) != list:
pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load
failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle)
failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle)
if (not failure_min) and (not failure_max):
mem_range=[min_oc_solution, max_oc_solution]
pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, 0)
pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
if type(core_range) != list:
pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, 300, 350) # Force low clocks, so the GPU can't crash when testing if under load
failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle, True)
failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle, True)
if (not failure_min) and (not failure_max):
core_range=[min_oc_solution, max_oc_solution]
pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0)
pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
try:
if type(mem_range) != list:
pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load
failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle)
failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle)
if (not failure_min) and (not failure_max):
mem_range=[min_oc_solution, max_oc_solution]
pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, 0)
pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
if type(core_range) != list:
pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, 300, 350) # Force low clocks, so the GPU can't crash when testing if under load
failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle, True)
failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle, True)
if (not failure_min) and (not failure_max):
core_range=[min_oc_solution, max_oc_solution]
pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0)
pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
except Exception as e_pinpointing:
if "not supported" in str(e_pinpointing).lower():
try:
min_core_offset, max_core_offset = pynvml.nvmlDeviceGetGpcClkMinMaxVfOffset(gpu_handle)
if min_core_offset>0:
min_core_offset = min_core_offset - math.floor((2**32)/1000)
if min_core_offset > -20000 and min_core_offset <= 0 and max_core_offset>=0 and min_core_offset < 20000:
core_range=[min_core_offset, max_core_offset]
else:
core_range=[0,0]
min_mem_offset, max_mem_offset = pynvml.nvmlDeviceGetMemClkMinMaxVfOffset(gpu_handle)
if min_mem_offset>0:
min_mem_offset = min_mem_offset - math.floor((2**32)/1000)
if min_mem_offset==0 and max_mem_offset==0:
if gpu_spec["name"] in GPU_MEM_ALLOWED_OC_RANGES:
mem_range = GPU_MEM_ALLOWED_OC_RANGES[gpu_spec["name"]]
else:
mem_range = [0,0]
elif min_mem_offset > -20000 and min_mem_offset <= 0 and max_mem_offset>=0 and max_mem_offset < 20000:
mem_range=[min_mem_offset, max_mem_offset]
else:
mem_range=[0,0]
except Exception as e2:
if "function not found" in str(e2).lower():
if gpu_spec["name"] in GPU_MEM_ALLOWED_OC_RANGES:
mem_range = GPU_MEM_ALLOWED_OC_RANGES[gpu_spec["name"]]
else:
mem_range = [0,0]
if gpu_spec["name"] in GPU_CORE_ALLOWED_OC_RANGES:
core_range = GPU_CORE_ALLOWED_OC_RANGES[gpu_spec["name"]]
else:
core_range = [0,0]
else:
get_data_fail=True
if type(mem_range) == list and type(core_range) == list and len(mem_range)==2 and len(core_range)==2:
gpu_spec["mem"]=mem_range
gpu_spec["core"]=core_range
@ -113,6 +211,19 @@ def get_gpu_oc_specs():
def shutdown():
pynvml.nvmlShutdown()
def get_gpu_locked_clocks(gpu_index):
try:
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
mem_clocks = pynvml.nvmlDeviceGetSupportedMemoryClocks(handle)
mem_to_core = {}
for idx, mem_clock in enumerate(mem_clocks):
if idx < 12 or idx == len(mem_clocks)-1:
graphics_clocks = pynvml.nvmlDeviceGetSupportedGraphicsClocks(handle, mem_clock)
mem_to_core[str(mem_clock)] = [min(graphics_clocks), max(graphics_clocks)]
return mem_to_core
except Exception as e:
return {}
def handle_nn(input_int):
if abs(4293967-input_int) < 10000:
return input_int-4293967
@ -218,6 +329,7 @@ def pinpoint_oc_limits_positive(gpu_handle, core=False):
return failure, found_solution
def set_oc(settings):
global is_hive
try:
gpu_count = pynvml.nvmlDeviceGetCount()
settings_keys = settings.keys()
@ -231,6 +343,10 @@ def set_oc(settings):
}
settings_keys = settings.keys()
log.debug(f"Rewriting settings with: {json.dumps(settings)}")
core_locks = []
mem_locks = []
any_lock_failure = False
for oc_gpu_index in settings_keys:
if oc_gpu_index.isdigit():
oc_gpu_index=int(oc_gpu_index)
@ -238,13 +354,42 @@ def set_oc(settings):
gpu_oc_config = settings[str(oc_gpu_index)]
gpu_possible_ranges = all_gpus_data_list[oc_gpu_index]
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(oc_gpu_index)
if "core" in gpu_oc_config:
if "core_lock" in gpu_oc_config:
core_lock = int(gpu_oc_config["core_lock"])
core_locks.append(str(core_lock))
try:
pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, core_lock, core_lock)
except Exception as core_lock_exception:
any_lock_failure=True
else:
core_locks.append('0')
try:
pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
except Exception as core_lock_exception:
any_lock_failure=True
if "mem_lock" in gpu_oc_config:
mem_lock = int(gpu_oc_config["mem_lock"])
mem_locks.append(str(mem_lock))
try:
pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, mem_lock, mem_lock)
except Exception as mem_lock_exception:
any_lock_failure=True
else:
mem_locks.append('0')
try:
pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
except Exception as mem_lock_exception:
any_lock_failure=True
if "core" in gpu_oc_config: # Core offset
wanted_core_clock = int(round(gpu_oc_config["core"]*2))
if gpu_possible_ranges["core"][0] <= wanted_core_clock and wanted_core_clock <= gpu_possible_ranges["core"][1]:
pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, wanted_core_clock)
else:
log.error(f"Requested OC for GPU:{oc_gpu_index} (CORE) out of bound | {wanted_core_clock} | [{gpu_possible_ranges["core"][0]}, {gpu_possible_ranges["core"][1]}]")
if "mem" in gpu_oc_config:
if "mem" in gpu_oc_config: # Memory offset
wanted_mem_clock = int(round(gpu_oc_config["mem"]*2))
if gpu_possible_ranges["mem"][0] <= wanted_mem_clock and wanted_mem_clock <= gpu_possible_ranges["mem"][1]:
pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, wanted_mem_clock)
@ -256,6 +401,17 @@ def set_oc(settings):
pynvml.nvmlDeviceSetPowerManagementLimit(gpu_handle, wanted_power_limit_milliwatts)
else:
log.error(f"Requested OC for GPU:{oc_gpu_index} (POWER LIMIT) out of bound | {gpu_oc_config["pl"]} | [{gpu_possible_ranges["power_limits"][0]}, {gpu_possible_ranges["power_limits"][1]}]")
if is_hive and any_lock_failure and len(mem_locks)==len(core_locks):
try:
nvtool_commands = []
for idx, mem_lock in enumerate(mem_locks):
core_lock = core_locks[idx]
nvtool_commands.append(f"nvtool -i {str(idx)} --setmem {mem_lock} --setcore {core_lock}")
cmd = ["bash",'-c',f"PATH={HIVE_PATH} && sudo {' && '.join(nvtool_commands)}"]
#print(cmd)
subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except Exception as hive_oc_settings:
pass
return True
except Exception as e:
log.error(f"set_oc | ERROR | {e}")
@ -267,7 +423,7 @@ def get_hive_clock_range(is_hive, gpu_index, part):
if is_hive:
try:
flag = "--setmemoffset" if part=="mem" else "--setcoreoffset"
cmd = ["bash",'-c',f"nvtool -i 0 {flag} -100000"]
cmd = ["bash",'-c',f"PATH={HIVE_PATH} && sudo nvtool -i {gpu_index} {flag} -100000"]
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
lines = result.stdout.decode().splitlines()
@ -291,4 +447,17 @@ def get_hive_clock_range(is_hive, gpu_index, part):
except Exception as e:
return False
else:
return False
return False
def get_vram_per_gpu():
vram_per_gpu = []
try:
gpu_count = pynvml.nvmlDeviceGetCount()
for i in range(0,gpu_count):
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
vram_per_gpu.append(mem_info.total / 1024 ** 2)
except Exception as e:
log.error(f"Failed loading get_vram_per_gpu() | {e}")
pass
return vram_per_gpu

View File

@ -1,11 +1,13 @@
from lib import config as config_module
from lib import logging as logging_lib
from lib import nvml
import subprocess
import hashlib
import random
import string
import shlex
import time
import math
import json
import os
@ -41,12 +43,20 @@ def normalize_rule(rule_dict):
def get_auth():
try:
if 'AUTH_TOKEN' in os.environ:
return os.environ['AUTH_TOKEN']
auth_str = ''
with open(config.auth_file, "r", encoding="utf-8") as file:
auth_str = file.read().strip()
return auth_str
except Exception as e:
return ''
def get_allowed_container_names():
allowed_container_names = os.getenv("ALLOWED_CONTAINER_NAMES")
if type(allowed_container_names)==str and len(allowed_container_names)>0:
return [x for x in allowed_container_names.split(',') if x]
return []
def unix_timestamp():
return int(time.time())
@ -91,18 +101,70 @@ def generate_random_string(length):
characters = string.ascii_letters + string.digits
return ''.join(random.choice(characters) for _ in range(length))
HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./"
def hive_set_miner_status(enabled=False):
### control miner state - OFF/ON
screen_out = run_command("screen -ls")
miner_screen_running = False
miner_screen_session_pids = []
if screen_out[0] == 0 or screen_out[0] == 1:
screen_lines=screen_out[1].split('\n')
for screen_line in screen_lines:
screen_line_parts=screen_line.replace('\t', '', 1).split('\t')
if len(screen_line_parts)>2 and '.' in screen_line_parts[0]:
if screen_line_parts[0].split('.',1)[1]=="miner":
miner_screen_session_pids.append(screen_line_parts[0].split('.',1)[0])
miner_screen_running=True
if miner_screen_running and not enabled:
run_command("miner stop")
if len(miner_screen_session_pids) > 1: ## Something really bad going on, destroy all instances
for idx, miner_screen_session_pid in enumerate(miner_screen_session_pids):
run_command(f"kill -9 {miner_screen_session_pid}{' && screen -wipe' if idx==len(miner_screen_session_pids)-1 else ''}")
elif miner_screen_running and not enabled:
run_command(f"/bin/bash -c \"PATH={HIVE_PATH} && sudo /hive/bin/miner stop\"")
elif enabled and not miner_screen_running:
run_command("nvidia-oc && miner start")
run_command(f"/bin/bash -c \"export PATH={HIVE_PATH} && sudo /hive/sbin/nvidia-oc && source ~/.bashrc ; sudo /hive/bin/miner start\"")
def get_extra_allowed_images():
if os.path.exists(config.extra_allowed_images_file):
try:
with open(config.extra_allowed_images_file, 'r') as file:
content = file.read()
data = json.loads(content)
if isinstance(data, list):
if all(isinstance(item, dict) and set(item.keys()) == {'repository', 'allowed_tags'} and isinstance(item['repository'], str) and isinstance(item['allowed_tags'], list) and all(isinstance(tag, str) for tag in item['allowed_tags']) for item in data):
return data
else:
return []
else:
return []
except Exception as e:
log.error(f"get_extra_allowed_images() | error: {e}")
return []
else:
return []
class shm_calculator:
def __init__(self, total_ram):
self.total_ram = total_ram
self.gpu_vram_sizes = []
def calculate(self, used_gpu_ids):
assume_ram_utilised = 2500 #MB
default_shm_size = 64 #MB
if len(self.gpu_vram_sizes) == 0:
self.gpu_vram_sizes = nvml.get_vram_per_gpu()
instance_vram_total = 0
total_vram_size = sum(self.gpu_vram_sizes)
for idx, value in enumerate(self.gpu_vram_sizes):
if used_gpu_ids == '*' or idx in used_gpu_ids:
instance_vram_total += value
if instance_vram_total == 0 or total_vram_size == 0:
return default_shm_size
shm_size = instance_vram_total * 1.5 if instance_vram_total * 1.5 < self.total_ram - assume_ram_utilised else (
instance_vram_total/total_vram_size * (self.total_ram - assume_ram_utilised)
)
return math.floor(shm_size if shm_size > default_shm_size else default_shm_size)

View File

@ -7,4 +7,5 @@ psutil==5.9.0
python-iptables==1.0.1
websockets==12.0
packaging==23.2
git+https://git.clore.ai/clore/pynvml.git@main
clore-pynvml==11.5.4
requests==2.31.0