V5.2.8 - when failing to set clock locks on HiveOS fallback on nvtool

fix reporting RAM usage
V5.2.7 - core, mem lock
2024-11-03 23:28:03 +00:00 · 2024-10-31 04:29:52 +00:00 · 2024-10-31 02:32:47 +00:00 · 2024-10-29 10:00:25 +00:00 · 2024-10-17 17:01:41 +00:00 · 2024-10-05 18:53:29 +02:00
15 changed files with 662 additions and 69 deletions
--- a/clore_hosting/docker_configurator.py
+++ b/clore_hosting/docker_configurator.py
@ -4,6 +4,7 @@ from lib import custom_entrypoint
 from lib import networking
 from lib import wireguard
 from lib import logging as logging_lib
+from clore_hosting import utils as hosting_utils
 import shutil
 import os
 import re
@ -53,9 +54,12 @@ def configure(containers):

        for index, container in enumerate(containers):
            ok_custom_entrypoint = False
+            invalid_hostname = False
            if index < len(custom_entrypoint_state):
                ok_custom_entrypoint = custom_entrypoint_state[index]
            startup_script_name = f"{container['name']}.sh"
+            if "hostname" in container and not hosting_utils.validate_hostname(container["hostname"]):
+                invalid_hostname = True
            if "ip" in container and len(container["ip"])>6 and type(container["ip"])==str:
                if container["ip"][:8] == "; echo '":
                    last_occurrence, text_after_last_ip = get_last_ip_occurrence_and_text(container["ip"])
@ -95,14 +99,14 @@ def configure(containers):
                            newly_created_networks.append(container["network"])
                        else:
                            any_fail=True
-                    if not any_fail and ok_custom_entrypoint:
+                    if not any_fail and ok_custom_entrypoint and not invalid_hostname:
                        valid_containers.append(container)
            elif "network" in container and container["network"][:len(config.clore_network_name_prefix)]==config.clore_network_name_prefix: # Subnet & gateway not defined, must be some of default networks, otherwise dump it
                if container["network"] in default_network_names:
                    for docker_network in docker_networks:
                        if docker_network["Name"]==container["network"]:
                            for ipam in docker_network["IPAM"]:
-                                if not ok_custom_entrypoint:
+                                if not ok_custom_entrypoint or invalid_hostname:
                                    break
                                elif not "ip" in container:
                                    valid_containers.append(container)
--- a/clore_hosting/main.py
+++ b/clore_hosting/main.py
@ -2,6 +2,7 @@ from lib import config as config_module
 from lib import logging as logging_lib
 from lib import log_streaming_task
 from lib import run_startup_script
+from lib import hive_miner_interface
 from lib import docker_interface
 from lib import docker_deploy
 from lib import docker_pull
@ -41,9 +42,9 @@ async def configure_networks(containers):
    except Exception as e:
        return False
    
-async def deploy_containers(validated_containers):
+async def deploy_containers(validated_containers, allowed_running_containers):
    try:
-        all_running_container_names, all_stopped_container_names = await asyncio.to_thread(docker_deploy.deploy, validated_containers)
+        all_running_container_names, all_stopped_container_names = await asyncio.to_thread(docker_deploy.deploy, validated_containers, allowed_running_containers)
        return types.DeployContainersRes(all_running_container_names=all_running_container_names, all_stopped_container_names=all_stopped_container_names)
    except Exception as e:
        return False
@ -97,7 +98,8 @@ class CloreClient:
            "log_streaming_task": utils.unix_timestamp(),
            "container_log_streaming_service": utils.unix_timestamp(),
            "specs_service": utils.unix_timestamp(),
-            "oc_service": utils.unix_timestamp()
+            "oc_service": utils.unix_timestamp(),
+            "background_pow_data_collection": utils.unix_timestamp()
        }
        self.max_service_inactivity = 600 # seconds

@ -105,9 +107,24 @@ class CloreClient:
            self.ws_peers[str(config.debug_ws_peer)]={
                "expiration":"immune"
            }
-        
+
+        self.os_release = get_specs.get_os_release()
+        self.restart_docker = False
+        if "use_cgroupfs" in self.os_release:
+            self.updated_exec_opts = True if docker_interface.configure_exec_opts("native.cgroupdriver","cgroupfs") else False
+            if self.updated_exec_opts:
+                docker_info = docker_interface.get_info()
+                if "CgroupDriver" in docker_info and docker_info["CgroupDriver"]=="systemd":
+                    self.restart_docker = True # Restart docker when it's loaded under systemd (accual restart will happen only if no orders running to not disrupt workload)
+
        docker_interface.verify_docker_version()
-        nvml.init()
+
+        self.dont_use_hive_binaries = True if 'DONT_USE_HIVE_BINARIES' in os.environ else False
+
+        nvml.init(allow_hive_binaries=not self.dont_use_hive_binaries)
+
+        self.extra_allowed_images = utils.get_extra_allowed_images()
+        self.allowed_running_containers = utils.get_allowed_container_names()

        self.gpu_oc_specs = nvml.get_gpu_oc_specs()
        self.last_oc_service_submit = 0
@ -117,6 +134,9 @@ class CloreClient:
        self.is_hive = get_specs.is_hive()
        self.use_hive_flightsheet = False

+        self.hive_miner_interface = hive_miner_interface.hive_interface()
+        self.next_pow_background_job_send_update = 0
+
    async def service(self):
        global container_log_broken

@ -126,14 +146,15 @@ class CloreClient:
        task1 = asyncio.create_task(self.main(pull_list, monitoring))
        task2 = asyncio.create_task(self.handle_container_cache(pull_list, monitoring))
        task3 = asyncio.create_task(self.startup_script_runner(monitoring))
-        task4 = asyncio.create_task(log_streaming_task.log_streaming_task(container_log_broken, monitoring))
+        task4 = asyncio.create_task(log_streaming_task.log_streaming_task(container_log_broken, monitoring, self.allowed_running_containers))
        task5 = asyncio.create_task(self.container_log_streaming_service(monitoring))
        task6 = asyncio.create_task(self.specs_service(monitoring))
        task7 = asyncio.create_task(self.oc_service(monitoring))
+        task8 = asyncio.create_task(self.background_pow_data_collection(monitoring))
        monitoring_task = asyncio.create_task(self.monitoring_service(monitoring))

        # Wait for both tasks to complete (they won't in this case)
-        await asyncio.gather(task1, task2, task3, task4, task5, task6, task7, monitoring_task)
+        await asyncio.gather(task1, task2, task3, task4, task5, task6, task7, task8, monitoring_task)

    async def monitoring_service(self, monitoring):
        while True:
@ -333,6 +354,7 @@ class CloreClient:
                    print("STEP",step,'|',self.containers_set, self.containers if config.log_containers_strings else '')

                tasks = []
+                running_order = False

                container_conf = WebSocketClient.get_containers()

@ -341,10 +363,11 @@ class CloreClient:
                    self.containers=container_conf[1]
                    tmp_images = []
                    for container in self.containers:
-                        if "image" in container:
+                        if "image" in container and "image" in container and container["image"]!="cloreai/hive-use-flightsheet":
                            log_pull = False
                            if "name" in container:
                                if "-order-" in container["name"]:
+                                    running_order=True
                                    log_pull=True
                            image_config = {
                                "image":container["image"],
@ -362,6 +385,12 @@ class CloreClient:

                            if not image_config in tmp_images:
                                tmp_images.append(image_config)
+
+                    if self.restart_docker and not running_order and len(self.containers)>0:
+                        log.debug("Sending docker restart command")
+                        utils.run_command_v2("systemctl restart docker")
+                        self.restart_docker=False
+                    
                    if tmp_images!=self.needed_images:
                        self.needed_images=tmp_images
                        await pull_list.put(self.needed_images)
@ -375,7 +404,7 @@ class CloreClient:
                    tasks.append(WebSocketClient.stream_pull_logs())

                if self.validated_containers_set:
-                    tasks.append(deploy_containers(self.validated_containers))
+                    tasks.append(deploy_containers(self.validated_containers, self.allowed_running_containers))

                if step==1:
                    WebSocketClient.set_auth(self.auth_key)
@ -397,7 +426,7 @@ class CloreClient:
                        if type(result)==types.ServerConfig:
                            if result.success:
                                self.last_checked_ws_peers = utils.unix_timestamp()
-                                self.allowed_images=result.allowed_images
+                                self.allowed_images=result.allowed_images+self.extra_allowed_images
                                if not config.debug_ws_peer:
                                    for pure_ws_peer in result.ws_peers:
                                        self.ws_peers[pure_ws_peer]={
@ -411,6 +440,7 @@ class CloreClient:
                                self.validated_containers_set=True
                                self.validated_containers = result.valid_containers
                                self.use_hive_flightsheet = result.use_hive_flightsheet
+                                log.debug(f"Use Hive flightsheet: {result.use_hive_flightsheet}")
                        elif type(result)==types.DeployContainersRes:
                            try:
                                self.all_running_container_names = result.all_running_container_names
@ -425,7 +455,7 @@ class CloreClient:
    async def submit_specs(self, current_specs):
        try:
            if type(current_specs) == dict:
-                current_specs["backend_version"]=9
+                current_specs["backend_version"]=18
                current_specs["update_hw"]=True
                smallest_pcie_width = 999
                for gpu in current_specs["gpus"]["nvidia"]:
@ -447,7 +477,7 @@ class CloreClient:
                    "update_realtime_data":True,
                    "gpus": gpu_list,
                    "cpu": cpu_usage,
-                    "ram": ram_usage,
+                    "ram": ram_usage.percent,
                    "all_running_container_names": self.all_running_container_names,
                    "all_stopped_container_names": self.all_stopped_container_names
                }
@ -474,10 +504,10 @@ class CloreClient:
                await monitoring.put("oc_service")
                oc_apply_allowed = True
                ### OC Service should also hande Hive stuff
-                if self.use_hive_flightsheet and self.is_hive:
+                if self.use_hive_flightsheet and self.is_hive and not self.dont_use_hive_binaries:
                    await set_hive_miner_status(True)
                    oc_apply_allowed = False # Don't apply any OC when running HiveOS miner
-                elif self.is_hive:
+                elif self.is_hive and not self.dont_use_hive_binaries:
                    await set_hive_miner_status(False)
                ### Run OC tasks
                oc_conf = WebSocketClient.get_oc()
@ -498,6 +528,22 @@ class CloreClient:
                log.debug(f"FAIL | oc_service() | {e}")
            await asyncio.sleep(2)

+    async def background_pow_data_collection(self, monitoring):
+        while True:
+            try:
+                await monitoring.put("background_pow_data_collection")
+                if not self.dont_use_hive_binaries and self.is_hive:
+                    miner_config = await self.hive_miner_interface.export_miner_stats(get_hashrates=False)
+                    if (miner_config["miner_uptime"]>0 and miner_config["miner_uptime"]<60) or self.next_pow_background_job_send_update < time.time():
+                        self.next_pow_background_job_send_update = time.time()+(5*60)
+                        current_statistics = await self.hive_miner_interface.export_miner_stats(get_hashrates=True)
+                        submit_result = await WebSocketClient.send({"submit_hashrates": current_statistics})
+                        if not submit_result:
+                            self.next_pow_background_job_send_update = time.time()+40
+            except Exception as e:
+                log.debug(f"FAIL | background_pow_data_collection() | {e}")
+            await asyncio.sleep(6)
+
    def expire_ws_peers(self):
        for ws_peer_address in list(self.ws_peers.keys()):
            ws_peer_info = self.ws_peers[ws_peer_address]
--- a/clore_hosting/utils.py
+++ b/clore_hosting/utils.py
@ -10,5 +10,13 @@ def is_valid_websocket_url(url):
            return True
    return False

+def validate_hostname(hostname):
+    # Define the regular expression pattern for a valid hostname
+    pattern = re.compile(r'^[a-zA-Z0-9._-]{1,63}$')
+    if pattern.match(hostname):
+        return True
+    else:
+        return False
+
 def unix_timestamp():
    return int(time.time())
--- a/lib/config.py
+++ b/lib/config.py
@ -33,7 +33,7 @@ hard_config = {
    "maximum_service_loop_time": 900, # Seconds, failsafe variable - if service is stuck processing longer than this timeframe it will lead into restarting the app
    "maximum_pull_service_loop_time": 14400, # Exception for image pulling
    "creation_engine": "wrapper", # "wrapper" or "sdk" | Wrapper - wrapped docker cli, SDK - docker sdk
-    "allow_mixed_gpus": False
+    "allow_mixed_gpus": True
 }

 parser = argparse.ArgumentParser(description='Example argparse usage')
@ -48,7 +48,8 @@ parser.add_argument('--startup-scripts-folder', type=str, default='/opt/clore-ho
 parser.add_argument('--wireguard-config-folder', type=str, default='/opt/clore-hosting/wireguard/configs', help='Folder with wireguard configs')
 parser.add_argument('--entrypoints-folder', type=str, default='/opt/clore-hosting/entrypoints', help='Folder with custom entrypoints')
 parser.add_argument('--debug-ws-peer', type=str, help="Specific ws peer to connect to (for debugging only)")
-parser.add_argument('--gpu-specs-file', type=str, default='/opt/clore-hosting/client/gpu_specs.json' ,help="Cache with specs of GPU possible OC/Power limit changes")
+parser.add_argument('--gpu-specs-file', type=str, default='/opt/clore-hosting/client/gpu_specs.json', help="Cache with specs of GPU possible OC/Power limit changes")
+parser.add_argument('--extra-allowed-images-file', type=str, default="/opt/clore-hosting/extra_allowed_images.json", help="Docker image whitelist, that are allowed by clore.ai hosting software")

 # Parse arguments, ignoring any non-defined arguments
 args, _ = parser.parse_known_args()
--- a/lib/custom_entrypoint.py
+++ b/lib/custom_entrypoint.py
@ -63,7 +63,7 @@ def cache_entrypoints(containers):
            else:
                valid_conf.append(True)
        for remaining_file in entrypoint_files: # We can remove files that are not needed anymore
-            os.remove(remaining_file)
+            os.remove(os.path.join(config.entrypoints_folder,remaining_file))
        return valid_conf
    except Exception as e:
        return 'e'
--- a/lib/docker_cli_wrapper.py
+++ b/lib/docker_cli_wrapper.py
@ -9,7 +9,7 @@ import docker
 config = config_module.config
 log = logging_lib.log

-def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
+def create_container(container_options, ip=None, docker_gpus=False, shm_size=64, timeout=30):
    # Sanitize and validate input
    container_options = sanitize_input(container_options)

@ -21,6 +21,9 @@ def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
    if "network_mode" in container_options:
        command.extend(["--network", container_options["network_mode"]])

+    if "hostname" in container_options:
+        command.extend(["--hostname", container_options["hostname"]])
+
    if "cap_add" in container_options:
        for cap in container_options["cap_add"]:
            command.extend(["--cap-add", cap])
@ -52,6 +55,10 @@ def create_container(container_options, ip=None, docker_gpus=False, timeout=30):

    if "runtime" in container_options:
        command.extend(["--runtime", container_options["runtime"]])
+    
+    if shm_size != 64:
+        command.extend(["--shm-size", f"{shm_size}m"])
+
    if docker_gpus:
        if type(docker_gpus)==list:
            command.extend(['--gpus', '"device=' + ','.join(str(gpu_id) for gpu_id in docker_gpus) + '"'])
--- a/lib/docker_deploy.py
+++ b/lib/docker_deploy.py
@ -3,15 +3,18 @@ from lib import logging as logging_lib
 from lib import docker_cli_wrapper
 from lib import docker_interface
 from lib import get_specs
+from lib import utils
 import docker
 from docker.types import EndpointConfig, NetworkingConfig
 import os

+shm_calculator = utils.shm_calculator(get_specs.get_total_ram_mb())
+
 client = docker_interface.client
 config = config_module.config
 log = logging_lib.log

-def deploy(validated_containers):
+def deploy(validated_containers, allowed_running_containers=[]):
    local_images = docker_interface.get_local_images()
    all_containers = docker_interface.get_containers(all=True)

@ -43,6 +46,7 @@ def deploy(validated_containers):

    for validated_container in validated_containers:
        try:
+            SHM_SIZE = 64 # MB - default

            image_ready = False
            docker_gpus = None
@ -76,12 +80,21 @@ def deploy(validated_containers):
                )
            }

+            if "hostname" in validated_container:
+                container_options["hostname"]=validated_container["hostname"]
+            elif "clore-order-" in validated_container["name"]:
+                try:
+                    container_options["hostname"] = f"O-{int(validated_container["name"][12:])}"
+                except Exception as eon:
+                    pass
            if "network" in validated_container:
                container_options["network_mode"]=validated_container["network"]
            if "ip" in validated_container and config.creation_engine=="sdk":
                del container_options["network_mode"]

            if "gpus" in validated_container and type(validated_container["gpus"])==bool:
+                if "clore-order-" in validated_container["name"]:
+                    SHM_SIZE = shm_calculator.calculate('*')
                container_options["runtime"]="nvidia"
                docker_gpus=True
                container_options["device_requests"].append(docker.types.DeviceRequest(count=-1, capabilities=[['gpu']]))
@ -121,9 +134,11 @@ def deploy(validated_containers):
            elif "entrypoint_command" in validated_container and type(validated_container["entrypoint_command"])==str and len(validated_container["entrypoint_command"])>0:
                container_options["entrypoint"]=validated_container["entrypoint_command"]

+            container_options["shm_size"] = f"{SHM_SIZE}m"
+
            if not validated_container["name"] in created_container_names and image_ready:
                if config.creation_engine == "wrapper":
-                    docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), docker_gpus=docker_gpus)
+                    docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), shm_size=SHM_SIZE, docker_gpus=docker_gpus)
                else:
                    container = client.containers.create(**container_options)
                    if "ip" in validated_container:
@ -159,13 +174,13 @@ def deploy(validated_containers):
                    container.stop()
                except Exception as e:
                    pass
-            elif container.name not in paused_names+needed_running_names and container.status == 'running':
+            elif container.name not in paused_names+needed_running_names+allowed_running_containers and container.status == 'running':
                try:
                    container.stop()
                    container.remove()
                except Exception as e:
                    pass
-            elif container.name not in paused_names+needed_running_names:
+            elif container.name not in paused_names+needed_running_names+allowed_running_containers:
                try:
                    container.remove()
                except Exception as e:
--- a/lib/docker_interface.py
+++ b/lib/docker_interface.py
@ -50,6 +50,14 @@ class DockerNetwork(BaseModel):

 client = docker.from_env()
 low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock')
+daemon_config_path = "/etc/docker/daemon.json"
+
+def get_info():
+    try:
+        client_info = client.info()
+        return client_info
+    except Exception as e:
+        return {}

 def check_docker_connection():
    try:
@ -346,16 +354,15 @@ def validate_and_secure_networks():


 def get_daemon_config():
-    config_path = "/etc/docker/daemon.json"
    try:
-        with open(config_path, 'r') as file:
+        with open(daemon_config_path, 'r') as file:
            config_data = json.load(file)
            return config_data
    except FileNotFoundError:
-        print(f"Error: {config_path} not found.")
+        print(f"Error: {daemon_config_path} not found.")
        return None
    except json.JSONDecodeError:
-        print(f"Error: Failed to parse JSON from {config_path}.")
+        print(f"Error: Failed to parse JSON from {daemon_config_path}.")
        return None
    
 def verify_docker_version(min_version="17.06"):
@ -367,4 +374,42 @@ def verify_docker_version(min_version="17.06"):
            os._exit(1)
    except Exception as e:
        log.error(f"Failed to verify docker version | {e}")
-        os._exit(1)
+        os._exit(1)
+
+def configure_exec_opts(key="native.cgroupdriver", value="cgroupfs"):
+    deamon_config = get_daemon_config()
+    if deamon_config:
+        try:
+            if (not "exec-opts" in deamon_config or type(deamon_config["exec-opts"])!=list) and value!=None:
+                deamon_config["exec-opts"]=[f"{key}={value}"]
+            elif "exec-opts" in deamon_config:
+                new_exec_opts=[]
+                matched_key=False
+                for exec_opt in deamon_config["exec-opts"]:
+                    if '=' in exec_opt:
+                        exec_opt_key, exec_opt_value = exec_opt.split('=',1)
+                        if exec_opt_key==key:
+                            matched_key=True
+                            if value!=None:
+                                new_exec_opts.append(f"{key}={value}")
+                        else:
+                            new_exec_opts.append(exec_opt)
+                    else:
+                        new_exec_opts.append(exec_opt)
+                if not matched_key:
+                    new_exec_opts.append(f"{key}={value}")
+                if len(new_exec_opts)==0:
+                    del deamon_config["exec-opts"]
+                else:
+                    if deamon_config["exec-opts"] == new_exec_opts:
+                        return "Same"
+                    deamon_config["exec-opts"]=new_exec_opts
+            json_string = json.dumps(deamon_config, indent=4)
+            with open(daemon_config_path, 'w') as file:
+                file.write(json_string)
+            return True
+        except Exception as e:
+            log.error(f"Failed 'configure_exec_opts' | {e}")
+            return False
+    else:
+        return False
--- a/lib/get_specs.py
+++ b/lib/get_specs.py
@ -43,6 +43,32 @@ def get_kernel():
 def is_hive():
    return "hive" in get_kernel()

+def get_total_ram_mb():
+    total_ram = psutil.virtual_memory().total
+    return total_ram / (1024 ** 2)
+
+def get_os_release():
+    try:
+        with open("/etc/os-release") as f:
+            os_info = f.read()
+        os_release = {}
+        for line in os_info.split('\n'):
+            if '=' in line:
+                key, value = line.split('=', 1)
+                if value[:1]=='"' and value.endswith('"'):
+                    value = value[1:len(value)-1]
+                    os_release[key]=value
+
+        needed_cgroupfs_versions = ["22.04", "22.10"] # Mitigate issue https://github.com/NVIDIA/nvidia-docker/issues/1730
+
+        if "NAME" in os_release and "VERSION_ID" in os_release:
+            if os_release["NAME"].lower() == "ubuntu" and os_release["VERSION_ID"] in needed_cgroupfs_versions:
+                os_release["use_cgroupfs"]=True
+
+        return os_release
+    except Exception as e:
+        return {}
+
 def drop_caches():
    try:
        with open('/proc/sys/vm/drop_caches', 'w') as f:
@ -296,7 +322,7 @@ class Specs:
        gpu_str, gpu_mem, gpus, nvml_err = get_gpu_info()
        if require_same_gpus:
            last_gpu_name=''
-            for gpu in gpus:
+            for gpu in gpus["nvidia"]:
                if not last_gpu_name:
                    last_gpu_name=gpu["name"]
                elif last_gpu_name!=gpu["name"]:
--- a/lib/hive_miner_interface.py
+++ b/lib/hive_miner_interface.py
@ -0,0 +1,205 @@
+
+import aiofiles
+import asyncio
+import json
+import time
+import os
+import re
+
+def extract_json_with_key(text, key):
+    json_pattern = r'({.*?})'
+    json_strings = re.findall(json_pattern, text, re.DOTALL)
+    json_objects_with_key = []
+    for json_str in json_strings:
+        try:
+            json_obj = json.loads(json_str)
+            if key in json.dumps(json_obj):
+                json_objects_with_key.append(json_obj)
+        except json.JSONDecodeError:
+            continue
+    return json_objects_with_key
+
+async def async_run_bash_command(command):
+    process = await asyncio.create_subprocess_exec(
+        '/bin/bash', '-c', command,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE
+    )
+    stdout, stderr = await process.communicate()
+
+    return stdout.decode().strip(), stderr.decode().strip(), process.returncode
+
+async def check_and_read_file(file_path):
+    try:
+        if os.path.exists(file_path):
+            async with aiofiles.open(file_path, mode='r') as file:
+                contents = await file.read()
+                return contents
+        else:
+            return "fail"
+    except Exception as e:
+        return "fail"
+    
+async def get_session_start_time(pid):
+    try:
+        async with aiofiles.open(f'/proc/{pid}/stat', 'r') as file:
+            stat_info = (await file.read()).split()
+            start_time_ticks = int(stat_info[21])
+
+        clock_ticks_per_sec = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
+        start_time_seconds = start_time_ticks / clock_ticks_per_sec
+
+        boot_time = None
+        async with aiofiles.open('/proc/stat', 'r') as file:
+            async for line in file:
+                if line.startswith('btime'):
+                    boot_time = int(line.split()[1])
+                    break
+
+        if boot_time is None:
+            raise ValueError("Failed to find boot time in /proc/stat")
+
+        start_time = (boot_time + start_time_seconds)
+        return start_time
+
+    except FileNotFoundError:
+        return None
+    except Exception as e:
+        print(f"Error retrieving session start time: {e}")
+        return None
+
+async def get_miner_stats(miner_dir, api_timeout=15):
+    stdout, stderr, code = await async_run_bash_command(f'export PATH=$PATH:/hive/sbin:/hive/bin && export API_TIMEOUT={api_timeout}'+''' && read -t $((API_TIMEOUT + 5)) -d "" pid khs stats < <(function run_miner_scripts { echo "$BASHPID"; cd '''+miner_dir+''' || exit; cd '''+miner_dir+'''; source h-manifest.conf; cd '''+miner_dir+'''; source h-stats.sh; printf "%q\n" "$khs"; echo "$stats"; }; run_miner_scripts) && [[ $? -ge 128 ]] && [[ ! -z "$pid" && "$pid" -ne $$ ]] && kill -9 "$pid" 2>/dev/null ; echo $stats''')
+    try:
+        if stdout and not stderr and code == 0:
+            stats = json.loads(stdout)
+            return stats
+        else:
+            return 'fail'
+    except Exception as e:
+        try:
+            miner_stat_jsons = extract_json_with_key(stdout, "hs_units")
+            return miner_stat_jsons[0] if len(miner_stat_jsons) > 0 else 'fail'
+        except Exception :
+            return 'fail'
+
+async def get_miner_uptime_stats():
+    stdout, stderr, code = await async_run_bash_command("screen -ls")
+    if not stderr and code == 0:
+        for line in stdout.split('\n'):
+            if line[:1]=='\t':
+                if line.split('\t')[1].endswith(".miner"):
+                    miner_screen_pid = line.split('\t')[1].split('.')[0]
+                    if miner_screen_pid.isdigit():
+                        miner_start_time = await get_session_start_time(miner_screen_pid)
+                        return miner_start_time
+    return None
+    
+def extract_miner_names(rig_conf):
+    miner_names=[]
+    for line in rig_conf.split('\n'):
+        if '=' in line:
+            key, value = line.split('=', 1)
+            if key[:5]=="MINER" and (len(key)==5 or str(key[5:]).isdigit()):
+                if value.startswith('"') and value.endswith('"'):
+                    value = value.strip('"')
+                if len(value)>0:
+                    miner_names.append(value)
+    return miner_names
+
+def extract_miner_config(miner_names, wallet_conf):
+    lines = wallet_conf.split('\n')
+    meta_config = {}
+    miners = {}
+    for miner_idx, miner_name in enumerate(miner_names):
+        remaining_lines = []
+        algos = []
+        for line in lines:
+            if not line.startswith('#') and '=' in line:
+                key, value = line.split('=', 1)
+                if value.startswith('"') and value.endswith('"'):
+                    value = value.strip('"')
+                if value.startswith("'") and value.endswith("'"):
+                    value = value.strip("'")
+                if key[:len(miner_name.replace('-','_'))+1].lower() == f"{miner_name.replace('-','_')}_".lower():
+                    if key.split('_')[-1][:4].lower()=="algo":
+                        algos.append(value)
+                    elif miner_name.lower()=="custom" and key.lower()=="custom_miner":
+                        miner_names[miner_idx] = os.path.join(miner_names[miner_idx],value)
+                elif key.lower()=="meta":
+                    try:
+                        meta_config=json.loads(value)
+                    except Exception as e:
+                        pass
+                else:
+                    remaining_lines.append(line)
+        lines = remaining_lines
+        miners[miner_name]={
+            "algos":algos,
+            "coins":[]
+        }
+    for miner_name in miner_names:
+        if "custom/" in miner_name.lower():
+            miner_name = "custom"
+        if miner_name in meta_config and type(meta_config[miner_name]) == dict:
+            for key in meta_config[miner_name].keys():
+                if "coin" in key:
+                    miners[miner_name]["coins"].append(meta_config[miner_name][key])
+    return miner_names, miners
+
+def sum_numbers_in_list(lst):
+    if all(isinstance(i, (int, float)) for i in lst):
+        return sum(lst)
+    else:
+        return "The list contains non-numeric elements."
+
+class hive_interface:
+    def __init__(self):
+        self.hive_miners_dir = "/hive/miners"
+        self.hive_rig_config = "/hive-config/rig.conf"
+        self.hive_wallet_config = "/hive-config/wallet.conf"
+    
+    async def get_miners_stats(self, miner_names):
+        scrape_tasks = []
+        for miner_name in miner_names:
+            scrape_tasks.append(get_miner_stats(os.path.join(self.hive_miners_dir, miner_name)))
+        results = await asyncio.gather(*scrape_tasks)
+        return results
+
+    async def get_configured_miners(self):
+        rig_conf, wallet_conf = await asyncio.gather(*[check_and_read_file(self.hive_rig_config), check_and_read_file(self.hive_wallet_config)])
+        miner_names = extract_miner_names(rig_conf)
+        miner_names, miner_config = extract_miner_config(miner_names, wallet_conf)
+        return miner_names, miner_config
+
+    async def export_miner_stats(self, get_hashrates=False):
+        output = {
+            "miner_uptime": 0
+        }
+        miner_start_ts = await get_miner_uptime_stats()
+        if miner_start_ts:
+            output["miner_uptime"] = int(time.time()-miner_start_ts)
+        miner_names, miner_config = await self.get_configured_miners()
+        output["miners"]=miner_config
+        if get_hashrates:
+            miners_stats = await self.get_miners_stats(miner_names)
+            for idx, miner_name in enumerate(miner_names):
+                miner_names[idx] = "custom" if "custom/" in miner_name.lower() else miner_name
+            for idx, miner_stats in enumerate(miners_stats):
+                if type(miner_stats) == dict:
+                    for key in miner_stats.keys():
+                        if key[:2]=="hs" and (key=="hs" or key[2:].isdigit()):
+                            all_hs = sum_numbers_in_list(miner_stats[key])
+                            try:
+                                if "hs_units" in miner_stats:
+                                    if miner_stats["hs_units"]=="hs":
+                                        all_hs = all_hs/1000
+                                if not "hashrates" in output['miners'][miner_names[idx]]:
+                                    output['miners'][miner_names[idx]]["hashrates"]=[]
+                                if isinstance(all_hs, (float, int)):
+                                    output['miners'][miner_names[idx]]["hashrates"].append(all_hs)
+                                else:
+                                    output['miners'][miner_names[idx]]["hashrates"].append(0)
+                            except Exception as e:
+                                pass
+        return output
--- a/lib/init_server.py
+++ b/lib/init_server.py
@ -3,6 +3,7 @@ from lib import logging as logging_lib
 from lib import get_specs
 from lib import utils
 import threading
+import socket
 import aiohttp
 import asyncio
 import json
@ -47,9 +48,11 @@ async def register_server(data):
        "Content-Type": "application/json"
    }

-    async with aiohttp.ClientSession() as session:
+    connector = aiohttp.TCPConnector(family=socket.AF_INET)
+
+    async with aiohttp.ClientSession(connector=connector) as session:
        try:
-            async with session.post(url, data=json_data, headers=headers, timeout=5) as response:
+            async with session.post(url, data=json_data, headers=headers, timeout=15) as response:
                if response.status == 200:
                    # Successful response
                    response_data = await response.json()
--- a/lib/log_streaming_task.py
+++ b/lib/log_streaming_task.py
@ -10,7 +10,7 @@ from lib import container_logs
 from concurrent.futures import ThreadPoolExecutor
 import queue  # Import the synchronous queue module

-async def log_streaming_task(message_broker, monitoring):
+async def log_streaming_task(message_broker, monitoring, do_not_stream_containers):
    client = docker_interface.client
    executor = ThreadPoolExecutor(max_workers=4)
    tasks = {}
@ -29,14 +29,15 @@ async def log_streaming_task(message_broker, monitoring):

            # Start tasks for new containers
            for container_name, container in current_containers.items():
-                log_container_names.append(container_name)
-                if container_name not in tasks:
-                    log.debug(f"log_streaming_task() | Starting task for {container_name}")
-                    sync_queue = queue.Queue()
-                    task = asyncio.ensure_future(asyncio.get_event_loop().run_in_executor(
-                        executor, container_logs.stream_logs, container_name, sync_queue))
-                    tasks[container_name] = task
-                    queues[container_name] = sync_queue
+                if not container_name in do_not_stream_containers:
+                    log_container_names.append(container_name)
+                    if container_name not in tasks:
+                        log.debug(f"log_streaming_task() | Starting task for {container_name}")
+                        sync_queue = queue.Queue()
+                        task = asyncio.ensure_future(asyncio.get_event_loop().run_in_executor(
+                            executor, container_logs.stream_logs, container_name, sync_queue))
+                        tasks[container_name] = task
+                        queues[container_name] = sync_queue

            await message_broker.put(log_container_names)

--- a/lib/nvml.py
+++ b/lib/nvml.py
@ -6,20 +6,77 @@ config = config_module.config
 log = logging_lib.log

 import subprocess
-import pynvml
+import clore_pynvml as pynvml
 import json
+import math
+
+HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./"
+
+GPU_MEM_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
+    "NVIDIA P102-100": [-2000, 2000],
+    "NVIDIA P104-100": [-2000, 2000],
+    "NVIDIA P106-090": [-2000, 2000],
+    "NVIDIA P106-100": [-2000, 2000],
+    "NVIDIA GeForce GTX 1050 Ti": [-2000, 2000],
+    "NVIDIA GeForce GTX 1060 3GB": [-2000, 2000],
+    "NVIDIA GeForce GTX 1060 6GB": [-2000, 2000],
+    "NVIDIA GeForce GTX 1070": [-2000, 2000],
+    "NVIDIA GeForce GTX 1070 Ti": [-2000, 2000],
+    "NVIDIA GeForce GTX 1080": [-2000, 2000],
+    "NVIDIA GeForce GTX 1080 Ti": [-2000, 2000],
+    "NVIDIA CMP 30HX": [-2000, 6000],
+    "NVIDIA CMP 40HX": [-2000, 6000],
+    "NVIDIA CMP 50HX": [-2000, 6000],
+    "NVIDIA CMP 90HX": [-2000, 6000],
+    "NVIDIA GeForce GTX 1650": [-2000, 6000],
+    "NVIDIA GeForce GTX 1660 SUPER": [-2000, 6000],
+    "NVIDIA GeForce GTX 1660 Ti": [-2000, 6000],
+    "NVIDIA GeForce RTX 2060": [-2000, 6000],
+    "NVIDIA GeForce RTX 2060 SUPER": [-2000, 6000],
+    "NVIDIA GeForce RTX 2070": [-2000, 6000],
+    "NVIDIA GeForce RTX 2070 SUPER": [-2000, 6000],
+    "NVIDIA GeForce RTX 2080": [-2000, 6000],
+    "NVIDIA GeForce RTX 2080 Ti": [-2000, 6000]
+}
+
+GPU_CORE_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
+    "NVIDIA P102-100": [-200, 1200],
+    "NVIDIA P104-100": [-200, 1200],
+    "NVIDIA P106-090": [-200, 1200],
+    "NVIDIA P106-100": [-200, 1200],
+    "NVIDIA GeForce GTX 1050 Ti": [-200, 1200],
+    "NVIDIA GeForce GTX 1060 3GB": [-200, 1200],
+    "NVIDIA GeForce GTX 1060 6GB": [-200, 1200],
+    "NVIDIA GeForce GTX 1070": [-200, 1200],
+    "NVIDIA GeForce GTX 1070 Ti": [-200, 1200],
+    "NVIDIA GeForce GTX 1080": [-200, 1200],
+    "NVIDIA GeForce GTX 1080 Ti": [-200, 1200],
+    "NVIDIA CMP 30HX": [-1000, 1000],
+    "NVIDIA CMP 40HX": [-1000, 1000],
+    "NVIDIA CMP 50HX": [-1000, 1000],
+    "NVIDIA CMP 90HX": [-1000, 1000],
+    "NVIDIA GeForce GTX 1650": [-1000, 1000],
+    "NVIDIA GeForce GTX 1660 SUPER": [-1000, 1000],
+    "NVIDIA GeForce GTX 1660 Ti": [-1000, 1000],
+    "NVIDIA GeForce RTX 2060": [-1000, 1000],
+    "NVIDIA GeForce RTX 2060 SUPER": [-1000, 1000],
+    "NVIDIA GeForce RTX 2070": [-1000, 1000],
+    "NVIDIA GeForce RTX 2070 SUPER": [-1000, 1000],
+    "NVIDIA GeForce RTX 2080": [-1000, 1000],
+    "NVIDIA GeForce RTX 2080 Ti": [-1000, 1000]
+}

 is_hive = False
 all_gpus_data_list=[]
 get_data_fail=False

-def init(gpu_specs_file=None):
+def init(gpu_specs_file=None, allow_hive_binaries=True):
    global is_hive, all_gpus_data_list, get_data_fail
    log.info("Loading GPU OC specs [ working ]")
    try:
        pynvml.nvmlInit()
        kernel = get_specs.get_kernel()
-        if "hive" in kernel:
+        if "hive" in kernel and allow_hive_binaries:
            is_hive=True
        
        specs_file_loc = gpu_specs_file if gpu_specs_file else config.gpu_specs_file
@ -43,10 +100,15 @@ def init(gpu_specs_file=None):
                parsed_specs={}
                regenerate_specs=True
                break
-        
+            elif not "locks" in parsed_specs[f"{i}-{gpu_uuid}"]:
+                parsed_specs={}
+                regenerate_specs=True
+                break
+
        if regenerate_specs:
            for i in range(0,gpu_count):
                gpu_spec={}
+                mem_to_core_allowed_locks = get_gpu_locked_clocks(i)
                gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
                gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
                power_limits = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(gpu_handle)
@ -55,6 +117,7 @@ def init(gpu_specs_file=None):
                gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0)
                gpu_spec["power_limits"] = [min_power_limit, max_power_limit]
                gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle)
+                gpu_spec["locks"] = mem_to_core_allowed_locks

                pci_info = pynvml.nvmlDeviceGetPciInfo(gpu_handle)
                pci_bus_id = pci_info.bus
@ -64,22 +127,57 @@ def init(gpu_specs_file=None):

                mem_range = get_hive_clock_range(is_hive, i, "mem")
                core_range = get_hive_clock_range(is_hive, i, "core")
-                if type(mem_range) != list:
-                    pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load
-                    failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle)
-                    failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle)
-                    if (not failure_min) and (not failure_max):
-                        mem_range=[min_oc_solution, max_oc_solution]
-                    pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, 0)
-                    pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
-                if type(core_range) != list:
-                    pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, 300, 350) # Force low clocks, so the GPU can't crash when testing if under load
-                    failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle, True)
-                    failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle, True)
-                    if (not failure_min) and (not failure_max):
-                        core_range=[min_oc_solution, max_oc_solution]
-                    pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0)
-                    pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
+                try:
+                    if type(mem_range) != list:
+                        pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load
+                        failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle)
+                        failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle)
+                        if (not failure_min) and (not failure_max):
+                            mem_range=[min_oc_solution, max_oc_solution]
+                        pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, 0)
+                        pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
+                    if type(core_range) != list:
+                        pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, 300, 350) # Force low clocks, so the GPU can't crash when testing if under load
+                        failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle, True)
+                        failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle, True)
+                        if (not failure_min) and (not failure_max):
+                            core_range=[min_oc_solution, max_oc_solution]
+                        pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0)
+                        pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
+                except Exception as e_pinpointing:
+                    if "not supported" in str(e_pinpointing).lower():
+                        try:
+                            min_core_offset, max_core_offset = pynvml.nvmlDeviceGetGpcClkMinMaxVfOffset(gpu_handle)
+                            if min_core_offset>0:
+                                min_core_offset = min_core_offset - math.floor((2**32)/1000)
+                            if min_core_offset > -20000 and min_core_offset <= 0 and max_core_offset>=0 and min_core_offset < 20000:
+                                core_range=[min_core_offset, max_core_offset]
+                            else:
+                                core_range=[0,0]
+                            min_mem_offset, max_mem_offset = pynvml.nvmlDeviceGetMemClkMinMaxVfOffset(gpu_handle)
+                            if min_mem_offset>0:
+                                min_mem_offset = min_mem_offset - math.floor((2**32)/1000)
+                            if min_mem_offset==0 and max_mem_offset==0:
+                                if gpu_spec["name"] in GPU_MEM_ALLOWED_OC_RANGES:
+                                    mem_range = GPU_MEM_ALLOWED_OC_RANGES[gpu_spec["name"]]
+                                else:
+                                    mem_range = [0,0]
+                            elif min_mem_offset > -20000 and min_mem_offset <= 0 and max_mem_offset>=0 and max_mem_offset < 20000:
+                                mem_range=[min_mem_offset, max_mem_offset]
+                            else:
+                                mem_range=[0,0]
+                        except Exception as e2:
+                            if "function not found" in str(e2).lower():
+                                if gpu_spec["name"] in GPU_MEM_ALLOWED_OC_RANGES:
+                                    mem_range = GPU_MEM_ALLOWED_OC_RANGES[gpu_spec["name"]]
+                                else:
+                                    mem_range = [0,0]
+                                if gpu_spec["name"] in GPU_CORE_ALLOWED_OC_RANGES:
+                                    core_range = GPU_CORE_ALLOWED_OC_RANGES[gpu_spec["name"]]
+                                else:
+                                    core_range = [0,0]
+                            else:
+                                get_data_fail=True
                if type(mem_range) == list and type(core_range) == list and len(mem_range)==2 and len(core_range)==2:
                    gpu_spec["mem"]=mem_range
                    gpu_spec["core"]=core_range
@ -113,6 +211,19 @@ def get_gpu_oc_specs():
 def shutdown():
    pynvml.nvmlShutdown()

+def get_gpu_locked_clocks(gpu_index):
+    try:
+        handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
+        mem_clocks = pynvml.nvmlDeviceGetSupportedMemoryClocks(handle)
+        mem_to_core = {}
+        for idx, mem_clock in enumerate(mem_clocks):
+            if idx < 12 or idx == len(mem_clocks)-1:
+                graphics_clocks = pynvml.nvmlDeviceGetSupportedGraphicsClocks(handle, mem_clock)
+                mem_to_core[str(mem_clock)] = [min(graphics_clocks), max(graphics_clocks)]
+        return mem_to_core
+    except Exception as e:
+        return {}
+
 def handle_nn(input_int):
    if abs(4293967-input_int) < 10000:
        return input_int-4293967
@ -218,6 +329,7 @@ def pinpoint_oc_limits_positive(gpu_handle, core=False):
    return failure, found_solution

 def set_oc(settings):
+    global is_hive
    try:
        gpu_count = pynvml.nvmlDeviceGetCount()
        settings_keys = settings.keys()
@ -231,6 +343,10 @@ def set_oc(settings):
                }
            settings_keys = settings.keys()
            log.debug(f"Rewriting settings with: {json.dumps(settings)}")
+        
+        core_locks = []
+        mem_locks = []
+        any_lock_failure = False
        for oc_gpu_index in settings_keys:
            if oc_gpu_index.isdigit():
                oc_gpu_index=int(oc_gpu_index)
@ -238,13 +354,42 @@ def set_oc(settings):
                    gpu_oc_config = settings[str(oc_gpu_index)]
                    gpu_possible_ranges = all_gpus_data_list[oc_gpu_index]
                    gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(oc_gpu_index)
-                    if "core" in gpu_oc_config:
+
+                    if "core_lock" in gpu_oc_config:
+                        core_lock = int(gpu_oc_config["core_lock"])
+                        core_locks.append(str(core_lock))
+                        try:
+                            pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, core_lock, core_lock)
+                        except Exception as core_lock_exception:
+                            any_lock_failure=True
+                    else:
+                        core_locks.append('0')
+                        try:
+                            pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
+                        except Exception as core_lock_exception:
+                            any_lock_failure=True
+
+                    if "mem_lock" in gpu_oc_config:
+                        mem_lock = int(gpu_oc_config["mem_lock"])
+                        mem_locks.append(str(mem_lock))
+                        try:
+                            pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, mem_lock, mem_lock)
+                        except Exception as mem_lock_exception:
+                            any_lock_failure=True
+                    else:
+                        mem_locks.append('0')
+                        try:
+                            pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
+                        except Exception as mem_lock_exception:
+                            any_lock_failure=True
+
+                    if "core" in gpu_oc_config: # Core offset
                        wanted_core_clock = int(round(gpu_oc_config["core"]*2))
                        if gpu_possible_ranges["core"][0] <= wanted_core_clock and wanted_core_clock <= gpu_possible_ranges["core"][1]:
                            pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, wanted_core_clock)
                        else:
                            log.error(f"Requested OC for GPU:{oc_gpu_index} (CORE) out of bound | {wanted_core_clock} | [{gpu_possible_ranges["core"][0]}, {gpu_possible_ranges["core"][1]}]")
-                    if "mem" in gpu_oc_config:
+                    if "mem" in gpu_oc_config: # Memory offset
                        wanted_mem_clock = int(round(gpu_oc_config["mem"]*2))
                        if gpu_possible_ranges["mem"][0] <= wanted_mem_clock and wanted_mem_clock <= gpu_possible_ranges["mem"][1]:
                            pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, wanted_mem_clock)
@ -256,6 +401,17 @@ def set_oc(settings):
                            pynvml.nvmlDeviceSetPowerManagementLimit(gpu_handle, wanted_power_limit_milliwatts)
                        else:
                            log.error(f"Requested OC for GPU:{oc_gpu_index} (POWER LIMIT) out of bound | {gpu_oc_config["pl"]} | [{gpu_possible_ranges["power_limits"][0]}, {gpu_possible_ranges["power_limits"][1]}]")
+        if is_hive and any_lock_failure and len(mem_locks)==len(core_locks):
+            try:
+                nvtool_commands = []
+                for idx, mem_lock in enumerate(mem_locks):
+                    core_lock = core_locks[idx]
+                    nvtool_commands.append(f"nvtool -i {str(idx)} --setmem {mem_lock} --setcore {core_lock}")
+                cmd = ["bash",'-c',f"PATH={HIVE_PATH} && sudo {' && '.join(nvtool_commands)}"]
+                #print(cmd)
+                subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            except Exception as hive_oc_settings:
+                pass
        return True
    except Exception as e:
        log.error(f"set_oc | ERROR | {e}")
@ -267,7 +423,7 @@ def get_hive_clock_range(is_hive, gpu_index, part):
    if is_hive:
        try:
            flag = "--setmemoffset" if part=="mem" else "--setcoreoffset"
-            cmd = ["bash",'-c',f"nvtool -i 0 {flag} -100000"]
+            cmd = ["bash",'-c',f"PATH={HIVE_PATH} && sudo nvtool -i {gpu_index} {flag} -100000"]

            result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            lines = result.stdout.decode().splitlines()
@ -291,4 +447,17 @@ def get_hive_clock_range(is_hive, gpu_index, part):
        except Exception as e:
            return False
    else:
-        return False
+        return False
+
+def get_vram_per_gpu():
+    vram_per_gpu = []
+    try:
+        gpu_count = pynvml.nvmlDeviceGetCount()
+        for i in range(0,gpu_count):
+            gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+            mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
+            vram_per_gpu.append(mem_info.total / 1024 ** 2)
+    except Exception as e:
+        log.error(f"Failed loading get_vram_per_gpu() | {e}")
+        pass
+    return vram_per_gpu
--- a/lib/utils.py
+++ b/lib/utils.py
@ -1,11 +1,13 @@
 from lib import config as config_module
 from lib import logging as logging_lib
+from lib import nvml
 import subprocess
 import hashlib
 import random
 import string
 import shlex
 import time
+import math
 import json
 import os

@ -41,12 +43,20 @@ def normalize_rule(rule_dict):

 def get_auth():
    try:
+        if 'AUTH_TOKEN' in os.environ:
+            return os.environ['AUTH_TOKEN']
        auth_str = ''
        with open(config.auth_file, "r", encoding="utf-8") as file:
            auth_str = file.read().strip()
        return auth_str
    except Exception as e:
        return ''
+
+def get_allowed_container_names():
+    allowed_container_names = os.getenv("ALLOWED_CONTAINER_NAMES")
+    if type(allowed_container_names)==str and len(allowed_container_names)>0:
+        return [x for x in allowed_container_names.split(',') if x]
+    return []
    
 def unix_timestamp():
    return int(time.time())
@ -91,18 +101,70 @@ def generate_random_string(length):
    characters = string.ascii_letters + string.digits
    return ''.join(random.choice(characters) for _ in range(length))

+HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./"
+
 def hive_set_miner_status(enabled=False):
    ### control miner state - OFF/ON
    screen_out = run_command("screen -ls")
    miner_screen_running = False
+    miner_screen_session_pids = []
    if screen_out[0] == 0 or screen_out[0] == 1:
        screen_lines=screen_out[1].split('\n')
        for screen_line in screen_lines:
            screen_line_parts=screen_line.replace('\t', '', 1).split('\t')
            if len(screen_line_parts)>2 and '.' in screen_line_parts[0]:
                if screen_line_parts[0].split('.',1)[1]=="miner":
+                    miner_screen_session_pids.append(screen_line_parts[0].split('.',1)[0])
                    miner_screen_running=True
-    if miner_screen_running and not enabled:
-        run_command("miner stop")
+    if len(miner_screen_session_pids) > 1: ## Something really bad going on, destroy all instances
+        for idx, miner_screen_session_pid in enumerate(miner_screen_session_pids):
+            run_command(f"kill -9 {miner_screen_session_pid}{' && screen -wipe' if idx==len(miner_screen_session_pids)-1 else ''}")
+    elif miner_screen_running and not enabled:
+        run_command(f"/bin/bash -c \"PATH={HIVE_PATH} && sudo /hive/bin/miner stop\"")
    elif enabled and not miner_screen_running:
-        run_command("nvidia-oc && miner start")
+        run_command(f"/bin/bash -c \"export PATH={HIVE_PATH} && sudo /hive/sbin/nvidia-oc && source ~/.bashrc ; sudo /hive/bin/miner start\"")
+
+def get_extra_allowed_images():
+    if os.path.exists(config.extra_allowed_images_file):
+        try:
+            with open(config.extra_allowed_images_file, 'r') as file:
+                content = file.read()
+            
+            data = json.loads(content)
+            
+            if isinstance(data, list):
+                if all(isinstance(item, dict) and set(item.keys()) == {'repository', 'allowed_tags'} and isinstance(item['repository'], str) and isinstance(item['allowed_tags'], list) and all(isinstance(tag, str) for tag in item['allowed_tags']) for item in data):
+                    return data
+                else:
+                    return []
+            else:
+                return []
+        except Exception as e:
+            log.error(f"get_extra_allowed_images() | error: {e}")
+            return []
+    else:
+        return []
+    
+class shm_calculator:
+    def __init__(self, total_ram):
+        self.total_ram = total_ram
+        self.gpu_vram_sizes = []
+
+    def calculate(self, used_gpu_ids):
+        assume_ram_utilised = 2500 #MB
+        default_shm_size = 64 #MB
+        
+        if len(self.gpu_vram_sizes) == 0:
+            self.gpu_vram_sizes = nvml.get_vram_per_gpu()
+
+        instance_vram_total = 0
+        total_vram_size = sum(self.gpu_vram_sizes)
+        for idx, value in enumerate(self.gpu_vram_sizes):
+            if used_gpu_ids == '*' or idx in used_gpu_ids:
+                instance_vram_total += value
+        if instance_vram_total == 0 or total_vram_size == 0:
+            return default_shm_size
+        shm_size = instance_vram_total * 1.5 if instance_vram_total * 1.5 < self.total_ram - assume_ram_utilised else (
+            instance_vram_total/total_vram_size * (self.total_ram - assume_ram_utilised)
+        )
+        return math.floor(shm_size if shm_size > default_shm_size else default_shm_size)
--- a/requirements.txt
+++ b/requirements.txt
@ -7,4 +7,5 @@ psutil==5.9.0
 python-iptables==1.0.1
 websockets==12.0
 packaging==23.2
-git+https://git.clore.ai/clore/pynvml.git@main
+clore-pynvml==11.5.4
+requests==2.31.0
Author	SHA1	Message	Date
clore	68e7dc215d	V5.2.8 - when failing to set clock locks on HiveOS fallback on nvtool	2024-11-03 23:28:03 +00:00
clore	6c4995e19f	fix reporting RAM usage	2024-10-31 04:29:52 +00:00
clore	7faadc76ea	V5.2.7 - core, mem lock	2024-10-31 02:32:47 +00:00
clore	1375d5599e	allow mixed GPU on machine initialization	2024-10-29 10:00:25 +00:00
clore	d5620c64c4	allocate /dev/shm towards instances - V5.2.6	2024-10-17 17:01:41 +00:00
clore	d6f90ab497	better hiveos miner integration support	2024-10-05 18:53:29 +02:00
clore	81e2659024	correctly submit hashrates of custom miner - V5.2.5	2024-10-05 13:54:19 +02:00
clore	15e0810359	add './' to HIVE_PATH (fix for running some miners) - V5.2.4	2024-10-04 00:59:25 +00:00
clore	e71697cefa	prevent multiple miner screen sessions under HiveOS	2024-09-12 01:19:01 +00:00
clore	20d3d9b6c8	add hive bin paths	2024-09-12 01:05:33 +00:00
clore	7ec35382b2	detect if hashrate in hs -> khs	2024-09-12 00:39:00 +00:00
clore	36c7db5831	bump version	2024-09-08 23:33:46 +00:00
clore	c3e1b684fe	fix submit pow info timings	2024-09-08 23:33:06 +00:00
clore	2d1c15c7bf	force register_server requests to go throut IPv4	2024-09-08 22:00:57 +00:00
clore	6150cf48cb	submit hashrates, algos of background pow job	2024-09-08 21:51:03 +00:00
clore	cab037526a	ALLOWED_CONTAINER_NAMES env variable	2024-09-04 12:18:23 +00:00
clore	590dc4b65e	add optional whitelist for outside images	2024-09-04 00:51:52 +00:00
clore	5e35570d3c	DONT_USE_HIVE_BINARIES, AUTH_TOKEN env parameters	2024-09-03 23:17:46 +00:00
clore	7e63ca5218	V5.2.2 - hostnames, Failed to initialize NVML fix for ubuntu 22 hosts	2024-07-06 13:05:22 +00:00
clore	73d19b5cd7	V5.2.1 \| add dict for core clocks for problematic gpus on "function not found" error	2024-05-28 00:56:26 +00:00
clore	5e733fd0d6	V5.2.1 \| fix reading OC specs for older GPUs	2024-05-28 00:25:58 +00:00
clore	36d3026d5d	use requests==2.31.0	2024-05-22 11:54:57 +00:00
clore	79c17624f2	bump version	2024-05-22 00:43:38 +00:00
clore	2ef648df25	fix removing custom entrypoints	2024-05-22 00:35:33 +00:00
clore	c71597af16	use pynvml==11.5.0 instead of clore fork	2024-05-16 19:29:07 +00:00
clore	e2d309650b	use correct env for hive binaries	2024-05-16 15:17:11 +00:00
clore	504aa74f5e	fixes on use hive flightsheet flag	2024-05-16 13:57:00 +00:00
clore	35ce001a71	use full path for nvidia-oc	2024-05-16 12:15:38 +00:00
clore	1658ad4f66	use hive miner bin full paths	2024-05-16 12:05:56 +00:00
clore	12b4239cab	increase timeout to register server	2024-05-12 11:22:47 +00:00
clore	b0d7618592	fix	2024-05-12 11:20:34 +00:00