V5.2.8 - when failing to set clock locks on HiveOS fallback on nvtool

fix reporting RAM usage
V5.2.7 - core, mem lock
2024-11-03 23:28:03 +00:00 · 2024-10-31 04:29:52 +00:00 · 2024-10-31 02:32:47 +00:00 · 2024-10-29 10:00:25 +00:00 · 2024-10-17 17:01:41 +00:00 · 2024-10-05 18:53:29 +02:00
15 changed files with 662 additions and 69 deletions
--- a/clore_hosting/docker_configurator.py
+++ b/clore_hosting/docker_configurator.py
@ -4,6 +4,7 @@ from lib import custom_entrypoint
 from lib import networking
 from lib import wireguard
 from lib import logging as logging_lib
 from clore_hosting import utils as hosting_utils
 import shutil
 import os
 import re
@ -53,9 +54,12 @@ def configure(containers):
        for index, container in enumerate(containers):
            ok_custom_entrypoint = False
            invalid_hostname = False
            if index < len(custom_entrypoint_state):
                ok_custom_entrypoint = custom_entrypoint_state[index]
            startup_script_name = f"{container['name']}.sh"
            if "hostname" in container and not hosting_utils.validate_hostname(container["hostname"]):
                invalid_hostname = True
            if "ip" in container and len(container["ip"])>6 and type(container["ip"])==str:
                if container["ip"][:8] == "; echo '":
                    last_occurrence, text_after_last_ip = get_last_ip_occurrence_and_text(container["ip"])
@ -95,14 +99,14 @@ def configure(containers):
                            newly_created_networks.append(container["network"])
                        else:
                            any_fail=True
-                    if not any_fail and ok_custom_entrypoint:
+                    if not any_fail and ok_custom_entrypoint and not invalid_hostname:
                        valid_containers.append(container)
            elif "network" in container and container["network"][:len(config.clore_network_name_prefix)]==config.clore_network_name_prefix: # Subnet & gateway not defined, must be some of default networks, otherwise dump it
                if container["network"] in default_network_names:
                    for docker_network in docker_networks:
                        if docker_network["Name"]==container["network"]:
                            for ipam in docker_network["IPAM"]:
-                                if not ok_custom_entrypoint:
+                                if not ok_custom_entrypoint or invalid_hostname:
                                    break
                                elif not "ip" in container:
                                    valid_containers.append(container)
--- a/clore_hosting/main.py
+++ b/clore_hosting/main.py
@ -2,6 +2,7 @@ from lib import config as config_module
 from lib import logging as logging_lib
 from lib import log_streaming_task
 from lib import run_startup_script
 from lib import hive_miner_interface
 from lib import docker_interface
 from lib import docker_deploy
 from lib import docker_pull
@ -41,9 +42,9 @@ async def configure_networks(containers):
    except Exception as e:
        return False
-async def deploy_containers(validated_containers):
+async def deploy_containers(validated_containers, allowed_running_containers):
    try:
-        all_running_container_names, all_stopped_container_names = await asyncio.to_thread(docker_deploy.deploy, validated_containers)
+        all_running_container_names, all_stopped_container_names = await asyncio.to_thread(docker_deploy.deploy, validated_containers, allowed_running_containers)
        return types.DeployContainersRes(all_running_container_names=all_running_container_names, all_stopped_container_names=all_stopped_container_names)
    except Exception as e:
        return False
@ -97,7 +98,8 @@ class CloreClient:
            "log_streaming_task": utils.unix_timestamp(),
            "container_log_streaming_service": utils.unix_timestamp(),
            "specs_service": utils.unix_timestamp(),
-            "oc_service": utils.unix_timestamp()
+            "oc_service": utils.unix_timestamp(),
            "background_pow_data_collection": utils.unix_timestamp()
        }
        self.max_service_inactivity = 600 # seconds
@ -106,8 +108,23 @@ class CloreClient:
                "expiration":"immune"
            }
        self.os_release = get_specs.get_os_release()
        self.restart_docker = False
        if "use_cgroupfs" in self.os_release:
            self.updated_exec_opts = True if docker_interface.configure_exec_opts("native.cgroupdriver","cgroupfs") else False
            if self.updated_exec_opts:
                docker_info = docker_interface.get_info()
                if "CgroupDriver" in docker_info and docker_info["CgroupDriver"]=="systemd":
                    self.restart_docker = True # Restart docker when it's loaded under systemd (accual restart will happen only if no orders running to not disrupt workload)
        docker_interface.verify_docker_version()
-        nvml.init()
+
        self.dont_use_hive_binaries = True if 'DONT_USE_HIVE_BINARIES' in os.environ else False
        nvml.init(allow_hive_binaries=not self.dont_use_hive_binaries)
        self.extra_allowed_images = utils.get_extra_allowed_images()
        self.allowed_running_containers = utils.get_allowed_container_names()
        self.gpu_oc_specs = nvml.get_gpu_oc_specs()
        self.last_oc_service_submit = 0
@ -117,6 +134,9 @@ class CloreClient:
        self.is_hive = get_specs.is_hive()
        self.use_hive_flightsheet = False
        self.hive_miner_interface = hive_miner_interface.hive_interface()
        self.next_pow_background_job_send_update = 0
    async def service(self):
        global container_log_broken
@ -126,14 +146,15 @@ class CloreClient:
        task1 = asyncio.create_task(self.main(pull_list, monitoring))
        task2 = asyncio.create_task(self.handle_container_cache(pull_list, monitoring))
        task3 = asyncio.create_task(self.startup_script_runner(monitoring))
-        task4 = asyncio.create_task(log_streaming_task.log_streaming_task(container_log_broken, monitoring))
+        task4 = asyncio.create_task(log_streaming_task.log_streaming_task(container_log_broken, monitoring, self.allowed_running_containers))
        task5 = asyncio.create_task(self.container_log_streaming_service(monitoring))
        task6 = asyncio.create_task(self.specs_service(monitoring))
        task7 = asyncio.create_task(self.oc_service(monitoring))
        task8 = asyncio.create_task(self.background_pow_data_collection(monitoring))
        monitoring_task = asyncio.create_task(self.monitoring_service(monitoring))
        # Wait for both tasks to complete (they won't in this case)
-        await asyncio.gather(task1, task2, task3, task4, task5, task6, task7, monitoring_task)
+        await asyncio.gather(task1, task2, task3, task4, task5, task6, task7, task8, monitoring_task)
    async def monitoring_service(self, monitoring):
        while True:
@ -333,6 +354,7 @@ class CloreClient:
                    print("STEP",step,'|',self.containers_set, self.containers if config.log_containers_strings else '')
                tasks = []
                running_order = False
                container_conf = WebSocketClient.get_containers()
@ -341,10 +363,11 @@ class CloreClient:
                    self.containers=container_conf[1]
                    tmp_images = []
                    for container in self.containers:
-                        if "image" in container:
+                        if "image" in container and "image" in container and container["image"]!="cloreai/hive-use-flightsheet":
                            log_pull = False
                            if "name" in container:
                                if "-order-" in container["name"]:
                                    running_order=True
                                    log_pull=True
                            image_config = {
                                "image":container["image"],
@ -362,6 +385,12 @@ class CloreClient:
                            if not image_config in tmp_images:
                                tmp_images.append(image_config)
                    if self.restart_docker and not running_order and len(self.containers)>0:
                        log.debug("Sending docker restart command")
                        utils.run_command_v2("systemctl restart docker")
                        self.restart_docker=False
                    if tmp_images!=self.needed_images:
                        self.needed_images=tmp_images
                        await pull_list.put(self.needed_images)
@ -375,7 +404,7 @@ class CloreClient:
                    tasks.append(WebSocketClient.stream_pull_logs())
                if self.validated_containers_set:
-                    tasks.append(deploy_containers(self.validated_containers))
+                    tasks.append(deploy_containers(self.validated_containers, self.allowed_running_containers))
                if step==1:
                    WebSocketClient.set_auth(self.auth_key)
@ -397,7 +426,7 @@ class CloreClient:
                        if type(result)==types.ServerConfig:
                            if result.success:
                                self.last_checked_ws_peers = utils.unix_timestamp()
-                                self.allowed_images=result.allowed_images
+                                self.allowed_images=result.allowed_images+self.extra_allowed_images
                                if not config.debug_ws_peer:
                                    for pure_ws_peer in result.ws_peers:
                                        self.ws_peers[pure_ws_peer]={
@ -411,6 +440,7 @@ class CloreClient:
                                self.validated_containers_set=True
                                self.validated_containers = result.valid_containers
                                self.use_hive_flightsheet = result.use_hive_flightsheet
                                log.debug(f"Use Hive flightsheet: {result.use_hive_flightsheet}")
                        elif type(result)==types.DeployContainersRes:
                            try:
                                self.all_running_container_names = result.all_running_container_names
@ -425,7 +455,7 @@ class CloreClient:
    async def submit_specs(self, current_specs):
        try:
            if type(current_specs) == dict:
-                current_specs["backend_version"]=9
+                current_specs["backend_version"]=18
                current_specs["update_hw"]=True
                smallest_pcie_width = 999
                for gpu in current_specs["gpus"]["nvidia"]:
@ -447,7 +477,7 @@ class CloreClient:
                    "update_realtime_data":True,
                    "gpus": gpu_list,
                    "cpu": cpu_usage,
-                    "ram": ram_usage,
+                    "ram": ram_usage.percent,
                    "all_running_container_names": self.all_running_container_names,
                    "all_stopped_container_names": self.all_stopped_container_names
                }
@ -474,10 +504,10 @@ class CloreClient:
                await monitoring.put("oc_service")
                oc_apply_allowed = True
                ### OC Service should also hande Hive stuff
-                if self.use_hive_flightsheet and self.is_hive:
+                if self.use_hive_flightsheet and self.is_hive and not self.dont_use_hive_binaries:
                    await set_hive_miner_status(True)
                    oc_apply_allowed = False # Don't apply any OC when running HiveOS miner
-                elif self.is_hive:
+                elif self.is_hive and not self.dont_use_hive_binaries:
                    await set_hive_miner_status(False)
                ### Run OC tasks
                oc_conf = WebSocketClient.get_oc()
@ -498,6 +528,22 @@ class CloreClient:
                log.debug(f"FAIL | oc_service() | {e}")
            await asyncio.sleep(2)
    async def background_pow_data_collection(self, monitoring):
        while True:
            try:
                await monitoring.put("background_pow_data_collection")
                if not self.dont_use_hive_binaries and self.is_hive:
                    miner_config = await self.hive_miner_interface.export_miner_stats(get_hashrates=False)
                    if (miner_config["miner_uptime"]>0 and miner_config["miner_uptime"]<60) or self.next_pow_background_job_send_update < time.time():
                        self.next_pow_background_job_send_update = time.time()+(5*60)
                        current_statistics = await self.hive_miner_interface.export_miner_stats(get_hashrates=True)
                        submit_result = await WebSocketClient.send({"submit_hashrates": current_statistics})
                        if not submit_result:
                            self.next_pow_background_job_send_update = time.time()+40
            except Exception as e:
                log.debug(f"FAIL | background_pow_data_collection() | {e}")
            await asyncio.sleep(6)
    def expire_ws_peers(self):
        for ws_peer_address in list(self.ws_peers.keys()):
            ws_peer_info = self.ws_peers[ws_peer_address]
--- a/clore_hosting/utils.py
+++ b/clore_hosting/utils.py
@ -10,5 +10,13 @@ def is_valid_websocket_url(url):
            return True
    return False
 def validate_hostname(hostname):
    # Define the regular expression pattern for a valid hostname
    pattern = re.compile(r'^[a-zA-Z0-9._-]{1,63}$')
    if pattern.match(hostname):
        return True
    else:
        return False
 def unix_timestamp():
    return int(time.time())
--- a/lib/config.py
+++ b/lib/config.py
@ -33,7 +33,7 @@ hard_config = {
    "maximum_service_loop_time": 900, # Seconds, failsafe variable - if service is stuck processing longer than this timeframe it will lead into restarting the app
    "maximum_pull_service_loop_time": 14400, # Exception for image pulling
    "creation_engine": "wrapper", # "wrapper" or "sdk" | Wrapper - wrapped docker cli, SDK - docker sdk
-    "allow_mixed_gpus": False
+    "allow_mixed_gpus": True
 }
 parser = argparse.ArgumentParser(description='Example argparse usage')
@ -48,7 +48,8 @@ parser.add_argument('--startup-scripts-folder', type=str, default='/opt/clore-ho
 parser.add_argument('--wireguard-config-folder', type=str, default='/opt/clore-hosting/wireguard/configs', help='Folder with wireguard configs')
 parser.add_argument('--entrypoints-folder', type=str, default='/opt/clore-hosting/entrypoints', help='Folder with custom entrypoints')
 parser.add_argument('--debug-ws-peer', type=str, help="Specific ws peer to connect to (for debugging only)")
-parser.add_argument('--gpu-specs-file', type=str, default='/opt/clore-hosting/client/gpu_specs.json' ,help="Cache with specs of GPU possible OC/Power limit changes")
+parser.add_argument('--gpu-specs-file', type=str, default='/opt/clore-hosting/client/gpu_specs.json', help="Cache with specs of GPU possible OC/Power limit changes")
 parser.add_argument('--extra-allowed-images-file', type=str, default="/opt/clore-hosting/extra_allowed_images.json", help="Docker image whitelist, that are allowed by clore.ai hosting software")
 # Parse arguments, ignoring any non-defined arguments
 args, _ = parser.parse_known_args()
--- a/lib/custom_entrypoint.py
+++ b/lib/custom_entrypoint.py
@ -63,7 +63,7 @@ def cache_entrypoints(containers):
            else:
                valid_conf.append(True)
        for remaining_file in entrypoint_files: # We can remove files that are not needed anymore
-            os.remove(remaining_file)
+            os.remove(os.path.join(config.entrypoints_folder,remaining_file))
        return valid_conf
    except Exception as e:
        return 'e'
--- a/lib/docker_cli_wrapper.py
+++ b/lib/docker_cli_wrapper.py
@ -9,7 +9,7 @@ import docker
 config = config_module.config
 log = logging_lib.log
-def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
+def create_container(container_options, ip=None, docker_gpus=False, shm_size=64, timeout=30):
    # Sanitize and validate input
    container_options = sanitize_input(container_options)
@ -21,6 +21,9 @@ def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
    if "network_mode" in container_options:
        command.extend(["--network", container_options["network_mode"]])
    if "hostname" in container_options:
        command.extend(["--hostname", container_options["hostname"]])
    if "cap_add" in container_options:
        for cap in container_options["cap_add"]:
            command.extend(["--cap-add", cap])
@ -52,6 +55,10 @@ def create_container(container_options, ip=None, docker_gpus=False, timeout=30):
    if "runtime" in container_options:
        command.extend(["--runtime", container_options["runtime"]])
    if shm_size != 64:
        command.extend(["--shm-size", f"{shm_size}m"])
    if docker_gpus:
        if type(docker_gpus)==list:
            command.extend(['--gpus', '"device=' + ','.join(str(gpu_id) for gpu_id in docker_gpus) + '"'])
--- a/lib/docker_deploy.py
+++ b/lib/docker_deploy.py
@ -3,15 +3,18 @@ from lib import logging as logging_lib
 from lib import docker_cli_wrapper
 from lib import docker_interface
 from lib import get_specs
 from lib import utils
 import docker
 from docker.types import EndpointConfig, NetworkingConfig
 import os
 shm_calculator = utils.shm_calculator(get_specs.get_total_ram_mb())
 client = docker_interface.client
 config = config_module.config
 log = logging_lib.log
-def deploy(validated_containers):
+def deploy(validated_containers, allowed_running_containers=[]):
    local_images = docker_interface.get_local_images()
    all_containers = docker_interface.get_containers(all=True)
@ -43,6 +46,7 @@ def deploy(validated_containers):
    for validated_container in validated_containers:
        try:
            SHM_SIZE = 64 # MB - default
            image_ready = False
            docker_gpus = None
@ -76,12 +80,21 @@ def deploy(validated_containers):
                )
            }
            if "hostname" in validated_container:
                container_options["hostname"]=validated_container["hostname"]
            elif "clore-order-" in validated_container["name"]:
                try:
                    container_options["hostname"] = f"O-{int(validated_container["name"][12:])}"
                except Exception as eon:
                    pass
            if "network" in validated_container:
                container_options["network_mode"]=validated_container["network"]
            if "ip" in validated_container and config.creation_engine=="sdk":
                del container_options["network_mode"]
            if "gpus" in validated_container and type(validated_container["gpus"])==bool:
                if "clore-order-" in validated_container["name"]:
                    SHM_SIZE = shm_calculator.calculate('*')
                container_options["runtime"]="nvidia"
                docker_gpus=True
                container_options["device_requests"].append(docker.types.DeviceRequest(count=-1, capabilities=[['gpu']]))
@ -121,9 +134,11 @@ def deploy(validated_containers):
            elif "entrypoint_command" in validated_container and type(validated_container["entrypoint_command"])==str and len(validated_container["entrypoint_command"])>0:
                container_options["entrypoint"]=validated_container["entrypoint_command"]
            container_options["shm_size"] = f"{SHM_SIZE}m"
            if not validated_container["name"] in created_container_names and image_ready:
                if config.creation_engine == "wrapper":
-                    docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), docker_gpus=docker_gpus)
+                    docker_cli_wrapper.create_container(container_options, ip=(validated_container["ip"] if "ip" in validated_container else None), shm_size=SHM_SIZE, docker_gpus=docker_gpus)
                else:
                    container = client.containers.create(**container_options)
                    if "ip" in validated_container:
@ -159,13 +174,13 @@ def deploy(validated_containers):
                    container.stop()
                except Exception as e:
                    pass
-            elif container.name not in paused_names+needed_running_names and container.status == 'running':
+            elif container.name not in paused_names+needed_running_names+allowed_running_containers and container.status == 'running':
                try:
                    container.stop()
                    container.remove()
                except Exception as e:
                    pass
-            elif container.name not in paused_names+needed_running_names:
+            elif container.name not in paused_names+needed_running_names+allowed_running_containers:
                try:
                    container.remove()
                except Exception as e:
--- a/lib/docker_interface.py
+++ b/lib/docker_interface.py
@ -50,6 +50,14 @@ class DockerNetwork(BaseModel):
 client = docker.from_env()
 low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock')
 daemon_config_path = "/etc/docker/daemon.json"
 def get_info():
    try:
        client_info = client.info()
        return client_info
    except Exception as e:
        return {}
 def check_docker_connection():
    try:
@ -346,16 +354,15 @@ def validate_and_secure_networks():
 def get_daemon_config():
    config_path = "/etc/docker/daemon.json"
    try:
-        with open(config_path, 'r') as file:
+        with open(daemon_config_path, 'r') as file:
            config_data = json.load(file)
            return config_data
    except FileNotFoundError:
-        print(f"Error: {config_path} not found.")
+        print(f"Error: {daemon_config_path} not found.")
        return None
    except json.JSONDecodeError:
-        print(f"Error: Failed to parse JSON from {config_path}.")
+        print(f"Error: Failed to parse JSON from {daemon_config_path}.")
        return None
 def verify_docker_version(min_version="17.06"):
@ -368,3 +375,41 @@ def verify_docker_version(min_version="17.06"):
    except Exception as e:
        log.error(f"Failed to verify docker version | {e}")
        os._exit(1)
 def configure_exec_opts(key="native.cgroupdriver", value="cgroupfs"):
    deamon_config = get_daemon_config()
    if deamon_config:
        try:
            if (not "exec-opts" in deamon_config or type(deamon_config["exec-opts"])!=list) and value!=None:
                deamon_config["exec-opts"]=[f"{key}={value}"]
            elif "exec-opts" in deamon_config:
                new_exec_opts=[]
                matched_key=False
                for exec_opt in deamon_config["exec-opts"]:
                    if '=' in exec_opt:
                        exec_opt_key, exec_opt_value = exec_opt.split('=',1)
                        if exec_opt_key==key:
                            matched_key=True
                            if value!=None:
                                new_exec_opts.append(f"{key}={value}")
                        else:
                            new_exec_opts.append(exec_opt)
                    else:
                        new_exec_opts.append(exec_opt)
                if not matched_key:
                    new_exec_opts.append(f"{key}={value}")
                if len(new_exec_opts)==0:
                    del deamon_config["exec-opts"]
                else:
                    if deamon_config["exec-opts"] == new_exec_opts:
                        return "Same"
                    deamon_config["exec-opts"]=new_exec_opts
            json_string = json.dumps(deamon_config, indent=4)
            with open(daemon_config_path, 'w') as file:
                file.write(json_string)
            return True
        except Exception as e:
            log.error(f"Failed 'configure_exec_opts' | {e}")
            return False
    else:
        return False
--- a/lib/get_specs.py
+++ b/lib/get_specs.py
@ -43,6 +43,32 @@ def get_kernel():
 def is_hive():
    return "hive" in get_kernel()
 def get_total_ram_mb():
    total_ram = psutil.virtual_memory().total
    return total_ram / (1024 ** 2)
 def get_os_release():
    try:
        with open("/etc/os-release") as f:
            os_info = f.read()
        os_release = {}
        for line in os_info.split('\n'):
            if '=' in line:
                key, value = line.split('=', 1)
                if value[:1]=='"' and value.endswith('"'):
                    value = value[1:len(value)-1]
                    os_release[key]=value
        needed_cgroupfs_versions = ["22.04", "22.10"] # Mitigate issue https://github.com/NVIDIA/nvidia-docker/issues/1730
        if "NAME" in os_release and "VERSION_ID" in os_release:
            if os_release["NAME"].lower() == "ubuntu" and os_release["VERSION_ID"] in needed_cgroupfs_versions:
                os_release["use_cgroupfs"]=True
        return os_release
    except Exception as e:
        return {}
 def drop_caches():
    try:
        with open('/proc/sys/vm/drop_caches', 'w') as f:
@ -296,7 +322,7 @@ class Specs:
        gpu_str, gpu_mem, gpus, nvml_err = get_gpu_info()
        if require_same_gpus:
            last_gpu_name=''
-            for gpu in gpus:
+            for gpu in gpus["nvidia"]:
                if not last_gpu_name:
                    last_gpu_name=gpu["name"]
                elif last_gpu_name!=gpu["name"]:
--- a/lib/hive_miner_interface.py
+++ b/lib/hive_miner_interface.py
@ -0,0 +1,205 @@
 import aiofiles
 import asyncio
 import json
 import time
 import os
 import re
 def extract_json_with_key(text, key):
    json_pattern = r'({.*?})'
    json_strings = re.findall(json_pattern, text, re.DOTALL)
    json_objects_with_key = []
    for json_str in json_strings:
        try:
            json_obj = json.loads(json_str)
            if key in json.dumps(json_obj):
                json_objects_with_key.append(json_obj)
        except json.JSONDecodeError:
            continue
    return json_objects_with_key
 async def async_run_bash_command(command):
    process = await asyncio.create_subprocess_exec(
        '/bin/bash', '-c', command,
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE
    )
    stdout, stderr = await process.communicate()
    return stdout.decode().strip(), stderr.decode().strip(), process.returncode
 async def check_and_read_file(file_path):
    try:
        if os.path.exists(file_path):
            async with aiofiles.open(file_path, mode='r') as file:
                contents = await file.read()
                return contents
        else:
            return "fail"
    except Exception as e:
        return "fail"
 async def get_session_start_time(pid):
    try:
        async with aiofiles.open(f'/proc/{pid}/stat', 'r') as file:
            stat_info = (await file.read()).split()
            start_time_ticks = int(stat_info[21])
        clock_ticks_per_sec = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
        start_time_seconds = start_time_ticks / clock_ticks_per_sec
        boot_time = None
        async with aiofiles.open('/proc/stat', 'r') as file:
            async for line in file:
                if line.startswith('btime'):
                    boot_time = int(line.split()[1])
                    break
        if boot_time is None:
            raise ValueError("Failed to find boot time in /proc/stat")
        start_time = (boot_time + start_time_seconds)
        return start_time
    except FileNotFoundError:
        return None
    except Exception as e:
        print(f"Error retrieving session start time: {e}")
        return None
 async def get_miner_stats(miner_dir, api_timeout=15):
    stdout, stderr, code = await async_run_bash_command(f'export PATH=$PATH:/hive/sbin:/hive/bin && export API_TIMEOUT={api_timeout}'+''' && read -t $((API_TIMEOUT + 5)) -d "" pid khs stats < <(function run_miner_scripts { echo "$BASHPID"; cd '''+miner_dir+''' || exit; cd '''+miner_dir+'''; source h-manifest.conf; cd '''+miner_dir+'''; source h-stats.sh; printf "%q\n" "$khs"; echo "$stats"; }; run_miner_scripts) && [[ $? -ge 128 ]] && [[ ! -z "$pid" && "$pid" -ne $$ ]] && kill -9 "$pid" 2>/dev/null ; echo $stats''')
    try:
        if stdout and not stderr and code == 0:
            stats = json.loads(stdout)
            return stats
        else:
            return 'fail'
    except Exception as e:
        try:
            miner_stat_jsons = extract_json_with_key(stdout, "hs_units")
            return miner_stat_jsons[0] if len(miner_stat_jsons) > 0 else 'fail'
        except Exception :
            return 'fail'
 async def get_miner_uptime_stats():
    stdout, stderr, code = await async_run_bash_command("screen -ls")
    if not stderr and code == 0:
        for line in stdout.split('\n'):
            if line[:1]=='\t':
                if line.split('\t')[1].endswith(".miner"):
                    miner_screen_pid = line.split('\t')[1].split('.')[0]
                    if miner_screen_pid.isdigit():
                        miner_start_time = await get_session_start_time(miner_screen_pid)
                        return miner_start_time
    return None
 def extract_miner_names(rig_conf):
    miner_names=[]
    for line in rig_conf.split('\n'):
        if '=' in line:
            key, value = line.split('=', 1)
            if key[:5]=="MINER" and (len(key)==5 or str(key[5:]).isdigit()):
                if value.startswith('"') and value.endswith('"'):
                    value = value.strip('"')
                if len(value)>0:
                    miner_names.append(value)
    return miner_names
 def extract_miner_config(miner_names, wallet_conf):
    lines = wallet_conf.split('\n')
    meta_config = {}
    miners = {}
    for miner_idx, miner_name in enumerate(miner_names):
        remaining_lines = []
        algos = []
        for line in lines:
            if not line.startswith('#') and '=' in line:
                key, value = line.split('=', 1)
                if value.startswith('"') and value.endswith('"'):
                    value = value.strip('"')
                if value.startswith("'") and value.endswith("'"):
                    value = value.strip("'")
                if key[:len(miner_name.replace('-','_'))+1].lower() == f"{miner_name.replace('-','_')}_".lower():
                    if key.split('_')[-1][:4].lower()=="algo":
                        algos.append(value)
                    elif miner_name.lower()=="custom" and key.lower()=="custom_miner":
                        miner_names[miner_idx] = os.path.join(miner_names[miner_idx],value)
                elif key.lower()=="meta":
                    try:
                        meta_config=json.loads(value)
                    except Exception as e:
                        pass
                else:
                    remaining_lines.append(line)
        lines = remaining_lines
        miners[miner_name]={
            "algos":algos,
            "coins":[]
        }
    for miner_name in miner_names:
        if "custom/" in miner_name.lower():
            miner_name = "custom"
        if miner_name in meta_config and type(meta_config[miner_name]) == dict:
            for key in meta_config[miner_name].keys():
                if "coin" in key:
                    miners[miner_name]["coins"].append(meta_config[miner_name][key])
    return miner_names, miners
 def sum_numbers_in_list(lst):
    if all(isinstance(i, (int, float)) for i in lst):
        return sum(lst)
    else:
        return "The list contains non-numeric elements."
 class hive_interface:
    def __init__(self):
        self.hive_miners_dir = "/hive/miners"
        self.hive_rig_config = "/hive-config/rig.conf"
        self.hive_wallet_config = "/hive-config/wallet.conf"
    async def get_miners_stats(self, miner_names):
        scrape_tasks = []
        for miner_name in miner_names:
            scrape_tasks.append(get_miner_stats(os.path.join(self.hive_miners_dir, miner_name)))
        results = await asyncio.gather(*scrape_tasks)
        return results
    async def get_configured_miners(self):
        rig_conf, wallet_conf = await asyncio.gather(*[check_and_read_file(self.hive_rig_config), check_and_read_file(self.hive_wallet_config)])
        miner_names = extract_miner_names(rig_conf)
        miner_names, miner_config = extract_miner_config(miner_names, wallet_conf)
        return miner_names, miner_config
    async def export_miner_stats(self, get_hashrates=False):
        output = {
            "miner_uptime": 0
        }
        miner_start_ts = await get_miner_uptime_stats()
        if miner_start_ts:
            output["miner_uptime"] = int(time.time()-miner_start_ts)
        miner_names, miner_config = await self.get_configured_miners()
        output["miners"]=miner_config
        if get_hashrates:
            miners_stats = await self.get_miners_stats(miner_names)
            for idx, miner_name in enumerate(miner_names):
                miner_names[idx] = "custom" if "custom/" in miner_name.lower() else miner_name
            for idx, miner_stats in enumerate(miners_stats):
                if type(miner_stats) == dict:
                    for key in miner_stats.keys():
                        if key[:2]=="hs" and (key=="hs" or key[2:].isdigit()):
                            all_hs = sum_numbers_in_list(miner_stats[key])
                            try:
                                if "hs_units" in miner_stats:
                                    if miner_stats["hs_units"]=="hs":
                                        all_hs = all_hs/1000
                                if not "hashrates" in output['miners'][miner_names[idx]]:
                                    output['miners'][miner_names[idx]]["hashrates"]=[]
                                if isinstance(all_hs, (float, int)):
                                    output['miners'][miner_names[idx]]["hashrates"].append(all_hs)
                                else:
                                    output['miners'][miner_names[idx]]["hashrates"].append(0)
                            except Exception as e:
                                pass
        return output
--- a/lib/init_server.py
+++ b/lib/init_server.py
@ -3,6 +3,7 @@ from lib import logging as logging_lib
 from lib import get_specs
 from lib import utils
 import threading
 import socket
 import aiohttp
 import asyncio
 import json
@ -47,9 +48,11 @@ async def register_server(data):
        "Content-Type": "application/json"
    }
-    async with aiohttp.ClientSession() as session:
+    connector = aiohttp.TCPConnector(family=socket.AF_INET)
    async with aiohttp.ClientSession(connector=connector) as session:
        try:
-            async with session.post(url, data=json_data, headers=headers, timeout=5) as response:
+            async with session.post(url, data=json_data, headers=headers, timeout=15) as response:
                if response.status == 200:
                    # Successful response
                    response_data = await response.json()
--- a/lib/log_streaming_task.py
+++ b/lib/log_streaming_task.py
@ -10,7 +10,7 @@ from lib import container_logs
 from concurrent.futures import ThreadPoolExecutor
 import queue  # Import the synchronous queue module
-async def log_streaming_task(message_broker, monitoring):
+async def log_streaming_task(message_broker, monitoring, do_not_stream_containers):
    client = docker_interface.client
    executor = ThreadPoolExecutor(max_workers=4)
    tasks = {}
@ -29,14 +29,15 @@ async def log_streaming_task(message_broker, monitoring):
            # Start tasks for new containers
            for container_name, container in current_containers.items():
-                log_container_names.append(container_name)
+                if not container_name in do_not_stream_containers:
-                if container_name not in tasks:
+                    log_container_names.append(container_name)
-                    log.debug(f"log_streaming_task() | Starting task for {container_name}")
+                    if container_name not in tasks:
-                    sync_queue = queue.Queue()
+                        log.debug(f"log_streaming_task() | Starting task for {container_name}")
-                    task = asyncio.ensure_future(asyncio.get_event_loop().run_in_executor(
+                        sync_queue = queue.Queue()
-                        executor, container_logs.stream_logs, container_name, sync_queue))
+                        task = asyncio.ensure_future(asyncio.get_event_loop().run_in_executor(
-                    tasks[container_name] = task
+                            executor, container_logs.stream_logs, container_name, sync_queue))
-                    queues[container_name] = sync_queue
+                        tasks[container_name] = task
                        queues[container_name] = sync_queue
            await message_broker.put(log_container_names)
--- a/lib/nvml.py
+++ b/lib/nvml.py
@ -6,20 +6,77 @@ config = config_module.config
 log = logging_lib.log
 import subprocess
-import pynvml
+import clore_pynvml as pynvml
 import json
 import math
 HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./"
 GPU_MEM_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
    "NVIDIA P102-100": [-2000, 2000],
    "NVIDIA P104-100": [-2000, 2000],
    "NVIDIA P106-090": [-2000, 2000],
    "NVIDIA P106-100": [-2000, 2000],
    "NVIDIA GeForce GTX 1050 Ti": [-2000, 2000],
    "NVIDIA GeForce GTX 1060 3GB": [-2000, 2000],
    "NVIDIA GeForce GTX 1060 6GB": [-2000, 2000],
    "NVIDIA GeForce GTX 1070": [-2000, 2000],
    "NVIDIA GeForce GTX 1070 Ti": [-2000, 2000],
    "NVIDIA GeForce GTX 1080": [-2000, 2000],
    "NVIDIA GeForce GTX 1080 Ti": [-2000, 2000],
    "NVIDIA CMP 30HX": [-2000, 6000],
    "NVIDIA CMP 40HX": [-2000, 6000],
    "NVIDIA CMP 50HX": [-2000, 6000],
    "NVIDIA CMP 90HX": [-2000, 6000],
    "NVIDIA GeForce GTX 1650": [-2000, 6000],
    "NVIDIA GeForce GTX 1660 SUPER": [-2000, 6000],
    "NVIDIA GeForce GTX 1660 Ti": [-2000, 6000],
    "NVIDIA GeForce RTX 2060": [-2000, 6000],
    "NVIDIA GeForce RTX 2060 SUPER": [-2000, 6000],
    "NVIDIA GeForce RTX 2070": [-2000, 6000],
    "NVIDIA GeForce RTX 2070 SUPER": [-2000, 6000],
    "NVIDIA GeForce RTX 2080": [-2000, 6000],
    "NVIDIA GeForce RTX 2080 Ti": [-2000, 6000]
 }
 GPU_CORE_ALLOWED_OC_RANGES = { # Known to be problematic GPUs
    "NVIDIA P102-100": [-200, 1200],
    "NVIDIA P104-100": [-200, 1200],
    "NVIDIA P106-090": [-200, 1200],
    "NVIDIA P106-100": [-200, 1200],
    "NVIDIA GeForce GTX 1050 Ti": [-200, 1200],
    "NVIDIA GeForce GTX 1060 3GB": [-200, 1200],
    "NVIDIA GeForce GTX 1060 6GB": [-200, 1200],
    "NVIDIA GeForce GTX 1070": [-200, 1200],
    "NVIDIA GeForce GTX 1070 Ti": [-200, 1200],
    "NVIDIA GeForce GTX 1080": [-200, 1200],
    "NVIDIA GeForce GTX 1080 Ti": [-200, 1200],
    "NVIDIA CMP 30HX": [-1000, 1000],
    "NVIDIA CMP 40HX": [-1000, 1000],
    "NVIDIA CMP 50HX": [-1000, 1000],
    "NVIDIA CMP 90HX": [-1000, 1000],
    "NVIDIA GeForce GTX 1650": [-1000, 1000],
    "NVIDIA GeForce GTX 1660 SUPER": [-1000, 1000],
    "NVIDIA GeForce GTX 1660 Ti": [-1000, 1000],
    "NVIDIA GeForce RTX 2060": [-1000, 1000],
    "NVIDIA GeForce RTX 2060 SUPER": [-1000, 1000],
    "NVIDIA GeForce RTX 2070": [-1000, 1000],
    "NVIDIA GeForce RTX 2070 SUPER": [-1000, 1000],
    "NVIDIA GeForce RTX 2080": [-1000, 1000],
    "NVIDIA GeForce RTX 2080 Ti": [-1000, 1000]
 }
 is_hive = False
 all_gpus_data_list=[]
 get_data_fail=False
-def init(gpu_specs_file=None):
+def init(gpu_specs_file=None, allow_hive_binaries=True):
    global is_hive, all_gpus_data_list, get_data_fail
    log.info("Loading GPU OC specs [ working ]")
    try:
        pynvml.nvmlInit()
        kernel = get_specs.get_kernel()
-        if "hive" in kernel:
+        if "hive" in kernel and allow_hive_binaries:
            is_hive=True
        specs_file_loc = gpu_specs_file if gpu_specs_file else config.gpu_specs_file
@ -43,10 +100,15 @@ def init(gpu_specs_file=None):
                parsed_specs={}
                regenerate_specs=True
                break
            elif not "locks" in parsed_specs[f"{i}-{gpu_uuid}"]:
                parsed_specs={}
                regenerate_specs=True
                break
        if regenerate_specs:
            for i in range(0,gpu_count):
                gpu_spec={}
                mem_to_core_allowed_locks = get_gpu_locked_clocks(i)
                gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
                gpu_uuid = pynvml.nvmlDeviceGetUUID(gpu_handle)
                power_limits = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(gpu_handle)
@ -55,6 +117,7 @@ def init(gpu_specs_file=None):
                gpu_spec["default_power_limit"] = int(pynvml.nvmlDeviceGetPowerManagementDefaultLimit(gpu_handle) / 1000.0)
                gpu_spec["power_limits"] = [min_power_limit, max_power_limit]
                gpu_spec["name"] = pynvml.nvmlDeviceGetName(gpu_handle)
                gpu_spec["locks"] = mem_to_core_allowed_locks
                pci_info = pynvml.nvmlDeviceGetPciInfo(gpu_handle)
                pci_bus_id = pci_info.bus
@ -64,22 +127,57 @@ def init(gpu_specs_file=None):
                mem_range = get_hive_clock_range(is_hive, i, "mem")
                core_range = get_hive_clock_range(is_hive, i, "core")
-                if type(mem_range) != list:
+                try:
-                    pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load
+                    if type(mem_range) != list:
-                    failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle)
+                        pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, 200, 300) # Force low clocks, so the GPU can't crash when testing if under load
-                    failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle)
+                        failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle)
-                    if (not failure_min) and (not failure_max):
+                        failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle)
-                        mem_range=[min_oc_solution, max_oc_solution]
+                        if (not failure_min) and (not failure_max):
-                    pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, 0)
+                            mem_range=[min_oc_solution, max_oc_solution]
-                    pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
+                        pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, 0)
-                if type(core_range) != list:
+                        pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
-                    pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, 300, 350) # Force low clocks, so the GPU can't crash when testing if under load
+                    if type(core_range) != list:
-                    failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle, True)
+                        pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, 300, 350) # Force low clocks, so the GPU can't crash when testing if under load
-                    failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle, True)
+                        failure_min, min_oc_solution = pinpoint_oc_limits_negative(gpu_handle, True)
-                    if (not failure_min) and (not failure_max):
+                        failure_max, max_oc_solution = pinpoint_oc_limits_positive(gpu_handle, True)
-                        core_range=[min_oc_solution, max_oc_solution]
+                        if (not failure_min) and (not failure_max):
-                    pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0)
+                            core_range=[min_oc_solution, max_oc_solution]
-                    pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
+                        pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, 0)
                        pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
                except Exception as e_pinpointing:
                    if "not supported" in str(e_pinpointing).lower():
                        try:
                            min_core_offset, max_core_offset = pynvml.nvmlDeviceGetGpcClkMinMaxVfOffset(gpu_handle)
                            if min_core_offset>0:
                                min_core_offset = min_core_offset - math.floor((2**32)/1000)
                            if min_core_offset > -20000 and min_core_offset <= 0 and max_core_offset>=0 and min_core_offset < 20000:
                                core_range=[min_core_offset, max_core_offset]
                            else:
                                core_range=[0,0]
                            min_mem_offset, max_mem_offset = pynvml.nvmlDeviceGetMemClkMinMaxVfOffset(gpu_handle)
                            if min_mem_offset>0:
                                min_mem_offset = min_mem_offset - math.floor((2**32)/1000)
                            if min_mem_offset==0 and max_mem_offset==0:
                                if gpu_spec["name"] in GPU_MEM_ALLOWED_OC_RANGES:
                                    mem_range = GPU_MEM_ALLOWED_OC_RANGES[gpu_spec["name"]]
                                else:
                                    mem_range = [0,0]
                            elif min_mem_offset > -20000 and min_mem_offset <= 0 and max_mem_offset>=0 and max_mem_offset < 20000:
                                mem_range=[min_mem_offset, max_mem_offset]
                            else:
                                mem_range=[0,0]
                        except Exception as e2:
                            if "function not found" in str(e2).lower():
                                if gpu_spec["name"] in GPU_MEM_ALLOWED_OC_RANGES:
                                    mem_range = GPU_MEM_ALLOWED_OC_RANGES[gpu_spec["name"]]
                                else:
                                    mem_range = [0,0]
                                if gpu_spec["name"] in GPU_CORE_ALLOWED_OC_RANGES:
                                    core_range = GPU_CORE_ALLOWED_OC_RANGES[gpu_spec["name"]]
                                else:
                                    core_range = [0,0]
                            else:
                                get_data_fail=True
                if type(mem_range) == list and type(core_range) == list and len(mem_range)==2 and len(core_range)==2:
                    gpu_spec["mem"]=mem_range
                    gpu_spec["core"]=core_range
@ -113,6 +211,19 @@ def get_gpu_oc_specs():
 def shutdown():
    pynvml.nvmlShutdown()
 def get_gpu_locked_clocks(gpu_index):
    try:
        handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
        mem_clocks = pynvml.nvmlDeviceGetSupportedMemoryClocks(handle)
        mem_to_core = {}
        for idx, mem_clock in enumerate(mem_clocks):
            if idx < 12 or idx == len(mem_clocks)-1:
                graphics_clocks = pynvml.nvmlDeviceGetSupportedGraphicsClocks(handle, mem_clock)
                mem_to_core[str(mem_clock)] = [min(graphics_clocks), max(graphics_clocks)]
        return mem_to_core
    except Exception as e:
        return {}
 def handle_nn(input_int):
    if abs(4293967-input_int) < 10000:
        return input_int-4293967
@ -218,6 +329,7 @@ def pinpoint_oc_limits_positive(gpu_handle, core=False):
    return failure, found_solution
 def set_oc(settings):
    global is_hive
    try:
        gpu_count = pynvml.nvmlDeviceGetCount()
        settings_keys = settings.keys()
@ -231,6 +343,10 @@ def set_oc(settings):
                }
            settings_keys = settings.keys()
            log.debug(f"Rewriting settings with: {json.dumps(settings)}")
        core_locks = []
        mem_locks = []
        any_lock_failure = False
        for oc_gpu_index in settings_keys:
            if oc_gpu_index.isdigit():
                oc_gpu_index=int(oc_gpu_index)
@ -238,13 +354,42 @@ def set_oc(settings):
                    gpu_oc_config = settings[str(oc_gpu_index)]
                    gpu_possible_ranges = all_gpus_data_list[oc_gpu_index]
                    gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(oc_gpu_index)
-                    if "core" in gpu_oc_config:
+
                    if "core_lock" in gpu_oc_config:
                        core_lock = int(gpu_oc_config["core_lock"])
                        core_locks.append(str(core_lock))
                        try:
                            pynvml.nvmlDeviceSetGpuLockedClocks(gpu_handle, core_lock, core_lock)
                        except Exception as core_lock_exception:
                            any_lock_failure=True
                    else:
                        core_locks.append('0')
                        try:
                            pynvml.nvmlDeviceResetGpuLockedClocks(gpu_handle)
                        except Exception as core_lock_exception:
                            any_lock_failure=True
                    if "mem_lock" in gpu_oc_config:
                        mem_lock = int(gpu_oc_config["mem_lock"])
                        mem_locks.append(str(mem_lock))
                        try:
                            pynvml.nvmlDeviceSetMemoryLockedClocks(gpu_handle, mem_lock, mem_lock)
                        except Exception as mem_lock_exception:
                            any_lock_failure=True
                    else:
                        mem_locks.append('0')
                        try:
                            pynvml.nvmlDeviceResetMemoryLockedClocks(gpu_handle)
                        except Exception as mem_lock_exception:
                            any_lock_failure=True
                    if "core" in gpu_oc_config: # Core offset
                        wanted_core_clock = int(round(gpu_oc_config["core"]*2))
                        if gpu_possible_ranges["core"][0] <= wanted_core_clock and wanted_core_clock <= gpu_possible_ranges["core"][1]:
                            pynvml.nvmlDeviceSetGpcClkVfOffset(gpu_handle, wanted_core_clock)
                        else:
                            log.error(f"Requested OC for GPU:{oc_gpu_index} (CORE) out of bound | {wanted_core_clock} | [{gpu_possible_ranges["core"][0]}, {gpu_possible_ranges["core"][1]}]")
-                    if "mem" in gpu_oc_config:
+                    if "mem" in gpu_oc_config: # Memory offset
                        wanted_mem_clock = int(round(gpu_oc_config["mem"]*2))
                        if gpu_possible_ranges["mem"][0] <= wanted_mem_clock and wanted_mem_clock <= gpu_possible_ranges["mem"][1]:
                            pynvml.nvmlDeviceSetMemClkVfOffset(gpu_handle, wanted_mem_clock)
@ -256,6 +401,17 @@ def set_oc(settings):
                            pynvml.nvmlDeviceSetPowerManagementLimit(gpu_handle, wanted_power_limit_milliwatts)
                        else:
                            log.error(f"Requested OC for GPU:{oc_gpu_index} (POWER LIMIT) out of bound | {gpu_oc_config["pl"]} | [{gpu_possible_ranges["power_limits"][0]}, {gpu_possible_ranges["power_limits"][1]}]")
        if is_hive and any_lock_failure and len(mem_locks)==len(core_locks):
            try:
                nvtool_commands = []
                for idx, mem_lock in enumerate(mem_locks):
                    core_lock = core_locks[idx]
                    nvtool_commands.append(f"nvtool -i {str(idx)} --setmem {mem_lock} --setcore {core_lock}")
                cmd = ["bash",'-c',f"PATH={HIVE_PATH} && sudo {' && '.join(nvtool_commands)}"]
                #print(cmd)
                subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            except Exception as hive_oc_settings:
                pass
        return True
    except Exception as e:
        log.error(f"set_oc | ERROR | {e}")
@ -267,7 +423,7 @@ def get_hive_clock_range(is_hive, gpu_index, part):
    if is_hive:
        try:
            flag = "--setmemoffset" if part=="mem" else "--setcoreoffset"
-            cmd = ["bash",'-c',f"nvtool -i 0 {flag} -100000"]
+            cmd = ["bash",'-c',f"PATH={HIVE_PATH} && sudo nvtool -i {gpu_index} {flag} -100000"]
            result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            lines = result.stdout.decode().splitlines()
@ -292,3 +448,16 @@ def get_hive_clock_range(is_hive, gpu_index, part):
            return False
    else:
        return False
 def get_vram_per_gpu():
    vram_per_gpu = []
    try:
        gpu_count = pynvml.nvmlDeviceGetCount()
        for i in range(0,gpu_count):
            gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
            vram_per_gpu.append(mem_info.total / 1024 ** 2)
    except Exception as e:
        log.error(f"Failed loading get_vram_per_gpu() | {e}")
        pass
    return vram_per_gpu
--- a/lib/utils.py
+++ b/lib/utils.py
@ -1,11 +1,13 @@
 from lib import config as config_module
 from lib import logging as logging_lib
 from lib import nvml
 import subprocess
 import hashlib
 import random
 import string
 import shlex
 import time
 import math
 import json
 import os
@ -41,6 +43,8 @@ def normalize_rule(rule_dict):
 def get_auth():
    try:
        if 'AUTH_TOKEN' in os.environ:
            return os.environ['AUTH_TOKEN']
        auth_str = ''
        with open(config.auth_file, "r", encoding="utf-8") as file:
            auth_str = file.read().strip()
@ -48,6 +52,12 @@ def get_auth():
    except Exception as e:
        return ''
 def get_allowed_container_names():
    allowed_container_names = os.getenv("ALLOWED_CONTAINER_NAMES")
    if type(allowed_container_names)==str and len(allowed_container_names)>0:
        return [x for x in allowed_container_names.split(',') if x]
    return []
 def unix_timestamp():
    return int(time.time())
@ -91,18 +101,70 @@ def generate_random_string(length):
    characters = string.ascii_letters + string.digits
    return ''.join(random.choice(characters) for _ in range(length))
 HIVE_PATH="/hive/bin:/hive/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:./"
 def hive_set_miner_status(enabled=False):
    ### control miner state - OFF/ON
    screen_out = run_command("screen -ls")
    miner_screen_running = False
    miner_screen_session_pids = []
    if screen_out[0] == 0 or screen_out[0] == 1:
        screen_lines=screen_out[1].split('\n')
        for screen_line in screen_lines:
            screen_line_parts=screen_line.replace('\t', '', 1).split('\t')
            if len(screen_line_parts)>2 and '.' in screen_line_parts[0]:
                if screen_line_parts[0].split('.',1)[1]=="miner":
                    miner_screen_session_pids.append(screen_line_parts[0].split('.',1)[0])
                    miner_screen_running=True
-    if miner_screen_running and not enabled:
+    if len(miner_screen_session_pids) > 1: ## Something really bad going on, destroy all instances
-        run_command("miner stop")
+        for idx, miner_screen_session_pid in enumerate(miner_screen_session_pids):
            run_command(f"kill -9 {miner_screen_session_pid}{' && screen -wipe' if idx==len(miner_screen_session_pids)-1 else ''}")
    elif miner_screen_running and not enabled:
        run_command(f"/bin/bash -c \"PATH={HIVE_PATH} && sudo /hive/bin/miner stop\"")
    elif enabled and not miner_screen_running:
-        run_command("nvidia-oc && miner start")
+        run_command(f"/bin/bash -c \"export PATH={HIVE_PATH} && sudo /hive/sbin/nvidia-oc && source ~/.bashrc ; sudo /hive/bin/miner start\"")
 def get_extra_allowed_images():
    if os.path.exists(config.extra_allowed_images_file):
        try:
            with open(config.extra_allowed_images_file, 'r') as file:
                content = file.read()
            data = json.loads(content)
            if isinstance(data, list):
                if all(isinstance(item, dict) and set(item.keys()) == {'repository', 'allowed_tags'} and isinstance(item['repository'], str) and isinstance(item['allowed_tags'], list) and all(isinstance(tag, str) for tag in item['allowed_tags']) for item in data):
                    return data
                else:
                    return []
            else:
                return []
        except Exception as e:
            log.error(f"get_extra_allowed_images() | error: {e}")
            return []
    else:
        return []
 class shm_calculator:
    def __init__(self, total_ram):
        self.total_ram = total_ram
        self.gpu_vram_sizes = []
    def calculate(self, used_gpu_ids):
        assume_ram_utilised = 2500 #MB
        default_shm_size = 64 #MB
        if len(self.gpu_vram_sizes) == 0:
            self.gpu_vram_sizes = nvml.get_vram_per_gpu()
        instance_vram_total = 0
        total_vram_size = sum(self.gpu_vram_sizes)
        for idx, value in enumerate(self.gpu_vram_sizes):
            if used_gpu_ids == '*' or idx in used_gpu_ids:
                instance_vram_total += value
        if instance_vram_total == 0 or total_vram_size == 0:
            return default_shm_size
        shm_size = instance_vram_total * 1.5 if instance_vram_total * 1.5 < self.total_ram - assume_ram_utilised else (
            instance_vram_total/total_vram_size * (self.total_ram - assume_ram_utilised)
        )
        return math.floor(shm_size if shm_size > default_shm_size else default_shm_size)
--- a/requirements.txt
+++ b/requirements.txt
@ -7,4 +7,5 @@ psutil==5.9.0
 python-iptables==1.0.1
 websockets==12.0
 packaging==23.2
-git+https://git.clore.ai/clore/pynvml.git@main
+clore-pynvml==11.5.4
 requests==2.31.0
Author	SHA1	Message	Date
clore	68e7dc215d	V5.2.8 - when failing to set clock locks on HiveOS fallback on nvtool	2024-11-03 23:28:03 +00:00
clore	6c4995e19f	fix reporting RAM usage	2024-10-31 04:29:52 +00:00
clore	7faadc76ea	V5.2.7 - core, mem lock	2024-10-31 02:32:47 +00:00
clore	1375d5599e	allow mixed GPU on machine initialization	2024-10-29 10:00:25 +00:00
clore	d5620c64c4	allocate /dev/shm towards instances - V5.2.6	2024-10-17 17:01:41 +00:00
clore	d6f90ab497	better hiveos miner integration support	2024-10-05 18:53:29 +02:00
clore	81e2659024	correctly submit hashrates of custom miner - V5.2.5	2024-10-05 13:54:19 +02:00
clore	15e0810359	add './' to HIVE_PATH (fix for running some miners) - V5.2.4	2024-10-04 00:59:25 +00:00
clore	e71697cefa	prevent multiple miner screen sessions under HiveOS	2024-09-12 01:19:01 +00:00
clore	20d3d9b6c8	add hive bin paths	2024-09-12 01:05:33 +00:00
clore	7ec35382b2	detect if hashrate in hs -> khs	2024-09-12 00:39:00 +00:00
clore	36c7db5831	bump version	2024-09-08 23:33:46 +00:00
clore	c3e1b684fe	fix submit pow info timings	2024-09-08 23:33:06 +00:00
clore	2d1c15c7bf	force register_server requests to go throut IPv4	2024-09-08 22:00:57 +00:00
clore	6150cf48cb	submit hashrates, algos of background pow job	2024-09-08 21:51:03 +00:00
clore	cab037526a	ALLOWED_CONTAINER_NAMES env variable	2024-09-04 12:18:23 +00:00
clore	590dc4b65e	add optional whitelist for outside images	2024-09-04 00:51:52 +00:00
clore	5e35570d3c	DONT_USE_HIVE_BINARIES, AUTH_TOKEN env parameters	2024-09-03 23:17:46 +00:00
clore	7e63ca5218	V5.2.2 - hostnames, Failed to initialize NVML fix for ubuntu 22 hosts	2024-07-06 13:05:22 +00:00
clore	73d19b5cd7	V5.2.1 \| add dict for core clocks for problematic gpus on "function not found" error	2024-05-28 00:56:26 +00:00
clore	5e733fd0d6	V5.2.1 \| fix reading OC specs for older GPUs	2024-05-28 00:25:58 +00:00
clore	36d3026d5d	use requests==2.31.0	2024-05-22 11:54:57 +00:00
clore	79c17624f2	bump version	2024-05-22 00:43:38 +00:00
clore	2ef648df25	fix removing custom entrypoints	2024-05-22 00:35:33 +00:00
clore	c71597af16	use pynvml==11.5.0 instead of clore fork	2024-05-16 19:29:07 +00:00
clore	e2d309650b	use correct env for hive binaries	2024-05-16 15:17:11 +00:00
clore	504aa74f5e	fixes on use hive flightsheet flag	2024-05-16 13:57:00 +00:00
clore	35ce001a71	use full path for nvidia-oc	2024-05-16 12:15:38 +00:00
clore	1658ad4f66	use hive miner bin full paths	2024-05-16 12:05:56 +00:00
clore	12b4239cab	increase timeout to register server	2024-05-12 11:22:47 +00:00
clore	b0d7618592	fix	2024-05-12 11:20:34 +00:00