From bce71c4574187784a428a1cabef05bee00dd117b Mon Sep 17 00:00:00 2001 From: clore Date: Sat, 28 Dec 2024 23:08:10 +0000 Subject: [PATCH] auto xfs migration, allow smaller disk size for xfs, restrict machines booting from usb from xfs migration --- clore_hosting/main.py | 9 ++++++- lib/get_specs.py | 20 ++++++++++++++- lib/xfs.py | 57 ++++++++++++++++++++++++++++++++++++++----- 3 files changed, 78 insertions(+), 8 deletions(-) diff --git a/clore_hosting/main.py b/clore_hosting/main.py index 28ce979..988b965 100644 --- a/clore_hosting/main.py +++ b/clore_hosting/main.py @@ -27,6 +27,7 @@ import asyncio import time import json from aiofiles import os as async_os +import aiofiles import os specs = get_specs.Specs() @@ -438,6 +439,12 @@ class CloreClient: can_run_partner_workloads = False if ((not is_order_spot) and running_order) else True clore_partner_socket.set_can_deploy(can_run_partner_workloads) + if not running_order and self.xfs_state == "disabled": + async with aiofiles.open("/opt/clore-hosting/xfs_state", mode='w') as file: + await file.write("enabled") + log.info("No order running, requesting XFS migration") + os._exit(0) + if self.restart_docker and not running_order and len(self.containers)>0: log.debug("Sending docker restart command") utils.run_command_v2("systemctl restart docker") @@ -512,7 +519,7 @@ class CloreClient: async def submit_specs(self, current_specs): try: if type(current_specs) == dict: - current_specs["backend_version"]=20 + current_specs["backend_version"]=21 current_specs["update_hw"]=True smallest_pcie_width = 999 for gpu in current_specs["gpus"]["nvidia"]: diff --git a/lib/get_specs.py b/lib/get_specs.py index eac7726..5092333 100644 --- a/lib/get_specs.py +++ b/lib/get_specs.py @@ -459,4 +459,22 @@ class Specs: total_swap_gb = total_swap_kb / (1024) / 1000 # Convert KB to GB return round(total_swap_gb, 4) except Exception as e: - return 0 \ No newline at end of file + return 0 + +def get_root_device(): + try: + mount_info = subprocess.check_output(['findmnt', '-n', '-o', 'SOURCE', '/']).decode().strip() + return mount_info + except subprocess.CalledProcessError: + return None + +def is_usb_device(device): + try: + lsblk_output = subprocess.check_output(['lsblk', '-o', 'NAME,TRAN', '-n']).decode().strip() + for line in lsblk_output.splitlines(): + parts = line.split() + if len(parts) == 2 and device.endswith(parts[0]): + return parts[1] == 'usb' + except subprocess.CalledProcessError: + return True + return False \ No newline at end of file diff --git a/lib/xfs.py b/lib/xfs.py index 4d7a0ac..2949d8e 100644 --- a/lib/xfs.py +++ b/lib/xfs.py @@ -3,6 +3,7 @@ from lib import ensure_packages_installed from lib import logging as logging_lib from lib import docker_interface from lib import networking +from lib import get_specs from lib import utils import asyncio @@ -13,12 +14,20 @@ log = logging_lib.log DOCKER_ROOT = "/var/lib/docker" DOCKER_DATA_IMG = "/opt/clore-hosting/data.img" -LEAVE_FREE_SPACE_MB = 1024*24 # 24 GB -MIN_XFS_PARTITION_SIZE = 1024*24 # 24 GB +HP_LEAVE_FREE_SPACE_MB = 1024*24 # 24 GB +HP_MIN_XFS_PARTITION_SIZE = 1024*24 # 24 GB + +GENERIC_LEAVE_FREE_SPACE_MB = 1024*8 # 8 GB +GENERIC_MIN_XFS_PARTITION_SIZE = 1024*10 # 10 GB XFS_STATE_FILE = "/opt/clore-hosting/xfs_state" +HIGH_PERFORMANCE_GPUS = [ + "NVIDIA GeForce RTX 4090", + "NVIDIA GeForce RTX 3090" +] + MANDATORY_PACKAGES = [ "xfsprogs", "dmidecode", @@ -34,9 +43,28 @@ MANDATORY_PACKAGES = [ # sudo mkfs.xfs /docker-storage.img # mount -o loop,pquota /docker-storage.img /mnt/docker-storage +def get_to_use_storage_values(max_free_space): + gpu_str, gpu_mem, gpus, nvml_err = get_specs.get_gpu_info() + if nvml_err: + return None, None + try: + gpu_names = [] + for gpu in gpus["nvidia"]: + gpu_names.append(gpu["name"]) + if len(gpu_names) > 0: + all_gpus_same = all(item == gpu_names[0] for item in gpu_names) + if (all_gpus_same and gpu_names[0] in HIGH_PERFORMANCE_GPUS) or max_free_space > 1024 * 70: + return HP_LEAVE_FREE_SPACE_MB, HP_MIN_XFS_PARTITION_SIZE + else: + return GENERIC_LEAVE_FREE_SPACE_MB, GENERIC_MIN_XFS_PARTITION_SIZE + else: + return None, None + except Exception as e: + return None, None + + def migrate(): docker_xfs_state = validate_docker_xfs() - #print(docker_xfs_state) if docker_xfs_state == "skip": return elif docker_xfs_state == "valid": @@ -49,6 +77,15 @@ def migrate(): if not packages_available: return 'packages-missing' + root_device = get_specs.get_root_device() + if not root_device: + return "not-supported-boot-device" + + device_name = os.path.basename(root_device).split('p')[0].rstrip('0123456789') + + if get_specs.is_usb_device(device_name): + return "not-supported-boot-device" + log.info("Starting migration to xfs") docker_interface.stop_all_containers() @@ -60,9 +97,13 @@ def migrate(): return "failure" max_free_space = utils.get_free_space_mb('/') + utils.get_directory_size_mb(DOCKER_ROOT) + + leave_free_space, min_xfs_size = get_to_use_storage_values(max_free_space) + if leave_free_space == None: + return "failure" - data_img_size = int(max_free_space - LEAVE_FREE_SPACE_MB) - if data_img_size < MIN_XFS_PARTITION_SIZE: + data_img_size = int(max_free_space - leave_free_space) + if data_img_size < min_xfs_size: return 'not-enough-space' docker_config_success = False @@ -184,6 +225,10 @@ def init(): with open(XFS_STATE_FILE, 'w') as file: file.write("not-enough-space") return 'not-enough-space' + elif migarion_status == "not-supported-boot-device": + with open(XFS_STATE_FILE, 'w') as file: + file.write("not-supported-boot-device") + return 'failed' else: with open(XFS_STATE_FILE, 'w') as file: file.write("failed-migration") @@ -198,4 +243,4 @@ def init(): return "disabled" except Exception as e: print(e) - pass \ No newline at end of file + return "failed" \ No newline at end of file