auto xfs migration, allow smaller disk size for xfs, restrict machines booting from usb from xfs migration

This commit is contained in:
clore 2024-12-28 23:08:10 +00:00
parent 4e1e72da25
commit bce71c4574
3 changed files with 78 additions and 8 deletions

View File

@ -27,6 +27,7 @@ import asyncio
import time import time
import json import json
from aiofiles import os as async_os from aiofiles import os as async_os
import aiofiles
import os import os
specs = get_specs.Specs() specs = get_specs.Specs()
@ -438,6 +439,12 @@ class CloreClient:
can_run_partner_workloads = False if ((not is_order_spot) and running_order) else True can_run_partner_workloads = False if ((not is_order_spot) and running_order) else True
clore_partner_socket.set_can_deploy(can_run_partner_workloads) clore_partner_socket.set_can_deploy(can_run_partner_workloads)
if not running_order and self.xfs_state == "disabled":
async with aiofiles.open("/opt/clore-hosting/xfs_state", mode='w') as file:
await file.write("enabled")
log.info("No order running, requesting XFS migration")
os._exit(0)
if self.restart_docker and not running_order and len(self.containers)>0: if self.restart_docker and not running_order and len(self.containers)>0:
log.debug("Sending docker restart command") log.debug("Sending docker restart command")
utils.run_command_v2("systemctl restart docker") utils.run_command_v2("systemctl restart docker")
@ -512,7 +519,7 @@ class CloreClient:
async def submit_specs(self, current_specs): async def submit_specs(self, current_specs):
try: try:
if type(current_specs) == dict: if type(current_specs) == dict:
current_specs["backend_version"]=20 current_specs["backend_version"]=21
current_specs["update_hw"]=True current_specs["update_hw"]=True
smallest_pcie_width = 999 smallest_pcie_width = 999
for gpu in current_specs["gpus"]["nvidia"]: for gpu in current_specs["gpus"]["nvidia"]:

View File

@ -459,4 +459,22 @@ class Specs:
total_swap_gb = total_swap_kb / (1024) / 1000 # Convert KB to GB total_swap_gb = total_swap_kb / (1024) / 1000 # Convert KB to GB
return round(total_swap_gb, 4) return round(total_swap_gb, 4)
except Exception as e: except Exception as e:
return 0 return 0
def get_root_device():
try:
mount_info = subprocess.check_output(['findmnt', '-n', '-o', 'SOURCE', '/']).decode().strip()
return mount_info
except subprocess.CalledProcessError:
return None
def is_usb_device(device):
try:
lsblk_output = subprocess.check_output(['lsblk', '-o', 'NAME,TRAN', '-n']).decode().strip()
for line in lsblk_output.splitlines():
parts = line.split()
if len(parts) == 2 and device.endswith(parts[0]):
return parts[1] == 'usb'
except subprocess.CalledProcessError:
return True
return False

View File

@ -3,6 +3,7 @@ from lib import ensure_packages_installed
from lib import logging as logging_lib from lib import logging as logging_lib
from lib import docker_interface from lib import docker_interface
from lib import networking from lib import networking
from lib import get_specs
from lib import utils from lib import utils
import asyncio import asyncio
@ -13,12 +14,20 @@ log = logging_lib.log
DOCKER_ROOT = "/var/lib/docker" DOCKER_ROOT = "/var/lib/docker"
DOCKER_DATA_IMG = "/opt/clore-hosting/data.img" DOCKER_DATA_IMG = "/opt/clore-hosting/data.img"
LEAVE_FREE_SPACE_MB = 1024*24 # 24 GB
MIN_XFS_PARTITION_SIZE = 1024*24 # 24 GB HP_LEAVE_FREE_SPACE_MB = 1024*24 # 24 GB
HP_MIN_XFS_PARTITION_SIZE = 1024*24 # 24 GB
GENERIC_LEAVE_FREE_SPACE_MB = 1024*8 # 8 GB
GENERIC_MIN_XFS_PARTITION_SIZE = 1024*10 # 10 GB
XFS_STATE_FILE = "/opt/clore-hosting/xfs_state" XFS_STATE_FILE = "/opt/clore-hosting/xfs_state"
HIGH_PERFORMANCE_GPUS = [
"NVIDIA GeForce RTX 4090",
"NVIDIA GeForce RTX 3090"
]
MANDATORY_PACKAGES = [ MANDATORY_PACKAGES = [
"xfsprogs", "xfsprogs",
"dmidecode", "dmidecode",
@ -34,9 +43,28 @@ MANDATORY_PACKAGES = [
# sudo mkfs.xfs /docker-storage.img # sudo mkfs.xfs /docker-storage.img
# mount -o loop,pquota /docker-storage.img /mnt/docker-storage # mount -o loop,pquota /docker-storage.img /mnt/docker-storage
def get_to_use_storage_values(max_free_space):
gpu_str, gpu_mem, gpus, nvml_err = get_specs.get_gpu_info()
if nvml_err:
return None, None
try:
gpu_names = []
for gpu in gpus["nvidia"]:
gpu_names.append(gpu["name"])
if len(gpu_names) > 0:
all_gpus_same = all(item == gpu_names[0] for item in gpu_names)
if (all_gpus_same and gpu_names[0] in HIGH_PERFORMANCE_GPUS) or max_free_space > 1024 * 70:
return HP_LEAVE_FREE_SPACE_MB, HP_MIN_XFS_PARTITION_SIZE
else:
return GENERIC_LEAVE_FREE_SPACE_MB, GENERIC_MIN_XFS_PARTITION_SIZE
else:
return None, None
except Exception as e:
return None, None
def migrate(): def migrate():
docker_xfs_state = validate_docker_xfs() docker_xfs_state = validate_docker_xfs()
#print(docker_xfs_state)
if docker_xfs_state == "skip": if docker_xfs_state == "skip":
return return
elif docker_xfs_state == "valid": elif docker_xfs_state == "valid":
@ -49,6 +77,15 @@ def migrate():
if not packages_available: if not packages_available:
return 'packages-missing' return 'packages-missing'
root_device = get_specs.get_root_device()
if not root_device:
return "not-supported-boot-device"
device_name = os.path.basename(root_device).split('p')[0].rstrip('0123456789')
if get_specs.is_usb_device(device_name):
return "not-supported-boot-device"
log.info("Starting migration to xfs") log.info("Starting migration to xfs")
docker_interface.stop_all_containers() docker_interface.stop_all_containers()
@ -60,9 +97,13 @@ def migrate():
return "failure" return "failure"
max_free_space = utils.get_free_space_mb('/') + utils.get_directory_size_mb(DOCKER_ROOT) max_free_space = utils.get_free_space_mb('/') + utils.get_directory_size_mb(DOCKER_ROOT)
leave_free_space, min_xfs_size = get_to_use_storage_values(max_free_space)
if leave_free_space == None:
return "failure"
data_img_size = int(max_free_space - LEAVE_FREE_SPACE_MB) data_img_size = int(max_free_space - leave_free_space)
if data_img_size < MIN_XFS_PARTITION_SIZE: if data_img_size < min_xfs_size:
return 'not-enough-space' return 'not-enough-space'
docker_config_success = False docker_config_success = False
@ -184,6 +225,10 @@ def init():
with open(XFS_STATE_FILE, 'w') as file: with open(XFS_STATE_FILE, 'w') as file:
file.write("not-enough-space") file.write("not-enough-space")
return 'not-enough-space' return 'not-enough-space'
elif migarion_status == "not-supported-boot-device":
with open(XFS_STATE_FILE, 'w') as file:
file.write("not-supported-boot-device")
return 'failed'
else: else:
with open(XFS_STATE_FILE, 'w') as file: with open(XFS_STATE_FILE, 'w') as file:
file.write("failed-migration") file.write("failed-migration")
@ -198,4 +243,4 @@ def init():
return "disabled" return "disabled"
except Exception as e: except Exception as e:
print(e) print(e)
pass return "failed"