2024-12-02 00:06:53 +00:00
|
|
|
from lib import ensure_packages_installed
|
|
|
|
from lib import config as config_module
|
|
|
|
from lib import logging as logging_lib
|
|
|
|
from lib import clore_partner_socket
|
|
|
|
from lib import latency_test
|
|
|
|
from lib import openvpn
|
|
|
|
from lib import utils
|
|
|
|
import asyncio
|
|
|
|
import random
|
|
|
|
import json
|
|
|
|
import time
|
|
|
|
import re
|
|
|
|
|
|
|
|
import os
|
|
|
|
import aiofiles.os
|
|
|
|
from aiohttp import ClientSession, ClientTimeout
|
|
|
|
|
|
|
|
config = config_module.config
|
|
|
|
log = logging_lib.log
|
|
|
|
|
|
|
|
MANDATORY_PACKEGES = ['dmidecode', 'openvpn', 'iproute2']
|
|
|
|
|
|
|
|
DUMMY_WORKLOAD_CONTAINER = "cloreai/partner-dummy-workload"
|
|
|
|
|
2024-12-10 22:28:51 +00:00
|
|
|
non_interactive_env = {
|
|
|
|
'DEBIAN_FRONTEND': 'noninteractive',
|
|
|
|
'PATH': '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin',
|
|
|
|
}
|
|
|
|
|
2024-12-02 00:06:53 +00:00
|
|
|
host_facts_location = os.path.join(config.clore_partner_base_dir, "host_facts")
|
|
|
|
partner_cache_location = os.path.join(config.clore_partner_base_dir, "partner_cache")
|
|
|
|
|
|
|
|
next_ensupe_packages_check = 0
|
|
|
|
is_socket_running = False
|
|
|
|
|
|
|
|
partner_container_config = None
|
|
|
|
|
|
|
|
async def initialize():
|
|
|
|
global next_ensupe_packages_check
|
|
|
|
global is_socket_running
|
|
|
|
try:
|
|
|
|
await aiofiles.os.makedirs(host_facts_location, exist_ok=True)
|
|
|
|
await aiofiles.os.makedirs(partner_cache_location, exist_ok=True)
|
|
|
|
await aiofiles.os.makedirs("/etc/openvpn/client", exist_ok=True)
|
|
|
|
if not is_socket_running:
|
|
|
|
is_socket_running=True
|
|
|
|
asyncio.create_task(clore_partner_socket.socket_service(
|
|
|
|
location=os.path.join(host_facts_location, "partner_interface.socket")
|
|
|
|
))
|
|
|
|
if next_ensupe_packages_check < time.time():
|
|
|
|
success = await ensure_packages_installed.ensure_packages_installed(MANDATORY_PACKEGES, None)
|
|
|
|
next_ensupe_packages_check = float('inf') if success else time.time() + 60*60 # if did not succeeed -> retry in 1hr
|
|
|
|
if not success:
|
|
|
|
return False
|
|
|
|
elif next_ensupe_packages_check != float('inf'):
|
|
|
|
return False
|
|
|
|
|
|
|
|
code, stdout, stderr = await utils.async_run_command(
|
2024-12-07 04:09:13 +00:00
|
|
|
"dmidecode -t 2 2>&1",
|
2024-12-02 00:06:53 +00:00
|
|
|
20
|
|
|
|
)
|
|
|
|
if code == 0 and not stderr:
|
|
|
|
async with aiofiles.open(os.path.join(host_facts_location, "dmidecode_t2.txt"), mode='w') as file:
|
|
|
|
await file.write(stdout)
|
|
|
|
else:
|
|
|
|
return False
|
|
|
|
code, stdout, stderr = await utils.async_run_command(
|
2024-12-07 04:09:13 +00:00
|
|
|
"dmidecode 2>&1",
|
2024-12-02 00:06:53 +00:00
|
|
|
20
|
|
|
|
)
|
|
|
|
if code == 0 and not stderr:
|
|
|
|
async with aiofiles.open(os.path.join(host_facts_location, "dmidecode.txt"), mode='w') as file:
|
|
|
|
await file.write(stdout)
|
|
|
|
else:
|
|
|
|
return False
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
|
|
log.error(f"FAIL | clore_partner.initialize | {e}")
|
|
|
|
return False
|
|
|
|
|
|
|
|
async def get_partner_allowed_images():
|
|
|
|
try:
|
|
|
|
file_exists = await aiofiles.os.path.exists(os.path.join(partner_cache_location, "container_list.json"))
|
|
|
|
if not file_exists:
|
|
|
|
return []
|
|
|
|
images = []
|
|
|
|
async with aiofiles.open(os.path.join(partner_cache_location, "container_list.json"), mode='r') as file:
|
|
|
|
content = await file.read()
|
|
|
|
containers = json.loads(content)
|
|
|
|
for container in containers:
|
|
|
|
image = container.get("Config", {}).get("Image", None)
|
|
|
|
if image and not image in images:
|
|
|
|
images.append(image)
|
|
|
|
return images
|
|
|
|
except Exception as e:
|
|
|
|
return None
|
|
|
|
|
|
|
|
def validate_partner_container_name(name):
|
|
|
|
if type(name) != str:
|
|
|
|
return False
|
|
|
|
elif name==config.clore_partner_container_name:
|
|
|
|
return True
|
|
|
|
pattern = r"^C\.\d+$"
|
|
|
|
return bool(re.match(pattern, name))
|
|
|
|
|
|
|
|
def validate_partner_workload_container_name(name):
|
|
|
|
if type(name) != str:
|
|
|
|
return False
|
|
|
|
pattern = r"^C\.\d+$"
|
|
|
|
return bool(re.match(pattern, name))
|
|
|
|
|
|
|
|
last_openvpn_config = None
|
|
|
|
|
|
|
|
def get_partner_container_config():
|
|
|
|
global partner_container_config
|
|
|
|
return partner_container_config
|
|
|
|
|
|
|
|
async def configure(partner_config):
|
|
|
|
global last_openvpn_config
|
|
|
|
global partner_container_config
|
|
|
|
if last_openvpn_config != partner_config:
|
|
|
|
partner_container_config = {
|
|
|
|
"image": partner_config["partner_image"],
|
|
|
|
"name": config.clore_partner_container_name,
|
2024-12-07 00:34:12 +00:00
|
|
|
"hostname": f"{partner_config['partner_id'][:16]}-m{partner_config['machine_id']}",
|
2024-12-02 00:06:53 +00:00
|
|
|
"env": {
|
|
|
|
"AUTH": partner_config['partner_id'],
|
|
|
|
"ip_addr": partner_config['openvpn_host'],
|
|
|
|
"port_range": f'{partner_config['ports'][0]}-{partner_config['ports'][1]}'
|
|
|
|
},
|
|
|
|
"volumes": {
|
|
|
|
f"{host_facts_location}": {"bind": "/var/lib/vastai_kaalia/specs_source"},
|
|
|
|
f"{partner_cache_location}": {"bind": "/var/lib/vastai_kaalia/data"},
|
|
|
|
f"/var/lib/docker": {"bind": "/var/lib/docker"},
|
|
|
|
f"/var/run/docker.sock": {"bind": "/var/run/docker.sock"}
|
|
|
|
},
|
|
|
|
"gpus": True,
|
|
|
|
"command": '',
|
|
|
|
"network": "clore-partner-br0",
|
|
|
|
"ip": "172.19.0.254",
|
|
|
|
"cap_add": ["SYS_ADMIN"],
|
|
|
|
"devices": ["/dev/fuse"],
|
|
|
|
#"security_opt": ["apparmor:unconfined"],
|
|
|
|
"ports": [f"{partner_config['ports'][1]}:{partner_config['ports'][1]}"],
|
|
|
|
}
|
|
|
|
r = await openvpn.clore_partner_configure(partner_config)
|
|
|
|
if r:
|
|
|
|
last_openvpn_config = partner_config
|
|
|
|
|
|
|
|
# -----------------------------------------
|
|
|
|
|
|
|
|
next_latency_measurment = 0
|
|
|
|
|
|
|
|
async def fetch_forwarding_nodes():
|
|
|
|
url = "https://api.clore.ai/v1/get_relay"
|
|
|
|
timeout = ClientTimeout(total=30)
|
|
|
|
|
|
|
|
async with ClientSession(timeout=timeout) as session:
|
|
|
|
try:
|
|
|
|
async with session.get(url) as response:
|
|
|
|
response.raise_for_status()
|
|
|
|
data = await response.json()
|
|
|
|
return data
|
|
|
|
except Exception as e:
|
|
|
|
print(f"An error occurred: {e}")
|
|
|
|
return None
|
|
|
|
|
|
|
|
async def set_next_latency_measurment(ts):
|
|
|
|
global next_latency_measurment
|
|
|
|
try:
|
|
|
|
next_latency_measurment=ts
|
|
|
|
async with aiofiles.open(os.path.join(config.clore_partner_base_dir, ".next_latency_measurment"), mode='w') as file:
|
|
|
|
await file.write(str(ts))
|
|
|
|
except Exception as e:
|
|
|
|
pass
|
|
|
|
|
|
|
|
async def measure_forwarding_latency():
|
|
|
|
global next_latency_measurment
|
|
|
|
|
|
|
|
if next_latency_measurment > time.time():
|
|
|
|
return False
|
|
|
|
try:
|
|
|
|
await aiofiles.os.makedirs(config.clore_partner_base_dir, exist_ok=True)
|
|
|
|
file_exists = await aiofiles.os.path.exists(os.path.join(config.clore_partner_base_dir, ".next_latency_measurment"))
|
|
|
|
if file_exists:
|
|
|
|
async with aiofiles.open(os.path.join(config.clore_partner_base_dir, ".next_latency_measurment"), mode='r') as file:
|
|
|
|
content = await file.read()
|
|
|
|
if content.isdigit():
|
|
|
|
next_latency_measurment = int(content)
|
|
|
|
if next_latency_measurment < time.time():
|
|
|
|
node_info = await fetch_forwarding_nodes()
|
|
|
|
if type(node_info) == dict and node_info.get("country") and type(node_info.get("nodes")) == dict and node_info.get("code") == 0:
|
|
|
|
to_test_nodes = []
|
|
|
|
ip_to_region = {}
|
|
|
|
|
|
|
|
valid_regions = []
|
|
|
|
|
|
|
|
for node_region in node_info.get("nodes").keys():
|
|
|
|
nodes_ip_list = node_info.get("nodes")[node_region]
|
|
|
|
if type(nodes_ip_list) == list and len(nodes_ip_list) > 0:
|
|
|
|
to_test_nodes = to_test_nodes + nodes_ip_list
|
|
|
|
for node_ip in nodes_ip_list:
|
|
|
|
ip_to_region[node_ip]=node_region
|
|
|
|
|
|
|
|
if len(to_test_nodes) > 0:
|
|
|
|
measurment_result = await latency_test.measure_latency_icmp(to_test_nodes)
|
|
|
|
if measurment_result:
|
|
|
|
for idx, res in enumerate(measurment_result):
|
|
|
|
if res["received"] > 2 and not ip_to_region.get(res["host"]) in valid_regions:
|
|
|
|
valid_regions.append(ip_to_region.get(res["host"]))
|
|
|
|
measurment_result[idx]["region"] = ip_to_region.get(res["host"])
|
|
|
|
if len(valid_regions) == len(ip_to_region.keys()):
|
|
|
|
await set_next_latency_measurment(int(
|
|
|
|
time.time() + 60*60*24*30 # Re run in 30 days, because measurment succeeded
|
|
|
|
))
|
|
|
|
return measurment_result
|
|
|
|
else:
|
|
|
|
await set_next_latency_measurment(int(
|
|
|
|
time.time() + 60*60*24 # Retry in 24hr, all regions in country should be reacheable
|
|
|
|
))
|
|
|
|
else:
|
|
|
|
await set_next_latency_measurment(int(
|
|
|
|
time.time() + 60*60*72 # Retry in 72hr (clore partner service is not available in host country yet)
|
|
|
|
))
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
await set_next_latency_measurment(int(
|
|
|
|
time.time() + 60*60*12 # Retry in 12hr, the response was not matching the required format
|
|
|
|
))
|
|
|
|
return False
|
|
|
|
return False
|
|
|
|
except Exception as e:
|
|
|
|
return False
|
|
|
|
|
|
|
|
def filter_partner_dummy_workload_container(containers):
|
|
|
|
try:
|
|
|
|
remaining_containers = []
|
|
|
|
for container in containers:
|
|
|
|
if container["image"] != DUMMY_WORKLOAD_CONTAINER:
|
|
|
|
remaining_containers.append(container)
|
|
|
|
return remaining_containers
|
|
|
|
except Exception as e:
|
2024-12-10 22:28:51 +00:00
|
|
|
return containers
|
|
|
|
|
|
|
|
auto_pull_selftest_gpus = ["NVIDIA GeForce RTX 3090", "NVIDIA GeForce RTX 4090"]
|
|
|
|
|
|
|
|
async def check_to_pull_selftest(current_specs):
|
|
|
|
try:
|
|
|
|
min_width = 16
|
|
|
|
gpu_total_vram = 0
|
|
|
|
gpu_name = ''
|
|
|
|
mixed_cards = False
|
2024-12-11 08:59:14 +00:00
|
|
|
driver_version = 0
|
2024-12-10 22:28:51 +00:00
|
|
|
for idx, nvidia_gpu in enumerate(current_specs["gpus"]["nvidia"]):
|
|
|
|
if idx > 0 and nvidia_gpu["name"] != gpu_name:
|
|
|
|
mixed_cards = True
|
|
|
|
gpu_name = nvidia_gpu["name"]
|
2024-12-11 08:59:14 +00:00
|
|
|
driver_version = int(nvidia_gpu["driver"].split('.')[0])
|
2024-12-10 22:28:51 +00:00
|
|
|
if nvidia_gpu["pcie_width"] < min_width:
|
|
|
|
min_width = nvidia_gpu["pcie_width"]
|
|
|
|
if " MiB" in nvidia_gpu["mem_total"]:
|
2024-12-10 23:11:25 +00:00
|
|
|
gpu_total_vram += int(nvidia_gpu["mem_total"].replace(" MiB", ''))
|
2024-12-11 08:59:14 +00:00
|
|
|
if driver_version >= 550 and gpu_name in auto_pull_selftest_gpus and current_specs["ram"] > 7 and int(current_specs["cpus"].split('/')[0]) >= 4 and not mixed_cards and min_width > 1 and gpu_total_vram < current_specs["ram"] * 1024 and float(current_specs["disk"].split(' ')[-1].replace("GB", '')) > 25:
|
2024-12-10 22:28:51 +00:00
|
|
|
await utils.async_run_command("docker pull vastai/test:selftest", 14400, non_interactive_env)
|
|
|
|
except Exception as e:
|
|
|
|
pass
|