Compare commits

..

3 Commits

Author SHA1 Message Date
empresa bf2d3e4fd8 Merge branch 'main' of https://git.clore.ai/clore/onboarding 2025-09-10 00:58:00 +07:00
empresa 9fe12b767c Clear last_used_config on error. 2025-09-10 00:56:15 +07:00
clore 5b36318cd4 clore-rentals (#1)
Clore rentals flightsheet implemented.
2025-09-09 12:10:06 +00:00
1 changed files with 3 additions and 41 deletions

View File

@ -11,7 +11,6 @@ import re
import os import os
import socket import socket
import asyncio import asyncio
import traceback
from urllib.parse import urlparse from urllib.parse import urlparse
import subprocess import subprocess
from functools import partial from functools import partial
@ -21,34 +20,22 @@ class logger:
GREEN = '\033[92m' GREEN = '\033[92m'
BLUE = '\033[94m' BLUE = '\033[94m'
RESET = '\033[0m' RESET = '\033[0m'
LOG_FILE = '/opt/clore-hosting/clore_onboarding.log'
@staticmethod @staticmethod
def _get_current_time(): def _get_current_time():
return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@staticmethod
def _log_to_file(level, message):
try:
with open(logger.LOG_FILE, 'a') as f:
f.write(f"{logger._get_current_time()} | {level} | {message}\n")
except Exception:
pass
@staticmethod @staticmethod
def error(message): def error(message):
print(f"{logger.RED}{logger._get_current_time()} | ERROR | {message}{logger.RESET}") print(f"{logger.RED}{logger._get_current_time()} | ERROR | {message}{logger.RESET}")
logger._log_to_file("ERROR", message)
@staticmethod @staticmethod
def success(message): def success(message):
print(f"{logger.GREEN}{logger._get_current_time()} | SUCCESS | {message}{logger.RESET}") print(f"{logger.GREEN}{logger._get_current_time()} | SUCCESS | {message}{logger.RESET}")
logger._log_to_file("SUCCESS", message)
@staticmethod @staticmethod
def info(message): def info(message):
print(f"{logger.BLUE}{logger._get_current_time()} | INFO | {message}{logger.RESET}") print(f"{logger.BLUE}{logger._get_current_time()} | INFO | {message}{logger.RESET}")
logger._log_to_file("INFO", message)
if os.geteuid() != 0: if os.geteuid() != 0:
logger.error("This script must be run as root!") logger.error("This script must be run as root!")
@ -395,13 +382,7 @@ async def post_request(url, body, headers=None, timeout=15):
return status_code, response_data return status_code, response_data
except (http.client.HTTPException, TimeoutError) as e: except (http.client.HTTPException, TimeoutError) as e:
logger.error(f"Request failed: {e}") print(f"Request failed: {e}")
return None, None
except socket.gaierror as e:
logger.error(f"DNS resolution failed for {url}: {e}")
return None, None
except Exception as e:
logger.error(f"Unexpected error in post_request: {e}")
return None, None return None, None
finally: finally:
conn.close() conn.close()
@ -458,11 +439,8 @@ if args.write_linux_config:
sys.exit(1) sys.exit(1)
async def main(machine_specs): async def main(machine_specs):
logger.info("Started onboarding")
global next_retry_reached_server_limit global next_retry_reached_server_limit
last_used_config = None last_used_config = None
logger.info("logger.info(last_used_config) 1")
logger.info(last_used_config)
ever_pending_creation = False ever_pending_creation = False
machine_id = get_machine_id() machine_id = get_machine_id()
@ -476,13 +454,9 @@ async def main(machine_specs):
logger.error("Can't load default power limits of nVidia GPU(s)") logger.error("Can't load default power limits of nVidia GPU(s)")
sys.exit(1) sys.exit(1)
logger.info("logger.info(last_used_config) 2")
logger.info(last_used_config)
oc_config = {} oc_config = {}
while True: while True:
try: try:
logger.info("Looping")
if args.mode == "linux": if args.mode == "linux":
clore_config = await async_read_file(clore_conf_path) clore_config = await async_read_file(clore_conf_path)
clore_config = json.loads(clore_config) clore_config = json.loads(clore_config)
@ -492,10 +466,7 @@ async def main(machine_specs):
machine_name, clore_config, oc_config = await hive_load_configs(default_power_limits, static_clore_config) machine_name, clore_config, oc_config = await hive_load_configs(default_power_limits, static_clore_config)
#print(f"Machine Name: {machine_name}") #print(f"Machine Name: {machine_name}")
logger.info(args.mode)
config_validation = validate_clore_config(clore_config) config_validation = validate_clore_config(clore_config)
if config_validation == "Validation successful": if config_validation == "Validation successful":
if "save_config" in clore_config and args.mode == "hive": if "save_config" in clore_config and args.mode == "hive":
verify_or_update_file(clore_conf_path, json.dumps(clore_config)) verify_or_update_file(clore_conf_path, json.dumps(clore_config))
@ -504,12 +475,6 @@ async def main(machine_specs):
clore_config["clear_oc_override"] = True clore_config["clear_oc_override"] = True
else: else:
clore_config["stock_oc_override"] = oc_config clore_config["stock_oc_override"] = oc_config
logger.info("Is config different")
logger.info(clore_config != last_used_config)
logger.info(clore_config)
logger.info(last_used_config)
if clore_config != last_used_config or (time.time() > next_retry_reached_server_limit and next_retry_reached_server_limit > 0): if clore_config != last_used_config or (time.time() > next_retry_reached_server_limit and next_retry_reached_server_limit > 0):
last_used_config = clore_config.copy() last_used_config = clore_config.copy()
if type(clore_config) == dict and "hostname_override" in clore_config: if type(clore_config) == dict and "hostname_override" in clore_config:
@ -556,13 +521,10 @@ async def main(machine_specs):
logger.error(f"Could not parse config - {' | '.join(config_validation)}") logger.error(f"Could not parse config - {' | '.join(config_validation)}")
except Exception as e: except Exception as e:
logger.error(f"Exception: {e}") print(e)
logger.error(f"Traceback: {traceback.format_exc()}") last_used_config = None
await asyncio.sleep(5) await asyncio.sleep(5)
logger.info("logger.info(last_used_config) 3")
logger.info(last_used_config)
if __name__ == "__main__": if __name__ == "__main__":
machine_specs = specs.get(benchmark_disk=True, mock=args.mock) machine_specs = specs.get(benchmark_disk=True, mock=args.mock)
asyncio.run(main(machine_specs)) asyncio.run(main(machine_specs))