dev: debug
This commit is contained in:
parent
daa634bfa9
commit
96bd92e5f0
|
@ -554,10 +554,13 @@ class CloreClient:
|
||||||
try:
|
try:
|
||||||
await monitoring.put("specs_service")
|
await monitoring.put("specs_service")
|
||||||
current_specs = await specs.get()
|
current_specs = await specs.get()
|
||||||
|
print(current_specs)
|
||||||
if self.last_hw_specs_submit < (utils.unix_timestamp()-1800):
|
if self.last_hw_specs_submit < (utils.unix_timestamp()-1800):
|
||||||
self.last_hw_specs_submit=utils.unix_timestamp()
|
self.last_hw_specs_submit=utils.unix_timestamp()
|
||||||
await self.submit_specs(current_specs)
|
await self.submit_specs(current_specs)
|
||||||
|
print("submit specs")
|
||||||
await self.update_realtime_data(current_specs)
|
await self.update_realtime_data(current_specs)
|
||||||
|
print("update realtime")
|
||||||
try:
|
try:
|
||||||
if self.xfs_state == "active" and len(current_specs["gpus"]["nvidia"]) > 0 and not self.runned_pull_selftest:
|
if self.xfs_state == "active" and len(current_specs["gpus"]["nvidia"]) > 0 and not self.runned_pull_selftest:
|
||||||
await clore_partner.check_to_pull_selftest(current_specs)
|
await clore_partner.check_to_pull_selftest(current_specs)
|
||||||
|
|
|
@ -291,8 +291,6 @@ def get_gpu_info():
|
||||||
"mem_used": parts[12]
|
"mem_used": parts[12]
|
||||||
}
|
}
|
||||||
|
|
||||||
print(xl_gpu_info)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
pci_query = parts[2][parts[2].find(':')+1:]
|
pci_query = parts[2][parts[2].find(':')+1:]
|
||||||
for index, valid_pci_dev in enumerate(valid_pci_dev_list):
|
for index, valid_pci_dev in enumerate(valid_pci_dev_list):
|
||||||
|
@ -302,8 +300,6 @@ def get_gpu_info():
|
||||||
xl_gpu_info["pcie_width"]=bus_spec.width
|
xl_gpu_info["pcie_width"]=bus_spec.width
|
||||||
xl_gpu_info["pcie_revision"]=bus_spec.revision
|
xl_gpu_info["pcie_revision"]=bus_spec.revision
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("err")
|
|
||||||
print(e)
|
|
||||||
pass
|
pass
|
||||||
gpus["nvidia"].append(xl_gpu_info)
|
gpus["nvidia"].append(xl_gpu_info)
|
||||||
lines = nvidia_smi_stdout.split('\n')
|
lines = nvidia_smi_stdout.split('\n')
|
||||||
|
@ -313,21 +309,13 @@ def get_gpu_info():
|
||||||
|
|
||||||
gpu_name = parts[1].strip()
|
gpu_name = parts[1].strip()
|
||||||
gpu_id = parts[5].strip();
|
gpu_id = parts[5].strip();
|
||||||
print("parts")
|
|
||||||
print(parts)
|
|
||||||
print("p1:" + gpu_name)
|
|
||||||
print("p1:" + gpu_id)
|
|
||||||
if gpu_name == "NVIDIA Graphics Device" and gpu_id in GPU_ID_TO_NAME:
|
if gpu_name == "NVIDIA Graphics Device" and gpu_id in GPU_ID_TO_NAME:
|
||||||
gpu_name = GPU_ID_TO_NAME[gpu_id]
|
gpu_name = GPU_ID_TO_NAME[gpu_id]
|
||||||
print("p2:" + gpu_name)
|
|
||||||
|
|
||||||
gpu_str = f"{len(lines)-1}x {gpu_name}"
|
gpu_str = f"{len(lines)-1}x {gpu_name}"
|
||||||
print(gpu_str)
|
|
||||||
gpu_mem = round(int(filter_non_numeric(parts[4]).strip())/1024, 2)
|
gpu_mem = round(int(filter_non_numeric(parts[4]).strip())/1024, 2)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
nvml_err=True
|
nvml_err=True
|
||||||
print("err2")
|
|
||||||
print(e)
|
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
nvml_err=True
|
nvml_err=True
|
||||||
|
|
Loading…
Reference in New Issue