Xin-Rui
/

QWen2.5-eval-NEWA800

Model card Files Files and versions

QWen2.5-eval-NEWA800 / remaining_eval /GPU_hunter_1.py

Xin-Rui's picture

Upload folder using huggingface_hub

a80200a verified 7 months ago

history blame contribute delete

1.78 kB

	import subprocess
	import time
	import logging


	logger = logging.getLogger()
	log_file_path = "gpu_hunter.log"
	logger.addHandler(logging.FileHandler(log_file_path))
	logging.basicConfig(filename=log_file_path, level=logging.DEBUG, format='%(asctime)s - %(message)s')
	print("start hunting")
	def get_gpu_memory_usage():
	try:
	output = subprocess.check_output(
	["nvidia-smi", "--query-gpu=memory.used,memory.total",
	"--format=csv,noheader,nounits", "-i", "4,5,6,7"],
	universal_newlines=True
	)
	return output.strip().split('\n')
	except Exception as e:
	print(f"Error getting GPU info: {e}")
	return None

	def check_low_usage(threshold=10):
	gpu_data = get_gpu_memory_usage()
	if not gpu_data:
	return False

	for gpu in gpu_data:
	used, total = map(int, gpu.split(', '))
	usage_percent = (used / total) * 100
	if usage_percent >= threshold:
	return False
	return True

	def main():
	check_interval = 60*3 # 检查间隔（秒）
	command_to_run = "bash /mnt/lyc/wuxinrui/Qwen2.5-Math/evaluation/remaining_eval/TCMv2_RL.sh" # 替换为需要执行的命令

	while True:
	if check_low_usage(threshold=10):
	print("All GPUs have memory usage below 10%. Executing command...")

	subprocess.run('conda deactivate', shell=True)
	subprocess.run('conda activate QMath-wxr', shell=True)
	subprocess.run(command_to_run, shell=True)

	print("Command executed. Exiting GPU monitoring.")
	break # 退出循环，停止监听
	else:
	print("GPUs are in use. Waiting...")

	time.sleep(check_interval)

	if __name__ == "__main__":
	main()