这是一个用于监视NVIDIA GPU 训练的工具。可以防止训练进程卡死占用资源。
可以根据自己的情况调整位于第34行时间阈值。
import os import subprocess import time import psutil nvtask = {} while True: nvrescmd = subprocess.run(['nvidia-smi','pmon','-c','1'], stdout=subprocess.PIPE) nvresout = nvrescmd.stdout.decode('utf-8') nvprocarr = nvresout.split('\n')[2:] for nvproc in nvprocarr: nvprocinfo = nvproc.split() if len(nvprocinfo) <2: continue if nvprocinfo[2] == 'G' or nvprocinfo[2] == '-': continue #print('gpu:{}, pid:{}, util:{}'.format(nvprocinfo[0],nvprocinfo[1],nvprocinfo[3])) if int(nvprocinfo[3]) == 0 : if int(nvprocinfo[1]) in nvtask: nvtask[int(nvprocinfo[1])] += 1 else: nvtask[int(nvprocinfo[1])] = 1 if nvtask[int(nvprocinfo[1])] > 10: from termcolor import colored print('Process {} counted for {}'.format(nvprocinfo[1],nvtask[int(nvprocinfo[1])])) else: nvtask[int(nvprocinfo[1])] = 0 for pid, zerocount in nvtask.copy().items(): if not psutil.pid_exists(pid): del nvtask[pid] else: if zerocount > 300: os.kill(pid,9) os.system('nvidia-smi') os.system('sensors') time.sleep(1) os.system('clear')