一个GPU监视工具

这是一个用于监视NVIDIA GPU 训练的工具。可以防止训练进程卡死占用资源。
可以根据自己的情况调整位于第34行时间阈值。

import os
import subprocess
import time
import psutil

nvtask = {}

while True:

    nvrescmd = subprocess.run(['nvidia-smi','pmon','-c','1'], stdout=subprocess.PIPE)
    nvresout = nvrescmd.stdout.decode('utf-8')
    nvprocarr = nvresout.split('\n')[2:]
    for nvproc in nvprocarr:
        nvprocinfo = nvproc.split()
        if len(nvprocinfo) <2:
            continue
        if nvprocinfo[2] == 'G' or nvprocinfo[2] == '-':
            continue
        #print('gpu:{}, pid:{}, util:{}'.format(nvprocinfo[0],nvprocinfo[1],nvprocinfo[3]))
        if int(nvprocinfo[3]) == 0 : 
            if int(nvprocinfo[1]) in nvtask:
                nvtask[int(nvprocinfo[1])] += 1
            else:
                nvtask[int(nvprocinfo[1])] = 1
            if nvtask[int(nvprocinfo[1])] > 10:
                from termcolor import colored
                print('Process {} counted for {}'.format(nvprocinfo[1],nvtask[int(nvprocinfo[1])]))
        else:
            nvtask[int(nvprocinfo[1])] = 0
    for pid, zerocount in nvtask.copy().items():
        if not psutil.pid_exists(pid):
            del nvtask[pid]
        else:
            if zerocount > 300:
                os.kill(pid,9)
    os.system('nvidia-smi')
    os.system('sensors')
    time.sleep(1)
    os.system('clear')