nvidia-smi命令可以查看GPU使用情況,但是只能看到占用每個GPU的進程ID。根據進程ID可以得到進程詳情,進程詳情中包括用戶ID,根據用戶ID可以獲取用戶名稱,從而知道哪個用戶在使用GPU。
import json
import os
import re
import sys
import time
import typing
import bidict
"""
查看誰在使用GPU
"""
def get_user_id_map() -> typing.Dict[str:str]:
"""獲取用戶名和用戶ID的對應關系"""
home = os.path.expanduser('~')
users = bidict.bidict()
for user_name in os.listdir(os.path.join(home, '..')):
info = os.popen('id ' + user_name + ' 2>&1').read().strip()
if 'no such user' in info: continue
try:
a = re.search("uid=(\\d+)\((\\w+)\)", info)
users[a.group(1)] = a.group(2) # userid==>username
except Exception as e:
print(e)
return users
def nvidia_smi() -> (int, typing.Dict[str:str]):
"""使用nvidia-smi命令查看GPU使用情況,返回GPU個數和各個GPU的進程的描述line"""
info = os.popen('nvidia-smi').read()
info = info.split('\n')
"""
smi信息分成上下兩部分
上面部分:以表格形式展示各個GPU的使用率
下面部分:展示各個GPU上運行的進程ID
"""
space_ind = 0
for ind, line in enumerate(info):
if not line.strip():
space_ind = ind
break
first_line = 0
for ind, line in enumerate(info):
if line.startswith('|===='):
first_line = ind
break
gpu_count = abs(space_ind - first_line) // 3
pos = None
for ind, line in enumerate(info):
line = line.split()
if len(line) > 1 and line[1] == 'Processes:':
pos = ind + 2
break
gpu_usage = dict()
if pos == None:
return gpu_count, gpu_usage
for i in range(pos, len(info)):
line = info[i].split()
if len(line) > 1:
thread = line[2]
gpu_id = int(line[1])
if gpu_id not in gpu_usage:
gpu_usage[gpu_id] = []
gpu_usage[gpu_id].append(thread)
return gpu_count, gpu_usage
def get_thread_info(thread_id: str):
"""根據thread_id獲取thread詳細信息"""
id2user = get_user_id_map()
thread_info = os.popen('ps -l ' + thread_id).read().split('\n')[1].split()
thread_user = id2user.get(thread_info[2])
thread_time = re.search('\\d+', thread_info[12]).group()
thread_cmd = ' '.join(thread_info[13:])
return dict(user=thread_user, use_time="{} hours".format(float(thread_time) / 60), thread_id=thread_id, cmd=thread_cmd)
def grep_gpu(task):
"""搶占GPU准備執行某個任務"""
free_gpu = None
while free_gpu is None:
gpu_count, usage = nvidia_smi()
time.sleep(2)
for i in range(gpu_count):
if i not in usage:
free_gpu = i
break
print('free gpu found ! ', free_gpu)
os.system(task)
def show():
gpu_count, usage = nvidia_smi()
for gpu_id in usage:
usage[gpu_id] = [get_thread_info(thread_id) for thread_id in usage[gpu_id]]
print('gpu count', gpu_count)
print(json.dumps(usage, ensure_ascii=0, indent=2))
def run(gpu_id, task):
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
os.system('echo CUDA_VISIBLE_DEVICES:$CUDA_VISIBLE_DEVICES')
os.system(task)
if __name__ == '__main__':
print(sys.argv)
if len(sys.argv) == 1:
print("""
GPU utility
gpu show
gpu grep your command here
gpu 1 python haha.py
""")
exit(0)
action = sys.argv[1]
if action == 'show': # 顯示GPU使用情況
show()
elif action == 'grep': # 爭奪GPU,得到之后執行命令
cmd = ' '.join(sys.argv[2:])
print('grep gpu and run', cmd)
grep_gpu(cmd)
elif re.match("\\d+", action): # 使用gpu_id執行某個action
gpu_id = int(action)
cmd = ' '.join(sys.argv[2:])
print('run on gpu', gpu_id, 'cmd', cmd)
run(gpu_id, cmd)
else:
print("unkown command")