pytorch的顯存釋放機制torch.cuda.empty_cache()

本文轉載自查看原文 2021-11-18 11:13 4185 雜談

參考：

https://cloud.tencent.com/developer/article/1626387

據說在pytorch中使用torch.cuda.empty_cache()可以釋放緩存空間，於是做了些嘗試：

上代碼：

import torch
import time
import os


#os.environ["CUDA_VISIBLE_DEVICES"] = "3"


device='cuda:2'


dummy_tensor_4 = torch.randn(120, 3, 512, 512).float().to(device)  # 120*3*512*512*4/1024/1024 = 360.0M


memory_allocated = torch.cuda.memory_allocated(device)/1024/1024

memory_reserved = torch.cuda.memory_reserved(device)/1024/1024


print("第一階段：")
print("變量類型：", dummy_tensor_4.dtype)
print("變量實際占用內存空間：", 120*3*512*512*4/1024/1024, "M")
print("GPU實際分配給的可用內存", memory_allocated, "M")
print("GPU實際分配給的緩存", memory_reserved, "M")








torch.cuda.empty_cache()

time.sleep(15)

memory_allocated = torch.cuda.memory_allocated(device)/1024/1024

memory_reserved = torch.cuda.memory_reserved(device)/1024/1024

print("第二階段：")
print("釋放緩存后:", "."*100)
print("變量實際占用內存空間：", 120*3*512*512*4/1024/1024, "M")
print("GPU實際分配給的可用內存", memory_allocated, "M")
print("GPU實際分配給的緩存", memory_reserved, "M")









del dummy_tensor_4

torch.cuda.empty_cache()

time.sleep(15)

memory_allocated = torch.cuda.memory_allocated(device)/1024/1024

memory_reserved = torch.cuda.memory_reserved(device)/1024/1024


print("第三階段：")
print("刪除變量后釋放緩存后:", "."*100)
print("變量實際占用內存空間：", 0, "M")
print("GPU實際分配給的可用內存", memory_allocated, "M")
print("GPU實際分配給的緩存", memory_reserved, "M")


time.sleep(60)

運行結果：

第一階段：

第二階段：

第三階段：

===================================================

可以看到在pytorch中顯存創建360M的變量其實總占有了1321M空間，其中變量自身占了360M空間，緩存也占了360M空間，中間多出了那1321-360*2=601M空間卻無法解釋，十分詭異。

總的來說 torch.cuda.empty_cache() 操作有一定用處，但是用處不太大。

===================================================

更改代碼：

import torch
import time
import os


#os.environ["CUDA_VISIBLE_DEVICES"] = "3"


device='cuda:2'


dummy_tensor_4 = torch.randn(120, 3, 512, 512).float().to(device)  # 120*3*512*512*4/1024/1024 = 360.0M
dummy_tensor_5 = torch.randn(120, 3, 512, 512).float().to(device)  # 120*3*512*512*4/1024/1024 = 360.0M


memory_allocated = torch.cuda.memory_allocated(device)/1024/1024

memory_reserved = torch.cuda.memory_reserved(device)/1024/1024


print("第一階段：")
print("變量類型：", dummy_tensor_4.dtype)
print("變量實際占用內存空間：", 2*120*3*512*512*4/1024/1024, "M")
print("GPU實際分配給的可用內存", memory_allocated, "M")
print("GPU實際分配給的緩存", memory_reserved, "M")








torch.cuda.empty_cache()

time.sleep(15)

memory_allocated = torch.cuda.memory_allocated(device)/1024/1024

memory_reserved = torch.cuda.memory_reserved(device)/1024/1024

print("第二階段：")
print("釋放緩存后:", "."*100)
print("變量實際占用內存空間：", 2*120*3*512*512*4/1024/1024, "M")
print("GPU實際分配給的可用內存", memory_allocated, "M")
print("GPU實際分配給的緩存", memory_reserved, "M")









del dummy_tensor_4
del dummy_tensor_5

torch.cuda.empty_cache()

time.sleep(15)

memory_allocated = torch.cuda.memory_allocated(device)/1024/1024

memory_reserved = torch.cuda.memory_reserved(device)/1024/1024


print("第三階段：")
print("刪除變量后釋放緩存后:", "."*100)
print("變量實際占用內存空間：", 0, "M")
print("GPU實際分配給的可用內存", memory_allocated, "M")
print("GPU實際分配給的緩存", memory_reserved, "M")


time.sleep(60)

第一階段：

第二階段：

第三階段：

發現依然有顯存空間無法解釋。

=============================================

上面的操作都是在24G顯存的titan上進行的，最后決定用1060顯卡試驗下，6G顯存比較好嘗試。

代碼：

import torch
import time
import os
import functools


#os.environ["CUDA_VISIBLE_DEVICES"] = "3"


device='cuda:0'


shape_ = (4, 1024, 512, 512)     # 4GB
# dummy_tensor_4 = torch.randn(120, 3, 512, 512).float().to(device)  # 120*3*512*512*4/1024/1024 = 360.0M
# dummy_tensor_5 = torch.randn(10, 120, 3, 512, 512).float().to(device)  # 120*3*512*512*4/1024/1024 = 360.0M
dummy_tensor_6 = torch.randn(*shape_).float().to(device)   


memory_allocated = torch.cuda.memory_allocated(device)/1024/1024

memory_reserved = torch.cuda.memory_reserved(device)/1024/1024


print("第一階段：")
print("變量類型：", dummy_tensor_6.dtype)
print("變量實際占用內存空間：", functools.reduce(lambda x, y: x*y, shape_)*4/1024/1024, "M")
print("GPU實際分配給的可用內存", memory_allocated, "M")
print("GPU實際分配給的緩存", memory_reserved, "M")








torch.cuda.empty_cache()

time.sleep(15)

memory_allocated = torch.cuda.memory_allocated(device)/1024/1024

memory_reserved = torch.cuda.memory_reserved(device)/1024/1024

print("第二階段：")
print("釋放緩存后:", "."*100)
print("GPU實際分配給的可用內存", memory_allocated, "M")
print("GPU實際分配給的緩存", memory_reserved, "M")









del dummy_tensor_6

torch.cuda.empty_cache()

time.sleep(15)

memory_allocated = torch.cuda.memory_allocated(device)/1024/1024

memory_reserved = torch.cuda.memory_reserved(device)/1024/1024


print("第三階段：")
print("刪除變量后釋放緩存后:", "."*100)
print("GPU實際分配給的可用內存", memory_allocated, "M")
print("GPU實際分配給的緩存", memory_reserved, "M")


time.sleep(60)

輸出結果：

第一階段:

第二階段:

第三階段:

由於顯卡總共6G顯存，所以

memory_allocated

memory_reserved

這兩部分應該是指的相同顯存空間，因為這兩個部分都是顯示4G空間，總共6G空間。

可以看到單獨執行：torch.cuda.empty_cache()

並沒有釋放顯存，還是4775MB，但是執行：

del dummy_tensor_6

torch.cuda.empty_cache()

顯存就進行了釋放，為679MB。

更改代碼：

import torch
import time
import os
import functools


#os.environ["CUDA_VISIBLE_DEVICES"] = "3"


device='cuda:0'


shape_ = (4, 1024, 512, 512)     # 4GB
# dummy_tensor_4 = torch.randn(120, 3, 512, 512).float().to(device)  # 120*3*512*512*4/1024/1024 = 360.0M
# dummy_tensor_5 = torch.randn(10, 120, 3, 512, 512).float().to(device)  # 120*3*512*512*4/1024/1024 = 360.0M
dummy_tensor_6 = torch.randn(*shape_).float().to(device)   


memory_allocated = torch.cuda.memory_allocated(device)/1024/1024

memory_reserved = torch.cuda.memory_reserved(device)/1024/1024


print("第一階段：")
print("生成變量后:", "."*100)
print("變量類型：", dummy_tensor_6.dtype)
print("變量實際占用內存空間：", functools.reduce(lambda x, y: x*y, shape_)*4/1024/1024, "M")
print("GPU實際分配給的可用內存", memory_allocated, "M")
print("GPU實際分配給的緩存", memory_reserved, "M")








torch.cuda.empty_cache()

time.sleep(15)

memory_allocated = torch.cuda.memory_allocated(device)/1024/1024

memory_reserved = torch.cuda.memory_reserved(device)/1024/1024

print("第二階段：")
print("釋放緩存后:", "."*100)
print("變量類型：", dummy_tensor_6.dtype)
print("GPU實際分配給的可用內存", memory_allocated, "M")
print("GPU實際分配給的緩存", memory_reserved, "M")
# for _ in range(10000):
#     dummy_tensor_6 += 0.001
# print(torch.sum(dummy_tensor_6))








del dummy_tensor_6


time.sleep(15)

memory_allocated = torch.cuda.memory_allocated(device)/1024/1024

memory_reserved = torch.cuda.memory_reserved(device)/1024/1024


print("第三階段：")
print("刪除變量后釋放緩存后:", "."*100)
print("GPU實際分配給的可用內存", memory_allocated, "M")
print("GPU實際分配給的緩存", memory_reserved, "M")


time.sleep(60)

運行結果：

NVIDIA顯存顯示第一，二，，三階段均為：

如果沒有執行torch.cuda.empty_cache()，即使刪除GPU上的變量顯存空間也不會被釋放，該部分顯存還為緩存空間所占。

================================================

總結：

torch.cuda.memory_reserved() 表示進程所獲得分配到總顯存大小（包括變量顯存和緩存等）

torch.cuda.memory_allocated 表示進程為變量所分配的顯存大小

torch.cuda.memory_reserved() - torch.cuda.memory_allocated

表示進程中空閑的顯存空間，一般是指進程顯存中緩存空間的大小。（不是GPU空閑顯存空間，而是進程已獲得的顯存中未被使用的空間）

================================================

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 pytorch的顯存釋放機制torch.cuda.empty_cache() Pytorch錯誤：Torch not compiled with CUDA enabled GPU 顯存釋放 CUDA教程二、主存與顯存 Pytorch出現RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) PyTorch torch.cuda.device_count 返回值與實際 GPU 數量不一致 Windows10系統下pytorch環境快速配置(Anaconda+CUDA+torch+PyCharm) torch.cuda.FloatTensor buff/cache 內容釋放 pyTorch進階-torch