实例
数据集
笔记本
笔记本

如何使用 PyTorch 进行半精度、混(合)精度训练
V100 的正确使用姿势?
PyTorch 半精度、混(合)精度训练,更少的 GPU 显存占用,更快的速度
笔记本内容
如何使用 PyTorch 进行半精度训练 #
实验: 混合精度训练对比 (GTX 3090 VS TESLA V100-SXM2) #
经常有小伙伴问我 TESLA V100 显存 16GB 比 GTX 3090 的 24GB 少了 8GB,价格还一样,为啥要用 V100 呢?

使用精度低于 32 位浮点数的数字格式有很多好处。首先,它们需要 更少的内存,从而能够训练和部署更大的神经网络。其次,它们需要 更少的内存带宽,从而加快数据传输操作。第三,数学运算在降低精度的情况下 运行得更快,尤其是在具有 Tensor Core 支持该精度的 GPU 上。混合精度训练实现了所有这些好处,同时确保与全精度训练相比不会丢失特定于任务的准确性。
使用混合精度训练需要以下两个步骤:
- 移植模型以使用 FP16 数据类型。
- 添加损失缩放以保留较小的梯度值。
# 下载数据集
!featurize -t [token] dataset download 12d20991-7d1a-4722-bf42-b3933bf34689
100%|██████████████████████████████████████| 1.16G/1.16G [00:05<00:00, 225MiB/s] 🍬 下载完成,正在解压... 🏁 数据集已经成功添加
# 以下两个包如果自己环境没有的同学请去掉注释运行
#%pip install nvidia-ml-py3 timm
import torch
import timm
import cv2
import os
import nvidia_smi
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
sns.set_theme(style="whitegrid")
df = pd.read_csv('/home/featurize/data/FMY/train.csv')
df.head(2)
image | label | fold | path | |
---|---|---|---|---|
0 | 1XmrT4KUOZldxPjf3JvzWBL8QGh2CFMV.jpg | 1 | 0 | /home/featurize/data/FMY/train/0_phone/JPEGImages |
1 | vHrM4APykiDYsfGghexZctRbldCNj5FK.jpg | 1 | 0 | /home/featurize/data/FMY/train/0_phone/JPEGImages |
# 定义 PyTorch 的 Dataset
class Dataset(torch.utils.data.Dataset):
def __init__(self, df: pd.DataFrame):
self.df = df
def __getitem__(self, index: int):
row = self.df.iloc[index]
fn = row.image
# 读取数据
image = cv2.imread(os.path.join(row.path, fn))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# 统一数据尺寸
image = cv2.resize(image, (384, 512), interpolation=cv2.INTER_LINEAR)
image = image.transpose(2, 0, 1)
label = np.array([row.label])
return torch.from_numpy(image).float(), torch.from_numpy(label).float()
def __len__(self):
return len(self.df)
# 创建 PyTorch 的 Dataloader
train_dataset = Dataset(df)
train_dataloader = torch.utils.data.DataLoader(
train_dataset,
batch_size=32,
num_workers=8,
shuffle=True,
pin_memory=True
)
# 生成随机的样本看一看
image, label = train_dataset.__getitem__(random.randint(0, len(train_dataset)-1))
plt.imshow(image.int().permute(1, 2, 0));
print('Label:', label.int().item())
Label: 1
# 创建模型、优化器、损失函数
model = timm.create_model('tf_efficientnet_b0', num_classes=1).cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)
criterion = torch.nn.BCEWithLogitsLoss()
实验一 GTX 3090 VS V100 (单精度) #
now = datetime.now()
start_time = now.strftime("%H:%M:%S")
print("Start Time =", start_time)
for epoch in range(1):
running_loss = 0.0
for i, data in enumerate(train_dataloader):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
inputs, labels = inputs.cuda(), labels.cuda()
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % 100 == 99: # print every 100 mini-batches
print(f'Epoch: {epoch + 1} Iterations: {i + 1} Loss: {running_loss}')
running_loss = 0.0
print('Finished Training')
now = datetime.now()
finish_time = now.strftime("%H:%M:%S")
print("Finish Time =", finish_time)
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
print("Total memory:", info.total)
print("Free memory:", info.free)
print("Used memory:", info.used)
print("GPU: ", nvidia_smi.nvmlDeviceGetName(handle))
nvidia_smi.nvmlShutdown()
Start Time = 15:17:17 Epoch: 1 Iterations: 100 Loss: 85.16591739654541 Epoch: 1 Iterations: 200 Loss: 79.9305801987648 Epoch: 1 Iterations: 300 Loss: 65.93301039934158 Epoch: 1 Iterations: 400 Loss: 67.16754561662674 Epoch: 1 Iterations: 500 Loss: 57.44469214975834 Finished Training Finish Time = 15:19:11 Total memory: 25447170048 Free memory: 11241324544 Used memory: 14205845504 GPU: b'NVIDIA GeForce RTX 3090'
now = datetime.now()
start_time = now.strftime("%H:%M:%S")
print("Start Time =", start_time)
for epoch in range(1):
running_loss = 0.0
for i, data in enumerate(train_dataloader):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
inputs, labels = inputs.cuda(), labels.cuda()
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % 100 == 99: # print every 100 mini-batches
print(f'Epoch: {epoch + 1} Iterations: {i + 1} Loss: {running_loss}')
running_loss = 0.0
print('Finished Training')
now = datetime.now()
finish_time = now.strftime("%H:%M:%S")
print("Finish Time =", finish_time)
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
print("Total memory:", info.total)
print("Free memory:", info.free)
print("Used memory:", info.used)
print("GPU: ", nvidia_smi.nvmlDeviceGetName(handle))
nvidia_smi.nvmlShutdown()
Start Time = 16:53:45 Epoch: 1 Iterations: 100 Loss: 97.43529093265533 Epoch: 1 Iterations: 200 Loss: 65.64091369509697 Epoch: 1 Iterations: 300 Loss: 59.835417211055756 Epoch: 1 Iterations: 400 Loss: 59.492636412382126 Epoch: 1 Iterations: 500 Loss: 54.997731268405914 Finished Training Finish Time = 16:55:47 Total memory: 16945512448 Free memory: 3001810944 Used memory: 13943701504 GPU: b'Tesla V100-SXM2-16GB'
实验二 GTX 3090 VS V100 (半精度) #
scaler = torch.cuda.amp.GradScaler()
autocast = torch.cuda.amp.autocast
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)
for epoch in range(1):
running_loss = 0.0
for i, data in enumerate(train_dataloader):
inputs, labels = data
inputs, labels = inputs.cuda(), labels.cuda()
optimizer.zero_grad()
with autocast():
outputs = model(inputs)
loss = criterion(outputs, labels)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
running_loss += loss.item()
if i % 100 == 99: # print every 100 mini-batches
print(f'Epoch: {epoch + 1} Iterations: {i + 1} Loss: {running_loss}')
running_loss = 0.0
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Finish Time =", current_time)
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
print("Total memory:", info.total)
print("Free memory:", info.free)
print("Used memory:", info.used)
print("GPU: ", nvidia_smi.nvmlDeviceGetName(handle))
nvidia_smi.nvmlShutdown()
Start Time = 15:20:14 Epoch: 1 Iterations: 100 Loss: 123.89001935720444 Epoch: 1 Iterations: 200 Loss: 73.2363004386425 Epoch: 1 Iterations: 300 Loss: 72.05745524168015 Epoch: 1 Iterations: 400 Loss: 58.50000710785389 Epoch: 1 Iterations: 500 Loss: 54.99997836351395 Finish Time = 15:22:38 Total memory: 25447170048 Free memory: 16354181120 Used memory: 9092988928 GPU: b'NVIDIA GeForce RTX 3090'
scaler = torch.cuda.amp.GradScaler()
autocast = torch.cuda.amp.autocast
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)
for epoch in range(1):
running_loss = 0.0
for i, data in enumerate(train_dataloader):
inputs, labels = data
inputs, labels = inputs.cuda(), labels.cuda()
optimizer.zero_grad()
with autocast():
outputs = model(inputs)
loss = criterion(outputs, labels)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
running_loss += loss.item()
if i % 100 == 99: # print every 100 mini-batches
print(f'Epoch: {epoch + 1} Iterations: {i + 1} Loss: {running_loss}')
running_loss = 0.0
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Finish Time =", current_time)
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
print("Total memory:", info.total)
print("Free memory:", info.free)
print("Used memory:", info.used)
print("GPU: ", nvidia_smi.nvmlDeviceGetName(handle))
nvidia_smi.nvmlShutdown()
Start Time = 17:00:27 Epoch: 1 Iterations: 100 Loss: 51.12177401781082 Epoch: 1 Iterations: 200 Loss: 51.52170616388321 Epoch: 1 Iterations: 300 Loss: 47.08749358355999 Epoch: 1 Iterations: 400 Loss: 50.16143310070038 Epoch: 1 Iterations: 500 Loss: 49.084938019514084 Finish Time = 17:01:56 Total memory: 16945512448 Free memory: 8861253632 Used memory: 8084258816 GPU: b'Tesla V100-SXM2-16GB'
sns.barplot(
x=[
'3090 FP32',
'V100 FP32',
'3090 FP16',
'V100 FP16'],
y=[
14205845504,
13954187264,
9092988928,
8084258816]
).set_title('GPU Memory');
sns.barplot(
x=[
'3090 FP32',
'V100 FP32',
'3090 FP16',
'V100 FP16'
],
y=[
1/114,
1/122,
1/144,
1/90
]
).set_title('Speed');

评论(0条)