PyTorch学习率调度器ReduceLROnPlateau使用及改进
PyTorch学习率调度器ReduceLROnPlateau使用及改进
在深度学习模型训练过程中,学习率是一个非常重要的超参数。学习率过大会导致模型难以收敛,而学习率过小则会导致模型收敛速度过慢。为了解决这个问题,我们可以使用学习率调度器来自适应地调整学习率。
PyTorch提供了一个名为ReduceLROnPlateau的学习率调度器,它可以在验证集指标停止提升时降低学习率。
代码分析
以下代码片段展示了如何使用ReduceLROnPlateau学习率调度器:pythonimport torchimport osimport torchvision.models as modelsfrom torch.utils import datafrom torch import nnfrom torch import optimimport numpy as npimport argparseimport timefrom data.MyDataset import Mydatasetprofrom torch.optim.lr_scheduler import ReduceLROnPlateaufrom data.MyDataset import all_imgs_path, all_labels, transformfrom tensorboardX import SummaryWriter
parser = argparse.ArgumentParser(description='Soft')parser.add_argument('--name', type=str, default='resnet50', metavar='N', help='model name')parser.add_argument('--batch_size', type=int, default=32, metavar='N', help='input batch size for training')parser.add_argument('--epochs', type=int, default=100, metavar='N', help='number of epochs to train')parser.add_argument('--lr', type=float, default=1e-3, metavar='LR', help='learning rate')parser.add_argument('--sample_size', type=int, default=256, help='sizes of input samples')parser.add_argument('--log_path', type=str, default='D:\UC_data\resnet50log')parser.add_argument('--lr_decay', type=float, default=0.1, help='learning rate decay value (default: 0.1)')parser.add_argument('--lr_decay_epoch', type=int, nargs='+', default=[50, 80], help='decrease learning rate at these epochs.')args = parser.parse_args()print(args)
DESTINATION_PATH = 'D:\UC_data\resnet50\saved_model' + args.name + '_models'if not os.path.isdir('saved_model'): os.mkdir('saved_model')if not os.path.isdir(DESTINATION_PATH): os.mkdir(DESTINATION_PATH)
==================================================================# Set Device# ==================================================================DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')# ==================================================================# logger setting# ==================================================================store_name = '_'.join([args.name, str(args.sample_size), str(args.lr), str(time.time())])# ==================================================================# get data# ==================================================================
划分数据集和测试集index = np.random.permutation(len(all_imgs_path))all_imgs_path = np.array(all_imgs_path)[index]all_labels = np.array(all_labels)[index]
80%做训练集s = int(len(all_imgs_path) * 0.8)
train_imgs = all_imgs_path[:s]train_labels = all_labels[:s]test_imgs = all_imgs_path[s:]test_labels = all_labels[s:]
train_ds = Mydatasetpro(train_imgs, train_labels, transform) test_ds = Mydatasetpro(test_imgs, test_labels, transform) # print(train_ds)# print(test_ds)train_loader = data.DataLoader(train_ds, batch_size=args.batch_size, shuffle=True) test_loader = data.DataLoader(train_ds, batch_size=args.batch_size, shuffle=True) # ==================================================================# Set Model# ==================================================================model = models.resnet50(pretrained=True)in_features = model.fc.in_featuresmodel.fc = nn.Sequential( nn.Linear(in_features, 256), nn.ReLU(), nn.Linear(256,3), nn.LogSoftmax(dim=1))
将模型迁移到gpumodel = model.to(DEVICE)
优化器loss_fn = nn.CrossEntropyLoss()loss_fn = loss_fn.to(DEVICE) # 将loss_fn迁移到GPU# Adam损失函数optimizer = torch.optim.Adam(model.fc.parameters(), lr=args.lr)# 设置ReduceLROnPlateau学习率调度器scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=10, verbose=True)
tf_writer = SummaryWriter(log_dir=os.path.join(args.log_path, args.name, store_name))
best_accuracy = 0.0 # 保存最佳模型的准确率best_epoch = 0no_improve_count = 0 # 记录连续没有提升的epoch次数
for epoch in range(args.epochs): # ============================================================== # Train # ============================================================== count = 0 train_loss = 0.0 model.train() start_time = time.time() # 遍历训练集数据 for imgs, labels in train_loader: count +=1 labels = torch.tensor(labels, dtype=torch.long) imgs, labels = imgs.to(DEVICE), labels.to(DEVICE) optimizer.zero_grad() # 梯度归零 outputs = model(imgs) loss = loss_fn(outputs, labels) loss.backward() # 反向传播计算梯度 optimizer.step() # 梯度优化 train_loss += loss.item() train_loss = train_loss / count train_time = (time.time() - start_time) # ============================================================== # Validate # ============================================================== model.eval() count = 0 test_loss = 0 accuracy = 0
with torch.no_grad(): # 遍历测试集数据 for imgs, labels in test_loader: count += 1 labels = torch.tensor(labels, dtype=torch.long) imgs, labels = imgs.to(DEVICE), labels.to(DEVICE) outputs = model(imgs) loss = loss_fn(outputs, labels) test_loss += loss.item() ps = torch.exp(outputs) top_p, top_class = ps.topk(1, dim=1) equals = top_class == labels.view(*top_class.shape) accuracy += torch.mean(equals.type(torch.FloatTensor)).item() test_loss = test_loss/count accuracy = accuracy/count print( f'Epoch {epoch + 1}/{args.epochs}.. ' f'Train time: {train_time:.3f}.. ' f'Train loss: {train_loss:.3f}.. ' f'Test loss: {test_loss:.3f}.. ' f'Test accuracy: {accuracy:.3f}..' f'lr:: {optimizer.param_groups[0]['lr']:.4f}' ) if accuracy > best_accuracy: best_accuracy = accuracy best_epoch = epoch torch.save(model, DESTINATION_PATH+'Direction_model.pth') no_improve_count = 0 else: # 验证集指标没有提升,调用scheduler.step() scheduler.step(accuracy) no_improve_count += 1 if no_improve_count >= 10: if optimizer.param_groups[0]['lr'] < 1e-5: # 学习率过小,停止训练 print('Learning rate too small, training stopped.') break
改进说明
在原始代码中,scheduler.step() 的调用位置不正确,导致学习率无法及时更新。正确的做法是在每个epoch结束后,根据验证集的准确率判断是否需要降低学习率。如果准确率没有提升,则立即调用scheduler.step(accuracy)来更新学习率。
总结
ReduceLROnPlateau 是一个非常实用的学习率调度器,可以帮助我们在模型训练过程中自动调整学习率。通过合理地设置和使用该调度器,可以有效提高模型的训练效率和性能。
原文地址: https://www.cveoy.top/t/topic/fzWF 著作权归作者所有。请勿转载和采集!