DQN CartPole-v1可运行pytorch代码
以下是一个基于PyTorch实现的DQN CartPole-v1可运行代码:
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
# 训练参数
EPISODES = 300
BATCH_SIZE = 64
GAMMA = 0.99
EPSILON = 1.0
EPSILON_DECAY = 0.995
EPSILON_MIN = 0.01
LEARNING_RATE = 0.001
MEMORY_SIZE = 1000000
UPDATE_TARGET_EVERY = 1000
# 环境
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
# 设备
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# 神经网络
class QNetwork(torch.nn.Module):
def __init__(self, state_size, action_size):
super(QNetwork, self).__init__()
self.fc1 = torch.nn.Linear(state_size, 64)
self.fc2 = torch.nn.Linear(64, 64)
self.fc3 = torch.nn.Linear(64, action_size)
def forward(self, state):
x = torch.relu(self.fc1(state))
x = torch.relu(self.fc2(x))
x = self.fc3(x)
return x
# DQN代理
class DQNAgent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.memory = deque(maxlen=MEMORY_SIZE)
self.q_network = QNetwork(state_size, action_size).to(device)
self.target_network = QNetwork(state_size, action_size).to(device)
self.target_network.load_state_dict(self.q_network.state_dict())
self.optimizer = torch.optim.Adam(self.q_network.parameters(), lr=LEARNING_RATE)
self.loss_fn = torch.nn.MSELoss()
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def act(self, state, epsilon):
if np.random.rand() <= epsilon:
return np.random.choice(self.action_size)
else:
state = torch.from_numpy(state).float().unsqueeze(0).to(device)
with torch.no_grad():
q_values = self.q_network(state)
return np.argmax(q_values.cpu().data.numpy())
def replay(self):
if len(self.memory) < BATCH_SIZE:
return
batch = random.sample(self.memory, BATCH_SIZE)
states, actions, rewards, next_states, dones = zip(*batch)
states = torch.from_numpy(np.array(states)).float().to(device)
actions = torch.from_numpy(np.array(actions)).long().to(device)
rewards = torch.from_numpy(np.array(rewards)).float().to(device)
next_states = torch.from_numpy(np.array(next_states)).float().to(device)
dones = torch.from_numpy(np.array(dones, dtype=np.uint8)).float().to(device)
q_values = self.q_network(states).gather(1, actions.unsqueeze(1)).squeeze(1)
next_q_values = self.target_network(next_states).max(1)[0]
expected_q_values = rewards + (1 - dones) * GAMMA * next_q_values
loss = self.loss_fn(q_values, expected_q_values.detach())
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
def update_target_network(self):
self.target_network.load_state_dict(self.q_network.state_dict())
# 训练函数
def train(agent):
scores = []
epsilons = []
for episode in range(EPISODES):
state = env.reset()
score = 0
epsilon = max(EPSILON * EPSILON_DECAY ** episode, EPSILON_MIN)
while True:
action = agent.act(state, epsilon)
next_state, reward, done, _ = env.step(action)
agent.remember(state, action, reward, next_state, done)
agent.replay()
score += reward
state = next_state
if done:
agent.update_target_network()
scores.append(score)
epsilons.append(epsilon)
print("Episode: {}/{}, Score: {}, Epsilon: {:.2f}".format(episode, EPISODES, score, epsilon))
break
return scores, epsilons
# 训练
agent = DQNAgent(state_size, action_size)
scores, epsilons = train(agent)
# 绘制结果
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
ax[0].plot(scores)
ax[0].set_xlabel("Episode")
ax[0].set_ylabel("Score")
ax[1].plot(epsilons)
ax[1].set_xlabel("Episode")
ax[1].set_ylabel("Epsilon")
plt.show()
代码中使用了PyTorch的神经网络模块和优化器,以及OpenAI Gym的CartPole-v1环境。DQNAgent类实现了DQN的agent,包括记忆回放、行为选择、网络更新等功能。train函数用于训练agent,并返回训练结果。最后使用matplotlib绘制了训练过程中的得分和epsilon变化情况。
原文地址: https://www.cveoy.top/t/topic/tvP 著作权归作者所有。请勿转载和采集!