使用 Python、MATLAB 和 Julia 实现 DQN 算法
使用 Python、MATLAB 和 Julia 实现 DQN 算法
深度 Q 学习 (DQN) 是一种流行的强化学习算法,它使用神经网络来学习最优策略。在本篇文章中,我们将展示如何使用三种编程语言 Python、MATLAB 和 Julia 实现 DQN 算法。
Python
import numpy as np
import tensorflow as tf
from tensorflow import keras
# 定义 DQN 类
class DQNAgent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.memory = []
self.gamma = 0.95
self.epsilon = 1.0
self.epsilon_decay = 0.995
self.epsilon_min = 0.01
self.learning_rate = 0.001
self.model = self.build_model()
# 定义神经网络模型
def build_model(self):
model = keras.Sequential()
model.add(keras.layers.Dense(24, input_dim=self.state_size, activation='relu'))
model.add(keras.layers.Dense(24, activation='relu'))
model.add(keras.layers.Dense(self.action_size, activation='linear'))
model.compile(loss='mse', optimizer=keras.optimizers.Adam(lr=self.learning_rate))
return model
# 将状态、动作、奖励、下一状态和完成标志添加到记忆中
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
# 根据当前状态选择动作
def act(self, state):
if np.random.rand() <= self.epsilon:
return np.random.choice(self.action_size)
else:
return np.argmax(self.model.predict(state)[0])
# 使用随机采样的记忆批次训练神经网络
def replay(self, batch_size):
if len(self.memory) < batch_size:
return
minibatch = np.random.choice(self.memory, batch_size, replace=False)
for state, action, reward, next_state, done in minibatch:
target = reward
if not done:
target = reward + self.gamma * np.amax(self.model.predict(next_state)[0])
target_f = self.model.predict(state)
target_f[0][action] = target
self.model.fit(state, target_f, epochs=1, verbose=0)
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
# 定义环境和智能体
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
# 训练智能体
batch_size = 32
num_episodes = 1000
for episode in range(num_episodes):
state = env.reset()
state = np.reshape(state, [1, state_size])
done = False
for time_step in range(500):
action = agent.act(state)
next_state, reward, done, _ = env.step(action)
reward = reward if not done else -10
next_state = np.reshape(next_state, [1, state_size])
agent.remember(state, action, reward, next_state, done)
state = next_state
if done:
print('episode: {}/{}, score: {}, epsilon: {:.2}'.format(episode, num_episodes, time_step, agent.epsilon))
break
agent.replay(batch_size)
MATLAB
classdef DQNAgent < handle
properties
state_size
action_size
memory = []
gamma = 0.95
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.01
learning_rate = 0.001
model
end
methods
function obj = DQNAgent(state_size, action_size)
obj.state_size = state_size;
obj.action_size = action_size;
obj.model = obj.build_model();
end
function model = build_model(obj)
model = rlSimpleNetwork([obj.state_size, 24, 24, obj.action_size], ...
[reluLayer, reluLayer, linearLayer], ...
'Observation', {'input'}, ...
'Action', {'rlAction'}, ...
'LearningRate', obj.learning_rate, ...
'Name', 'DQN');
model.Options.GradientThreshold = 1;
model.Options.UseDevice = 'gpu';
model.Options.ExecutionEnvironment = 'gpu';
model.Options.EpsilonGreedyExploration.Epsilon = obj.epsilon;
end
function remember(obj, state, action, reward, next_state, done)
obj.memory = [obj.memory; {state, action, reward, next_state, done}];
end
function action = act(obj, state)
if rand < obj.epsilon
action = randi([1, obj.action_size]);
else
action = predict(obj.model, state);
end
end
function replay(obj, batch_size)
if size(obj.memory, 1) < batch_size
return;
end
minibatch = datasample(obj.memory, batch_size, 'Replace', false);
for i = 1:batch_size
state = minibatch{i, 1};
action = minibatch{i, 2};
reward = minibatch{i, 3};
next_state = minibatch{i, 4};
done = minibatch{i, 5};
target = reward;
if ~done
target = reward + obj.gamma * max(predict(obj.model, next_state));
end
target_f = predict(obj.model, state);
target_f(action) = target;
train(obj.model, state, target_f);
end
if obj.epsilon > obj.epsilon_min
obj.epsilon = obj.epsilon * obj.epsilon_decay;
obj.model.Options.EpsilonGreedyExploration.Epsilon = obj.epsilon;
end
end
end
end
env = rlPredefinedEnv('CartPole-Discrete');
state_size = env.ObservationInfo.Dimension(1);
action_size = env.ActionInfo.Dimension(1);
agent = DQNAgent(state_size, action_size);
batch_size = 32;
num_episodes = 1000;
for episode = 1:num_episodes
state = reset(env);
done = false;
for time_step = 1:500
action = act(agent, state);
[next_state, reward, done, ~] = step(env, action);
reward = reward * (1 - double(done));
remember(agent, state, action, reward, next_state, done);
state = next_state;
if done
fprintf('episode: %d/%d, score: %d, epsilon: %.2f\n', episode, num_episodes, time_step, agent.epsilon);
break
end
end
replay(agent, batch_size);
end
Julia
using Flux, Random, Statistics, Base.Iterators, ReinforcementLearningEnvironments
struct DQNAgent
state_size::Int
action_size::Int
memory::Vector{Tuple{Vector{Float32}, Int64, Float32, Vector{Float32}, Bool}}
gamma::Float32
epsilon::Float32
epsilon_decay::Float32
epsilon_min::Float32
learning_rate::Float32
model::Chain
end
function DQNAgent(state_size::Int, action_size::Int)
model = Chain(Dense(state_size, 24, relu), Dense(24, 24, relu), Dense(24, action_size))
model = gpu(model)
optimizer = ADAM(learning_rate=0.001)
return DQNAgent(state_size, action_size, [], 0.95f0, 1.0f0, 0.995f0, 0.01f0, 0.001f0, model)
end
function remember(agent::DQNAgent, state::Vector{Float32}, action::Int64, reward::Float32, next_state::Vector{Float32}, done::Bool)
push!(agent.memory, (state, action, reward, next_state, done))
end
function act(agent::DQNAgent, state::Vector{Float32})
if rand(Float32) < agent.epsilon
return rand(1:agent.action_size)
else
return argmax(Flux.gpu(agent.model)(state))
end
end
function replay(agent::DQNAgent, batch_size::Int64)
if length(agent.memory) < batch_size
return
end
minibatch = sample(agent.memory, batch_size, replace=false)
for (state, action, reward, next_state, done) in minibatch
target = reward
if !done
target = reward + agent.gamma * maximum(Flux.gpu(agent.model)(next_state))
end
target_f = Flux.gpu(agent.model)(state)
target_f[action] = target
Flux.train!(loss_fn(state, target_f, agent.model), params(agent.model), minibatch, optimizer=agent.optimizer)
end
if agent.epsilon > agent.epsilon_min
agent.epsilon = agent.epsilon * agent.epsilon_decay
end
end
function loss_fn(state::Vector{Float32}, target_f::Vector{Float32}, model::Chain)
loss = Flux.Losses.mse(Flux.gpu(model)(state), target_f)
return loss
end
env = CartPoleEnv();
state_size = length(env.observation_space.low)
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
batch_size = 32
num_episodes = 1000
for episode in 1:num_episodes
state = reset!(env)
done = false
for time_step in 1:500
action = act(agent, state)
next_state, reward, done, _ = step!(env, action)
remember(agent, state, action, reward, next_state, done)
state = next_state
if done
println('episode: $episode/$num_episodes, score: $time_step, epsilon: $(agent.epsilon)')
break
end
end
replay(agent, batch_size)
end
总结
这篇文章展示了如何使用 Python、MATLAB 和 Julia 实现 DQN 算法。每个代码示例都包含了构建模型、训练模型和评估模型的基本步骤。您可以在这些代码的基础上进行修改和扩展,以应用于不同的强化学习问题。
希望这篇文章对您有所帮助!如果您有任何问题,请随时提出。
原文地址: http://www.cveoy.top/t/topic/lMx6 著作权归作者所有。请勿转载和采集!