强化学习入门:Q-learning算法实现迷宫求解
强化学习是一种机器学习方法,通过与环境的交互来学习最优策略。下面是一个简单的强化学习程序示例,使用 Q-learning 算法解决迷宫问题。
import numpy as np
# 定义一个简单的环境,例如一个迷宫
class Environment:
def __init__(self, size):
self.size = size
self.state = (0, 0)
self.goal = (size-1, size-1)
def reset(self):
self.state = (0, 0)
def step(self, action):
row, col = self.state
if action == 0: # 向上移动
row = max(row - 1, 0)
elif action == 1: # 向下移动
row = min(row + 1, self.size - 1)
elif action == 2: # 向左移动
col = max(col - 1, 0)
elif action == 3: # 向右移动
col = min(col + 1, self.size - 1)
self.state = (row, col)
if self.state == self.goal:
return 1 # 到达目标状态,返回奖励1
else:
return 0 # 未到达目标状态,返回奖励0
# 定义一个简单的强化学习智能体
class Agent:
def __init__(self, size, epsilon=0.1, alpha=0.5, gamma=0.9):
self.size = size
self.epsilon = epsilon # ε-greedy策略中的ε
self.alpha = alpha # 学习率
self.gamma = gamma # 折扣因子
self.q_table = np.zeros((size, size, 4)) # Q表
def choose_action(self, state):
if np.random.uniform() < self.epsilon:
action = np.random.randint(0, 4) # 随机选择动作
else:
action = np.argmax(self.q_table[state]) # 选择具有最高Q值的动作
return action
def update_q_table(self, state, action, reward, next_state):
next_max_q = np.max(self.q_table[next_state])
self.q_table[state][action] += self.alpha * (reward + self.gamma * next_max_q - self.q_table[state][action])
# 主程序
def main():
size = 5 # 迷宫大小
num_episodes = 1000 # 训练轮数
env = Environment(size)
agent = Agent(size)
for episode in range(num_episodes):
env.reset()
total_reward = 0
while True:
state = env.state
action = agent.choose_action(state)
reward = env.step(action)
next_state = env.state
agent.update_q_table(state, action, reward, next_state)
total_reward += reward
if env.state == env.goal:
break
print(f'Episode {episode+1}: Total reward = {total_reward}')
if __name__ == '__main__':
main()
上述程序实现了一个简单的强化学习智能体,使用 Q-learning 算法在一个迷宫环境中学习最优策略。程序中的 Environment 类定义了一个迷宫环境,Agent 类定义了一个强化学习智能体。主程序中使用了 1000 轮训练,每轮训练中智能体与环境进行交互并更新 Q 表。训练完成后,程序会输出每轮训练的总奖励。
原文地址: https://www.cveoy.top/t/topic/o5sx 著作权归作者所有。请勿转载和采集!