Initialize memory

memory = Memory(n_steps, env.observation_space.shape, env.action_space.shape[0])

Determine how many episodes to train

self.episodes = 5

Train the agent

for e in range(self.episodes): state = env.reset() total_reward = 0

for time in range(self.max_steps):
	action = self.act(state, memory)
	next_state, reward, done, info = env.step(action)
	memory.add_step((state, action, reward, next_state, done))
	state = next_state
	
	total_reward += reward
	
	if done:
		break
	
if memory.size > self.batch_size:
	self.train(memory)

print('Episode: {}/{}, score: {}, epsilon: {}'.format(e, self.episodes, total_reward, self.epsilon))

if total_reward > self.best_reward:
	self.save()
	self.best_reward = total_reward

Close the environment

env.close()

def act(self, state, memory): # Generate a random number rand = np.random.rand()

# If the random number is less than the epsilon value, select a random action
if rand <= self.epsilon:
	action = env.action_space.sample()
	
# Else, select the action with the highest Q-value
else:
	action = np.argmax(self.model.predict(np.array([state])))
	
return action

def train(self, memory): # Sample a batch of data from the memory batch = memory.sample(self.batch_size)

# Separate the batch into inputs and targets
states = np.array([each[0] for each in batch])
actions = np.array([each[1] for each in batch])
rewards = np.array([each[2] for each in batch])
next_states = np.array([each[3] for each in batch])
dones = np.array([each[4] for each in batch])

# Get the Q-values for the next states
next_Qs = self.model.predict(next_states)

# Set the targets
targets = rewards + self.gamma * np.max(next_Qs, axis=1) * (1 - dones)

# One-hot encode the actions
targets_full = self.model.predict(states)
indices = np.array([i for i in range(self.batch_size)])
targets_full[[indices], [actions]] = targets

# Train the model
self.model.fit(states, targets_full, epochs=1, verbose=0)

# Decay the epsilon
if self.epsilon > self.epsilon_min:
	self.epsilon *= self.epsilon_decay

def save(self): self.model.save('cartpole_model.h5')

if name == 'main': agent = Agent() agent.run()

Reinforcement Learning Agent for CartPole Environment

原文地址: https://www.cveoy.top/t/topic/lidR 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录