vimmoos@Thor
In the beginning there was darkness
b49af5c
import os
import sys
import gym
import utils
import numpy as np
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam
from matplotlib import pyplot as plt
class A2CAgent:
def __init__(self, state_size, action_size):
self.render = False
self.state_size = state_size
self.action_size = action_size
self.value_size = 1
self.discount_factor = 0.99
self.actor_lr = 0.001
self.critic_lr = 0.005
self.actor = self.build_actor()
self.critic = self.build_critic()
def build_actor(self):
actor = Sequential()
actor.add(Dense(24, input_dim=self.state_size, activation='relu',
kernel_initializer='he_uniform'))
actor.add(Dense(self.action_size, activation='softmax',
kernel_initializer='he_uniform'))
actor.compile(loss='categorical_crossentropy',
optimizer=Adam(lr=self.actor_lr))
return actor
def build_critic(self):
critic = Sequential()
critic.add(Dense(24, input_dim=self.state_size, activation='relu',
kernel_initializer='he_uniform'))
critic.add(Dense(self.value_size, activation='linear',
kernel_initializer='he_uniform'))
critic.compile(loss="mse", optimizer=Adam(lr=self.critic_lr))
return critic
def get_action(self, state):
policy = self.actor.predict(state, batch_size=1).flatten()
return np.random.choice(self.action_size, 1, p=policy)[0]
def train_model(self, state, action, reward, next_state, done):
target = np.zeros((1, self.value_size))
advantages = np.zeros((1, self.action_size))
value = self.critic.predict(state)[0]
next_value = self.critic.predict(next_state)[0]
if done:
advantages[0][action] = reward - value
target[0][0] = reward
else:
advantages[0][action] = reward + self.discount_factor * (next_value) - value
target[0][0] = reward + self.discount_factor * next_value
self.actor.fit(state, advantages, epochs=1, verbose=0)
self.critic.fit(state, target, epochs=1, verbose=0)
def run_A2C():
episodes = 500
seed = 1
results = []
game = 'CartPole-v0'
env = gym.make(game)
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = A2CAgent(state_size, action_size)
for e in range(episodes):
done = False
score = 0
state = env.reset()
state = np.reshape(state, [1, state_size])
while not done:
action = agent.get_action(state)
next_state, reward, done, info = env.step(action)
next_state = np.reshape(next_state, [1, state_size])
agent.train_model(state, action, reward, next_state, done)
score += reward
state = next_state
results.append(score)
utils.save_trained_model(game, seed, 'A2C', agent.actor)
plt.plot(results)
plt.show()
run_A2C()