|
import os |
|
import sys |
|
import gym |
|
import utils |
|
|
|
import numpy as np |
|
|
|
from keras.layers import Dense |
|
from keras.models import Sequential |
|
from keras.optimizers import Adam |
|
|
|
from matplotlib import pyplot as plt |
|
|
|
class A2CAgent: |
|
def __init__(self, state_size, action_size): |
|
self.render = False |
|
self.state_size = state_size |
|
self.action_size = action_size |
|
self.value_size = 1 |
|
self.discount_factor = 0.99 |
|
self.actor_lr = 0.001 |
|
self.critic_lr = 0.005 |
|
self.actor = self.build_actor() |
|
self.critic = self.build_critic() |
|
|
|
def build_actor(self): |
|
actor = Sequential() |
|
actor.add(Dense(24, input_dim=self.state_size, activation='relu', |
|
kernel_initializer='he_uniform')) |
|
actor.add(Dense(self.action_size, activation='softmax', |
|
kernel_initializer='he_uniform')) |
|
actor.compile(loss='categorical_crossentropy', |
|
optimizer=Adam(lr=self.actor_lr)) |
|
|
|
return actor |
|
|
|
def build_critic(self): |
|
critic = Sequential() |
|
critic.add(Dense(24, input_dim=self.state_size, activation='relu', |
|
kernel_initializer='he_uniform')) |
|
critic.add(Dense(self.value_size, activation='linear', |
|
kernel_initializer='he_uniform')) |
|
critic.compile(loss="mse", optimizer=Adam(lr=self.critic_lr)) |
|
|
|
return critic |
|
|
|
def get_action(self, state): |
|
policy = self.actor.predict(state, batch_size=1).flatten() |
|
|
|
return np.random.choice(self.action_size, 1, p=policy)[0] |
|
|
|
def train_model(self, state, action, reward, next_state, done): |
|
target = np.zeros((1, self.value_size)) |
|
advantages = np.zeros((1, self.action_size)) |
|
|
|
value = self.critic.predict(state)[0] |
|
next_value = self.critic.predict(next_state)[0] |
|
|
|
if done: |
|
advantages[0][action] = reward - value |
|
target[0][0] = reward |
|
else: |
|
advantages[0][action] = reward + self.discount_factor * (next_value) - value |
|
target[0][0] = reward + self.discount_factor * next_value |
|
|
|
self.actor.fit(state, advantages, epochs=1, verbose=0) |
|
self.critic.fit(state, target, epochs=1, verbose=0) |
|
|
|
def run_A2C(): |
|
episodes = 500 |
|
seed = 1 |
|
results = [] |
|
game = 'CartPole-v0' |
|
|
|
env = gym.make(game) |
|
|
|
state_size = env.observation_space.shape[0] |
|
action_size = env.action_space.n |
|
|
|
agent = A2CAgent(state_size, action_size) |
|
|
|
for e in range(episodes): |
|
done = False |
|
score = 0 |
|
state = env.reset() |
|
state = np.reshape(state, [1, state_size]) |
|
|
|
while not done: |
|
action = agent.get_action(state) |
|
next_state, reward, done, info = env.step(action) |
|
next_state = np.reshape(next_state, [1, state_size]) |
|
agent.train_model(state, action, reward, next_state, done) |
|
|
|
score += reward |
|
state = next_state |
|
|
|
results.append(score) |
|
|
|
utils.save_trained_model(game, seed, 'A2C', agent.actor) |
|
|
|
plt.plot(results) |
|
plt.show() |
|
|
|
run_A2C() |
|
|