import os import sys import gym import utils import numpy as np from keras.layers import Dense from keras.models import Sequential from keras.optimizers import Adam from matplotlib import pyplot as plt class A2CAgent: def __init__(self, state_size, action_size): self.render = False self.state_size = state_size self.action_size = action_size self.value_size = 1 self.discount_factor = 0.99 self.actor_lr = 0.001 self.critic_lr = 0.005 self.actor = self.build_actor() self.critic = self.build_critic() def build_actor(self): actor = Sequential() actor.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform')) actor.add(Dense(self.action_size, activation='softmax', kernel_initializer='he_uniform')) actor.compile(loss='categorical_crossentropy', optimizer=Adam(lr=self.actor_lr)) return actor def build_critic(self): critic = Sequential() critic.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform')) critic.add(Dense(self.value_size, activation='linear', kernel_initializer='he_uniform')) critic.compile(loss="mse", optimizer=Adam(lr=self.critic_lr)) return critic def get_action(self, state): policy = self.actor.predict(state, batch_size=1).flatten() return np.random.choice(self.action_size, 1, p=policy)[0] def train_model(self, state, action, reward, next_state, done): target = np.zeros((1, self.value_size)) advantages = np.zeros((1, self.action_size)) value = self.critic.predict(state)[0] next_value = self.critic.predict(next_state)[0] if done: advantages[0][action] = reward - value target[0][0] = reward else: advantages[0][action] = reward + self.discount_factor * (next_value) - value target[0][0] = reward + self.discount_factor * next_value self.actor.fit(state, advantages, epochs=1, verbose=0) self.critic.fit(state, target, epochs=1, verbose=0) def run_A2C(): episodes = 500 seed = 1 results = [] game = 'CartPole-v0' env = gym.make(game) state_size = env.observation_space.shape[0] action_size = env.action_space.n agent = A2CAgent(state_size, action_size) for e in range(episodes): done = False score = 0 state = env.reset() state = np.reshape(state, [1, state_size]) while not done: action = agent.get_action(state) next_state, reward, done, info = env.step(action) next_state = np.reshape(next_state, [1, state_size]) agent.train_model(state, action, reward, next_state, done) score += reward state = next_state results.append(score) utils.save_trained_model(game, seed, 'A2C', agent.actor) plt.plot(results) plt.show() run_A2C()