File size: 2,841 Bytes
b49af5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os 
import sys
import gym
import utils 

import numpy as np

from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam

from matplotlib import pyplot as plt 

class A2CAgent:
	def __init__(self, state_size, action_size):
		self.render = False
		self.state_size = state_size
		self.action_size = action_size
		self.value_size = 1
		self.discount_factor = 0.99
		self.actor_lr = 0.001
		self.critic_lr = 0.005
		self.actor = self.build_actor()
		self.critic = self.build_critic()

	def build_actor(self):
		actor = Sequential()
		actor.add(Dense(24, input_dim=self.state_size, activation='relu',
						kernel_initializer='he_uniform'))
		actor.add(Dense(self.action_size, activation='softmax',
						kernel_initializer='he_uniform'))
		actor.compile(loss='categorical_crossentropy',
					  optimizer=Adam(lr=self.actor_lr))
		
		return actor

	def build_critic(self):
		critic = Sequential()
		critic.add(Dense(24, input_dim=self.state_size, activation='relu',
						 kernel_initializer='he_uniform'))
		critic.add(Dense(self.value_size, activation='linear',
						 kernel_initializer='he_uniform'))
		critic.compile(loss="mse", optimizer=Adam(lr=self.critic_lr))
		
		return critic

	def get_action(self, state):
		policy = self.actor.predict(state, batch_size=1).flatten()
			
		return np.random.choice(self.action_size, 1, p=policy)[0]

	def train_model(self, state, action, reward, next_state, done):
		target = np.zeros((1, self.value_size))
		advantages = np.zeros((1, self.action_size))

		value = self.critic.predict(state)[0]
		next_value = self.critic.predict(next_state)[0]

		if done:
			advantages[0][action] = reward - value
			target[0][0] = reward
		else:
			advantages[0][action] = reward + self.discount_factor * (next_value) - value
			target[0][0] = reward + self.discount_factor * next_value

		self.actor.fit(state, advantages, epochs=1, verbose=0)
		self.critic.fit(state, target, epochs=1, verbose=0)

def run_A2C():
    episodes = 500
    seed = 1
    results = []
    game = 'CartPole-v0'

    env = gym.make(game)

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    agent = A2CAgent(state_size, action_size)

    for e in range(episodes):
        done = False
        score = 0
        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])
            agent.train_model(state, action, reward, next_state, done)

            score += reward
            state = next_state

        results.append(score)

    utils.save_trained_model(game, seed, 'A2C', agent.actor)

    plt.plot(results)
    plt.show()

run_A2C()