|
import os |
|
import math |
|
import time |
|
import gym |
|
import random |
|
import utils |
|
import keras |
|
import numpy as np |
|
|
|
from collections import deque |
|
from matplotlib import pyplot as plt |
|
from sklearn.preprocessing import OneHotEncoder |
|
|
|
class ReplayBuffer(): |
|
""" |
|
Thank you: https://github.com/BY571/ |
|
""" |
|
|
|
def __init__(self, max_size): |
|
self.max_size = max_size |
|
self.buffer = [] |
|
|
|
def add_sample(self, states, actions, rewards): |
|
episode = {"states": states, "actions":actions, "rewards": rewards, "summed_rewards":sum(rewards)} |
|
self.buffer.append(episode) |
|
|
|
def sort(self): |
|
|
|
self.buffer = sorted(self.buffer, key = lambda i: i["summed_rewards"],reverse=True) |
|
|
|
self.buffer = self.buffer[:self.max_size] |
|
|
|
def get_random_samples(self, batch_size): |
|
self.sort() |
|
idxs = np.random.randint(0, len(self.buffer), batch_size) |
|
batch = [self.buffer[idx] for idx in idxs] |
|
return batch |
|
|
|
def get_n_best(self, n): |
|
self.sort() |
|
return self.buffer[:n] |
|
|
|
def __len__(self): |
|
return len(self.buffer) |
|
|
|
class UpsideDownAgent(): |
|
def __init__(self, environment, approximator): |
|
self.environment = gym.make(environment) |
|
self.approximator = approximator |
|
self.state_size = (84, 84, 4) |
|
self.action_size = 3 |
|
self.warm_up_episodes = 1 |
|
self.render = False |
|
self.memory = ReplayBuffer(700) |
|
self.last_few = 50 |
|
self.batch_size = 256 |
|
self.command_size = 2 |
|
self.desired_return = 1 |
|
self.desired_horizon = 1 |
|
self.horizon_scale = 0.02 |
|
self.return_scale = 0.02 |
|
|
|
self.behaviour_function = utils.get_atari_behaviour_function(self.action_size) |
|
|
|
self.testing_rewards = [] |
|
self.warm_up_buffer() |
|
|
|
def warm_up_buffer(self): |
|
print('Warming up') |
|
|
|
for i in range(self.warm_up_episodes): |
|
|
|
states = [] |
|
rewards = [] |
|
actions = [] |
|
|
|
dead = False |
|
done = False |
|
desired_return = 1 |
|
desired_horizon = 1 |
|
|
|
step, score, start_life = 0, 0, 5 |
|
observe = self.environment.reset() |
|
|
|
for _ in range(random.randint(1, 30)): |
|
observe, _, _, _ = self.environment.step(1) |
|
|
|
state = utils.pre_processing(observe) |
|
history = np.stack((state, state, state, state), axis=2) |
|
history = np.reshape([history], (1, 84, 84, 4)) |
|
|
|
|
|
while not done: |
|
|
|
states.append(history) |
|
command = np.asarray([desired_return * self.return_scale, desired_horizon * self.horizon_scale]) |
|
command = np.reshape(command, [1, len(command)]) |
|
|
|
action = self.get_action(history, command) |
|
actions.append(action) |
|
|
|
if action == 0: |
|
real_action = 1 |
|
elif action == 1: |
|
real_action = 2 |
|
else: |
|
real_action = 3 |
|
|
|
next_state, reward, done, info = self.environment.step(real_action) |
|
next_state = utils.pre_processing(observe) |
|
next_state = np.reshape([next_state], (1, 84, 84, 1)) |
|
next_history = np.append(next_state, history[:, :, :, :3], axis = 3) |
|
|
|
rewards.append(reward) |
|
|
|
state = next_state |
|
|
|
if start_life > info['ale.lives']: |
|
dead = True |
|
start_lide = info['ale.lives'] |
|
|
|
if dead: |
|
dead = False |
|
else: |
|
history = next_history |
|
|
|
desired_return -= reward |
|
desired_horizon -= 1 |
|
desired_horizon = np.maximum(desired_horizon, 1) |
|
|
|
self.memory.add_sample(states, actions, rewards) |
|
|
|
|
|
def get_action(self, observation, command): |
|
""" |
|
We will sample from the action distribution modeled by the Behavior Function |
|
""" |
|
|
|
observation = np.float32(observation / 255.0) |
|
|
|
action_probs = self.behaviour_function.predict([observation, command]) |
|
action = np.random.choice(np.arange(0, self.action_size), p=action_probs[0]) |
|
|
|
return action |
|
|
|
def get_greedy_action(self, observation, command): |
|
|
|
action_probs = self.behaviour_function.predict([observation, command]) |
|
action = np.argmax(action_probs) |
|
|
|
return action |
|
|
|
def train_behaviour_function(self): |
|
|
|
random_episodes = self.memory.get_random_samples(self.batch_size) |
|
|
|
training_observations = np.zeros((self.batch_size, self.state_size[0], self.state_size[1], self.state_size[2])) |
|
training_commands = np.zeros((self.batch_size, 2)) |
|
|
|
y = [] |
|
|
|
for idx, episode in enumerate(random_episodes): |
|
T = len(episode['states']) |
|
t1 = np.random.randint(0, T-1) |
|
t2 = np.random.randint(t1+1, T) |
|
|
|
state = np.float32(episode['states'][t1] / 255.) |
|
desired_return = sum(episode["rewards"][t1:t2]) |
|
desired_horizon = t2 -t1 |
|
|
|
target = episode['actions'][t1] |
|
|
|
training_observations[idx] = state[0] |
|
training_commands[idx] = np.asarray([desired_return*self.return_scale, desired_horizon*self.horizon_scale]) |
|
y.append(target) |
|
|
|
_y = keras.utils.to_categorical(y, num_classes=self.action_size) |
|
|
|
self.behaviour_function.fit([training_observations, training_commands], _y, verbose=0) |
|
|
|
|
|
def sample_exploratory_commands(self): |
|
best_episodes = self.memory.get_n_best(self.last_few) |
|
exploratory_desired_horizon = np.mean([len(i["states"]) for i in best_episodes]) |
|
|
|
returns = [i["summed_rewards"] for i in best_episodes] |
|
exploratory_desired_returns = np.random.uniform(np.mean(returns), np.mean(returns)+np.std(returns)) |
|
|
|
return [exploratory_desired_returns, exploratory_desired_horizon] |
|
|
|
def generate_episode(self, environment, e, desired_return, desired_horizon, testing): |
|
|
|
env = gym.make(environment) |
|
|
|
tot_rewards = [] |
|
|
|
done = False |
|
dead = False |
|
|
|
scores = [] |
|
states = [] |
|
actions = [] |
|
rewards = [] |
|
|
|
step, score, start_life = 0, 0, 5 |
|
|
|
observe = env.reset() |
|
for _ in range(random.randint(1, 30)): |
|
observe, _, _, _ = env.step(1) |
|
|
|
state = utils.pre_processing(observe) |
|
history = np.stack((state, state, state, state), axis=2) |
|
history = np.reshape([history], (1, 84, 84, 4)) |
|
|
|
while not done: |
|
states.append(history) |
|
|
|
command = np.asarray([desired_return * self.return_scale, desired_horizon * self.horizon_scale]) |
|
command = np.reshape(command, [1, len(command)]) |
|
|
|
if not testing: |
|
action = self.get_action(history, command) |
|
actions.append(action) |
|
else: |
|
action = self.get_greedy_action(history, command) |
|
|
|
if action == 0: |
|
real_action = 1 |
|
elif action == 1: |
|
real_action = 2 |
|
else: |
|
real_action = 3 |
|
|
|
next_state, reward, done, info = env.step(real_action) |
|
next_state = utils.pre_processing(observe) |
|
next_state = np.reshape([next_state], (1, 84, 84, 1)) |
|
next_history = np.append(next_state, history[:, :, :, :3], axis = 3) |
|
|
|
clipped_reward = np.clip(reward, -1, 1) |
|
rewards.append(clipped_reward) |
|
|
|
score += reward |
|
|
|
if start_life > info['ale.lives']: |
|
dead = True |
|
start_life = info['ale.lives'] |
|
|
|
if dead: |
|
dead = False |
|
else: |
|
history = next_history |
|
|
|
desired_return -= reward |
|
desired_horizon -= 1 |
|
desired_horizon = np.maximum(desired_horizon, 1) |
|
|
|
self.memory.add_sample(states, actions, rewards) |
|
|
|
self.testing_rewards.append(score) |
|
|
|
if testing: |
|
print('Querying the model ...') |
|
print('Testing score: {}'.format(score)) |
|
|
|
return score |
|
|
|
def run_experiment(): |
|
|
|
import argparse |
|
|
|
parser = argparse.ArgumentParser() |
|
|
|
parser.add_argument('--approximator', type=str, default='neural_network') |
|
parser.add_argument('--environment', type=str, default='PongDeterministic-v4') |
|
parser.add_argument('--seed', type=int, default=1) |
|
|
|
args = parser.parse_args() |
|
|
|
approximator = args.approximator |
|
environment = args.environment |
|
seed = args.seed |
|
|
|
episodes = 1500 |
|
returns = [] |
|
|
|
agent = UpsideDownAgent(environment, approximator) |
|
|
|
for e in range(episodes): |
|
|
|
print("Episode {}".format(e)) |
|
|
|
for i in range(100): |
|
agent.train_behaviour_function() |
|
|
|
print("Finished training B!") |
|
|
|
for i in range(15): |
|
tmp_r = [] |
|
exploratory_commands = agent.sample_exploratory_commands() |
|
desired_return = exploratory_commands[0] |
|
desired_horizon = exploratory_commands[1] |
|
r = agent.generate_episode(environment, e, desired_return, desired_horizon, False) |
|
tmp_r.append(r) |
|
|
|
print(np.mean(tmp_r)) |
|
returns.append(np.mean(tmp_r)) |
|
|
|
exploratory_commands = agent.sample_exploratory_commands() |
|
|
|
|
|
|
|
utils.save_results(environment, approximator, seed, returns) |
|
|
|
if approximator == 'neural_network': |
|
utils.save_trained_model(environment, seed, agent.behaviour_function) |
|
|
|
plt.plot(returns) |
|
plt.show() |
|
|
|
if __name__ == "__main__": |
|
run_experiment() |
|
|