vimmoos@Thor
commited on
Commit
·
b49af5c
0
Parent(s):
In the beginning there was darkness
Browse files- .gitignore +4 -0
- README.md +64 -0
- old_code/experiment_1/train_agent.py +405 -0
- old_code/experiment_1/utils.py +121 -0
- old_code/experiment_2/catch.py +93 -0
- old_code/experiment_2/catch_v2.py +98 -0
- old_code/experiment_2/catch_v3.py +91 -0
- old_code/experiment_2/catch_v4.py +88 -0
- old_code/experiment_2/train_catch_cnn_agent.py +295 -0
- old_code/experiment_2/utils.py +121 -0
- old_code/experiment_3/q_networks/a2c.py +104 -0
- old_code/experiment_3/q_networks/buffers/CartPole-v0/1/DQN/memory_buffer.p +0 -0
- old_code/experiment_3/q_networks/ddqn.py +136 -0
- old_code/experiment_3/q_networks/dqn.py +130 -0
- old_code/experiment_3/q_networks/prepare_buffer.py +104 -0
- old_code/experiment_3/q_networks/train_offline_a2c.py +123 -0
- old_code/experiment_3/q_networks/train_offline_ddqn.py +126 -0
- old_code/experiment_3/q_networks/train_offline_dqn.py +124 -0
- old_code/experiment_3/q_networks/utils.py +29 -0
- old_code/experiment_3/upside_down/prepare_offline_buffer.py +157 -0
- old_code/experiment_3/upside_down/train_agent.py +248 -0
- old_code/experiment_3/upside_down/train_offline_agent.py +196 -0
- old_code/experiment_3/upside_down/utils.py +131 -0
- old_code/train_atari_agent.py +321 -0
- old_code/utils.py +121 -0
- poetry.lock +0 -0
- udrl/__main__.py +238 -0
- udrl/agent.py +180 -0
- udrl/buffer.py +70 -0
- udrl/catch/__init__.py +35 -0
- udrl/catch/adptor.py +126 -0
- udrl/catch/core.py +190 -0
- udrl/catch/renderer.py +65 -0
- udrl/cli.py +192 -0
- udrl/data_proc.py +51 -0
- udrl/inference.py +122 -0
- udrl/plot.py +189 -0
- udrl/policies.py +364 -0
- udrl/test.py +137 -0
- udrl/viz.py +310 -0
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
data/
|
3 |
+
**/*.npy
|
4 |
+
**/*.pyc
|
README.md
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# Upside-Down RL
|
3 |
+
|
4 |
+
This project implements an Upside-Down Reinforcement Learning (UDRL) agent.
|
5 |
+
|
6 |
+
### Installation
|
7 |
+
|
8 |
+
1. Make sure you have Python 3.10 installed. You can check your version with `python --version`.
|
9 |
+
**NOTE** Use a virtual env to avoid dependency clash
|
10 |
+
2. Install the project dependencies using Poetry:
|
11 |
+
```bash
|
12 |
+
poetry install
|
13 |
+
```
|
14 |
+
If you do not have poetry use pip to install the requirements like so:
|
15 |
+
```bash
|
16 |
+
pip install -r requirements.txt
|
17 |
+
```
|
18 |
+
|
19 |
+
|
20 |
+
### Running the Experiment
|
21 |
+
|
22 |
+
You can run the experiment with various configuration options using the command line:
|
23 |
+
|
24 |
+
```bash
|
25 |
+
poetry run python -m udrl [options]
|
26 |
+
```
|
27 |
+
**Note** If you are already inside a virtual env `python -m udrl [options]` is enough
|
28 |
+
**Note** All defaults are for the CartPole-v0
|
29 |
+
Available options include:
|
30 |
+
|
31 |
+
* `--env_name`: Name of the Gym environment (default: `CartPole-v0`)
|
32 |
+
* `--estimator_name`: "neural" for NN or a fully qualified name of the scikit-learn estimator class (default: `ensemble.RandomForestClassifier`)
|
33 |
+
* `--seed`: Random seed (default: `42`)
|
34 |
+
* `--max_episode`: Maximum training episodes (default: `500`)
|
35 |
+
* `--collect_episode`: Episodes to collect between training (default: `15`)
|
36 |
+
* `--batch_size`: Batch size for training (default: `0`, uses entire replay buffer)
|
37 |
+
* Other options related to warm-up, memory size, exploration, testing, saving, etc.
|
38 |
+
|
39 |
+
**NOTE** Cartpole, Acrobot, Mountain car and LunarLander envs were tested
|
40 |
+
|
41 |
+
### Result Data
|
42 |
+
|
43 |
+
* Experiment configuration and final test results are saved in a JSON file (`conf.json`) within a directory structure based on the environment, seed, and non-default configuration values (e.g., `data/[env-name]/[experiment_name]/[seed]/conf.json`).
|
44 |
+
* If `save_policy` is True, the trained policy is saved in the same directory (`policy`).
|
45 |
+
* If `save_learning_infos` is True, learning infos and rewards during training are saved as a NumPy file (e.g.`test_rewards.npy`) and a json file (e.h.`learning_infos.json`) in the same directory.
|
46 |
+
|
47 |
+
### Process Data
|
48 |
+
* A base post processing is available to convert the results data in csvs run it as `python -m udrl.data_proc`
|
49 |
+
|
50 |
+
### Project Structure
|
51 |
+
|
52 |
+
* `data`: Stores experiment results and other data.
|
53 |
+
* `old_code`: Contains previous code versions (not used in the current setup).
|
54 |
+
* `poetry.lock`, `pyproject.toml`: Manage project dependencies and configuration.
|
55 |
+
* `README.md`: This file.
|
56 |
+
* `udrl`: Contains the main Python modules for the UDRL agent.
|
57 |
+
|
58 |
+
Please refer to the code and comments for further details on the implementation.
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
## Troubleshooting
|
63 |
+
|
64 |
+
If you encounter any errors during installation or execution, or if you have any questions about the project, feel free to reach out to me at [[email protected]](mailto:[email protected]). I'll be happy to assist you!
|
old_code/experiment_1/train_agent.py
ADDED
@@ -0,0 +1,405 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import math
|
3 |
+
import time
|
4 |
+
import gymnasium as gym
|
5 |
+
import random
|
6 |
+
import utils
|
7 |
+
import keras
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
from collections import deque
|
11 |
+
from matplotlib import pyplot as plt
|
12 |
+
from sklearn.preprocessing import OneHotEncoder
|
13 |
+
from sklearn.tree import DecisionTreeClassifier
|
14 |
+
from sklearn.ensemble import (
|
15 |
+
RandomForestClassifier,
|
16 |
+
ExtraTreesClassifier,
|
17 |
+
AdaBoostClassifier,
|
18 |
+
)
|
19 |
+
from sklearn.neighbors import KNeighborsClassifier
|
20 |
+
from sklearn.svm import SVC
|
21 |
+
|
22 |
+
from sklearn.exceptions import NotFittedError
|
23 |
+
from sklearn.ensemble import GradientBoostingClassifier
|
24 |
+
from tqdm import trange
|
25 |
+
|
26 |
+
|
27 |
+
class ReplayBuffer:
|
28 |
+
"""
|
29 |
+
Thank you: https://github.com/BY571/
|
30 |
+
"""
|
31 |
+
|
32 |
+
def __init__(self, max_size):
|
33 |
+
self.max_size = max_size
|
34 |
+
self.buffer = []
|
35 |
+
|
36 |
+
def add_sample(self, states, actions, rewards):
|
37 |
+
episode = {
|
38 |
+
"states": states,
|
39 |
+
"actions": actions,
|
40 |
+
"rewards": rewards,
|
41 |
+
"summed_rewards": sum(rewards),
|
42 |
+
}
|
43 |
+
self.buffer.append(episode)
|
44 |
+
|
45 |
+
def sort(self):
|
46 |
+
# sort buffer
|
47 |
+
self.buffer = sorted(
|
48 |
+
self.buffer, key=lambda i: i["summed_rewards"], reverse=True
|
49 |
+
)
|
50 |
+
# keep the max buffer size
|
51 |
+
self.buffer = self.buffer[: self.max_size]
|
52 |
+
|
53 |
+
def get_random_samples(self, batch_size):
|
54 |
+
self.sort()
|
55 |
+
|
56 |
+
idxs = np.random.randint(0, len(self.buffer), batch_size)
|
57 |
+
batch = [self.buffer[idx] for idx in idxs]
|
58 |
+
|
59 |
+
return batch
|
60 |
+
|
61 |
+
def get_n_best(self, n):
|
62 |
+
self.sort()
|
63 |
+
return self.buffer[:n]
|
64 |
+
|
65 |
+
def __len__(self):
|
66 |
+
return len(self.buffer)
|
67 |
+
|
68 |
+
|
69 |
+
class UpsideDownAgent:
|
70 |
+
def __init__(self, environment, approximator):
|
71 |
+
print(environment)
|
72 |
+
self.environment = gym.make(environment)
|
73 |
+
self.approximator = approximator
|
74 |
+
self.state_size = self.environment.observation_space.shape[0]
|
75 |
+
self.action_size = self.environment.action_space.n
|
76 |
+
self.warm_up_episodes = 50
|
77 |
+
self.render = False
|
78 |
+
self.memory = ReplayBuffer(700)
|
79 |
+
self.last_few = 75
|
80 |
+
self.batch_size = 32
|
81 |
+
self.command_size = 2 # desired return + desired horizon
|
82 |
+
self.desired_return = 1
|
83 |
+
self.desired_horizon = 1
|
84 |
+
self.horizon_scale = 0.02
|
85 |
+
self.return_scale = 0.02
|
86 |
+
self.testing_state = 0
|
87 |
+
|
88 |
+
if approximator == "neural_network":
|
89 |
+
self.behaviour_function = utils.get_functional_behaviour_function(
|
90 |
+
self.state_size, self.command_size, self.action_size
|
91 |
+
)
|
92 |
+
|
93 |
+
elif approximator == "forest":
|
94 |
+
self.behaviour_function = RandomForestClassifier(200)
|
95 |
+
|
96 |
+
elif approximator == "extra-trees":
|
97 |
+
self.behaviour_function = ExtraTreesClassifier()
|
98 |
+
|
99 |
+
elif approximator == "knn":
|
100 |
+
self.behaviour_function = KNeighborsClassifier()
|
101 |
+
|
102 |
+
elif approximator == "adaboost":
|
103 |
+
self.behaviour_function = AdaBoostClassifier()
|
104 |
+
|
105 |
+
self.testing_rewards = []
|
106 |
+
self.warm_up_buffer()
|
107 |
+
|
108 |
+
def warm_up_buffer(self):
|
109 |
+
|
110 |
+
for i in range(self.warm_up_episodes):
|
111 |
+
# Gymnasium returns (state,info_dict)
|
112 |
+
state, _ = self.environment.reset()
|
113 |
+
states = []
|
114 |
+
rewards = []
|
115 |
+
actions = []
|
116 |
+
done = False
|
117 |
+
desired_return = 1
|
118 |
+
desired_horizon = 1
|
119 |
+
|
120 |
+
while not done:
|
121 |
+
state = np.reshape(state, [1, self.state_size])
|
122 |
+
states.append(state)
|
123 |
+
|
124 |
+
observation = state
|
125 |
+
|
126 |
+
command = np.asarray(
|
127 |
+
[
|
128 |
+
desired_return * self.return_scale,
|
129 |
+
desired_horizon * self.horizon_scale,
|
130 |
+
]
|
131 |
+
)
|
132 |
+
|
133 |
+
command = np.reshape(command, [1, len(command)])
|
134 |
+
|
135 |
+
action = self.get_action(observation, command)
|
136 |
+
actions.append(action)
|
137 |
+
# Gymnasium returns (s,r,tr,te,info)
|
138 |
+
next_state, reward, tru, ter, info = self.environment.step(action)
|
139 |
+
done = tru or ter
|
140 |
+
next_state = np.reshape(next_state, [1, self.state_size])
|
141 |
+
|
142 |
+
rewards.append(reward)
|
143 |
+
|
144 |
+
state = next_state
|
145 |
+
|
146 |
+
desired_return -= reward # Line 8 Algorithm 2
|
147 |
+
desired_horizon -= 1 # Line 9 Algorithm 2
|
148 |
+
desired_horizon = np.maximum(desired_horizon, 1)
|
149 |
+
|
150 |
+
self.memory.add_sample(states, actions, rewards)
|
151 |
+
|
152 |
+
def get_action(self, observation, command):
|
153 |
+
"""
|
154 |
+
We will sample from the action distribution modeled by the Behavior Function
|
155 |
+
"""
|
156 |
+
|
157 |
+
if self.approximator == "neural_network":
|
158 |
+
action_probs = self.behaviour_function.predict([observation, command])
|
159 |
+
action = np.random.choice(np.arange(0, self.action_size), p=action_probs[0])
|
160 |
+
|
161 |
+
return action
|
162 |
+
|
163 |
+
elif self.approximator in ["forest", "extra-trees", "knn", "svm", "adaboost"]:
|
164 |
+
try:
|
165 |
+
input_state = np.concatenate((observation, command), axis=1)
|
166 |
+
action = self.behaviour_function.predict(input_state)
|
167 |
+
# print(action)
|
168 |
+
if np.random.rand() > 0.8:
|
169 |
+
return int(not np.argmax(action))
|
170 |
+
|
171 |
+
return np.argmax(action)
|
172 |
+
|
173 |
+
except NotFittedError as e:
|
174 |
+
return random.randint(0, 1)
|
175 |
+
|
176 |
+
def get_greedy_action(self, observation, command):
|
177 |
+
|
178 |
+
if self.approximator == "neural_network":
|
179 |
+
action_probs = self.behaviour_function.predict([observation, command])
|
180 |
+
action = np.argmax(action_probs)
|
181 |
+
|
182 |
+
return action
|
183 |
+
|
184 |
+
else:
|
185 |
+
input_state = np.concatenate((observation, command), axis=1)
|
186 |
+
action = self.behaviour_function.predict(input_state)
|
187 |
+
|
188 |
+
self.testing_state += 1
|
189 |
+
|
190 |
+
feature_importances = {}
|
191 |
+
|
192 |
+
for t in self.behaviour_function.estimators_:
|
193 |
+
branch = t.decision_path(input_state).todense()
|
194 |
+
branch = np.array(branch, dtype=bool)
|
195 |
+
imp = t.tree_.impurity[branch[0]]
|
196 |
+
for f, i in zip(t.tree_.feature[branch[0]][:-1], imp[:-1] - imp[1:]):
|
197 |
+
feature_importances.setdefault(f, []).append(i)
|
198 |
+
|
199 |
+
summed_importances = [
|
200 |
+
sum(feature_importances[0]),
|
201 |
+
sum(feature_importances[1]),
|
202 |
+
sum(feature_importances[2]),
|
203 |
+
sum(feature_importances[3]),
|
204 |
+
sum(feature_importances[4]),
|
205 |
+
sum(feature_importances[5]),
|
206 |
+
]
|
207 |
+
|
208 |
+
x = np.arange(len(summed_importances))
|
209 |
+
|
210 |
+
plt.figure()
|
211 |
+
plt.title("Cartpole-v0")
|
212 |
+
plt.bar(x, summed_importances)
|
213 |
+
plt.xticks(
|
214 |
+
x,
|
215 |
+
[
|
216 |
+
"feature-1",
|
217 |
+
"feature-2",
|
218 |
+
"feature-3",
|
219 |
+
"feature-4",
|
220 |
+
r"$d_t^{r}$",
|
221 |
+
r"$d_t^{h}$",
|
222 |
+
],
|
223 |
+
)
|
224 |
+
plt.savefig("importances_state_" + str(self.testing_state) + ".jpg")
|
225 |
+
|
226 |
+
return np.argmax(action)
|
227 |
+
|
228 |
+
def train_behaviour_function(self):
|
229 |
+
|
230 |
+
random_episodes = self.memory.get_random_samples(self.batch_size)
|
231 |
+
|
232 |
+
training_observations = np.zeros((self.batch_size, self.state_size))
|
233 |
+
training_commands = np.zeros((self.batch_size, 2))
|
234 |
+
|
235 |
+
y = []
|
236 |
+
|
237 |
+
for idx, episode in enumerate(random_episodes):
|
238 |
+
T = len(episode["states"])
|
239 |
+
t1 = np.random.randint(0, T - 1)
|
240 |
+
t2 = np.random.randint(t1 + 1, T)
|
241 |
+
|
242 |
+
state = episode["states"][t1]
|
243 |
+
desired_return = sum(episode["rewards"][t1:t2])
|
244 |
+
desired_horizon = t2 - t1
|
245 |
+
|
246 |
+
target = episode["actions"][t1]
|
247 |
+
|
248 |
+
training_observations[idx] = state[0]
|
249 |
+
training_commands[idx] = np.asarray(
|
250 |
+
[
|
251 |
+
desired_return * self.return_scale,
|
252 |
+
desired_horizon * self.horizon_scale,
|
253 |
+
]
|
254 |
+
)
|
255 |
+
y.append(target)
|
256 |
+
|
257 |
+
_y = keras.utils.to_categorical(y)
|
258 |
+
|
259 |
+
if self.approximator == "neural_network":
|
260 |
+
self.behaviour_function.fit(
|
261 |
+
[training_observations, training_commands], _y, verbose=0
|
262 |
+
)
|
263 |
+
|
264 |
+
elif self.approximator in ["forest", "extra-trees", "adaboost"]:
|
265 |
+
input_classifier = np.concatenate(
|
266 |
+
(training_observations, training_commands), axis=1
|
267 |
+
)
|
268 |
+
|
269 |
+
self.behaviour_function.fit(input_classifier, _y)
|
270 |
+
|
271 |
+
def sample_exploratory_commands(self):
|
272 |
+
best_episodes = self.memory.get_n_best(self.last_few)
|
273 |
+
exploratory_desired_horizon = np.mean([len(i["states"]) for i in best_episodes])
|
274 |
+
|
275 |
+
returns = [i["summed_rewards"] for i in best_episodes]
|
276 |
+
exploratory_desired_returns = np.random.uniform(
|
277 |
+
np.mean(returns), np.mean(returns) + np.std(returns)
|
278 |
+
)
|
279 |
+
|
280 |
+
return [exploratory_desired_returns, exploratory_desired_horizon]
|
281 |
+
|
282 |
+
def generate_episode(
|
283 |
+
self, environment, e, desired_return, desired_horizon, testing
|
284 |
+
):
|
285 |
+
|
286 |
+
env = gym.make(environment)
|
287 |
+
tot_rewards = []
|
288 |
+
done = False
|
289 |
+
|
290 |
+
score = 0
|
291 |
+
# Gymnasium returns (state,info_dict)
|
292 |
+
state, _ = env.reset()
|
293 |
+
|
294 |
+
scores = []
|
295 |
+
states = []
|
296 |
+
actions = []
|
297 |
+
rewards = []
|
298 |
+
|
299 |
+
while not done:
|
300 |
+
state = np.reshape(state, [1, self.state_size])
|
301 |
+
states.append(state)
|
302 |
+
|
303 |
+
observation = state
|
304 |
+
|
305 |
+
command = np.asarray(
|
306 |
+
[
|
307 |
+
desired_return * self.return_scale,
|
308 |
+
desired_horizon * self.horizon_scale,
|
309 |
+
]
|
310 |
+
)
|
311 |
+
command = np.reshape(command, [1, len(command)])
|
312 |
+
|
313 |
+
if not testing:
|
314 |
+
action = self.get_action(observation, command)
|
315 |
+
actions.append(action)
|
316 |
+
else:
|
317 |
+
action = self.get_greedy_action(observation, command)
|
318 |
+
|
319 |
+
# Gymnasium returns (s,r,tr,te,info)
|
320 |
+
next_state, reward, tru, ter, info = env.step(action)
|
321 |
+
done = tru or ter
|
322 |
+
next_state = np.reshape(next_state, [1, self.state_size])
|
323 |
+
|
324 |
+
rewards.append(reward)
|
325 |
+
score += reward
|
326 |
+
|
327 |
+
state = next_state
|
328 |
+
|
329 |
+
desired_return -= reward # Line 8 Algorithm 2
|
330 |
+
desired_horizon -= 1 # Line 9 Algorithm 2
|
331 |
+
desired_horizon = np.maximum(desired_horizon, 1)
|
332 |
+
|
333 |
+
self.memory.add_sample(states, actions, rewards)
|
334 |
+
|
335 |
+
self.testing_rewards.append(score)
|
336 |
+
|
337 |
+
if testing:
|
338 |
+
print("Querying the model ...")
|
339 |
+
print("Testing score: {}".format(score))
|
340 |
+
|
341 |
+
return score
|
342 |
+
|
343 |
+
|
344 |
+
def run_experiment():
|
345 |
+
|
346 |
+
import argparse
|
347 |
+
|
348 |
+
parser = argparse.ArgumentParser()
|
349 |
+
|
350 |
+
parser.add_argument("--approximator", type=str, default="forest")
|
351 |
+
parser.add_argument("--environment", type=str, default="CartPole-v0")
|
352 |
+
parser.add_argument("--seed", type=int, default=42)
|
353 |
+
|
354 |
+
args = parser.parse_args()
|
355 |
+
|
356 |
+
approximator = args.approximator
|
357 |
+
environment = args.environment
|
358 |
+
seed = args.seed
|
359 |
+
print(args)
|
360 |
+
|
361 |
+
episodes = 500
|
362 |
+
returns = []
|
363 |
+
|
364 |
+
agent = UpsideDownAgent(environment, approximator)
|
365 |
+
epi_bar = trange(episodes)
|
366 |
+
for e in epi_bar:
|
367 |
+
for i in range(100):
|
368 |
+
agent.train_behaviour_function()
|
369 |
+
|
370 |
+
for i in range(15):
|
371 |
+
tmp_r = []
|
372 |
+
exploratory_commands = (
|
373 |
+
agent.sample_exploratory_commands()
|
374 |
+
) # Line 5 Algorithm 1
|
375 |
+
desired_return = exploratory_commands[0]
|
376 |
+
desired_horizon = exploratory_commands[1]
|
377 |
+
r = agent.generate_episode(
|
378 |
+
environment, e, desired_return, desired_horizon, False
|
379 |
+
)
|
380 |
+
tmp_r.append(r)
|
381 |
+
|
382 |
+
epi_bar.set_postfix(
|
383 |
+
{
|
384 |
+
"mean": np.mean(tmp_r),
|
385 |
+
"std": np.std(tmp_r),
|
386 |
+
}
|
387 |
+
)
|
388 |
+
# print()
|
389 |
+
returns.append(np.mean(tmp_r))
|
390 |
+
|
391 |
+
exploratory_commands = agent.sample_exploratory_commands()
|
392 |
+
|
393 |
+
agent.generate_episode(environment, 1, 200, 200, True)
|
394 |
+
|
395 |
+
utils.save_results(environment, approximator, seed, returns)
|
396 |
+
|
397 |
+
if approximator == "neural_network":
|
398 |
+
utils.save_trained_model(environment, seed, agent.behaviour_function)
|
399 |
+
|
400 |
+
|
401 |
+
if __name__ == "__main__":
|
402 |
+
import warnings
|
403 |
+
|
404 |
+
warnings.simplefilter("ignore", DeprecationWarning)
|
405 |
+
run_experiment()
|
old_code/experiment_1/utils.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import argparse
|
3 |
+
import pickle
|
4 |
+
import keras
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
from keras.layers import Dense, Multiply, Input, Conv2D, Flatten
|
8 |
+
from keras.models import Sequential, Model
|
9 |
+
from keras.optimizers import Adam, RMSprop, SGD
|
10 |
+
|
11 |
+
from skimage.transform import resize
|
12 |
+
from skimage.color import rgb2gray
|
13 |
+
|
14 |
+
STORING_PATH = './results/'
|
15 |
+
MODELS_PATH = './trained_models/'
|
16 |
+
|
17 |
+
def save_results(environment, approximator, seed, rewards):
|
18 |
+
storing_path = os.path.join(STORING_PATH, environment, approximator, str(seed))
|
19 |
+
if not os.path.exists(storing_path):
|
20 |
+
os.makedirs(storing_path)
|
21 |
+
|
22 |
+
np.save(storing_path + '/' + 'upside_down_rewards.npy', rewards)
|
23 |
+
|
24 |
+
def get_functional_behaviour_function(state_size, command_size, action_size):
|
25 |
+
observation_input = keras.Input(shape=(state_size,))
|
26 |
+
linear_layer = Dense(64, activation='sigmoid')(observation_input)
|
27 |
+
|
28 |
+
command_input = keras.Input(shape=(command_size,))
|
29 |
+
sigmoidal_layer = Dense(64, activation='sigmoid')(command_input)
|
30 |
+
|
31 |
+
multiplied_layer = Multiply()([linear_layer, sigmoidal_layer])
|
32 |
+
|
33 |
+
layer_1 = Dense(64, activation='relu')(multiplied_layer)
|
34 |
+
layer_2 = Dense(64, activation='relu')(layer_1)
|
35 |
+
layer_3 = Dense(64, activation='relu')(layer_2)
|
36 |
+
layer_4 = Dense(64, activation='relu')(layer_3)
|
37 |
+
final_layer = Dense(action_size, activation='softmax')(layer_4)
|
38 |
+
|
39 |
+
model = Model(inputs=[observation_input, command_input], outputs=final_layer)
|
40 |
+
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001))
|
41 |
+
|
42 |
+
return model
|
43 |
+
|
44 |
+
def get_atari_behaviour_function(action_size):
|
45 |
+
|
46 |
+
print('Getting the model')
|
47 |
+
|
48 |
+
input_state = Input(shape=(84,84,4))
|
49 |
+
|
50 |
+
first_conv = Conv2D(
|
51 |
+
32, (8, 8), strides=(4,4), activation='relu')(input_state)
|
52 |
+
second_conv = Conv2D(
|
53 |
+
64, (4, 4), strides=(2,2), activation='relu')(first_conv)
|
54 |
+
third_conv = Conv2D(
|
55 |
+
64, (3, 3), strides=(1,1), activation='relu')(second_conv)
|
56 |
+
|
57 |
+
flattened = Flatten()(third_conv)
|
58 |
+
dense_layer = Dense(512, activation='relu')(flattened)
|
59 |
+
|
60 |
+
command_input = keras.Input(shape=(2,))
|
61 |
+
sigmoidal_layer = Dense(512, activation='sigmoid')(command_input)
|
62 |
+
|
63 |
+
multiplied_layer = Multiply()([dense_layer, sigmoidal_layer])
|
64 |
+
final_layer = Dense(256, activation='relu')(multiplied_layer)
|
65 |
+
|
66 |
+
action_layer = Dense(action_size, activation='softmax')(final_layer)
|
67 |
+
|
68 |
+
model = Model(inputs=[input_state, command_input], outputs=action_layer)
|
69 |
+
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.001, rho=0.95, epsilon=0.01))
|
70 |
+
|
71 |
+
|
72 |
+
print(model.summary())
|
73 |
+
|
74 |
+
return model
|
75 |
+
|
76 |
+
def get_catch_behaviour_function(action_size):
|
77 |
+
|
78 |
+
print('Getting the Catch-model')
|
79 |
+
|
80 |
+
input_state = Input(shape=(84,84,4))
|
81 |
+
|
82 |
+
first_conv = Conv2D(
|
83 |
+
32, (8, 8), strides=(4,4), activation='relu')(input_state)
|
84 |
+
second_conv = Conv2D(
|
85 |
+
64, (4, 4), strides=(2,2), activation='relu')(first_conv)
|
86 |
+
third_conv = Conv2D(
|
87 |
+
64, (3, 3), strides=(1,1), activation='relu')(second_conv)
|
88 |
+
|
89 |
+
flattened = Flatten()(third_conv)
|
90 |
+
dense_layer = Dense(512, activation='relu')(flattened)
|
91 |
+
|
92 |
+
command_input = keras.Input(shape=(2,))
|
93 |
+
sigmoidal_layer = Dense(512, activation='sigmoid')(command_input)
|
94 |
+
|
95 |
+
multiplied_layer = Multiply()([dense_layer, sigmoidal_layer])
|
96 |
+
final_layer = Dense(256, activation='relu')(multiplied_layer)
|
97 |
+
|
98 |
+
action_layer = Dense(action_size, activation='softmax')(final_layer)
|
99 |
+
|
100 |
+
model = Model(inputs=[input_state, command_input], outputs=action_layer)
|
101 |
+
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.001, rho=0.95, epsilon=0.01))
|
102 |
+
|
103 |
+
|
104 |
+
print(model.summary())
|
105 |
+
|
106 |
+
return model
|
107 |
+
|
108 |
+
|
109 |
+
def pre_processing(state):
|
110 |
+
processed_state = np.uint8(
|
111 |
+
resize(rgb2gray(state), (84, 84), mode='constant')*255)
|
112 |
+
|
113 |
+
return processed_state
|
114 |
+
|
115 |
+
def save_trained_model(environment, seed, model):
|
116 |
+
storing_path = os.path.join(MODELS_PATH, environment, str(seed))
|
117 |
+
if not os.path.exists(storing_path):
|
118 |
+
os.makedirs(storing_path)
|
119 |
+
|
120 |
+
model.save_weights(storing_path + '/' + 'trained_model.h5')
|
121 |
+
|
old_code/experiment_2/catch.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from scipy.misc import imresize
|
2 |
+
import gym
|
3 |
+
import random
|
4 |
+
import numpy as np
|
5 |
+
from queue import Queue
|
6 |
+
|
7 |
+
from matplotlib import pyplot as plt
|
8 |
+
from PIL import Image
|
9 |
+
|
10 |
+
class CatchEnv:
|
11 |
+
def __init__(self):
|
12 |
+
self.size = 21
|
13 |
+
self.image = np.zeros((self.size, self.size))
|
14 |
+
self.state = []
|
15 |
+
self.fps = 4
|
16 |
+
self.output_shape = (84, 84)
|
17 |
+
|
18 |
+
def reset_random(self):
|
19 |
+
self.image.fill(0)
|
20 |
+
self.pos = np.random.randint(2, self.size-2)
|
21 |
+
self.vx = np.random.randint(5) - 2
|
22 |
+
self.vy = 1
|
23 |
+
self.ballx, self.bally = np.random.randint(self.size), 4
|
24 |
+
self.image[self.bally, self.ballx] = 1
|
25 |
+
self.image[-5, self.pos - 2:self.pos + 3] = np.ones(5)
|
26 |
+
|
27 |
+
return self.step(2)[0]
|
28 |
+
|
29 |
+
|
30 |
+
def step(self, action):
|
31 |
+
def left():
|
32 |
+
if self.pos > 3:
|
33 |
+
self.pos -= 2
|
34 |
+
def right():
|
35 |
+
if self.pos < 17:
|
36 |
+
self.pos += 2
|
37 |
+
def noop():
|
38 |
+
pass
|
39 |
+
{0: left, 1: right, 2: noop}[action]()
|
40 |
+
|
41 |
+
|
42 |
+
self.image[self.bally, self.ballx] = 0
|
43 |
+
self.ballx += self.vx
|
44 |
+
self.bally += self.vy
|
45 |
+
if self.ballx > self.size - 1:
|
46 |
+
self.ballx -= 2 * (self.ballx - (self.size-1))
|
47 |
+
self.vx *= -1
|
48 |
+
elif self.ballx < 0:
|
49 |
+
self.ballx += 2 * (0 - self.ballx)
|
50 |
+
self.vx *= -1
|
51 |
+
self.image[self.bally, self.ballx] = 1
|
52 |
+
|
53 |
+
self.image[-5].fill(0)
|
54 |
+
self.image[-5, self.pos-2:self.pos+3] = np.ones(5)
|
55 |
+
|
56 |
+
terminal = self.bally == self.size - 1 - 4
|
57 |
+
reward = int(self.pos - 2 <= self.ballx <= self.pos + 2) if terminal else 0
|
58 |
+
|
59 |
+
[self.state.append(imresize(self.image, (84, 84))) for _ in range(self.fps - len(self.state) + 1)]
|
60 |
+
self.state = self.state[-self.fps:]
|
61 |
+
|
62 |
+
return np.transpose(self.state, [1, 2, 0]), reward, terminal
|
63 |
+
|
64 |
+
def get_num_actions(self):
|
65 |
+
return 3
|
66 |
+
|
67 |
+
def reset(self):
|
68 |
+
return self.reset_random()
|
69 |
+
|
70 |
+
def state_shape(self):
|
71 |
+
return (self.fps,) + self.output_shape
|
72 |
+
|
73 |
+
|
74 |
+
def test():
|
75 |
+
env = CatchEnv()
|
76 |
+
i = 0
|
77 |
+
|
78 |
+
for ep in range(1):
|
79 |
+
env.reset()
|
80 |
+
|
81 |
+
state, reward, terminal = env.step(1)
|
82 |
+
|
83 |
+
while not terminal:
|
84 |
+
state, reward, terminal = env.step(random.randint(0,2))
|
85 |
+
|
86 |
+
state = np.squeeze(state)
|
87 |
+
|
88 |
+
#print(reward)
|
89 |
+
#print(terminal)
|
90 |
+
i += 1
|
91 |
+
|
92 |
+
if __name__ == "__main__":
|
93 |
+
test()
|
old_code/experiment_2/catch_v2.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
import numpy as np
|
3 |
+
from scipy.misc import imresize
|
4 |
+
|
5 |
+
import random
|
6 |
+
import numpy as np
|
7 |
+
from scipy.ndimage import rotate
|
8 |
+
from scipy.misc import imresize
|
9 |
+
from matplotlib import pyplot as plt
|
10 |
+
|
11 |
+
class CatchEnv2:
|
12 |
+
def __init__(self):
|
13 |
+
self.size = 21
|
14 |
+
self.image = np.zeros((self.size, self.size))
|
15 |
+
self.state = []
|
16 |
+
self.fps = 4
|
17 |
+
self.output_shape = (84, 84)
|
18 |
+
|
19 |
+
def reset_random(self):
|
20 |
+
self.image.fill(0)
|
21 |
+
self.pos = np.random.randint(2, self.size-2)
|
22 |
+
self.vx = np.random.randint(5) - 2
|
23 |
+
self.vy = 1
|
24 |
+
self.ballx, self.bally = np.random.randint(self.size), 4
|
25 |
+
|
26 |
+
self.image[self.bally, self.ballx] = 1
|
27 |
+
self.image[-5, self.pos - 2:self.pos + 3] = np.ones(5)
|
28 |
+
|
29 |
+
for i in range(0, self.size):
|
30 |
+
for j in range(0, self.size):
|
31 |
+
self.image[i][j] = random.randint(2,5)
|
32 |
+
|
33 |
+
return self.step(2)[0]
|
34 |
+
|
35 |
+
|
36 |
+
def step(self, action):
|
37 |
+
def left():
|
38 |
+
if self.pos > 3:
|
39 |
+
self.pos -= 2
|
40 |
+
def right():
|
41 |
+
if self.pos < 17:
|
42 |
+
self.pos += 2
|
43 |
+
def noop():
|
44 |
+
pass
|
45 |
+
{0: left, 1: right, 2: noop}[action]()
|
46 |
+
|
47 |
+
|
48 |
+
self.image[self.bally, self.ballx] = 0
|
49 |
+
self.ballx += self.vx
|
50 |
+
self.bally += self.vy
|
51 |
+
if self.ballx > self.size - 1:
|
52 |
+
self.ballx -= 2 * (self.ballx - (self.size-1))
|
53 |
+
self.vx *= -1
|
54 |
+
elif self.ballx < 0:
|
55 |
+
self.ballx += 2 * (0 - self.ballx)
|
56 |
+
self.vx *= -1
|
57 |
+
|
58 |
+
self.image[self.bally, self.ballx] = 1
|
59 |
+
self.image[-5].fill(random.randint(2,5))
|
60 |
+
self.image[-5, self.pos-2:self.pos+3] = np.ones(5)
|
61 |
+
|
62 |
+
terminal = self.bally == self.size - 1 - 4
|
63 |
+
reward = int(self.pos - 2 <= self.ballx <= self.pos + 2) if terminal else 0
|
64 |
+
|
65 |
+
[self.state.append(imresize(self.image, (84, 84))) for _ in range(self.fps - len(self.state) + 1)]
|
66 |
+
self.state = self.state[-self.fps:]
|
67 |
+
|
68 |
+
return np.transpose(self.state, [1, 2, 0]), reward, terminal
|
69 |
+
|
70 |
+
def get_num_actions(self):
|
71 |
+
return 3
|
72 |
+
|
73 |
+
def reset(self):
|
74 |
+
return self.reset_random()
|
75 |
+
|
76 |
+
def state_shape(self):
|
77 |
+
return (self.fps,) + self.output_shape
|
78 |
+
|
79 |
+
def show_state(self, i):
|
80 |
+
plt.imshow(self.image)
|
81 |
+
plt.imsave('image_'+str(i)+'.jpg', self.image)
|
82 |
+
|
83 |
+
def test():
|
84 |
+
env = CatchEnv2()
|
85 |
+
i = 0
|
86 |
+
for ep in range(1):
|
87 |
+
env.reset()
|
88 |
+
env.show_state(i)
|
89 |
+
|
90 |
+
state, reward, terminal = env.step(1)
|
91 |
+
while not terminal:
|
92 |
+
env.show_state(i)
|
93 |
+
state, reward, terminal = env.step(np.random.randint(0,2))
|
94 |
+
i += 1
|
95 |
+
#print(reward)
|
96 |
+
|
97 |
+
if __name__ == "main":
|
98 |
+
test()
|
old_code/experiment_2/catch_v3.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
import numpy as np
|
3 |
+
from scipy.misc import imresize
|
4 |
+
|
5 |
+
class CatchEnv3:
|
6 |
+
def __init__(self):
|
7 |
+
self.size = 21
|
8 |
+
self.image = np.zeros((self.size, self.size))
|
9 |
+
self.state = []
|
10 |
+
self.fps = 4
|
11 |
+
self.output_shape = (84, 84)
|
12 |
+
|
13 |
+
def reset_random(self):
|
14 |
+
self.image.fill(0)
|
15 |
+
self.pos = np.random.randint(2, self.size-2)
|
16 |
+
self.vx = np.random.randint(5) - 2
|
17 |
+
self.vy = 1
|
18 |
+
self.ballx, self.bally = np.random.randint(self.size), 4
|
19 |
+
|
20 |
+
self.image[self.bally, self.ballx] = 1
|
21 |
+
self.image[-5, self.pos - 2:self.pos + 3] = np.ones(5)
|
22 |
+
|
23 |
+
return self.step(2)[0]
|
24 |
+
|
25 |
+
|
26 |
+
def step(self, action):
|
27 |
+
def left():
|
28 |
+
if self.pos > 3:
|
29 |
+
self.pos -= 2
|
30 |
+
def right():
|
31 |
+
if self.pos < 17:
|
32 |
+
self.pos += 2
|
33 |
+
def noop():
|
34 |
+
pass
|
35 |
+
{0: left, 1: right, 2: noop}[action]()
|
36 |
+
|
37 |
+
|
38 |
+
self.image[self.bally, self.ballx] = 0
|
39 |
+
self.ballx += self.vx
|
40 |
+
self.bally += self.vy
|
41 |
+
if self.ballx > self.size - 1:
|
42 |
+
self.ballx -= 2 * (self.ballx - (self.size-1))
|
43 |
+
self.vx *= -1
|
44 |
+
elif self.ballx < 0:
|
45 |
+
self.ballx += 2 * (0 - self.ballx)
|
46 |
+
self.vx *= -1
|
47 |
+
self.image[self.bally, self.ballx] = 1
|
48 |
+
|
49 |
+
self.image[-5].fill(0)
|
50 |
+
self.image[-5, self.pos-2:self.pos+3] = np.ones(5)
|
51 |
+
|
52 |
+
terminal = self.bally == self.size - 1 - 4
|
53 |
+
reward = int(self.pos - 2 <= self.ballx <= self.pos + 2) if terminal else 0
|
54 |
+
|
55 |
+
[self.state.append(imresize(self.image, (84, 84))) for _ in range(self.fps - len(self.state) + 1)]
|
56 |
+
self.state = self.state[-self.fps:]
|
57 |
+
|
58 |
+
self.state[0] = self.state[0][::-1,:]
|
59 |
+
self.state[1] = self.state[1][::-1,:]
|
60 |
+
self.state[2] = self.state[2][::-1,:]
|
61 |
+
self.state[3] = self.state[3][::-1,:]
|
62 |
+
|
63 |
+
return np.transpose(self.state, [1, 2, 0]), reward, terminal
|
64 |
+
|
65 |
+
def get_num_actions(self):
|
66 |
+
return 3
|
67 |
+
|
68 |
+
def reset(self):
|
69 |
+
return self.reset_random()
|
70 |
+
|
71 |
+
def state_shape(self):
|
72 |
+
return (self.fps,) + self.output_shape
|
73 |
+
|
74 |
+
|
75 |
+
def test():
|
76 |
+
env = CatchEnv2()
|
77 |
+
i = 0
|
78 |
+
for ep in range(1):
|
79 |
+
env.reset()
|
80 |
+
state, reward, terminal = env.step(1)
|
81 |
+
while not terminal:
|
82 |
+
env.show_state(i)
|
83 |
+
state, reward, terminal = env.step(1)
|
84 |
+
state = np.squeeze(state)
|
85 |
+
|
86 |
+
plt.imsave('image_'+str(i)+'.jpg', state)
|
87 |
+
|
88 |
+
i += 1
|
89 |
+
|
90 |
+
if __name__ == "main":
|
91 |
+
test()
|
old_code/experiment_2/catch_v4.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
import numpy as np
|
3 |
+
from scipy.ndimage import rotate
|
4 |
+
from scipy.misc import imresize
|
5 |
+
|
6 |
+
class CatchEnv4:
|
7 |
+
def __init__(self):
|
8 |
+
self.size = 21
|
9 |
+
self.image = np.zeros((self.size, self.size))
|
10 |
+
self.state = []
|
11 |
+
self.fps = 4
|
12 |
+
self.output_shape = (84, 84)
|
13 |
+
|
14 |
+
def reset_random(self):
|
15 |
+
self.image.fill(0)
|
16 |
+
self.pos = np.random.randint(2, self.size-2)
|
17 |
+
self.vx = np.random.randint(5) - 2
|
18 |
+
self.vy = 1
|
19 |
+
self.ballx, self.bally = np.random.randint(self.size), 4
|
20 |
+
|
21 |
+
self.image[self.bally, self.ballx] = 1
|
22 |
+
self.image[-5, self.pos - 1:self.pos+1] = np.ones(1)
|
23 |
+
|
24 |
+
return self.step(2)[0]
|
25 |
+
|
26 |
+
|
27 |
+
def step(self, action):
|
28 |
+
def left():
|
29 |
+
if self.pos > 3:
|
30 |
+
self.pos -= 2
|
31 |
+
def right():
|
32 |
+
if self.pos < 17:
|
33 |
+
self.pos += 2
|
34 |
+
def noop():
|
35 |
+
pass
|
36 |
+
{0: left, 1: right, 2: noop}[action]()
|
37 |
+
|
38 |
+
|
39 |
+
self.image[self.bally, self.ballx] = 0
|
40 |
+
self.ballx += self.vx
|
41 |
+
self.bally += self.vy
|
42 |
+
if self.ballx > self.size - 1:
|
43 |
+
self.ballx -= 2 * (self.ballx - (self.size-1))
|
44 |
+
self.vx *= -1
|
45 |
+
elif self.ballx < 0:
|
46 |
+
self.ballx += 2 * (0 - self.ballx)
|
47 |
+
self.vx *= -1
|
48 |
+
self.image[self.bally, self.ballx] = 1
|
49 |
+
|
50 |
+
self.image[-5].fill(0)
|
51 |
+
self.image[-5, self.pos-1:self.pos+1] = np.ones(1)
|
52 |
+
|
53 |
+
terminal = self.bally == self.size - 2 - 4
|
54 |
+
reward = int(self.pos - 1 <= self.ballx <= self.pos + 1) if terminal else 0
|
55 |
+
|
56 |
+
[self.state.append(imresize(self.image, (84, 84))) for _ in range(self.fps - len(self.state) + 1)]
|
57 |
+
self.state = self.state[-self.fps:]
|
58 |
+
|
59 |
+
return np.transpose(self.state, [1, 2, 0]), reward, terminal
|
60 |
+
|
61 |
+
def get_num_actions(self):
|
62 |
+
return 3
|
63 |
+
|
64 |
+
def reset(self):
|
65 |
+
return self.reset_random()
|
66 |
+
|
67 |
+
def state_shape(self):
|
68 |
+
return (self.fps,) + self.output_shape
|
69 |
+
|
70 |
+
def show_state(self, i):
|
71 |
+
plt.imshow(self.image)
|
72 |
+
plt.imsave('image_'+str(i)+'.jpg', self.image)
|
73 |
+
|
74 |
+
def test():
|
75 |
+
env = CatchEnv4()
|
76 |
+
i = 0
|
77 |
+
for ep in range(1):
|
78 |
+
env.reset()
|
79 |
+
state, reward, terminal = env.step(1)
|
80 |
+
while not terminal:
|
81 |
+
state, reward, terminal = env.step(np.random.randint(0,2))
|
82 |
+
state = np.squeeze(state)
|
83 |
+
env.show_state(i)
|
84 |
+
i += 1
|
85 |
+
print(reward)
|
86 |
+
|
87 |
+
if __name__ == "main":
|
88 |
+
test()
|
old_code/experiment_2/train_catch_cnn_agent.py
ADDED
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import math
|
3 |
+
import time
|
4 |
+
import gym
|
5 |
+
import random
|
6 |
+
import utils
|
7 |
+
import keras
|
8 |
+
import catch
|
9 |
+
import catch_v2
|
10 |
+
import catch_v3
|
11 |
+
import catch_v4
|
12 |
+
|
13 |
+
import numpy as np
|
14 |
+
|
15 |
+
from collections import deque
|
16 |
+
from matplotlib import pyplot as plt
|
17 |
+
from sklearn.preprocessing import OneHotEncoder
|
18 |
+
|
19 |
+
class ReplayBuffer():
|
20 |
+
"""
|
21 |
+
Thank you: https://github.com/BY571/
|
22 |
+
"""
|
23 |
+
|
24 |
+
def __init__(self, max_size):
|
25 |
+
self.max_size = max_size
|
26 |
+
self.buffer = []
|
27 |
+
|
28 |
+
def add_sample(self, states, actions, rewards):
|
29 |
+
episode = {"states": states, "actions":actions, "rewards": rewards, "summed_rewards":sum(rewards)}
|
30 |
+
self.buffer.append(episode)
|
31 |
+
|
32 |
+
def sort(self):
|
33 |
+
#sort buffer
|
34 |
+
self.buffer = sorted(self.buffer, key = lambda i: i["summed_rewards"],reverse=True)
|
35 |
+
# keep the max buffer size
|
36 |
+
self.buffer = self.buffer[:self.max_size]
|
37 |
+
|
38 |
+
def get_random_samples(self, batch_size):
|
39 |
+
self.sort()
|
40 |
+
idxs = np.random.randint(0, len(self.buffer), batch_size)
|
41 |
+
batch = [self.buffer[idx] for idx in idxs]
|
42 |
+
return batch
|
43 |
+
|
44 |
+
def get_n_best(self, n):
|
45 |
+
self.sort()
|
46 |
+
return self.buffer[:n]
|
47 |
+
|
48 |
+
def __len__(self):
|
49 |
+
return len(self.buffer)
|
50 |
+
|
51 |
+
class UpsideDownAgent():
|
52 |
+
def __init__(self, environment, approximator):
|
53 |
+
if environment == "Catch-v0":
|
54 |
+
self.environment = catch.CatchEnv()
|
55 |
+
elif environment == "Catch-v2":
|
56 |
+
self.environment = catch_v2.CatchEnv()
|
57 |
+
elif environment == "Catch-v3":
|
58 |
+
self.environment = catch_v3.CatchEnv()
|
59 |
+
elif environment == "Catch-v4":
|
60 |
+
self.environment = catch_v4.CatchEnv()
|
61 |
+
|
62 |
+
self.approximator = approximator
|
63 |
+
self.state_size = (84, 84, 4)
|
64 |
+
self.action_size = 3
|
65 |
+
self.warm_up_episodes = 50
|
66 |
+
self.memory = ReplayBuffer(700)
|
67 |
+
self.last_few = 50
|
68 |
+
self.batch_size = 32
|
69 |
+
self.command_size = 2 # desired return + desired horizon
|
70 |
+
self.desired_return = 1
|
71 |
+
self.desired_horizon = 1
|
72 |
+
self.horizon_scale = 0.02
|
73 |
+
self.return_scale = 0.02
|
74 |
+
|
75 |
+
self.behaviour_function = utils.get_catch_behaviour_function(self.action_size)
|
76 |
+
|
77 |
+
self.testing_rewards = []
|
78 |
+
self.warm_up_buffer()
|
79 |
+
|
80 |
+
def warm_up_buffer(self):
|
81 |
+
print('Warming up')
|
82 |
+
|
83 |
+
for i in range(self.warm_up_episodes):
|
84 |
+
|
85 |
+
states = []
|
86 |
+
rewards = []
|
87 |
+
actions = []
|
88 |
+
|
89 |
+
dead = False
|
90 |
+
done = False
|
91 |
+
desired_return = 1
|
92 |
+
desired_horizon = 1
|
93 |
+
|
94 |
+
step, score, start_life = 0, 0, 5
|
95 |
+
observe = self.environment.reset()
|
96 |
+
|
97 |
+
observe, reward, terminal = self.environment.step(1)
|
98 |
+
|
99 |
+
state = utils.pre_processing(observe)
|
100 |
+
history = np.stack((state, state, state, state), axis=2)
|
101 |
+
history = np.reshape([history], (1, 84, 84, 4))
|
102 |
+
|
103 |
+
|
104 |
+
while not done:
|
105 |
+
|
106 |
+
states.append(history)
|
107 |
+
command = np.asarray([desired_return * self.return_scale, desired_horizon * self.horizon_scale])
|
108 |
+
command = np.reshape(command, [1, len(command)])
|
109 |
+
|
110 |
+
action = self.get_action(history, command)
|
111 |
+
actions.append(action)
|
112 |
+
|
113 |
+
next_state, reward, done = self.environment.step(action)
|
114 |
+
next_state = utils.pre_processing(observe)
|
115 |
+
next_state = np.reshape([next_state], (1, 84, 84, 1))
|
116 |
+
next_history = np.append(next_state, history[:, :, :, :3], axis = 3)
|
117 |
+
|
118 |
+
rewards.append(reward)
|
119 |
+
|
120 |
+
state = next_state
|
121 |
+
history = next_history
|
122 |
+
|
123 |
+
desired_return -= reward # Line 8 Algorithm 2
|
124 |
+
desired_horizon -= 1 # Line 9 Algorithm 2
|
125 |
+
desired_horizon = np.maximum(desired_horizon, 1)
|
126 |
+
|
127 |
+
self.memory.add_sample(states, actions, rewards)
|
128 |
+
|
129 |
+
|
130 |
+
def get_action(self, observation, command):
|
131 |
+
"""
|
132 |
+
We will sample from the action distribution modeled by the Behavior Function
|
133 |
+
"""
|
134 |
+
|
135 |
+
observation = np.float32(observation / 255.0)
|
136 |
+
|
137 |
+
action_probs = self.behaviour_function.predict([observation, command])
|
138 |
+
action = np.random.choice(np.arange(0, self.action_size), p=action_probs[0])
|
139 |
+
|
140 |
+
return action
|
141 |
+
|
142 |
+
def get_greedy_action(self, observation, command):
|
143 |
+
|
144 |
+
action_probs = self.behaviour_function.predict([observation, command])
|
145 |
+
action = np.argmax(action_probs)
|
146 |
+
|
147 |
+
return action
|
148 |
+
|
149 |
+
def train_behaviour_function(self):
|
150 |
+
|
151 |
+
random_episodes = self.memory.get_random_samples(self.batch_size)
|
152 |
+
|
153 |
+
training_observations = np.zeros((self.batch_size, self.state_size[0], self.state_size[1], self.state_size[2]))
|
154 |
+
training_commands = np.zeros((self.batch_size, 2))
|
155 |
+
|
156 |
+
y = []
|
157 |
+
|
158 |
+
for idx, episode in enumerate(random_episodes):
|
159 |
+
T = len(episode['states'])
|
160 |
+
t1 = np.random.randint(0, T-1)
|
161 |
+
t2 = np.random.randint(t1+1, T)
|
162 |
+
|
163 |
+
state = np.float32(episode['states'][t1] / 255.)
|
164 |
+
desired_return = sum(episode["rewards"][t1:t2])
|
165 |
+
desired_horizon = t2 -t1
|
166 |
+
|
167 |
+
target = episode['actions'][t1]
|
168 |
+
|
169 |
+
training_observations[idx] = state[0]
|
170 |
+
training_commands[idx] = np.asarray([desired_return*self.return_scale, desired_horizon*self.horizon_scale])
|
171 |
+
y.append(target)
|
172 |
+
|
173 |
+
_y = keras.utils.to_categorical(y, num_classes=self.action_size)
|
174 |
+
|
175 |
+
self.behaviour_function.fit([training_observations, training_commands], _y, verbose=0)
|
176 |
+
|
177 |
+
|
178 |
+
def sample_exploratory_commands(self):
|
179 |
+
best_episodes = self.memory.get_n_best(self.last_few)
|
180 |
+
exploratory_desired_horizon = np.mean([len(i["states"]) for i in best_episodes])
|
181 |
+
|
182 |
+
returns = [i["summed_rewards"] for i in best_episodes]
|
183 |
+
exploratory_desired_returns = np.random.uniform(np.mean(returns), np.mean(returns)+np.std(returns))
|
184 |
+
|
185 |
+
return [exploratory_desired_returns, exploratory_desired_horizon]
|
186 |
+
|
187 |
+
def generate_episode(self, environment, e, desired_return, desired_horizon, testing):
|
188 |
+
|
189 |
+
if environment == "Catch-v0":
|
190 |
+
env = catch.CatchEnv()
|
191 |
+
elif environment == "Catch-v2":
|
192 |
+
self.environment = catch_v2.CatchEnv()
|
193 |
+
elif environment == "Catch-v3":
|
194 |
+
self.environment = catch_v3.CatchEnv()
|
195 |
+
elif environment == "Catch-v4":
|
196 |
+
self.environment = catch_v4.CatchEnv()
|
197 |
+
|
198 |
+
tot_rewards = []
|
199 |
+
|
200 |
+
done = False
|
201 |
+
dead = False
|
202 |
+
|
203 |
+
scores = []
|
204 |
+
states = []
|
205 |
+
actions = []
|
206 |
+
rewards = []
|
207 |
+
|
208 |
+
step, score, start_life = 0, 0, 5
|
209 |
+
|
210 |
+
observe = env.reset()
|
211 |
+
observe, _, _ = env.step(1)
|
212 |
+
|
213 |
+
state = utils.pre_processing(observe)
|
214 |
+
history = np.stack((state, state, state, state), axis=2)
|
215 |
+
history = np.reshape([history], (1, 84, 84, 4))
|
216 |
+
|
217 |
+
while not done:
|
218 |
+
states.append(history)
|
219 |
+
|
220 |
+
command = np.asarray([desired_return * self.return_scale, desired_horizon * self.horizon_scale])
|
221 |
+
command = np.reshape(command, [1, len(command)])
|
222 |
+
|
223 |
+
if not testing:
|
224 |
+
action = self.get_action(history, command)
|
225 |
+
actions.append(action)
|
226 |
+
else:
|
227 |
+
action = self.get_greedy_action(history, command)
|
228 |
+
|
229 |
+
next_state, reward, done = env.step(action)
|
230 |
+
next_state = utils.pre_processing(observe)
|
231 |
+
next_state = np.reshape([next_state], (1, 84, 84, 1))
|
232 |
+
next_history = np.append(next_state, history[:, :, :, :3], axis = 3)
|
233 |
+
|
234 |
+
|
235 |
+
score += reward
|
236 |
+
history = next_history
|
237 |
+
|
238 |
+
desired_return -= reward # Line 8 Algorithm 2
|
239 |
+
desired_horizon -= 1 # Line 9 Algorithm 2
|
240 |
+
desired_horizon = np.maximum(desired_horizon, 1)
|
241 |
+
|
242 |
+
self.memory.add_sample(states, actions, rewards)
|
243 |
+
self.testing_rewards.append(score)
|
244 |
+
|
245 |
+
if testing:
|
246 |
+
print('Querying the model ...')
|
247 |
+
print('Testing score: {}'.format(score))
|
248 |
+
|
249 |
+
return score
|
250 |
+
|
251 |
+
def run_experiment():
|
252 |
+
|
253 |
+
import argparse
|
254 |
+
|
255 |
+
parser = argparse.ArgumentParser()
|
256 |
+
|
257 |
+
parser.add_argument('--approximator', type=str, default='neural_network')
|
258 |
+
parser.add_argument('--environment', type=str, default='PongDeterministic-v4')
|
259 |
+
parser.add_argument('--seed', type=int, default=1)
|
260 |
+
|
261 |
+
args = parser.parse_args()
|
262 |
+
|
263 |
+
approximator = args.approximator
|
264 |
+
environment = args.environment
|
265 |
+
seed = args.seed
|
266 |
+
|
267 |
+
training_episodes = 10
|
268 |
+
warm_up_episodes = 10
|
269 |
+
testing_returns = []
|
270 |
+
|
271 |
+
agent = UpsideDownAgent(environment, approximator)
|
272 |
+
|
273 |
+
for e in range(training_episodes):
|
274 |
+
print("Training Episode {}".format(e))
|
275 |
+
|
276 |
+
for i in range(100):
|
277 |
+
agent.train_behaviour_function()
|
278 |
+
|
279 |
+
print("Finished training B!")
|
280 |
+
|
281 |
+
for i in range(15):
|
282 |
+
exploratory_commands = agent.sample_exploratory_commands() # Line 5 Algorithm 1
|
283 |
+
desired_return = exploratory_commands[0]
|
284 |
+
desired_horizon = exploratory_commands[1]
|
285 |
+
agent.generate_episode(environment, e, desired_return, desired_horizon, False)
|
286 |
+
|
287 |
+
if e % 2 == 0:
|
288 |
+
for i in range(1):
|
289 |
+
r = agent.generate_episode(environment, e, desired_return, desired_horizon, True)
|
290 |
+
testing_returns.append(r)
|
291 |
+
|
292 |
+
exploratory_commands = agent.sample_exploratory_commands()
|
293 |
+
|
294 |
+
if __name__ == "__main__":
|
295 |
+
run_experiment()
|
old_code/experiment_2/utils.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import argparse
|
3 |
+
import pickle
|
4 |
+
import keras
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
from keras.layers import Dense, Multiply, Input, Conv2D, Flatten
|
8 |
+
from keras.models import Sequential, Model
|
9 |
+
from keras.optimizers import Adam, RMSprop, SGD
|
10 |
+
|
11 |
+
from skimage.transform import resize
|
12 |
+
from skimage.color import rgb2gray
|
13 |
+
|
14 |
+
STORING_PATH = './results/'
|
15 |
+
MODELS_PATH = './trained_models/'
|
16 |
+
|
17 |
+
def save_results(environment, approximator, seed, rewards):
|
18 |
+
storing_path = os.path.join(STORING_PATH, environment, approximator, str(seed))
|
19 |
+
if not os.path.exists(storing_path):
|
20 |
+
os.makedirs(storing_path)
|
21 |
+
|
22 |
+
np.save(storing_path + '/' + 'upside_down_rewards.npy', rewards)
|
23 |
+
|
24 |
+
def get_functional_behaviour_function(state_size, command_size, action_size):
|
25 |
+
observation_input = keras.Input(shape=(state_size,))
|
26 |
+
linear_layer = Dense(64, activation='sigmoid')(observation_input)
|
27 |
+
|
28 |
+
command_input = keras.Input(shape=(command_size,))
|
29 |
+
sigmoidal_layer = Dense(64, activation='sigmoid')(command_input)
|
30 |
+
|
31 |
+
multiplied_layer = Multiply()([linear_layer, sigmoidal_layer])
|
32 |
+
|
33 |
+
layer_1 = Dense(64, activation='relu')(multiplied_layer)
|
34 |
+
layer_2 = Dense(64, activation='relu')(layer_1)
|
35 |
+
layer_3 = Dense(64, activation='relu')(layer_2)
|
36 |
+
layer_4 = Dense(64, activation='relu')(layer_3)
|
37 |
+
final_layer = Dense(action_size, activation='softmax')(layer_4)
|
38 |
+
|
39 |
+
model = Model(inputs=[observation_input, command_input], outputs=final_layer)
|
40 |
+
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001))
|
41 |
+
|
42 |
+
return model
|
43 |
+
|
44 |
+
def get_atari_behaviour_function(action_size):
|
45 |
+
|
46 |
+
print('Getting the model')
|
47 |
+
|
48 |
+
input_state = Input(shape=(84,84,4))
|
49 |
+
|
50 |
+
first_conv = Conv2D(
|
51 |
+
32, (8, 8), strides=(4,4), activation='relu')(input_state)
|
52 |
+
second_conv = Conv2D(
|
53 |
+
64, (4, 4), strides=(2,2), activation='relu')(first_conv)
|
54 |
+
third_conv = Conv2D(
|
55 |
+
64, (3, 3), strides=(1,1), activation='relu')(second_conv)
|
56 |
+
|
57 |
+
flattened = Flatten()(third_conv)
|
58 |
+
dense_layer = Dense(512, activation='relu')(flattened)
|
59 |
+
|
60 |
+
command_input = keras.Input(shape=(2,))
|
61 |
+
sigmoidal_layer = Dense(512, activation='sigmoid')(command_input)
|
62 |
+
|
63 |
+
multiplied_layer = Multiply()([dense_layer, sigmoidal_layer])
|
64 |
+
final_layer = Dense(256, activation='relu')(multiplied_layer)
|
65 |
+
|
66 |
+
action_layer = Dense(action_size, activation='softmax')(final_layer)
|
67 |
+
|
68 |
+
model = Model(inputs=[input_state, command_input], outputs=action_layer)
|
69 |
+
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.001, rho=0.95, epsilon=0.01))
|
70 |
+
|
71 |
+
|
72 |
+
print(model.summary())
|
73 |
+
|
74 |
+
return model
|
75 |
+
|
76 |
+
def get_catch_behaviour_function(action_size):
|
77 |
+
|
78 |
+
print('Getting the Catch-model')
|
79 |
+
|
80 |
+
input_state = Input(shape=(84,84,4))
|
81 |
+
|
82 |
+
first_conv = Conv2D(
|
83 |
+
32, (8, 8), strides=(4,4), activation='relu')(input_state)
|
84 |
+
second_conv = Conv2D(
|
85 |
+
64, (4, 4), strides=(2,2), activation='relu')(first_conv)
|
86 |
+
third_conv = Conv2D(
|
87 |
+
64, (3, 3), strides=(1,1), activation='relu')(second_conv)
|
88 |
+
|
89 |
+
flattened = Flatten()(third_conv)
|
90 |
+
dense_layer = Dense(512, activation='relu')(flattened)
|
91 |
+
|
92 |
+
command_input = keras.Input(shape=(2,))
|
93 |
+
sigmoidal_layer = Dense(512, activation='sigmoid')(command_input)
|
94 |
+
|
95 |
+
multiplied_layer = Multiply()([dense_layer, sigmoidal_layer])
|
96 |
+
final_layer = Dense(256, activation='relu')(multiplied_layer)
|
97 |
+
|
98 |
+
action_layer = Dense(action_size, activation='softmax')(final_layer)
|
99 |
+
|
100 |
+
model = Model(inputs=[input_state, command_input], outputs=action_layer)
|
101 |
+
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.001, rho=0.95, epsilon=0.01))
|
102 |
+
|
103 |
+
|
104 |
+
print(model.summary())
|
105 |
+
|
106 |
+
return model
|
107 |
+
|
108 |
+
|
109 |
+
def pre_processing(state):
|
110 |
+
processed_state = np.uint8(
|
111 |
+
resize(rgb2gray(state), (84, 84), mode='constant')*255)
|
112 |
+
|
113 |
+
return processed_state
|
114 |
+
|
115 |
+
def save_trained_model(environment, seed, model):
|
116 |
+
storing_path = os.path.join(MODELS_PATH, environment, str(seed))
|
117 |
+
if not os.path.exists(storing_path):
|
118 |
+
os.makedirs(storing_path)
|
119 |
+
|
120 |
+
model.save_weights(storing_path + '/' + 'trained_model.h5')
|
121 |
+
|
old_code/experiment_3/q_networks/a2c.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import gym
|
4 |
+
import utils
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
from keras.layers import Dense
|
9 |
+
from keras.models import Sequential
|
10 |
+
from keras.optimizers import Adam
|
11 |
+
|
12 |
+
from matplotlib import pyplot as plt
|
13 |
+
|
14 |
+
class A2CAgent:
|
15 |
+
def __init__(self, state_size, action_size):
|
16 |
+
self.render = False
|
17 |
+
self.state_size = state_size
|
18 |
+
self.action_size = action_size
|
19 |
+
self.value_size = 1
|
20 |
+
self.discount_factor = 0.99
|
21 |
+
self.actor_lr = 0.001
|
22 |
+
self.critic_lr = 0.005
|
23 |
+
self.actor = self.build_actor()
|
24 |
+
self.critic = self.build_critic()
|
25 |
+
|
26 |
+
def build_actor(self):
|
27 |
+
actor = Sequential()
|
28 |
+
actor.add(Dense(24, input_dim=self.state_size, activation='relu',
|
29 |
+
kernel_initializer='he_uniform'))
|
30 |
+
actor.add(Dense(self.action_size, activation='softmax',
|
31 |
+
kernel_initializer='he_uniform'))
|
32 |
+
actor.compile(loss='categorical_crossentropy',
|
33 |
+
optimizer=Adam(lr=self.actor_lr))
|
34 |
+
|
35 |
+
return actor
|
36 |
+
|
37 |
+
def build_critic(self):
|
38 |
+
critic = Sequential()
|
39 |
+
critic.add(Dense(24, input_dim=self.state_size, activation='relu',
|
40 |
+
kernel_initializer='he_uniform'))
|
41 |
+
critic.add(Dense(self.value_size, activation='linear',
|
42 |
+
kernel_initializer='he_uniform'))
|
43 |
+
critic.compile(loss="mse", optimizer=Adam(lr=self.critic_lr))
|
44 |
+
|
45 |
+
return critic
|
46 |
+
|
47 |
+
def get_action(self, state):
|
48 |
+
policy = self.actor.predict(state, batch_size=1).flatten()
|
49 |
+
|
50 |
+
return np.random.choice(self.action_size, 1, p=policy)[0]
|
51 |
+
|
52 |
+
def train_model(self, state, action, reward, next_state, done):
|
53 |
+
target = np.zeros((1, self.value_size))
|
54 |
+
advantages = np.zeros((1, self.action_size))
|
55 |
+
|
56 |
+
value = self.critic.predict(state)[0]
|
57 |
+
next_value = self.critic.predict(next_state)[0]
|
58 |
+
|
59 |
+
if done:
|
60 |
+
advantages[0][action] = reward - value
|
61 |
+
target[0][0] = reward
|
62 |
+
else:
|
63 |
+
advantages[0][action] = reward + self.discount_factor * (next_value) - value
|
64 |
+
target[0][0] = reward + self.discount_factor * next_value
|
65 |
+
|
66 |
+
self.actor.fit(state, advantages, epochs=1, verbose=0)
|
67 |
+
self.critic.fit(state, target, epochs=1, verbose=0)
|
68 |
+
|
69 |
+
def run_A2C():
|
70 |
+
episodes = 500
|
71 |
+
seed = 1
|
72 |
+
results = []
|
73 |
+
game = 'CartPole-v0'
|
74 |
+
|
75 |
+
env = gym.make(game)
|
76 |
+
|
77 |
+
state_size = env.observation_space.shape[0]
|
78 |
+
action_size = env.action_space.n
|
79 |
+
|
80 |
+
agent = A2CAgent(state_size, action_size)
|
81 |
+
|
82 |
+
for e in range(episodes):
|
83 |
+
done = False
|
84 |
+
score = 0
|
85 |
+
state = env.reset()
|
86 |
+
state = np.reshape(state, [1, state_size])
|
87 |
+
|
88 |
+
while not done:
|
89 |
+
action = agent.get_action(state)
|
90 |
+
next_state, reward, done, info = env.step(action)
|
91 |
+
next_state = np.reshape(next_state, [1, state_size])
|
92 |
+
agent.train_model(state, action, reward, next_state, done)
|
93 |
+
|
94 |
+
score += reward
|
95 |
+
state = next_state
|
96 |
+
|
97 |
+
results.append(score)
|
98 |
+
|
99 |
+
utils.save_trained_model(game, seed, 'A2C', agent.actor)
|
100 |
+
|
101 |
+
plt.plot(results)
|
102 |
+
plt.show()
|
103 |
+
|
104 |
+
run_A2C()
|
old_code/experiment_3/q_networks/buffers/CartPole-v0/1/DQN/memory_buffer.p
ADDED
The diff for this file is too large to render.
See raw diff
|
|
old_code/experiment_3/q_networks/ddqn.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import sys
|
4 |
+
import gym
|
5 |
+
import random
|
6 |
+
import utils
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
from collections import deque
|
10 |
+
from keras.layers import Dense
|
11 |
+
from keras.optimizers import Adam
|
12 |
+
from keras.models import Sequential
|
13 |
+
|
14 |
+
from matplotlib import pyplot as plt
|
15 |
+
|
16 |
+
class DoubleDQNAgent:
|
17 |
+
def __init__(self, state_size, action_size):
|
18 |
+
self.render = False
|
19 |
+
self.load_model = False
|
20 |
+
self.state_size = state_size
|
21 |
+
self.action_size = action_size
|
22 |
+
self.discount_factor = 0.99
|
23 |
+
self.learning_rate = 0.001
|
24 |
+
self.epsilon = 1.0
|
25 |
+
self.epsilon_decay = 0.999
|
26 |
+
self.epsilon_min = 0.01
|
27 |
+
self.batch_size = 64
|
28 |
+
self.train_start = 1000
|
29 |
+
self.memory = deque(maxlen=2000)
|
30 |
+
|
31 |
+
self.model = self.build_model()
|
32 |
+
self.target_model = self.build_model()
|
33 |
+
|
34 |
+
self.update_target_model()
|
35 |
+
|
36 |
+
def build_model(self):
|
37 |
+
model = Sequential()
|
38 |
+
model.add(Dense(24, input_dim=self.state_size, activation='relu',
|
39 |
+
kernel_initializer='he_uniform'))
|
40 |
+
model.add(Dense(24, activation='relu',
|
41 |
+
kernel_initializer='he_uniform'))
|
42 |
+
model.add(Dense(self.action_size, activation='linear',
|
43 |
+
kernel_initializer='he_uniform'))
|
44 |
+
model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
|
45 |
+
|
46 |
+
return model
|
47 |
+
|
48 |
+
def update_target_model(self):
|
49 |
+
self.target_model.set_weights(self.model.get_weights())
|
50 |
+
|
51 |
+
def get_action(self, state):
|
52 |
+
if np.random.rand() <= self.epsilon:
|
53 |
+
return random.randrange(self.action_size)
|
54 |
+
else:
|
55 |
+
q_value = self.model.predict(state)
|
56 |
+
return np.argmax(q_value[0])
|
57 |
+
|
58 |
+
def append_sample(self, state, action, reward, next_state, done):
|
59 |
+
self.memory.append((state, action, reward, next_state, done))
|
60 |
+
if self.epsilon > self.epsilon_min:
|
61 |
+
self.epsilon *= self.epsilon_decay
|
62 |
+
|
63 |
+
def train_model(self):
|
64 |
+
if len(self.memory) < self.train_start:
|
65 |
+
return
|
66 |
+
batch_size = min(self.batch_size, len(self.memory))
|
67 |
+
mini_batch = random.sample(self.memory, batch_size)
|
68 |
+
|
69 |
+
update_input = np.zeros((batch_size, self.state_size))
|
70 |
+
update_target = np.zeros((batch_size, self.state_size))
|
71 |
+
action, reward, done = [], [], []
|
72 |
+
|
73 |
+
for i in range(batch_size):
|
74 |
+
update_input[i] = mini_batch[i][0]
|
75 |
+
action.append(mini_batch[i][1])
|
76 |
+
reward.append(mini_batch[i][2])
|
77 |
+
update_target[i] = mini_batch[i][3]
|
78 |
+
done.append(mini_batch[i][4])
|
79 |
+
|
80 |
+
target = self.model.predict(update_input)
|
81 |
+
target_next = self.model.predict(update_target)
|
82 |
+
target_val = self.target_model.predict(update_target)
|
83 |
+
|
84 |
+
for i in range(self.batch_size):
|
85 |
+
if done[i]:
|
86 |
+
target[i][action[i]] = reward[i]
|
87 |
+
else:
|
88 |
+
a = np.argmax(target_next[i])
|
89 |
+
target[i][action[i]] = reward[i] + self.discount_factor * (
|
90 |
+
target_val[i][a])
|
91 |
+
|
92 |
+
self.model.fit(update_input, target, batch_size=self.batch_size,
|
93 |
+
epochs=1, verbose=0)
|
94 |
+
|
95 |
+
|
96 |
+
def run_DDQN():
|
97 |
+
episodes = 500
|
98 |
+
seed = 1
|
99 |
+
results = []
|
100 |
+
game = 'CartPole-v0'
|
101 |
+
|
102 |
+
env = gym.make(game)
|
103 |
+
|
104 |
+
state_size = env.observation_space.shape[0]
|
105 |
+
action_size = env.action_space.n
|
106 |
+
|
107 |
+
agent = DoubleDQNAgent(state_size, action_size)
|
108 |
+
|
109 |
+
for e in range(episodes):
|
110 |
+
done = False
|
111 |
+
score = 0
|
112 |
+
state = env.reset()
|
113 |
+
state = np.reshape(state, [1, state_size])
|
114 |
+
|
115 |
+
while not done:
|
116 |
+
action = agent.get_action(state)
|
117 |
+
next_state, reward, done, info = env.step(action)
|
118 |
+
next_state = np.reshape(next_state, [1, state_size])
|
119 |
+
|
120 |
+
agent.append_sample(state, action, reward, next_state, done)
|
121 |
+
agent.train_model()
|
122 |
+
score += reward
|
123 |
+
state = next_state
|
124 |
+
|
125 |
+
if done:
|
126 |
+
agent.update_target_model()
|
127 |
+
|
128 |
+
results.append(score)
|
129 |
+
|
130 |
+
utils.save_trained_model(game, seed, 'DDQN', agent.model)
|
131 |
+
|
132 |
+
plt.plot(results)
|
133 |
+
plt.show()
|
134 |
+
|
135 |
+
run_DDQN()
|
136 |
+
|
old_code/experiment_3/q_networks/dqn.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import gym
|
4 |
+
import random
|
5 |
+
import utils
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
from collections import deque
|
9 |
+
|
10 |
+
from keras.layers import Dense
|
11 |
+
from keras.optimizers import Adam
|
12 |
+
from keras.models import Sequential
|
13 |
+
from matplotlib import pyplot as plt
|
14 |
+
|
15 |
+
class DQNAgent:
|
16 |
+
def __init__(self, state_size, action_size):
|
17 |
+
self.state_size = state_size
|
18 |
+
self.action_size = action_size
|
19 |
+
self.discount_factor = 0.99
|
20 |
+
self.learning_rate = 0.001
|
21 |
+
self.epsilon = 1.0
|
22 |
+
self.epsilon_decay = 0.999
|
23 |
+
self.epsilon_min = 0.01
|
24 |
+
self.batch_size = 64
|
25 |
+
self.train_start = 1000
|
26 |
+
self.memory = deque(maxlen=2000)
|
27 |
+
self.model = self.build_model()
|
28 |
+
self.target_model = self.build_model()
|
29 |
+
|
30 |
+
self.update_target_model()
|
31 |
+
|
32 |
+
def build_model(self):
|
33 |
+
model = Sequential()
|
34 |
+
model.add(Dense(24, input_dim=self.state_size, activation='relu',
|
35 |
+
kernel_initializer='he_uniform'))
|
36 |
+
model.add(Dense(24, activation='relu',
|
37 |
+
kernel_initializer='he_uniform'))
|
38 |
+
model.add(Dense(self.action_size, activation='linear',
|
39 |
+
kernel_initializer='he_uniform'))
|
40 |
+
model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
|
41 |
+
return model
|
42 |
+
|
43 |
+
def update_target_model(self):
|
44 |
+
self.target_model.set_weights(self.model.get_weights())
|
45 |
+
|
46 |
+
def get_action(self, state):
|
47 |
+
if np.random.rand() <= self.epsilon:
|
48 |
+
return random.randrange(self.action_size)
|
49 |
+
else:
|
50 |
+
q_value = self.model.predict(state)
|
51 |
+
return np.argmax(q_value[0])
|
52 |
+
|
53 |
+
def append_sample(self, state, action, reward, next_state, done):
|
54 |
+
self.memory.append((state, action, reward, next_state, done))
|
55 |
+
if self.epsilon > self.epsilon_min:
|
56 |
+
self.epsilon *= self.epsilon_decay
|
57 |
+
|
58 |
+
def train_model(self):
|
59 |
+
if len(self.memory) < self.train_start:
|
60 |
+
return
|
61 |
+
|
62 |
+
batch_size = min(self.batch_size, len(self.memory))
|
63 |
+
mini_batch = random.sample(self.memory, batch_size)
|
64 |
+
|
65 |
+
update_input = np.zeros((batch_size, self.state_size))
|
66 |
+
update_target = np.zeros((batch_size, self.state_size))
|
67 |
+
action, reward, done = [], [], []
|
68 |
+
|
69 |
+
for i in range(self.batch_size):
|
70 |
+
update_input[i] = mini_batch[i][0]
|
71 |
+
action.append(mini_batch[i][1])
|
72 |
+
reward.append(mini_batch[i][2])
|
73 |
+
update_target[i] = mini_batch[i][3]
|
74 |
+
done.append(mini_batch[i][4])
|
75 |
+
|
76 |
+
target = self.model.predict(update_input)
|
77 |
+
|
78 |
+
target_val = self.target_model.predict(update_target)
|
79 |
+
|
80 |
+
for i in range(self.batch_size):
|
81 |
+
if done[i]:
|
82 |
+
target[i][action[i]] = reward[i]
|
83 |
+
else:
|
84 |
+
target[i][action[i]] = reward[i] + self.discount_factor * (
|
85 |
+
np.amax(target_val[i]))
|
86 |
+
|
87 |
+
self.model.fit(update_input, target, batch_size=self.batch_size,
|
88 |
+
epochs=1, verbose=0)
|
89 |
+
|
90 |
+
def run_DQN():
|
91 |
+
episodes = 500
|
92 |
+
seed = 1
|
93 |
+
results = []
|
94 |
+
game = 'CartPole-v0'
|
95 |
+
|
96 |
+
env = gym.make(game)
|
97 |
+
|
98 |
+
state_size = env.observation_space.shape[0]
|
99 |
+
action_size = env.action_space.n
|
100 |
+
|
101 |
+
agent = DQNAgent(state_size, action_size)
|
102 |
+
|
103 |
+
for e in range(episodes):
|
104 |
+
done = False
|
105 |
+
score = 0
|
106 |
+
state = env.reset()
|
107 |
+
state = np.reshape(state, [1, state_size])
|
108 |
+
|
109 |
+
while not done:
|
110 |
+
action = agent.get_action(state)
|
111 |
+
next_state, reward, done, info = env.step(action)
|
112 |
+
next_state = np.reshape(next_state, [1, state_size])
|
113 |
+
|
114 |
+
agent.append_sample(state, action, reward, next_state, done)
|
115 |
+
agent.train_model()
|
116 |
+
|
117 |
+
score += reward
|
118 |
+
state = next_state
|
119 |
+
|
120 |
+
if done:
|
121 |
+
agent.update_target_model()
|
122 |
+
|
123 |
+
results.append(score)
|
124 |
+
|
125 |
+
utils.save_trained_model(game, seed, 'DQN', agent.model)
|
126 |
+
|
127 |
+
plt.plot(results)
|
128 |
+
plt.show()
|
129 |
+
|
130 |
+
run_DQN()
|
old_code/experiment_3/q_networks/prepare_buffer.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import gym
|
4 |
+
import random
|
5 |
+
import numpy as np
|
6 |
+
import pickle
|
7 |
+
|
8 |
+
from collections import deque
|
9 |
+
|
10 |
+
from keras.layers import Dense
|
11 |
+
from keras.optimizers import Adam
|
12 |
+
from keras.models import Sequential
|
13 |
+
from matplotlib import pyplot as plt
|
14 |
+
|
15 |
+
WEIGHTS_PATH = './trained_models/CartPole-v0/1/'
|
16 |
+
BUFFER_PATH = './buffers/CartPole-v0/1/'
|
17 |
+
|
18 |
+
class Agent:
|
19 |
+
def __init__(self, algorithm, state_size, action_size):
|
20 |
+
self.algorithm = algorithm
|
21 |
+
self.render = False
|
22 |
+
self.state_size = state_size
|
23 |
+
self.action_size = action_size
|
24 |
+
self.memory = deque(maxlen=2000)
|
25 |
+
|
26 |
+
if self.algorithm in ['DQN', 'DDQN', 'DQV']:
|
27 |
+
self.model = self.build_model()
|
28 |
+
self.model.load_weights(os.path.join(WEIGHTS_PATH, self.algorithm, 'trained_model.h5'))
|
29 |
+
else:
|
30 |
+
self.model = self.build_actor()
|
31 |
+
self.model.load_weights(os.path.join(WEIGHTS_PATH, self.algorithm, 'trained_model.h5'))
|
32 |
+
|
33 |
+
|
34 |
+
def build_actor(self):
|
35 |
+
actor = Sequential()
|
36 |
+
actor.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
|
37 |
+
actor.add(Dense(self.action_size, activation='softmax', kernel_initializer='he_uniform'))
|
38 |
+
|
39 |
+
return actor
|
40 |
+
|
41 |
+
def build_model(self):
|
42 |
+
model = Sequential()
|
43 |
+
model.add(Dense(24, input_dim=self.state_size, activation='relu',
|
44 |
+
kernel_initializer='he_uniform'))
|
45 |
+
model.add(Dense(24, activation='relu',
|
46 |
+
kernel_initializer='he_uniform'))
|
47 |
+
model.add(Dense(self.action_size, activation='linear',
|
48 |
+
kernel_initializer='he_uniform'))
|
49 |
+
|
50 |
+
return model
|
51 |
+
|
52 |
+
def get_action(self, state):
|
53 |
+
if self.algorithm == 'A2C':
|
54 |
+
policy = self.model.predict(state, batch_size=1).flatten()
|
55 |
+
|
56 |
+
return np.random.choice(self.action_size, 1, p=policy)[0]
|
57 |
+
|
58 |
+
else:
|
59 |
+
q_value = self.model.predict(state)
|
60 |
+
return np.argmax(q_value[0])
|
61 |
+
|
62 |
+
def append_sample(self, state, action, reward, next_state, done):
|
63 |
+
self.memory.append((state, action, reward, next_state, done))
|
64 |
+
|
65 |
+
def save_buffer(self):
|
66 |
+
if not os.path.exists(os.path.join(BUFFER_PATH, self.algorithm)):
|
67 |
+
os.makedirs(os.path.join(BUFFER_PATH, self.algorithm))
|
68 |
+
|
69 |
+
with open(os.path.join(BUFFER_PATH, self.algorithm, 'memory_buffer.p'), 'wb') as filehandler:
|
70 |
+
pickle.dump(self.memory, filehandler)
|
71 |
+
|
72 |
+
def fill_buffer(algorithm):
|
73 |
+
max_len = 10000
|
74 |
+
results = []
|
75 |
+
game = 'CartPole-v0'
|
76 |
+
|
77 |
+
env = gym.make(game)
|
78 |
+
|
79 |
+
state_size = env.observation_space.shape[0]
|
80 |
+
action_size = env.action_space.n
|
81 |
+
|
82 |
+
agent = Agent(algorithm, state_size, action_size)
|
83 |
+
|
84 |
+
while True:
|
85 |
+
done = False
|
86 |
+
score = 0
|
87 |
+
state = env.reset()
|
88 |
+
state = np.reshape(state, [1, state_size])
|
89 |
+
|
90 |
+
while not done:
|
91 |
+
action = agent.get_action(state)
|
92 |
+
next_state, reward, done, info = env.step(action)
|
93 |
+
next_state = np.reshape(next_state, [1, state_size])
|
94 |
+
|
95 |
+
agent.append_sample(state, action, reward, next_state, done)
|
96 |
+
|
97 |
+
score += reward
|
98 |
+
state = next_state
|
99 |
+
|
100 |
+
if len(agent.memory) > max_len:
|
101 |
+
agent.save_buffer()
|
102 |
+
break
|
103 |
+
|
104 |
+
fill_buffer('DQN')
|
old_code/experiment_3/q_networks/train_offline_a2c.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import gym
|
4 |
+
import pickle
|
5 |
+
import random
|
6 |
+
import utils
|
7 |
+
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
from keras.layers import Dense
|
11 |
+
from keras.models import Sequential
|
12 |
+
from keras.optimizers import Adam
|
13 |
+
|
14 |
+
from matplotlib import pyplot as plt
|
15 |
+
|
16 |
+
MEMORY_PATH = './buffers/CartPole-v0/1/A2C/'
|
17 |
+
|
18 |
+
class A2CAgent:
|
19 |
+
def __init__(self, state_size, action_size):
|
20 |
+
self.render = False
|
21 |
+
self.state_size = state_size
|
22 |
+
self.action_size = action_size
|
23 |
+
self.value_size = 1
|
24 |
+
self.discount_factor = 0.99
|
25 |
+
self.actor_lr = 0.0001
|
26 |
+
self.critic_lr = 0.005
|
27 |
+
self.actor = self.build_actor()
|
28 |
+
self.critic = self.build_critic()
|
29 |
+
self.get_memory_buffer()
|
30 |
+
|
31 |
+
def build_actor(self):
|
32 |
+
actor = Sequential()
|
33 |
+
actor.add(Dense(24, input_dim=self.state_size, activation='relu',
|
34 |
+
kernel_initializer='he_uniform'))
|
35 |
+
actor.add(Dense(self.action_size, activation='softmax',
|
36 |
+
kernel_initializer='he_uniform'))
|
37 |
+
actor.compile(loss='categorical_crossentropy',
|
38 |
+
optimizer=Adam(lr=self.actor_lr))
|
39 |
+
|
40 |
+
return actor
|
41 |
+
|
42 |
+
def build_critic(self):
|
43 |
+
critic = Sequential()
|
44 |
+
critic.add(Dense(24, input_dim=self.state_size, activation='relu',
|
45 |
+
kernel_initializer='he_uniform'))
|
46 |
+
critic.add(Dense(self.value_size, activation='linear',
|
47 |
+
kernel_initializer='he_uniform'))
|
48 |
+
critic.compile(loss="mse", optimizer=Adam(lr=self.critic_lr))
|
49 |
+
|
50 |
+
return critic
|
51 |
+
|
52 |
+
def get_memory_buffer(self):
|
53 |
+
memory_buffer_path = os.path.join(MEMORY_PATH, 'memory_buffer.p')
|
54 |
+
with open(memory_buffer_path, 'rb') as f:
|
55 |
+
self.memory = pickle.load(f)
|
56 |
+
|
57 |
+
def get_action(self, state):
|
58 |
+
policy = self.actor.predict(state, batch_size=1).flatten()
|
59 |
+
|
60 |
+
return np.random.choice(self.action_size, 1, p=policy)[0]
|
61 |
+
|
62 |
+
def train_model(self):
|
63 |
+
mini_batch = random.sample(self.memory, 1)
|
64 |
+
|
65 |
+
state = mini_batch[0][0]
|
66 |
+
action = mini_batch[0][1]
|
67 |
+
reward = mini_batch[0][2]
|
68 |
+
next_state = mini_batch[0][3]
|
69 |
+
done = mini_batch[0][4]
|
70 |
+
|
71 |
+
target = np.zeros((1, self.value_size))
|
72 |
+
advantages = np.zeros((1, self.action_size))
|
73 |
+
|
74 |
+
value = self.critic.predict(state)[0]
|
75 |
+
next_value = self.critic.predict(next_state)[0]
|
76 |
+
|
77 |
+
if done:
|
78 |
+
advantages[0][action] = reward - value
|
79 |
+
target[0][0] = reward
|
80 |
+
else:
|
81 |
+
advantages[0][action] = reward + self.discount_factor * (next_value) - value
|
82 |
+
target[0][0] = reward + self.discount_factor * next_value
|
83 |
+
|
84 |
+
self.actor.fit(state, advantages, epochs=1, verbose=0)
|
85 |
+
self.critic.fit(state, target, epochs=1, verbose=0)
|
86 |
+
|
87 |
+
def run_A2C():
|
88 |
+
episodes = 500
|
89 |
+
seed = 2
|
90 |
+
results = []
|
91 |
+
game = 'CartPole-v0'
|
92 |
+
|
93 |
+
env = gym.make(game)
|
94 |
+
|
95 |
+
state_size = env.observation_space.shape[0]
|
96 |
+
action_size = env.action_space.n
|
97 |
+
|
98 |
+
agent = A2CAgent(state_size, action_size)
|
99 |
+
|
100 |
+
for e in range(episodes):
|
101 |
+
done = False
|
102 |
+
score = 0
|
103 |
+
state = env.reset()
|
104 |
+
state = np.reshape(state, [1, state_size])
|
105 |
+
|
106 |
+
while not done:
|
107 |
+
action = agent.get_action(state)
|
108 |
+
next_state, reward, done, info = env.step(action)
|
109 |
+
next_state = np.reshape(next_state, [1, state_size])
|
110 |
+
agent.train_model()
|
111 |
+
|
112 |
+
score += reward
|
113 |
+
state = next_state
|
114 |
+
|
115 |
+
print(score)
|
116 |
+
results.append(score)
|
117 |
+
|
118 |
+
utils.save_offline_results(game, 'A2C', seed, results)
|
119 |
+
|
120 |
+
plt.plot(results)
|
121 |
+
plt.show()
|
122 |
+
|
123 |
+
run_A2C()
|
old_code/experiment_3/q_networks/train_offline_ddqn.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import gym
|
4 |
+
import pickle
|
5 |
+
import random
|
6 |
+
import utils
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
from collections import deque
|
10 |
+
|
11 |
+
from keras.layers import Dense
|
12 |
+
from keras.optimizers import Adam
|
13 |
+
from keras.models import Sequential
|
14 |
+
from matplotlib import pyplot as plt
|
15 |
+
|
16 |
+
MEMORY_PATH = './buffers/CartPole-v0/1/DDQN/'
|
17 |
+
|
18 |
+
class DDQNAgent:
|
19 |
+
def __init__(self, state_size, action_size):
|
20 |
+
self.render = False
|
21 |
+
self.state_size = state_size
|
22 |
+
self.action_size = action_size
|
23 |
+
self.discount_factor = 0.99
|
24 |
+
self.learning_rate = 0.00001
|
25 |
+
self.batch_size = 256
|
26 |
+
self.model = self.build_model()
|
27 |
+
self.target_model = self.build_model()
|
28 |
+
|
29 |
+
self.get_memory_buffer()
|
30 |
+
self.update_target_model()
|
31 |
+
|
32 |
+
def build_model(self):
|
33 |
+
model = Sequential()
|
34 |
+
model.add(Dense(24, input_dim=self.state_size, activation='relu',
|
35 |
+
kernel_initializer='he_uniform'))
|
36 |
+
model.add(Dense(24, activation='relu',
|
37 |
+
kernel_initializer='he_uniform'))
|
38 |
+
model.add(Dense(self.action_size, activation='linear',
|
39 |
+
kernel_initializer='he_uniform'))
|
40 |
+
model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
|
41 |
+
|
42 |
+
return model
|
43 |
+
|
44 |
+
def update_target_model(self):
|
45 |
+
self.target_model.set_weights(self.model.get_weights())
|
46 |
+
|
47 |
+
def get_memory_buffer(self):
|
48 |
+
memory_buffer_path = os.path.join(MEMORY_PATH, 'memory_buffer.p')
|
49 |
+
|
50 |
+
with open(memory_buffer_path, 'rb') as f:
|
51 |
+
self.memory = pickle.load(f)
|
52 |
+
|
53 |
+
print(len(self.memory))
|
54 |
+
|
55 |
+
def get_action(self, state):
|
56 |
+
q_value = self.model.predict(state)
|
57 |
+
return np.argmax(q_value[0])
|
58 |
+
|
59 |
+
def train_model(self):
|
60 |
+
batch_size = min(self.batch_size, len(self.memory))
|
61 |
+
mini_batch = random.sample(self.memory, batch_size)
|
62 |
+
|
63 |
+
update_input = np.zeros((batch_size, self.state_size))
|
64 |
+
update_target = np.zeros((batch_size, self.state_size))
|
65 |
+
action, reward, done = [], [], []
|
66 |
+
|
67 |
+
for i in range(batch_size):
|
68 |
+
update_input[i] = mini_batch[i][0]
|
69 |
+
action.append(mini_batch[i][1])
|
70 |
+
reward.append(mini_batch[i][2])
|
71 |
+
update_target[i] = mini_batch[i][3]
|
72 |
+
done.append(mini_batch[i][4])
|
73 |
+
|
74 |
+
target = self.model.predict(update_input)
|
75 |
+
target_next = self.model.predict(update_target)
|
76 |
+
target_val = self.target_model.predict(update_target)
|
77 |
+
|
78 |
+
for i in range(self.batch_size):
|
79 |
+
if done[i]:
|
80 |
+
target[i][action[i]] = reward[i]
|
81 |
+
else:
|
82 |
+
a = np.argmax(target_next[i])
|
83 |
+
target[i][action[i]] = reward[i] + self.discount_factor * (
|
84 |
+
target_val[i][a])
|
85 |
+
|
86 |
+
self.model.fit(update_input, target, batch_size=self.batch_size, epochs=1, verbose=0)
|
87 |
+
|
88 |
+
|
89 |
+
def run_DDQN():
|
90 |
+
episodes = 500
|
91 |
+
seed = 2
|
92 |
+
results = []
|
93 |
+
game = 'CartPole-v0'
|
94 |
+
|
95 |
+
env = gym.make(game)
|
96 |
+
|
97 |
+
state_size = env.observation_space.shape[0]
|
98 |
+
action_size = env.action_space.n
|
99 |
+
|
100 |
+
agent = DDQNAgent(state_size, action_size)
|
101 |
+
|
102 |
+
for e in range(episodes):
|
103 |
+
done = False
|
104 |
+
score = 0
|
105 |
+
state = env.reset()
|
106 |
+
state = np.reshape(state, [1, state_size])
|
107 |
+
|
108 |
+
while not done:
|
109 |
+
action = agent.get_action(state)
|
110 |
+
next_state, reward, done, info = env.step(action)
|
111 |
+
next_state = np.reshape(next_state, [1, state_size])
|
112 |
+
|
113 |
+
agent.train_model()
|
114 |
+
|
115 |
+
score += reward
|
116 |
+
state = next_state
|
117 |
+
|
118 |
+
print(score)
|
119 |
+
results.append(score)
|
120 |
+
|
121 |
+
utils.save_offline_results(game, 'DDQN', seed, results)
|
122 |
+
|
123 |
+
plt.plot(results)
|
124 |
+
plt.show()
|
125 |
+
|
126 |
+
run_DDQN()
|
old_code/experiment_3/q_networks/train_offline_dqn.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import gym
|
4 |
+
import pickle
|
5 |
+
import random
|
6 |
+
import utils
|
7 |
+
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
from collections import deque
|
11 |
+
|
12 |
+
from keras.layers import Dense
|
13 |
+
from keras.optimizers import Adam
|
14 |
+
from keras.models import Sequential
|
15 |
+
from matplotlib import pyplot as plt
|
16 |
+
|
17 |
+
MEMORY_PATH = './buffers/CartPole-v0/1/DQN/'
|
18 |
+
|
19 |
+
class DQNAgent:
|
20 |
+
def __init__(self, state_size, action_size):
|
21 |
+
self.render = False
|
22 |
+
self.state_size = state_size
|
23 |
+
self.action_size = action_size
|
24 |
+
self.discount_factor = 0.99
|
25 |
+
self.learning_rate = 0.00001
|
26 |
+
self.batch_size = 256
|
27 |
+
self.model = self.build_model()
|
28 |
+
self.target_model = self.build_model()
|
29 |
+
|
30 |
+
self.get_memory_buffer()
|
31 |
+
self.update_target_model()
|
32 |
+
|
33 |
+
def build_model(self):
|
34 |
+
model = Sequential()
|
35 |
+
model.add(Dense(24, input_dim=self.state_size, activation='relu',
|
36 |
+
kernel_initializer='he_uniform'))
|
37 |
+
model.add(Dense(24, activation='relu',
|
38 |
+
kernel_initializer='he_uniform'))
|
39 |
+
model.add(Dense(self.action_size, activation='linear',
|
40 |
+
kernel_initializer='he_uniform'))
|
41 |
+
model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
|
42 |
+
|
43 |
+
return model
|
44 |
+
|
45 |
+
def update_target_model(self):
|
46 |
+
self.target_model.set_weights(self.model.get_weights())
|
47 |
+
|
48 |
+
def get_memory_buffer(self):
|
49 |
+
memory_buffer_path = os.path.join(MEMORY_PATH, 'memory_buffer.p')
|
50 |
+
|
51 |
+
with open(memory_buffer_path, 'rb') as f:
|
52 |
+
self.memory = pickle.load(f)
|
53 |
+
|
54 |
+
print(len(self.memory))
|
55 |
+
|
56 |
+
def get_action(self, state):
|
57 |
+
q_value = self.model.predict(state)
|
58 |
+
return np.argmax(q_value[0])
|
59 |
+
|
60 |
+
def train_model(self):
|
61 |
+
batch_size = min(self.batch_size, len(self.memory))
|
62 |
+
mini_batch = random.sample(self.memory, batch_size)
|
63 |
+
|
64 |
+
update_input = np.zeros((batch_size, self.state_size))
|
65 |
+
update_target = np.zeros((batch_size, self.state_size))
|
66 |
+
action, reward, done = [], [], []
|
67 |
+
|
68 |
+
for i in range(self.batch_size):
|
69 |
+
update_input[i] = mini_batch[i][0]
|
70 |
+
action.append(mini_batch[i][1])
|
71 |
+
reward.append(mini_batch[i][2])
|
72 |
+
update_target[i] = mini_batch[i][3]
|
73 |
+
done.append(mini_batch[i][4])
|
74 |
+
|
75 |
+
target = self.model.predict(update_input)
|
76 |
+
target_val = self.target_model.predict(update_target)
|
77 |
+
|
78 |
+
for i in range(self.batch_size):
|
79 |
+
if done[i]:
|
80 |
+
target[i][action[i]] = reward[i]
|
81 |
+
else:
|
82 |
+
target[i][action[i]] = reward[i] + self.discount_factor * (np.amax(target_val[i]))
|
83 |
+
|
84 |
+
self.model.fit(update_input, target, batch_size=self.batch_size,
|
85 |
+
epochs=1, verbose=0)
|
86 |
+
|
87 |
+
def run_DQN():
|
88 |
+
episodes = 500
|
89 |
+
seed = 2
|
90 |
+
results = []
|
91 |
+
game = 'CartPole-v0'
|
92 |
+
|
93 |
+
env = gym.make(game)
|
94 |
+
|
95 |
+
state_size = env.observation_space.shape[0]
|
96 |
+
action_size = env.action_space.n
|
97 |
+
|
98 |
+
agent = DQNAgent(state_size, action_size)
|
99 |
+
|
100 |
+
for e in range(episodes):
|
101 |
+
done = False
|
102 |
+
score = 0
|
103 |
+
state = env.reset()
|
104 |
+
state = np.reshape(state, [1, state_size])
|
105 |
+
|
106 |
+
while not done:
|
107 |
+
action = agent.get_action(state)
|
108 |
+
next_state, reward, done, info = env.step(action)
|
109 |
+
next_state = np.reshape(next_state, [1, state_size])
|
110 |
+
|
111 |
+
agent.train_model()
|
112 |
+
|
113 |
+
score += reward
|
114 |
+
state = next_state
|
115 |
+
|
116 |
+
print(score)
|
117 |
+
results.append(score)
|
118 |
+
|
119 |
+
utils.save_offline_results(game, 'DQN', seed, results)
|
120 |
+
|
121 |
+
plt.plot(results)
|
122 |
+
plt.show()
|
123 |
+
|
124 |
+
run_DQN()
|
old_code/experiment_3/q_networks/utils.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import argparse
|
3 |
+
import pickle
|
4 |
+
import keras
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
STORING_PATH = '../offline_rl_results/'
|
8 |
+
MODELS_PATH = './trained_models/'
|
9 |
+
|
10 |
+
def save_results(environment, approximator, seed, rewards):
|
11 |
+
storing_path = os.path.join(STORING_PATH, environment, approximator, str(seed))
|
12 |
+
if not os.path.exists(storing_path):
|
13 |
+
os.makedirs(storing_path)
|
14 |
+
|
15 |
+
np.save(storing_path + '/' + 'upside_down_rewards.npy', rewards)
|
16 |
+
|
17 |
+
def save_trained_model(environment, seed, algorithm, model):
|
18 |
+
storing_path = os.path.join(MODELS_PATH, environment, str(seed), algorithm)
|
19 |
+
if not os.path.exists(storing_path):
|
20 |
+
os.makedirs(storing_path)
|
21 |
+
|
22 |
+
model.save_weights(storing_path + '/' + 'trained_model.h5')
|
23 |
+
|
24 |
+
def save_offline_results(environment, algorithm, seed, returns):
|
25 |
+
storing_path = os.path.join(STORING_PATH, algorithm, str(seed))
|
26 |
+
if not os.path.exists(storing_path):
|
27 |
+
os.makedirs(storing_path)
|
28 |
+
|
29 |
+
np.save(storing_path + '/rewards.npy', returns)
|
old_code/experiment_3/upside_down/prepare_offline_buffer.py
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import math
|
3 |
+
import time
|
4 |
+
import gym
|
5 |
+
import random
|
6 |
+
import utils
|
7 |
+
import keras
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
from collections import deque
|
11 |
+
from matplotlib import pyplot as plt
|
12 |
+
|
13 |
+
|
14 |
+
class ReplayBuffer():
|
15 |
+
"""
|
16 |
+
Thank you: https://github.com/BY571/
|
17 |
+
"""
|
18 |
+
|
19 |
+
def __init__(self, max_size):
|
20 |
+
self.max_size = max_size
|
21 |
+
self.buffer = []
|
22 |
+
|
23 |
+
def add_sample(self, states, actions, rewards):
|
24 |
+
episode = {"states": states, "actions":actions, "rewards": rewards, "summed_rewards":sum(rewards)}
|
25 |
+
self.buffer.append(episode)
|
26 |
+
|
27 |
+
def sort(self):
|
28 |
+
#sort buffer
|
29 |
+
self.buffer = sorted(self.buffer, key = lambda i: i["summed_rewards"],reverse=True)
|
30 |
+
# keep the max buffer size
|
31 |
+
self.buffer = self.buffer[:self.max_size]
|
32 |
+
|
33 |
+
def get_random_samples(self, batch_size):
|
34 |
+
self.sort()
|
35 |
+
|
36 |
+
idxs = np.random.randint(0, len(self.buffer), batch_size)
|
37 |
+
batch = [self.buffer[idx] for idx in idxs]
|
38 |
+
|
39 |
+
return batch
|
40 |
+
|
41 |
+
def get_n_best(self, n):
|
42 |
+
self.sort()
|
43 |
+
return self.buffer[:n]
|
44 |
+
|
45 |
+
def __len__(self):
|
46 |
+
return len(self.buffer)
|
47 |
+
|
48 |
+
class UpsideDownAgent():
|
49 |
+
def __init__(self, environment):
|
50 |
+
self.environment = gym.make(environment)
|
51 |
+
self.state_size = self.environment.observation_space.shape[0]
|
52 |
+
self.action_size = self.environment.action_space.n
|
53 |
+
self.memory = ReplayBuffer(700)
|
54 |
+
self.last_few = 75
|
55 |
+
self.batch_size = 32
|
56 |
+
self.command_size = 2 # desired return + desired horizon
|
57 |
+
self.desired_return = 1
|
58 |
+
self.desired_horizon = 1
|
59 |
+
self.horizon_scale = 0.02
|
60 |
+
self.return_scale = 0.02
|
61 |
+
self.testing_state = 0
|
62 |
+
|
63 |
+
self.behaviour_function = utils.get_functional_behaviour_function(self.state_size, self.command_size, self.action_size, True)
|
64 |
+
|
65 |
+
self.testing_rewards = []
|
66 |
+
|
67 |
+
def get_action(self, observation, command):
|
68 |
+
"""
|
69 |
+
We will sample from the action distribution modeled by the Behavior Function
|
70 |
+
"""
|
71 |
+
|
72 |
+
action_probs = self.behaviour_function.predict([observation, command])
|
73 |
+
action = np.random.choice(np.arange(0, self.action_size), p=action_probs[0])
|
74 |
+
|
75 |
+
return action
|
76 |
+
|
77 |
+
def get_greedy_action(self, observation, command):
|
78 |
+
|
79 |
+
action_probs = self.behaviour_function.predict([observation, command])
|
80 |
+
action = np.argmax(action_probs)
|
81 |
+
|
82 |
+
return action
|
83 |
+
|
84 |
+
|
85 |
+
def sample_exploratory_commands(self):
|
86 |
+
best_episodes = self.memory.get_n_best(self.last_few)
|
87 |
+
exploratory_desired_horizon = np.mean([len(i["states"]) for i in best_episodes])
|
88 |
+
|
89 |
+
returns = [i["summed_rewards"] for i in best_episodes]
|
90 |
+
exploratory_desired_returns = np.random.uniform(np.mean(returns), np.mean(returns)+np.std(returns))
|
91 |
+
|
92 |
+
return [exploratory_desired_returns, exploratory_desired_horizon]
|
93 |
+
|
94 |
+
def generate_offline_episodes(self, environment, e, desired_return, desired_horizon):
|
95 |
+
|
96 |
+
env = gym.make(environment)
|
97 |
+
tot_rewards = []
|
98 |
+
done = False
|
99 |
+
|
100 |
+
score = 0
|
101 |
+
state = env.reset()
|
102 |
+
|
103 |
+
scores = []
|
104 |
+
states = []
|
105 |
+
actions = []
|
106 |
+
rewards = []
|
107 |
+
|
108 |
+
while not done:
|
109 |
+
state = np.reshape(state, [1, self.state_size])
|
110 |
+
states.append(state)
|
111 |
+
|
112 |
+
observation = state
|
113 |
+
|
114 |
+
command = np.asarray([desired_return * self.return_scale, desired_horizon * self.horizon_scale])
|
115 |
+
command = np.reshape(command, [1, len(command)])
|
116 |
+
|
117 |
+
action = self.get_action(observation, command)
|
118 |
+
actions.append(action)
|
119 |
+
|
120 |
+
next_state, reward, done, info = env.step(action)
|
121 |
+
next_state = np.reshape(next_state, [1, self.state_size])
|
122 |
+
|
123 |
+
rewards.append(reward)
|
124 |
+
score += reward
|
125 |
+
|
126 |
+
state = next_state
|
127 |
+
|
128 |
+
desired_return -= reward # Line 8 Algorithm 2
|
129 |
+
desired_horizon -= 1 # Line 9 Algorithm 2
|
130 |
+
desired_horizon = np.maximum(desired_horizon, 1)
|
131 |
+
|
132 |
+
self.memory.add_sample(states, actions, rewards)
|
133 |
+
|
134 |
+
print('Testing score: {}'.format(score))
|
135 |
+
|
136 |
+
def save_buffer(self, environment, seed):
|
137 |
+
utils.save_buffer(environment, seed, self.memory.buffer)
|
138 |
+
|
139 |
+
def run_experiment():
|
140 |
+
|
141 |
+
environment = 'CartPole-v0'
|
142 |
+
seed = 1
|
143 |
+
|
144 |
+
offline_episodes = 700
|
145 |
+
returns = []
|
146 |
+
|
147 |
+
agent = UpsideDownAgent(environment)
|
148 |
+
|
149 |
+
for e in range(offline_episodes):
|
150 |
+
tmp_r = []
|
151 |
+
r = agent.generate_offline_episodes(environment, e, 200, 200)
|
152 |
+
tmp_r.append(r)
|
153 |
+
|
154 |
+
agent.save_buffer(environment, seed)
|
155 |
+
|
156 |
+
if __name__ == "__main__":
|
157 |
+
run_experiment()
|
old_code/experiment_3/upside_down/train_agent.py
ADDED
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import math
|
3 |
+
import time
|
4 |
+
import gym
|
5 |
+
import random
|
6 |
+
import utils
|
7 |
+
import keras
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
from collections import deque
|
11 |
+
from matplotlib import pyplot as plt
|
12 |
+
|
13 |
+
|
14 |
+
class ReplayBuffer():
|
15 |
+
"""
|
16 |
+
Thank you: https://github.com/BY571/
|
17 |
+
"""
|
18 |
+
|
19 |
+
def __init__(self, max_size):
|
20 |
+
self.max_size = max_size
|
21 |
+
self.buffer = []
|
22 |
+
|
23 |
+
def add_sample(self, states, actions, rewards):
|
24 |
+
episode = {"states": states, "actions":actions, "rewards": rewards, "summed_rewards":sum(rewards)}
|
25 |
+
self.buffer.append(episode)
|
26 |
+
|
27 |
+
def sort(self):
|
28 |
+
#sort buffer
|
29 |
+
self.buffer = sorted(self.buffer, key = lambda i: i["summed_rewards"],reverse=True)
|
30 |
+
# keep the max buffer size
|
31 |
+
self.buffer = self.buffer[:self.max_size]
|
32 |
+
|
33 |
+
def get_random_samples(self, batch_size):
|
34 |
+
self.sort()
|
35 |
+
|
36 |
+
idxs = np.random.randint(0, len(self.buffer), batch_size)
|
37 |
+
batch = [self.buffer[idx] for idx in idxs]
|
38 |
+
|
39 |
+
return batch
|
40 |
+
|
41 |
+
def get_n_best(self, n):
|
42 |
+
self.sort()
|
43 |
+
return self.buffer[:n]
|
44 |
+
|
45 |
+
def __len__(self):
|
46 |
+
return len(self.buffer)
|
47 |
+
|
48 |
+
class UpsideDownAgent():
|
49 |
+
def __init__(self, environment):
|
50 |
+
self.environment = gym.make(environment)
|
51 |
+
self.state_size = self.environment.observation_space.shape[0]
|
52 |
+
self.action_size = self.environment.action_space.n
|
53 |
+
self.warm_up_episodes = 50
|
54 |
+
self.render = False
|
55 |
+
self.memory = ReplayBuffer(700)
|
56 |
+
self.last_few = 75
|
57 |
+
self.batch_size = 32
|
58 |
+
self.command_size = 2 # desired return + desired horizon
|
59 |
+
self.desired_return = 1
|
60 |
+
self.desired_horizon = 1
|
61 |
+
self.horizon_scale = 0.02
|
62 |
+
self.return_scale = 0.02
|
63 |
+
self.testing_state = 0
|
64 |
+
|
65 |
+
self.behaviour_function = utils.get_functional_behaviour_function(self.state_size, self.command_size, self.action_size, False)
|
66 |
+
|
67 |
+
self.testing_rewards = []
|
68 |
+
self.warm_up_buffer()
|
69 |
+
|
70 |
+
def warm_up_buffer(self):
|
71 |
+
|
72 |
+
for i in range(self.warm_up_episodes):
|
73 |
+
state = self.environment.reset()
|
74 |
+
states = []
|
75 |
+
rewards = []
|
76 |
+
actions = []
|
77 |
+
done = False
|
78 |
+
desired_return = 1
|
79 |
+
desired_horizon = 1
|
80 |
+
|
81 |
+
while not done:
|
82 |
+
|
83 |
+
state = np.reshape(state, [1, self.state_size])
|
84 |
+
states.append(state)
|
85 |
+
|
86 |
+
observation = state
|
87 |
+
|
88 |
+
command = np.asarray([desired_return * self.return_scale, desired_horizon * self.horizon_scale])
|
89 |
+
|
90 |
+
command = np.reshape(command, [1, len(command)])
|
91 |
+
|
92 |
+
action = self.get_action(observation, command)
|
93 |
+
actions.append(action)
|
94 |
+
|
95 |
+
next_state, reward, done, info = self.environment.step(action)
|
96 |
+
next_state = np.reshape(next_state, [1, self.state_size])
|
97 |
+
|
98 |
+
rewards.append(reward)
|
99 |
+
|
100 |
+
state = next_state
|
101 |
+
|
102 |
+
desired_return -= reward # Line 8 Algorithm 2
|
103 |
+
desired_horizon -= 1 # Line 9 Algorithm 2
|
104 |
+
desired_horizon = np.maximum(desired_horizon, 1)
|
105 |
+
|
106 |
+
self.memory.add_sample(states, actions, rewards)
|
107 |
+
|
108 |
+
|
109 |
+
def get_action(self, observation, command):
|
110 |
+
"""
|
111 |
+
We will sample from the action distribution modeled by the Behavior Function
|
112 |
+
"""
|
113 |
+
|
114 |
+
action_probs = self.behaviour_function.predict([observation, command])
|
115 |
+
action = np.random.choice(np.arange(0, self.action_size), p=action_probs[0])
|
116 |
+
|
117 |
+
return action
|
118 |
+
|
119 |
+
def get_greedy_action(self, observation, command):
|
120 |
+
|
121 |
+
action_probs = self.behaviour_function.predict([observation, command])
|
122 |
+
action = np.argmax(action_probs)
|
123 |
+
|
124 |
+
return action
|
125 |
+
|
126 |
+
def train_behaviour_function(self):
|
127 |
+
|
128 |
+
random_episodes = self.memory.get_random_samples(self.batch_size)
|
129 |
+
|
130 |
+
training_observations = np.zeros((self.batch_size, self.state_size))
|
131 |
+
training_commands = np.zeros((self.batch_size, 2))
|
132 |
+
|
133 |
+
y = []
|
134 |
+
|
135 |
+
for idx, episode in enumerate(random_episodes):
|
136 |
+
T = len(episode['states'])
|
137 |
+
t1 = np.random.randint(0, T-1)
|
138 |
+
t2 = np.random.randint(t1+1, T)
|
139 |
+
|
140 |
+
state = episode['states'][t1]
|
141 |
+
desired_return = sum(episode["rewards"][t1:t2])
|
142 |
+
desired_horizon = t2 -t1
|
143 |
+
|
144 |
+
target = episode['actions'][t1]
|
145 |
+
|
146 |
+
training_observations[idx] = state[0]
|
147 |
+
training_commands[idx] = np.asarray([desired_return*self.return_scale, desired_horizon*self.horizon_scale])
|
148 |
+
y.append(target)
|
149 |
+
|
150 |
+
_y = keras.utils.to_categorical(y)
|
151 |
+
|
152 |
+
self.behaviour_function.fit([training_observations, training_commands], _y, verbose=0)
|
153 |
+
|
154 |
+
|
155 |
+
def sample_exploratory_commands(self):
|
156 |
+
best_episodes = self.memory.get_n_best(self.last_few)
|
157 |
+
exploratory_desired_horizon = np.mean([len(i["states"]) for i in best_episodes])
|
158 |
+
|
159 |
+
returns = [i["summed_rewards"] for i in best_episodes]
|
160 |
+
exploratory_desired_returns = np.random.uniform(np.mean(returns), np.mean(returns)+np.std(returns))
|
161 |
+
|
162 |
+
return [exploratory_desired_returns, exploratory_desired_horizon]
|
163 |
+
|
164 |
+
def generate_episode(self, environment, e, desired_return, desired_horizon, testing):
|
165 |
+
|
166 |
+
env = gym.make(environment)
|
167 |
+
tot_rewards = []
|
168 |
+
done = False
|
169 |
+
|
170 |
+
score = 0
|
171 |
+
state = env.reset()
|
172 |
+
|
173 |
+
scores = []
|
174 |
+
states = []
|
175 |
+
actions = []
|
176 |
+
rewards = []
|
177 |
+
|
178 |
+
while not done:
|
179 |
+
state = np.reshape(state, [1, self.state_size])
|
180 |
+
states.append(state)
|
181 |
+
|
182 |
+
observation = state
|
183 |
+
|
184 |
+
command = np.asarray([desired_return * self.return_scale, desired_horizon * self.horizon_scale])
|
185 |
+
command = np.reshape(command, [1, len(command)])
|
186 |
+
|
187 |
+
if not testing:
|
188 |
+
action = self.get_action(observation, command)
|
189 |
+
actions.append(action)
|
190 |
+
else:
|
191 |
+
action = self.get_greedy_action(observation, command)
|
192 |
+
|
193 |
+
next_state, reward, done, info = env.step(action)
|
194 |
+
next_state = np.reshape(next_state, [1, self.state_size])
|
195 |
+
|
196 |
+
rewards.append(reward)
|
197 |
+
score += reward
|
198 |
+
|
199 |
+
state = next_state
|
200 |
+
|
201 |
+
desired_return -= reward # Line 8 Algorithm 2
|
202 |
+
desired_horizon -= 1 # Line 9 Algorithm 2
|
203 |
+
desired_horizon = np.maximum(desired_horizon, 1)
|
204 |
+
|
205 |
+
self.memory.add_sample(states, actions, rewards)
|
206 |
+
|
207 |
+
self.testing_rewards.append(score)
|
208 |
+
|
209 |
+
if testing:
|
210 |
+
print('Querying the model ...')
|
211 |
+
print('Testing score: {}'.format(score))
|
212 |
+
|
213 |
+
return score
|
214 |
+
|
215 |
+
def run_experiment():
|
216 |
+
|
217 |
+
environment = 'CartPole-v0'
|
218 |
+
seed = 1
|
219 |
+
episodes = 500
|
220 |
+
|
221 |
+
returns = []
|
222 |
+
|
223 |
+
agent = UpsideDownAgent(environment)
|
224 |
+
|
225 |
+
for e in range(episodes):
|
226 |
+
for i in range(100):
|
227 |
+
agent.train_behaviour_function()
|
228 |
+
|
229 |
+
for i in range(15):
|
230 |
+
tmp_r = []
|
231 |
+
exploratory_commands = agent.sample_exploratory_commands() # Line 5 Algorithm 1
|
232 |
+
desired_return = exploratory_commands[0]
|
233 |
+
desired_horizon = exploratory_commands[1]
|
234 |
+
r = agent.generate_episode(environment, e, desired_return, desired_horizon, False)
|
235 |
+
tmp_r.append(r)
|
236 |
+
|
237 |
+
print(np.mean(tmp_r))
|
238 |
+
returns.append(np.mean(tmp_r))
|
239 |
+
|
240 |
+
exploratory_commands = agent.sample_exploratory_commands()
|
241 |
+
|
242 |
+
agent.generate_episode(environment, 1, 200, 200, True)
|
243 |
+
|
244 |
+
utils.save_results(environment, 'upside_down_agent', seed, returns)
|
245 |
+
utils.save_trained_model(environment, seed, agent.behaviour_function)
|
246 |
+
|
247 |
+
if __name__ == "__main__":
|
248 |
+
run_experiment()
|
old_code/experiment_3/upside_down/train_offline_agent.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import math
|
3 |
+
import time
|
4 |
+
import gym
|
5 |
+
import random
|
6 |
+
import utils
|
7 |
+
import keras
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
from collections import deque
|
11 |
+
from matplotlib import pyplot as plt
|
12 |
+
|
13 |
+
class ReplayBuffer():
|
14 |
+
"""
|
15 |
+
Thank you: https://github.com/BY571/
|
16 |
+
"""
|
17 |
+
|
18 |
+
def __init__(self, max_size):
|
19 |
+
self.max_size = max_size
|
20 |
+
self.buffer = np.load('./buffers/CartPole-v0/1/memory_buffer.npy')
|
21 |
+
|
22 |
+
def sort(self):
|
23 |
+
#sort buffer
|
24 |
+
self.buffer = sorted(self.buffer, key = lambda i: i["summed_rewards"],reverse=True)
|
25 |
+
# keep the max buffer size
|
26 |
+
self.buffer = self.buffer[:self.max_size]
|
27 |
+
|
28 |
+
def get_random_samples(self, batch_size):
|
29 |
+
self.sort()
|
30 |
+
|
31 |
+
idxs = np.random.randint(0, len(self.buffer), batch_size)
|
32 |
+
batch = [self.buffer[idx] for idx in idxs]
|
33 |
+
|
34 |
+
return batch
|
35 |
+
|
36 |
+
def get_n_best(self, n):
|
37 |
+
self.sort()
|
38 |
+
return self.buffer[:n]
|
39 |
+
|
40 |
+
def __len__(self):
|
41 |
+
return len(self.buffer)
|
42 |
+
|
43 |
+
|
44 |
+
class UpsideDownAgent():
|
45 |
+
def __init__(self, environment):
|
46 |
+
self.environment = gym.make(environment)
|
47 |
+
self.state_size = self.environment.observation_space.shape[0]
|
48 |
+
self.action_size = self.environment.action_space.n
|
49 |
+
self.memory = ReplayBuffer(700)
|
50 |
+
self.last_few = 75
|
51 |
+
self.batch_size = 32
|
52 |
+
self.command_size = 2 # desired return + desired horizon
|
53 |
+
self.desired_return = 1
|
54 |
+
self.desired_horizon = 1
|
55 |
+
self.horizon_scale = 0.02
|
56 |
+
self.return_scale = 0.02
|
57 |
+
self.testing_state = 0
|
58 |
+
|
59 |
+
self.behaviour_function = utils.get_functional_behaviour_function(self.state_size, self.command_size, self.action_size, False)
|
60 |
+
|
61 |
+
self.testing_rewards = []
|
62 |
+
|
63 |
+
|
64 |
+
def get_action(self, observation, command):
|
65 |
+
"""
|
66 |
+
We will sample from the action distribution modeled by the Behavior Function
|
67 |
+
"""
|
68 |
+
|
69 |
+
action_probs = self.behaviour_function.predict([observation, command])
|
70 |
+
action = np.random.choice(np.arange(0, self.action_size), p=action_probs[0])
|
71 |
+
|
72 |
+
return action
|
73 |
+
|
74 |
+
def get_greedy_action(self, observation, command):
|
75 |
+
|
76 |
+
action_probs = self.behaviour_function.predict([observation, command])
|
77 |
+
action = np.argmax(action_probs)
|
78 |
+
|
79 |
+
return action
|
80 |
+
|
81 |
+
def train_behaviour_function(self):
|
82 |
+
|
83 |
+
random_episodes = self.memory.get_random_samples(self.batch_size)
|
84 |
+
|
85 |
+
training_observations = np.zeros((self.batch_size, self.state_size))
|
86 |
+
training_commands = np.zeros((self.batch_size, 2))
|
87 |
+
|
88 |
+
y = []
|
89 |
+
|
90 |
+
for idx, episode in enumerate(random_episodes):
|
91 |
+
T = len(episode['states'])
|
92 |
+
t1 = np.random.randint(0, T-1)
|
93 |
+
t2 = np.random.randint(t1+1, T)
|
94 |
+
|
95 |
+
state = episode['states'][t1]
|
96 |
+
desired_return = sum(episode["rewards"][t1:t2])
|
97 |
+
desired_horizon = t2 -t1
|
98 |
+
|
99 |
+
target = episode['actions'][t1]
|
100 |
+
|
101 |
+
training_observations[idx] = state[0]
|
102 |
+
training_commands[idx] = np.asarray([desired_return*self.return_scale, desired_horizon*self.horizon_scale])
|
103 |
+
y.append(target)
|
104 |
+
|
105 |
+
_y = keras.utils.to_categorical(y)
|
106 |
+
|
107 |
+
self.behaviour_function.fit([training_observations, training_commands], _y, verbose=0)
|
108 |
+
|
109 |
+
|
110 |
+
def sample_exploratory_commands(self):
|
111 |
+
best_episodes = self.memory.get_n_best(self.last_few)
|
112 |
+
exploratory_desired_horizon = np.mean([len(i["states"]) for i in best_episodes])
|
113 |
+
|
114 |
+
returns = [i["summed_rewards"] for i in best_episodes]
|
115 |
+
exploratory_desired_returns = np.random.uniform(np.mean(returns), np.mean(returns)+np.std(returns))
|
116 |
+
|
117 |
+
return [exploratory_desired_returns, exploratory_desired_horizon]
|
118 |
+
|
119 |
+
def generate_episode(self, environment, e, desired_return, desired_horizon, testing):
|
120 |
+
|
121 |
+
env = gym.make(environment)
|
122 |
+
tot_rewards = []
|
123 |
+
done = False
|
124 |
+
|
125 |
+
score = 0
|
126 |
+
state = env.reset()
|
127 |
+
|
128 |
+
scores = []
|
129 |
+
states = []
|
130 |
+
actions = []
|
131 |
+
rewards = []
|
132 |
+
|
133 |
+
while not done:
|
134 |
+
state = np.reshape(state, [1, self.state_size])
|
135 |
+
states.append(state)
|
136 |
+
|
137 |
+
observation = state
|
138 |
+
|
139 |
+
command = np.asarray([desired_return * self.return_scale, desired_horizon * self.horizon_scale])
|
140 |
+
command = np.reshape(command, [1, len(command)])
|
141 |
+
|
142 |
+
if not testing:
|
143 |
+
action = self.get_action(observation, command)
|
144 |
+
actions.append(action)
|
145 |
+
else:
|
146 |
+
action = self.get_greedy_action(observation, command)
|
147 |
+
|
148 |
+
next_state, reward, done, info = env.step(action)
|
149 |
+
next_state = np.reshape(next_state, [1, self.state_size])
|
150 |
+
|
151 |
+
rewards.append(reward)
|
152 |
+
score += reward
|
153 |
+
|
154 |
+
state = next_state
|
155 |
+
|
156 |
+
desired_return -= reward # Line 8 Algorithm 2
|
157 |
+
desired_horizon -= 1 # Line 9 Algorithm 2
|
158 |
+
desired_horizon = np.maximum(desired_horizon, 1)
|
159 |
+
|
160 |
+
self.testing_rewards.append(score)
|
161 |
+
|
162 |
+
if testing:
|
163 |
+
print('Querying the model ...')
|
164 |
+
print('Testing score: {}'.format(score))
|
165 |
+
|
166 |
+
return score
|
167 |
+
|
168 |
+
def run_experiment():
|
169 |
+
|
170 |
+
environment = 'CartPole-v0'
|
171 |
+
seed = 1
|
172 |
+
|
173 |
+
episodes = 500
|
174 |
+
returns = []
|
175 |
+
|
176 |
+
agent = UpsideDownAgent(environment)
|
177 |
+
|
178 |
+
for e in range(episodes):
|
179 |
+
for i in range(100):
|
180 |
+
agent.train_behaviour_function()
|
181 |
+
|
182 |
+
for i in range(15):
|
183 |
+
tmp_r = []
|
184 |
+
r = agent.generate_episode(environment, e, 200, 200, False)
|
185 |
+
tmp_r.append(r)
|
186 |
+
|
187 |
+
print(np.mean(tmp_r))
|
188 |
+
returns.append(np.mean(tmp_r))
|
189 |
+
|
190 |
+
agent.generate_episode(environment, 1, 200, 200, True)
|
191 |
+
|
192 |
+
utils.save_offline_results(environment, approximator, seed, returns)
|
193 |
+
|
194 |
+
|
195 |
+
if __name__ == "__main__":
|
196 |
+
run_experiment()
|
old_code/experiment_3/upside_down/utils.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import argparse
|
3 |
+
import pickle
|
4 |
+
import keras
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
from keras.layers import Dense, Multiply, Input, Conv2D, Flatten
|
8 |
+
from keras.models import Sequential, Model
|
9 |
+
from keras.optimizers import Adam, RMSprop, SGD
|
10 |
+
|
11 |
+
from skimage.transform import resize
|
12 |
+
from skimage.color import rgb2gray
|
13 |
+
|
14 |
+
STORING_PATH = './results/'
|
15 |
+
MODELS_PATH = './trained_models/'
|
16 |
+
BUFFERS_PATH = './buffers/'
|
17 |
+
|
18 |
+
def save_results(environment, approximator, seed, rewards):
|
19 |
+
storing_path = os.path.join(STORING_PATH, environment, approximator, str(seed))
|
20 |
+
if not os.path.exists(storing_path):
|
21 |
+
os.makedirs(storing_path)
|
22 |
+
|
23 |
+
np.save(storing_path + '/' + 'upside_down_rewards.npy', rewards)
|
24 |
+
|
25 |
+
def get_functional_behaviour_function(state_size, command_size, action_size, pretrained):
|
26 |
+
observation_input = keras.Input(shape=(state_size,))
|
27 |
+
linear_layer = Dense(64, activation='sigmoid')(observation_input)
|
28 |
+
|
29 |
+
command_input = keras.Input(shape=(command_size,))
|
30 |
+
sigmoidal_layer = Dense(64, activation='sigmoid')(command_input)
|
31 |
+
|
32 |
+
multiplied_layer = Multiply()([linear_layer, sigmoidal_layer])
|
33 |
+
|
34 |
+
layer_1 = Dense(64, activation='relu')(multiplied_layer)
|
35 |
+
layer_2 = Dense(64, activation='relu')(layer_1)
|
36 |
+
layer_3 = Dense(64, activation='relu')(layer_2)
|
37 |
+
layer_4 = Dense(64, activation='relu')(layer_3)
|
38 |
+
final_layer = Dense(action_size, activation='softmax')(layer_4)
|
39 |
+
|
40 |
+
model = Model(inputs=[observation_input, command_input], outputs=final_layer)
|
41 |
+
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001))
|
42 |
+
|
43 |
+
if pretrained:
|
44 |
+
model.load_weights(os.path.join(MODELS_PATH, 'CartPole-v0', '1', 'trained_model.h5'))
|
45 |
+
|
46 |
+
return model
|
47 |
+
|
48 |
+
def get_atari_behaviour_function(action_size):
|
49 |
+
|
50 |
+
print('Getting the model')
|
51 |
+
|
52 |
+
input_state = Input(shape=(84,84,4))
|
53 |
+
|
54 |
+
first_conv = Conv2D(
|
55 |
+
32, (8, 8), strides=(4,4), activation='relu')(input_state)
|
56 |
+
second_conv = Conv2D(
|
57 |
+
64, (4, 4), strides=(2,2), activation='relu')(first_conv)
|
58 |
+
third_conv = Conv2D(
|
59 |
+
64, (3, 3), strides=(1,1), activation='relu')(second_conv)
|
60 |
+
|
61 |
+
flattened = Flatten()(third_conv)
|
62 |
+
dense_layer = Dense(512, activation='relu')(flattened)
|
63 |
+
|
64 |
+
command_input = keras.Input(shape=(2,))
|
65 |
+
sigmoidal_layer = Dense(512, activation='sigmoid')(command_input)
|
66 |
+
|
67 |
+
multiplied_layer = Multiply()([dense_layer, sigmoidal_layer])
|
68 |
+
final_layer = Dense(256, activation='relu')(multiplied_layer)
|
69 |
+
|
70 |
+
action_layer = Dense(action_size, activation='softmax')(final_layer)
|
71 |
+
|
72 |
+
model = Model(inputs=[input_state, command_input], outputs=action_layer)
|
73 |
+
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.001, rho=0.95, epsilon=0.01))
|
74 |
+
|
75 |
+
|
76 |
+
print(model.summary())
|
77 |
+
|
78 |
+
return model
|
79 |
+
|
80 |
+
def get_catch_behaviour_function(action_size):
|
81 |
+
|
82 |
+
print('Getting the Catch-model')
|
83 |
+
|
84 |
+
input_state = Input(shape=(84,84,4))
|
85 |
+
|
86 |
+
first_conv = Conv2D(
|
87 |
+
32, (8, 8), strides=(4,4), activation='relu')(input_state)
|
88 |
+
second_conv = Conv2D(
|
89 |
+
64, (4, 4), strides=(2,2), activation='relu')(first_conv)
|
90 |
+
third_conv = Conv2D(
|
91 |
+
64, (3, 3), strides=(1,1), activation='relu')(second_conv)
|
92 |
+
|
93 |
+
flattened = Flatten()(third_conv)
|
94 |
+
dense_layer = Dense(512, activation='relu')(flattened)
|
95 |
+
|
96 |
+
command_input = keras.Input(shape=(2,))
|
97 |
+
sigmoidal_layer = Dense(512, activation='sigmoid')(command_input)
|
98 |
+
|
99 |
+
multiplied_layer = Multiply()([dense_layer, sigmoidal_layer])
|
100 |
+
final_layer = Dense(256, activation='relu')(multiplied_layer)
|
101 |
+
|
102 |
+
action_layer = Dense(action_size, activation='softmax')(final_layer)
|
103 |
+
|
104 |
+
model = Model(inputs=[input_state, command_input], outputs=action_layer)
|
105 |
+
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.001, rho=0.95, epsilon=0.01))
|
106 |
+
|
107 |
+
|
108 |
+
print(model.summary())
|
109 |
+
|
110 |
+
return model
|
111 |
+
|
112 |
+
|
113 |
+
def pre_processing(state):
|
114 |
+
processed_state = np.uint8(
|
115 |
+
resize(rgb2gray(state), (84, 84), mode='constant')*255)
|
116 |
+
|
117 |
+
return processed_state
|
118 |
+
|
119 |
+
def save_trained_model(environment, seed, model):
|
120 |
+
storing_path = os.path.join(MODELS_PATH, environment, str(seed))
|
121 |
+
if not os.path.exists(storing_path):
|
122 |
+
os.makedirs(storing_path)
|
123 |
+
|
124 |
+
model.save_weights(storing_path + '/' + 'trained_model.h5')
|
125 |
+
|
126 |
+
def save_buffer(environment, seed, memory_buffer):
|
127 |
+
storing_path = os.path.join(BUFFERS_PATH, environment, str(seed))
|
128 |
+
if not os.path.exists(storing_path):
|
129 |
+
os.makedirs(storing_path)
|
130 |
+
|
131 |
+
np.save(os.path.join(storing_path,'memory_buffer.npy'), memory_buffer)
|
old_code/train_atari_agent.py
ADDED
@@ -0,0 +1,321 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import math
|
3 |
+
import time
|
4 |
+
import gym
|
5 |
+
import random
|
6 |
+
import utils
|
7 |
+
import keras
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
from collections import deque
|
11 |
+
from matplotlib import pyplot as plt
|
12 |
+
from sklearn.preprocessing import OneHotEncoder
|
13 |
+
|
14 |
+
class ReplayBuffer():
|
15 |
+
"""
|
16 |
+
Thank you: https://github.com/BY571/
|
17 |
+
"""
|
18 |
+
|
19 |
+
def __init__(self, max_size):
|
20 |
+
self.max_size = max_size
|
21 |
+
self.buffer = []
|
22 |
+
|
23 |
+
def add_sample(self, states, actions, rewards):
|
24 |
+
episode = {"states": states, "actions":actions, "rewards": rewards, "summed_rewards":sum(rewards)}
|
25 |
+
self.buffer.append(episode)
|
26 |
+
|
27 |
+
def sort(self):
|
28 |
+
#sort buffer
|
29 |
+
self.buffer = sorted(self.buffer, key = lambda i: i["summed_rewards"],reverse=True)
|
30 |
+
# keep the max buffer size
|
31 |
+
self.buffer = self.buffer[:self.max_size]
|
32 |
+
|
33 |
+
def get_random_samples(self, batch_size):
|
34 |
+
self.sort()
|
35 |
+
idxs = np.random.randint(0, len(self.buffer), batch_size)
|
36 |
+
batch = [self.buffer[idx] for idx in idxs]
|
37 |
+
return batch
|
38 |
+
|
39 |
+
def get_n_best(self, n):
|
40 |
+
self.sort()
|
41 |
+
return self.buffer[:n]
|
42 |
+
|
43 |
+
def __len__(self):
|
44 |
+
return len(self.buffer)
|
45 |
+
|
46 |
+
class UpsideDownAgent():
|
47 |
+
def __init__(self, environment, approximator):
|
48 |
+
self.environment = gym.make(environment)
|
49 |
+
self.approximator = approximator
|
50 |
+
self.state_size = (84, 84, 4)
|
51 |
+
self.action_size = 3
|
52 |
+
self.warm_up_episodes = 1 #50
|
53 |
+
self.render = False
|
54 |
+
self.memory = ReplayBuffer(700)
|
55 |
+
self.last_few = 50
|
56 |
+
self.batch_size = 256
|
57 |
+
self.command_size = 2 # desired return + desired horizon
|
58 |
+
self.desired_return = 1
|
59 |
+
self.desired_horizon = 1
|
60 |
+
self.horizon_scale = 0.02
|
61 |
+
self.return_scale = 0.02
|
62 |
+
|
63 |
+
self.behaviour_function = utils.get_atari_behaviour_function(self.action_size)
|
64 |
+
|
65 |
+
self.testing_rewards = []
|
66 |
+
self.warm_up_buffer()
|
67 |
+
|
68 |
+
def warm_up_buffer(self):
|
69 |
+
print('Warming up')
|
70 |
+
|
71 |
+
for i in range(self.warm_up_episodes):
|
72 |
+
|
73 |
+
states = []
|
74 |
+
rewards = []
|
75 |
+
actions = []
|
76 |
+
|
77 |
+
dead = False
|
78 |
+
done = False
|
79 |
+
desired_return = 1
|
80 |
+
desired_horizon = 1
|
81 |
+
|
82 |
+
step, score, start_life = 0, 0, 5
|
83 |
+
observe = self.environment.reset()
|
84 |
+
|
85 |
+
for _ in range(random.randint(1, 30)):
|
86 |
+
observe, _, _, _ = self.environment.step(1)
|
87 |
+
|
88 |
+
state = utils.pre_processing(observe)
|
89 |
+
history = np.stack((state, state, state, state), axis=2)
|
90 |
+
history = np.reshape([history], (1, 84, 84, 4))
|
91 |
+
|
92 |
+
|
93 |
+
while not done:
|
94 |
+
|
95 |
+
states.append(history)
|
96 |
+
command = np.asarray([desired_return * self.return_scale, desired_horizon * self.horizon_scale])
|
97 |
+
command = np.reshape(command, [1, len(command)])
|
98 |
+
|
99 |
+
action = self.get_action(history, command)
|
100 |
+
actions.append(action)
|
101 |
+
|
102 |
+
if action == 0:
|
103 |
+
real_action = 1
|
104 |
+
elif action == 1:
|
105 |
+
real_action = 2
|
106 |
+
else:
|
107 |
+
real_action = 3
|
108 |
+
|
109 |
+
next_state, reward, done, info = self.environment.step(real_action)
|
110 |
+
next_state = utils.pre_processing(observe)
|
111 |
+
next_state = np.reshape([next_state], (1, 84, 84, 1))
|
112 |
+
next_history = np.append(next_state, history[:, :, :, :3], axis = 3)
|
113 |
+
|
114 |
+
rewards.append(reward)
|
115 |
+
|
116 |
+
state = next_state
|
117 |
+
|
118 |
+
if start_life > info['ale.lives']:
|
119 |
+
dead = True
|
120 |
+
start_lide = info['ale.lives']
|
121 |
+
|
122 |
+
if dead:
|
123 |
+
dead = False
|
124 |
+
else:
|
125 |
+
history = next_history
|
126 |
+
|
127 |
+
desired_return -= reward # Line 8 Algorithm 2
|
128 |
+
desired_horizon -= 1 # Line 9 Algorithm 2
|
129 |
+
desired_horizon = np.maximum(desired_horizon, 1)
|
130 |
+
|
131 |
+
self.memory.add_sample(states, actions, rewards)
|
132 |
+
|
133 |
+
|
134 |
+
def get_action(self, observation, command):
|
135 |
+
"""
|
136 |
+
We will sample from the action distribution modeled by the Behavior Function
|
137 |
+
"""
|
138 |
+
|
139 |
+
observation = np.float32(observation / 255.0)
|
140 |
+
|
141 |
+
action_probs = self.behaviour_function.predict([observation, command])
|
142 |
+
action = np.random.choice(np.arange(0, self.action_size), p=action_probs[0])
|
143 |
+
|
144 |
+
return action
|
145 |
+
|
146 |
+
def get_greedy_action(self, observation, command):
|
147 |
+
|
148 |
+
action_probs = self.behaviour_function.predict([observation, command])
|
149 |
+
action = np.argmax(action_probs)
|
150 |
+
|
151 |
+
return action
|
152 |
+
|
153 |
+
def train_behaviour_function(self):
|
154 |
+
|
155 |
+
random_episodes = self.memory.get_random_samples(self.batch_size)
|
156 |
+
|
157 |
+
training_observations = np.zeros((self.batch_size, self.state_size[0], self.state_size[1], self.state_size[2]))
|
158 |
+
training_commands = np.zeros((self.batch_size, 2))
|
159 |
+
|
160 |
+
y = []
|
161 |
+
|
162 |
+
for idx, episode in enumerate(random_episodes):
|
163 |
+
T = len(episode['states'])
|
164 |
+
t1 = np.random.randint(0, T-1)
|
165 |
+
t2 = np.random.randint(t1+1, T)
|
166 |
+
|
167 |
+
state = np.float32(episode['states'][t1] / 255.)
|
168 |
+
desired_return = sum(episode["rewards"][t1:t2])
|
169 |
+
desired_horizon = t2 -t1
|
170 |
+
|
171 |
+
target = episode['actions'][t1]
|
172 |
+
|
173 |
+
training_observations[idx] = state[0]
|
174 |
+
training_commands[idx] = np.asarray([desired_return*self.return_scale, desired_horizon*self.horizon_scale])
|
175 |
+
y.append(target)
|
176 |
+
|
177 |
+
_y = keras.utils.to_categorical(y, num_classes=self.action_size)
|
178 |
+
|
179 |
+
self.behaviour_function.fit([training_observations, training_commands], _y, verbose=0)
|
180 |
+
|
181 |
+
|
182 |
+
def sample_exploratory_commands(self):
|
183 |
+
best_episodes = self.memory.get_n_best(self.last_few)
|
184 |
+
exploratory_desired_horizon = np.mean([len(i["states"]) for i in best_episodes])
|
185 |
+
|
186 |
+
returns = [i["summed_rewards"] for i in best_episodes]
|
187 |
+
exploratory_desired_returns = np.random.uniform(np.mean(returns), np.mean(returns)+np.std(returns))
|
188 |
+
|
189 |
+
return [exploratory_desired_returns, exploratory_desired_horizon]
|
190 |
+
|
191 |
+
def generate_episode(self, environment, e, desired_return, desired_horizon, testing):
|
192 |
+
|
193 |
+
env = gym.make(environment)
|
194 |
+
|
195 |
+
tot_rewards = []
|
196 |
+
|
197 |
+
done = False
|
198 |
+
dead = False
|
199 |
+
|
200 |
+
scores = []
|
201 |
+
states = []
|
202 |
+
actions = []
|
203 |
+
rewards = []
|
204 |
+
|
205 |
+
step, score, start_life = 0, 0, 5
|
206 |
+
|
207 |
+
observe = env.reset()
|
208 |
+
for _ in range(random.randint(1, 30)):
|
209 |
+
observe, _, _, _ = env.step(1)
|
210 |
+
|
211 |
+
state = utils.pre_processing(observe)
|
212 |
+
history = np.stack((state, state, state, state), axis=2)
|
213 |
+
history = np.reshape([history], (1, 84, 84, 4))
|
214 |
+
|
215 |
+
while not done:
|
216 |
+
states.append(history)
|
217 |
+
|
218 |
+
command = np.asarray([desired_return * self.return_scale, desired_horizon * self.horizon_scale])
|
219 |
+
command = np.reshape(command, [1, len(command)])
|
220 |
+
|
221 |
+
if not testing:
|
222 |
+
action = self.get_action(history, command)
|
223 |
+
actions.append(action)
|
224 |
+
else:
|
225 |
+
action = self.get_greedy_action(history, command)
|
226 |
+
|
227 |
+
if action == 0:
|
228 |
+
real_action = 1
|
229 |
+
elif action == 1:
|
230 |
+
real_action = 2
|
231 |
+
else:
|
232 |
+
real_action = 3
|
233 |
+
|
234 |
+
next_state, reward, done, info = env.step(real_action)
|
235 |
+
next_state = utils.pre_processing(observe)
|
236 |
+
next_state = np.reshape([next_state], (1, 84, 84, 1))
|
237 |
+
next_history = np.append(next_state, history[:, :, :, :3], axis = 3)
|
238 |
+
|
239 |
+
clipped_reward = np.clip(reward, -1, 1)
|
240 |
+
rewards.append(clipped_reward)
|
241 |
+
|
242 |
+
score += reward
|
243 |
+
|
244 |
+
if start_life > info['ale.lives']:
|
245 |
+
dead = True
|
246 |
+
start_life = info['ale.lives']
|
247 |
+
|
248 |
+
if dead:
|
249 |
+
dead = False
|
250 |
+
else:
|
251 |
+
history = next_history
|
252 |
+
|
253 |
+
desired_return -= reward # Line 8 Algorithm 2
|
254 |
+
desired_horizon -= 1 # Line 9 Algorithm 2
|
255 |
+
desired_horizon = np.maximum(desired_horizon, 1)
|
256 |
+
|
257 |
+
self.memory.add_sample(states, actions, rewards)
|
258 |
+
|
259 |
+
self.testing_rewards.append(score)
|
260 |
+
|
261 |
+
if testing:
|
262 |
+
print('Querying the model ...')
|
263 |
+
print('Testing score: {}'.format(score))
|
264 |
+
|
265 |
+
return score
|
266 |
+
|
267 |
+
def run_experiment():
|
268 |
+
|
269 |
+
import argparse
|
270 |
+
|
271 |
+
parser = argparse.ArgumentParser()
|
272 |
+
|
273 |
+
parser.add_argument('--approximator', type=str, default='neural_network')
|
274 |
+
parser.add_argument('--environment', type=str, default='PongDeterministic-v4')
|
275 |
+
parser.add_argument('--seed', type=int, default=1)
|
276 |
+
|
277 |
+
args = parser.parse_args()
|
278 |
+
|
279 |
+
approximator = args.approximator
|
280 |
+
environment = args.environment
|
281 |
+
seed = args.seed
|
282 |
+
|
283 |
+
episodes = 1500
|
284 |
+
returns = []
|
285 |
+
|
286 |
+
agent = UpsideDownAgent(environment, approximator)
|
287 |
+
|
288 |
+
for e in range(episodes):
|
289 |
+
|
290 |
+
print("Episode {}".format(e))
|
291 |
+
|
292 |
+
for i in range(100):
|
293 |
+
agent.train_behaviour_function()
|
294 |
+
|
295 |
+
print("Finished training B!")
|
296 |
+
|
297 |
+
for i in range(15):
|
298 |
+
tmp_r = []
|
299 |
+
exploratory_commands = agent.sample_exploratory_commands() # Line 5 Algorithm 1
|
300 |
+
desired_return = exploratory_commands[0]
|
301 |
+
desired_horizon = exploratory_commands[1]
|
302 |
+
r = agent.generate_episode(environment, e, desired_return, desired_horizon, False)
|
303 |
+
tmp_r.append(r)
|
304 |
+
|
305 |
+
print(np.mean(tmp_r))
|
306 |
+
returns.append(np.mean(tmp_r))
|
307 |
+
|
308 |
+
exploratory_commands = agent.sample_exploratory_commands()
|
309 |
+
|
310 |
+
#agent.generate_episode(environment, 1, 200, 200, True)
|
311 |
+
|
312 |
+
utils.save_results(environment, approximator, seed, returns)
|
313 |
+
|
314 |
+
if approximator == 'neural_network':
|
315 |
+
utils.save_trained_model(environment, seed, agent.behaviour_function)
|
316 |
+
|
317 |
+
plt.plot(returns)
|
318 |
+
plt.show()
|
319 |
+
|
320 |
+
if __name__ == "__main__":
|
321 |
+
run_experiment()
|
old_code/utils.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import argparse
|
3 |
+
import pickle
|
4 |
+
import keras
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
from keras.layers import Dense, Multiply, Input, Conv2D, Flatten
|
8 |
+
from keras.models import Sequential, Model
|
9 |
+
from keras.optimizers import Adam, RMSprop, SGD
|
10 |
+
|
11 |
+
from skimage.transform import resize
|
12 |
+
from skimage.color import rgb2gray
|
13 |
+
|
14 |
+
STORING_PATH = './results/'
|
15 |
+
MODELS_PATH = './trained_models/'
|
16 |
+
|
17 |
+
def save_results(environment, approximator, seed, rewards):
|
18 |
+
storing_path = os.path.join(STORING_PATH, environment, approximator, str(seed))
|
19 |
+
if not os.path.exists(storing_path):
|
20 |
+
os.makedirs(storing_path)
|
21 |
+
|
22 |
+
np.save(storing_path + '/' + 'upside_down_rewards.npy', rewards)
|
23 |
+
|
24 |
+
def get_functional_behaviour_function(state_size, command_size, action_size):
|
25 |
+
observation_input = keras.Input(shape=(state_size,))
|
26 |
+
linear_layer = Dense(64, activation='sigmoid')(observation_input)
|
27 |
+
|
28 |
+
command_input = keras.Input(shape=(command_size,))
|
29 |
+
sigmoidal_layer = Dense(64, activation='sigmoid')(command_input)
|
30 |
+
|
31 |
+
multiplied_layer = Multiply()([linear_layer, sigmoidal_layer])
|
32 |
+
|
33 |
+
layer_1 = Dense(64, activation='relu')(multiplied_layer)
|
34 |
+
layer_2 = Dense(64, activation='relu')(layer_1)
|
35 |
+
layer_3 = Dense(64, activation='relu')(layer_2)
|
36 |
+
layer_4 = Dense(64, activation='relu')(layer_3)
|
37 |
+
final_layer = Dense(action_size, activation='softmax')(layer_4)
|
38 |
+
|
39 |
+
model = Model(inputs=[observation_input, command_input], outputs=final_layer)
|
40 |
+
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001))
|
41 |
+
|
42 |
+
return model
|
43 |
+
|
44 |
+
def get_atari_behaviour_function(action_size):
|
45 |
+
|
46 |
+
print('Getting the model')
|
47 |
+
|
48 |
+
input_state = Input(shape=(84,84,4))
|
49 |
+
|
50 |
+
first_conv = Conv2D(
|
51 |
+
32, (8, 8), strides=(4,4), activation='relu')(input_state)
|
52 |
+
second_conv = Conv2D(
|
53 |
+
64, (4, 4), strides=(2,2), activation='relu')(first_conv)
|
54 |
+
third_conv = Conv2D(
|
55 |
+
64, (3, 3), strides=(1,1), activation='relu')(second_conv)
|
56 |
+
|
57 |
+
flattened = Flatten()(third_conv)
|
58 |
+
dense_layer = Dense(512, activation='relu')(flattened)
|
59 |
+
|
60 |
+
command_input = keras.Input(shape=(2,))
|
61 |
+
sigmoidal_layer = Dense(512, activation='sigmoid')(command_input)
|
62 |
+
|
63 |
+
multiplied_layer = Multiply()([dense_layer, sigmoidal_layer])
|
64 |
+
final_layer = Dense(256, activation='relu')(multiplied_layer)
|
65 |
+
|
66 |
+
action_layer = Dense(action_size, activation='softmax')(final_layer)
|
67 |
+
|
68 |
+
model = Model(inputs=[input_state, command_input], outputs=action_layer)
|
69 |
+
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.001, rho=0.95, epsilon=0.01))
|
70 |
+
|
71 |
+
|
72 |
+
print(model.summary())
|
73 |
+
|
74 |
+
return model
|
75 |
+
|
76 |
+
def get_catch_behaviour_function(action_size):
|
77 |
+
|
78 |
+
print('Getting the Catch-model')
|
79 |
+
|
80 |
+
input_state = Input(shape=(84,84,4))
|
81 |
+
|
82 |
+
first_conv = Conv2D(
|
83 |
+
32, (8, 8), strides=(4,4), activation='relu')(input_state)
|
84 |
+
second_conv = Conv2D(
|
85 |
+
64, (4, 4), strides=(2,2), activation='relu')(first_conv)
|
86 |
+
third_conv = Conv2D(
|
87 |
+
64, (3, 3), strides=(1,1), activation='relu')(second_conv)
|
88 |
+
|
89 |
+
flattened = Flatten()(third_conv)
|
90 |
+
dense_layer = Dense(512, activation='relu')(flattened)
|
91 |
+
|
92 |
+
command_input = keras.Input(shape=(2,))
|
93 |
+
sigmoidal_layer = Dense(512, activation='sigmoid')(command_input)
|
94 |
+
|
95 |
+
multiplied_layer = Multiply()([dense_layer, sigmoidal_layer])
|
96 |
+
final_layer = Dense(256, activation='relu')(multiplied_layer)
|
97 |
+
|
98 |
+
action_layer = Dense(action_size, activation='softmax')(final_layer)
|
99 |
+
|
100 |
+
model = Model(inputs=[input_state, command_input], outputs=action_layer)
|
101 |
+
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.001, rho=0.95, epsilon=0.01))
|
102 |
+
|
103 |
+
|
104 |
+
print(model.summary())
|
105 |
+
|
106 |
+
return model
|
107 |
+
|
108 |
+
|
109 |
+
def pre_processing(state):
|
110 |
+
processed_state = np.uint8(
|
111 |
+
resize(rgb2gray(state), (84, 84), mode='constant')*255)
|
112 |
+
|
113 |
+
return processed_state
|
114 |
+
|
115 |
+
def save_trained_model(environment, seed, model):
|
116 |
+
storing_path = os.path.join(MODELS_PATH, environment, str(seed))
|
117 |
+
if not os.path.exists(storing_path):
|
118 |
+
os.makedirs(storing_path)
|
119 |
+
|
120 |
+
model.save_weights(storing_path + '/' + 'trained_model.h5')
|
121 |
+
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
udrl/__main__.py
ADDED
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .agent import UpsideDownAgent, AgentHyper
|
2 |
+
from .policies import SklearnPolicy, NeuralPolicy
|
3 |
+
from .catch import CatchAdaptor
|
4 |
+
from dataclasses import dataclass, asdict
|
5 |
+
import gymnasium as gym
|
6 |
+
from tqdm import trange
|
7 |
+
import numpy as np
|
8 |
+
import warnings
|
9 |
+
import argparse
|
10 |
+
from udrl.cli import (
|
11 |
+
with_meta,
|
12 |
+
create_argparse_dict,
|
13 |
+
create_experiment_from_args,
|
14 |
+
dataclass_non_defaults_to_string,
|
15 |
+
apply,
|
16 |
+
)
|
17 |
+
from pathlib import Path
|
18 |
+
import json
|
19 |
+
import torch
|
20 |
+
import random as rnd
|
21 |
+
|
22 |
+
|
23 |
+
@dataclass
|
24 |
+
class UDRLExperiment:
|
25 |
+
"""Configuration for an Upside-Down Reinforcement Learning experiment."""
|
26 |
+
|
27 |
+
env_name: str = with_meta(
|
28 |
+
"CartPole-v0", "Name of the Gym environment to use "
|
29 |
+
)
|
30 |
+
estimator_name: str = with_meta(
|
31 |
+
"ensemble.RandomForestClassifier",
|
32 |
+
"neural for the NN or a fully qualified name of the "
|
33 |
+
"scikit-learn estimator class "
|
34 |
+
"for the policy",
|
35 |
+
)
|
36 |
+
seed: int = with_meta(42, "Random seed for reproducibility")
|
37 |
+
|
38 |
+
max_episode: int = with_meta(500, "Maximum number of training episodes ")
|
39 |
+
collect_iter: int = with_meta(
|
40 |
+
15, "Number of episodes to collect between training steps "
|
41 |
+
)
|
42 |
+
train_per_iter: int = with_meta(
|
43 |
+
100, "Number of train iteration for each collected episode "
|
44 |
+
)
|
45 |
+
batch_size: int = with_meta(
|
46 |
+
0,
|
47 |
+
"Batch size for training the policy."
|
48 |
+
"If batch_size <= 0, use the entire replay buffer",
|
49 |
+
)
|
50 |
+
|
51 |
+
warm_up: int = with_meta(
|
52 |
+
50, "Number of initial random episodes to populate the replay buffer"
|
53 |
+
)
|
54 |
+
memory_size: int = with_meta(700, "Maximum size of the replay buffer")
|
55 |
+
last_few: int = with_meta(
|
56 |
+
75,
|
57 |
+
"Number of recent episodes to consider for exploratory command sampling",
|
58 |
+
)
|
59 |
+
testing_period: int = with_meta(
|
60 |
+
10, "After how many training loop we perform the testing of the agent"
|
61 |
+
)
|
62 |
+
|
63 |
+
horizon_scale: float = with_meta(
|
64 |
+
0.02, "Scaling factor for desired horizon in commands "
|
65 |
+
)
|
66 |
+
return_scale: float = with_meta(
|
67 |
+
0.02, "Scaling factor for desired return in commands"
|
68 |
+
)
|
69 |
+
|
70 |
+
epsilon: float = with_meta(
|
71 |
+
0.2, "Exploration rate for epsilon-greedy action selection"
|
72 |
+
)
|
73 |
+
save_desired: bool = with_meta(
|
74 |
+
False, "Save desired_horizon and desired_return during training"
|
75 |
+
)
|
76 |
+
|
77 |
+
final_testing: bool = with_meta(
|
78 |
+
True, "Whether to perform final testing after training "
|
79 |
+
)
|
80 |
+
final_testing_sample: int = with_meta(
|
81 |
+
100, "Number of episodes to evaluate during final testing "
|
82 |
+
)
|
83 |
+
final_desired_return: int = with_meta(
|
84 |
+
200, "Desired return for final testing episodes"
|
85 |
+
)
|
86 |
+
final_desired_horizon: int = with_meta(
|
87 |
+
200, "Desired horizon for final testing episodes "
|
88 |
+
)
|
89 |
+
save_policy: bool = with_meta(True, "Whether to save the trained policy ")
|
90 |
+
save_learning_infos: bool = with_meta(
|
91 |
+
True, "Whether to save the learning infos"
|
92 |
+
)
|
93 |
+
|
94 |
+
|
95 |
+
def dump_dict(data, file_path):
|
96 |
+
with open(file_path, "w") as file:
|
97 |
+
json.dump(data, file, indent=4)
|
98 |
+
|
99 |
+
|
100 |
+
def run_experiment(conf: UDRLExperiment):
|
101 |
+
"""Runs an Upside-Down Reinforcement Learning experiment.
|
102 |
+
|
103 |
+
Parameters
|
104 |
+
----------
|
105 |
+
conf : UDRLExperiment
|
106 |
+
Configuration for the experiment.
|
107 |
+
|
108 |
+
Returns
|
109 |
+
-------
|
110 |
+
None
|
111 |
+
|
112 |
+
Notes
|
113 |
+
-----
|
114 |
+
* Trains an agent using the specified policy and environment.
|
115 |
+
* Collects episodes of experience and updates the policy.
|
116 |
+
* Optionally performs final testing,saves the policy and learning infos.
|
117 |
+
"""
|
118 |
+
torch.manual_seed(conf.seed)
|
119 |
+
np.random.seed(conf.seed)
|
120 |
+
rnd.seed(conf.seed)
|
121 |
+
|
122 |
+
toy_env = (
|
123 |
+
CatchAdaptor(dense=True)
|
124 |
+
if conf.env_name == "catch"
|
125 |
+
else gym.make(conf.env_name)
|
126 |
+
)
|
127 |
+
if conf.estimator_name == "neural":
|
128 |
+
policy = NeuralPolicy(
|
129 |
+
toy_env.observation_space.shape[0],
|
130 |
+
action_size=toy_env.action_space.n,
|
131 |
+
)
|
132 |
+
else:
|
133 |
+
policy = SklearnPolicy(
|
134 |
+
epsilon=conf.epsilon,
|
135 |
+
estimator_name=conf.estimator_name,
|
136 |
+
action_size=toy_env.action_space.n,
|
137 |
+
)
|
138 |
+
agent = UpsideDownAgent(
|
139 |
+
conf=apply(AgentHyper, asdict(conf)),
|
140 |
+
policy=policy,
|
141 |
+
)
|
142 |
+
epi_bar = trange(conf.max_episode)
|
143 |
+
|
144 |
+
returns = []
|
145 |
+
test_returns = []
|
146 |
+
infos = []
|
147 |
+
desired_returns = []
|
148 |
+
desired_horizons = []
|
149 |
+
test_reward_mean = 0
|
150 |
+
test_reward_std = 0
|
151 |
+
for e in epi_bar:
|
152 |
+
metric = []
|
153 |
+
for _ in range(conf.train_per_iter):
|
154 |
+
info = agent.train()
|
155 |
+
metric.append(info["metric"])
|
156 |
+
infos.append(info)
|
157 |
+
|
158 |
+
episodic_rewards = []
|
159 |
+
for _ in range(conf.collect_iter):
|
160 |
+
r, dr, dh = agent.collect_episode(
|
161 |
+
*agent.sample_exploratory_commands()
|
162 |
+
)
|
163 |
+
episodic_rewards.append(r)
|
164 |
+
desired_returns.extend(dr)
|
165 |
+
desired_horizons.extend(dh)
|
166 |
+
|
167 |
+
ep_r_mean = np.mean(episodic_rewards)
|
168 |
+
ep_r_std = np.std(episodic_rewards)
|
169 |
+
returns.append((ep_r_mean, ep_r_std))
|
170 |
+
|
171 |
+
if e % conf.testing_period == 0:
|
172 |
+
test_reward = [
|
173 |
+
agent.collect_episode(
|
174 |
+
conf.final_desired_return,
|
175 |
+
conf.final_desired_horizon,
|
176 |
+
test=True,
|
177 |
+
store_episode=False,
|
178 |
+
)[0]
|
179 |
+
for _ in range(conf.final_testing_sample)
|
180 |
+
]
|
181 |
+
test_reward_mean = np.mean(test_reward)
|
182 |
+
test_reward_std = np.std(test_reward)
|
183 |
+
test_returns.append((test_reward_mean, test_reward_std))
|
184 |
+
|
185 |
+
epi_bar.set_postfix(
|
186 |
+
{
|
187 |
+
"mean": test_reward_mean,
|
188 |
+
"std": test_reward_std,
|
189 |
+
"mean_m": np.mean(metric),
|
190 |
+
"std_m": np.std(metric),
|
191 |
+
}
|
192 |
+
)
|
193 |
+
|
194 |
+
exp_name = dataclass_non_defaults_to_string(conf)
|
195 |
+
base_path = Path("data") / conf.env_name / exp_name / str(conf.seed)
|
196 |
+
base_path.mkdir(parents=True, exist_ok=True)
|
197 |
+
final_res = {}
|
198 |
+
if conf.final_testing:
|
199 |
+
print("Start Testing...")
|
200 |
+
final_r = [
|
201 |
+
agent.collect_episode(
|
202 |
+
conf.final_desired_return,
|
203 |
+
conf.final_desired_horizon,
|
204 |
+
test=True,
|
205 |
+
store_episode=False,
|
206 |
+
)[0]
|
207 |
+
for _ in trange(conf.final_testing_sample)
|
208 |
+
]
|
209 |
+
final_res["test_mean"] = np.mean(final_r)
|
210 |
+
final_res["test_std"] = np.std(final_r)
|
211 |
+
print(f"Final result:\n{np.mean(final_r)} +- {np.std(final_r)}")
|
212 |
+
|
213 |
+
dump_dict(asdict(conf) | final_res, str(base_path / "conf.json"))
|
214 |
+
if conf.save_policy:
|
215 |
+
agent.policy.save(str(base_path / "policy"))
|
216 |
+
|
217 |
+
if conf.save_learning_infos:
|
218 |
+
np.save(str(base_path / "train_rewards.npy"), returns)
|
219 |
+
np.save(str(base_path / "test_rewards.npy"), test_returns)
|
220 |
+
np.save(str(base_path / "desired_returns.npy"), desired_returns)
|
221 |
+
np.save(str(base_path / "desired_horizons.npy"), desired_horizons)
|
222 |
+
dump_dict(infos, str(base_path / "learning_infos.json"))
|
223 |
+
|
224 |
+
|
225 |
+
warnings.simplefilter("ignore", DeprecationWarning)
|
226 |
+
warnings.simplefilter("ignore", FutureWarning)
|
227 |
+
parser = argparse.ArgumentParser(
|
228 |
+
description="Runs an Upside-Down Reinforcement Learning experiment."
|
229 |
+
"NOTE: Default values are for the CartPole env with RandomForestClassifier"
|
230 |
+
)
|
231 |
+
arguments = create_argparse_dict(UDRLExperiment)
|
232 |
+
for k, v in arguments.items():
|
233 |
+
parser.add_argument(k, **v)
|
234 |
+
args = parser.parse_args()
|
235 |
+
conf = create_experiment_from_args(args, UDRLExperiment)
|
236 |
+
print(conf)
|
237 |
+
|
238 |
+
run_experiment(conf)
|
udrl/agent.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
import gymnasium as gym
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
from .catch import CatchAdaptor
|
6 |
+
from .policies import ABCPolicy
|
7 |
+
from .buffer import ReplayBuffer
|
8 |
+
|
9 |
+
|
10 |
+
@dataclass
|
11 |
+
class AgentHyper:
|
12 |
+
"""Hyperparameters for an agent interacting with an environment.
|
13 |
+
|
14 |
+
Parameters
|
15 |
+
----------
|
16 |
+
env_name : str
|
17 |
+
Name of the environment the agent interacts with.
|
18 |
+
warm_up : int, optional
|
19 |
+
Number of initial steps before training begins (default: 50).
|
20 |
+
memory_size : int, optional
|
21 |
+
Maximum size of the agent's experience replay memory (default: 700).
|
22 |
+
last_few : int, optional
|
23 |
+
Number of recent experiences to prioritize in training (default: 75).
|
24 |
+
batch_size : int, optional
|
25 |
+
Number of experiences sampled from memory for each training update
|
26 |
+
(default: 32).
|
27 |
+
horizon_scale : float, optional
|
28 |
+
Scaling factor for the horizon length in reinforcement learning
|
29 |
+
(default: 0.02).
|
30 |
+
return_scale : float, optional
|
31 |
+
Scaling factor for rewards or returns in reinforcement learning
|
32 |
+
(default: 0.02).
|
33 |
+
"""
|
34 |
+
|
35 |
+
env_name: str
|
36 |
+
warm_up: int = 50
|
37 |
+
memory_size: int = 700
|
38 |
+
last_few: int = 75
|
39 |
+
batch_size: int = 32
|
40 |
+
|
41 |
+
horizon_scale: float = 0.02
|
42 |
+
return_scale: float = 0.02
|
43 |
+
|
44 |
+
|
45 |
+
class UpsideDownAgent:
|
46 |
+
"""An agent that interacts with an environment using an
|
47 |
+
Upside-Down Reinforcement Learning approach.
|
48 |
+
|
49 |
+
Parameters
|
50 |
+
----------
|
51 |
+
conf : AgentHyper
|
52 |
+
Hyperparameters for the agent.
|
53 |
+
policy : ABCPolicy
|
54 |
+
A policy object used by the agent to select actions.
|
55 |
+
|
56 |
+
Attributes
|
57 |
+
----------
|
58 |
+
environment : gym.Env
|
59 |
+
The Gym environment the agent interacts with.
|
60 |
+
state_size : int
|
61 |
+
The size of the state space in the environment.
|
62 |
+
memory : ReplayBuffer
|
63 |
+
The replay buffer used to store experiences for training.
|
64 |
+
policy : ABCPolicy
|
65 |
+
The policy object used by the agent to select actions.
|
66 |
+
|
67 |
+
Methods
|
68 |
+
-------
|
69 |
+
collect_episode(desired_return=1, desired_horizon=1, random=False,
|
70 |
+
store_episode=True, test=False)
|
71 |
+
Collects an episode of experience from the environment.
|
72 |
+
sample_exploratory_commands()
|
73 |
+
Samples exploratory commands based on past experiences.
|
74 |
+
train()
|
75 |
+
Trains the agent's policy using experiences from the replay buffer.
|
76 |
+
"""
|
77 |
+
|
78 |
+
def __init__(self, conf: AgentHyper, policy: ABCPolicy):
|
79 |
+
self.conf = conf
|
80 |
+
self.environment = (
|
81 |
+
CatchAdaptor(dense=True)
|
82 |
+
if conf.env_name == "catch"
|
83 |
+
else gym.make(conf.env_name)
|
84 |
+
)
|
85 |
+
self.state_size = self.environment.observation_space.shape[0]
|
86 |
+
self.memory = ReplayBuffer(conf.memory_size)
|
87 |
+
self.policy = policy
|
88 |
+
for x in range(conf.warm_up):
|
89 |
+
self.collect_episode(random=True)
|
90 |
+
|
91 |
+
def collect_episode(
|
92 |
+
self,
|
93 |
+
desired_return: int = 1,
|
94 |
+
desired_horizon: int = 1,
|
95 |
+
random: bool = False,
|
96 |
+
store_episode: bool = True,
|
97 |
+
test: bool = False,
|
98 |
+
):
|
99 |
+
state, _ = self.environment.reset()
|
100 |
+
epochs = []
|
101 |
+
horizons = []
|
102 |
+
returns = []
|
103 |
+
cum_rew = 0
|
104 |
+
tru, ter = False, False
|
105 |
+
|
106 |
+
while not (tru or ter):
|
107 |
+
state = np.expand_dims(state, axis=0)
|
108 |
+
command = np.array(
|
109 |
+
[
|
110 |
+
desired_return * self.conf.return_scale,
|
111 |
+
desired_horizon * self.conf.horizon_scale,
|
112 |
+
]
|
113 |
+
)
|
114 |
+
command = np.expand_dims(command, axis=0)
|
115 |
+
action = (
|
116 |
+
self.environment.action_space.sample()
|
117 |
+
if random
|
118 |
+
else self.policy(state, command, test)
|
119 |
+
)
|
120 |
+
next_state, reward, tru, ter, _ = self.environment.step(action)
|
121 |
+
|
122 |
+
epochs.append([state, action, reward])
|
123 |
+
cum_rew += reward
|
124 |
+
horizons.append(desired_horizon)
|
125 |
+
returns.append(desired_return)
|
126 |
+
|
127 |
+
state = next_state
|
128 |
+
# Line 8 Algorithm 2
|
129 |
+
desired_return -= reward
|
130 |
+
# Line 9 Algorithm 2
|
131 |
+
desired_horizon = max(desired_horizon - 1, 1)
|
132 |
+
if store_episode:
|
133 |
+
self.memory.add_sample(*list(zip(*epochs)))
|
134 |
+
return cum_rew, returns, horizons
|
135 |
+
|
136 |
+
def sample_exploratory_commands(self):
|
137 |
+
best_ep = self.memory.get_n_best(self.conf.last_few)
|
138 |
+
expl_desired_horizon = np.mean([len(i["states"]) for i in best_ep])
|
139 |
+
|
140 |
+
returns = [i["summed_rewards"] for i in best_ep]
|
141 |
+
expl_desired_returns = np.random.uniform(
|
142 |
+
np.mean(returns), np.mean(returns) + np.std(returns)
|
143 |
+
)
|
144 |
+
|
145 |
+
return [expl_desired_returns, expl_desired_horizon]
|
146 |
+
|
147 |
+
def train(self):
|
148 |
+
batch_size = self.conf.batch_size
|
149 |
+
if self.conf.batch_size <= 0:
|
150 |
+
batch_size = len(self.memory.buffer)
|
151 |
+
|
152 |
+
random_episodes = self.memory.get_random_samples(batch_size)
|
153 |
+
|
154 |
+
training_states = np.zeros((batch_size, self.state_size))
|
155 |
+
training_commands = np.zeros((batch_size, 2))
|
156 |
+
|
157 |
+
actions = []
|
158 |
+
|
159 |
+
for idx, episode in enumerate(random_episodes):
|
160 |
+
T = len(episode["states"])
|
161 |
+
t1 = np.random.randint(0, T - 1)
|
162 |
+
# t2 = np.random.randint(t1 + 1, T)
|
163 |
+
t2 = T
|
164 |
+
|
165 |
+
state = episode["states"][t1]
|
166 |
+
desired_return = sum(episode["rewards"][t1:t2])
|
167 |
+
desired_horizon = t2 - t1
|
168 |
+
|
169 |
+
action = episode["actions"][t1]
|
170 |
+
|
171 |
+
training_states[idx] = state[0]
|
172 |
+
training_commands[idx] = np.array(
|
173 |
+
[
|
174 |
+
desired_return * self.conf.return_scale,
|
175 |
+
desired_horizon * self.conf.horizon_scale,
|
176 |
+
]
|
177 |
+
)
|
178 |
+
actions.append(action)
|
179 |
+
|
180 |
+
return self.policy.train(training_states, training_commands, actions)
|
udrl/buffer.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
|
3 |
+
|
4 |
+
class ReplayBuffer:
|
5 |
+
"""A replay buffer for storing and sampling experiences.
|
6 |
+
Thank you: https://github.com/BY571/
|
7 |
+
|
8 |
+
Parameters
|
9 |
+
----------
|
10 |
+
max_size : int
|
11 |
+
The maximum number of experiences the buffer can store.
|
12 |
+
|
13 |
+
Attributes
|
14 |
+
----------
|
15 |
+
max_size : int
|
16 |
+
The maximum number of experiences the buffer can store.
|
17 |
+
buffer : list
|
18 |
+
The list storing the experiences.
|
19 |
+
|
20 |
+
Methods
|
21 |
+
-------
|
22 |
+
add_sample(states, actions, rewards)
|
23 |
+
Adds an episode of experience to the buffer and sorts the buffer
|
24 |
+
by summed rewards in descending order.
|
25 |
+
|
26 |
+
sort()
|
27 |
+
Sorts the buffer by summed rewards in descending order and keeps only
|
28 |
+
the top `max_size` experiences.
|
29 |
+
|
30 |
+
get_random_samples(batch_size)
|
31 |
+
Returns a random sample of `batch_size` experiences from the buffer.
|
32 |
+
|
33 |
+
get_n_best(n)
|
34 |
+
Returns the `n` experiences with the highest summed rewards.
|
35 |
+
|
36 |
+
__len__()
|
37 |
+
Returns the current number of experiences in the buffer.
|
38 |
+
"""
|
39 |
+
|
40 |
+
def __init__(self, max_size):
|
41 |
+
self.max_size = max_size
|
42 |
+
self.buffer = []
|
43 |
+
|
44 |
+
def add_sample(self, states, actions, rewards):
|
45 |
+
episode = {
|
46 |
+
"states": states,
|
47 |
+
"actions": actions,
|
48 |
+
"rewards": rewards,
|
49 |
+
"summed_rewards": sum(rewards),
|
50 |
+
}
|
51 |
+
self.buffer.append(episode)
|
52 |
+
self.sort()
|
53 |
+
|
54 |
+
def sort(self):
|
55 |
+
# sort buffer
|
56 |
+
self.buffer = sorted(
|
57 |
+
self.buffer, key=lambda i: i["summed_rewards"], reverse=True
|
58 |
+
)
|
59 |
+
# keep the max buffer size
|
60 |
+
self.buffer = self.buffer[: self.max_size]
|
61 |
+
|
62 |
+
def get_random_samples(self, batch_size):
|
63 |
+
return random.sample(self.buffer, batch_size)
|
64 |
+
|
65 |
+
def get_n_best(self, n):
|
66 |
+
self.sort()
|
67 |
+
return self.buffer[:n]
|
68 |
+
|
69 |
+
def __len__(self):
|
70 |
+
return len(self.buffer)
|
udrl/catch/__init__.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .adptor import CatchAdaptor
|
2 |
+
from .core import CatchEnv
|
3 |
+
|
4 |
+
env_names = [
|
5 |
+
"base",
|
6 |
+
"small_paddle",
|
7 |
+
"random_background",
|
8 |
+
"hardest",
|
9 |
+
"discrete_background",
|
10 |
+
]
|
11 |
+
|
12 |
+
env_names = ["catch_" + x for x in env_names]
|
13 |
+
|
14 |
+
|
15 |
+
def make_catch_conf(env_name: str):
|
16 |
+
base_args = {
|
17 |
+
"random_background": False,
|
18 |
+
"discrete_background": False,
|
19 |
+
"paddle_size": 5,
|
20 |
+
}
|
21 |
+
match env_name:
|
22 |
+
case "catch_small_paddle":
|
23 |
+
base_args["paddle_size"] = 2
|
24 |
+
case "catch_discrete_background":
|
25 |
+
base_args["random_background"] = True
|
26 |
+
base_args["discrete_background"] = True
|
27 |
+
case "catch_random_background":
|
28 |
+
base_args["random_background"] = True
|
29 |
+
case "catch_hardest":
|
30 |
+
base_args["random_background"] = True
|
31 |
+
base_args["paddle_size"] = 2
|
32 |
+
return base_args
|
33 |
+
|
34 |
+
|
35 |
+
__all__ = ["CatchEnv", "CatchAdaptor", "make_catch_conf", "env_names"]
|
udrl/catch/adptor.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, Dict, Tuple
|
2 |
+
|
3 |
+
import gymnasium as gym
|
4 |
+
import numpy as np
|
5 |
+
from gymnasium import spaces
|
6 |
+
from numpy.typing import NDArray
|
7 |
+
|
8 |
+
from .core import CatchEnv
|
9 |
+
from .renderer import Renderer
|
10 |
+
|
11 |
+
|
12 |
+
class CatchAdaptor(gym.Env):
|
13 |
+
"""Adapts the CatchEnv game to the OpenAI Gym interface.
|
14 |
+
|
15 |
+
This class provides a wrapper for the CatchEnv game,
|
16 |
+
making it compatible with the Gymnasium environment framework.
|
17 |
+
It handles action and observation spacedefinitions, rendering,
|
18 |
+
and environment interaction.
|
19 |
+
|
20 |
+
Parameters
|
21 |
+
----------
|
22 |
+
render : bool, optional
|
23 |
+
If True or "human", renders the environment in a human-viewable window.
|
24 |
+
If "rgb_array", renders the environment to an RGB array.
|
25 |
+
Default is False.
|
26 |
+
numpy_type : str, optional
|
27 |
+
The NumPy data type for the observation array. Default is "float32".
|
28 |
+
**catch_kwargs
|
29 |
+
Additional keyword arguments to pass to the CatchEnv constructor.
|
30 |
+
"""
|
31 |
+
|
32 |
+
def __init__(
|
33 |
+
self, render: bool = False, numpy_type: str = "float32", **catch_kwargs
|
34 |
+
):
|
35 |
+
super().__init__()
|
36 |
+
self.catch = CatchEnv(**catch_kwargs)
|
37 |
+
self.np_type = numpy_type
|
38 |
+
self.action_space = spaces.Discrete(3)
|
39 |
+
self.obs_shape = (84, 84)
|
40 |
+
self.dense = catch_kwargs.get("dense", None)
|
41 |
+
if self.dense:
|
42 |
+
self.observation_space = spaces.Box(
|
43 |
+
np.array([0, 0, 0]), np.array([21, 21, 21]), dtype=np.uint8
|
44 |
+
)
|
45 |
+
else:
|
46 |
+
self.observation_space = spaces.Box(
|
47 |
+
low=0, high=255, shape=self.obs_shape, dtype=np.uint8
|
48 |
+
)
|
49 |
+
self.render_mode = render
|
50 |
+
if self.render_mode:
|
51 |
+
self.GUI = Renderer(self.obs_shape)
|
52 |
+
|
53 |
+
def step(
|
54 |
+
self, action: int
|
55 |
+
) -> Tuple[NDArray, float, bool, bool, Dict[str, Any]]:
|
56 |
+
"""Run one timestep of the environment's dynamics.
|
57 |
+
|
58 |
+
Parameters
|
59 |
+
----------
|
60 |
+
action : int
|
61 |
+
The action to take in the environment
|
62 |
+
(0: move left, 1: move right, 2: stay).
|
63 |
+
|
64 |
+
Returns
|
65 |
+
-------
|
66 |
+
observation : np.ndarray
|
67 |
+
The agent's observation of the current environment.
|
68 |
+
reward : float
|
69 |
+
The amount of reward returned after the previous action.
|
70 |
+
terminated : bool
|
71 |
+
Whether the episode has ended.
|
72 |
+
truncated : bool
|
73 |
+
Whether the episode was truncated.
|
74 |
+
info : dict
|
75 |
+
Contains auxiliary diagnostic information.
|
76 |
+
"""
|
77 |
+
state, reward, done = self.catch.step(action)
|
78 |
+
self.state = state
|
79 |
+
if self.render_mode:
|
80 |
+
self.render()
|
81 |
+
|
82 |
+
# terminated vs truncated: see gymnasium documentation
|
83 |
+
# https://gymnasium.farama.org/api/env/
|
84 |
+
# in this environment we do not have a difference between the two.
|
85 |
+
obs = state
|
86 |
+
if not self.dense:
|
87 |
+
obs = np.reshape(obs, self.obs_shape).astype(self.np_type)
|
88 |
+
|
89 |
+
return (
|
90 |
+
obs,
|
91 |
+
reward,
|
92 |
+
done, # terminated
|
93 |
+
done, # trucated
|
94 |
+
{}, # empty info
|
95 |
+
)
|
96 |
+
|
97 |
+
def reset(self, **_) -> Tuple[NDArray, Dict[str, Any]]:
|
98 |
+
"""Resets the environment to an initial state and
|
99 |
+
returns the initial observation.
|
100 |
+
|
101 |
+
Returns
|
102 |
+
-------
|
103 |
+
observation : np.ndarray
|
104 |
+
The initial observation.
|
105 |
+
info : dict
|
106 |
+
Contains auxiliary diagnostic information.
|
107 |
+
"""
|
108 |
+
obs = self.catch.reset()
|
109 |
+
if not self.dense:
|
110 |
+
obs = np.reshape(obs, self.obs_shape)
|
111 |
+
return obs, {}
|
112 |
+
|
113 |
+
def render(self):
|
114 |
+
"""Renders the environment.
|
115 |
+
|
116 |
+
If the 'render' parameter is set,
|
117 |
+
this method will display the environment
|
118 |
+
either in a human-viewable window or as an RGB array.
|
119 |
+
"""
|
120 |
+
if self.render_mode:
|
121 |
+
self.GUI(self.state)
|
122 |
+
|
123 |
+
def close(self):
|
124 |
+
"""Closes the renderer if it is active."""
|
125 |
+
if self.render_mode:
|
126 |
+
self.GUI.quit()
|
udrl/catch/core.py
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass, field
|
2 |
+
from typing import Tuple
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
from numpy.typing import NDArray
|
6 |
+
from skimage.transform import resize
|
7 |
+
|
8 |
+
|
9 |
+
@dataclass
|
10 |
+
class CatchEnv:
|
11 |
+
"""A simple 2D Catch environment for reinforcement learning.
|
12 |
+
|
13 |
+
This environment simulates a game where the agent controls a paddle
|
14 |
+
at the bottom of the screen and tries to catch a falling ball.
|
15 |
+
The state is represented as an image, and the actions are discrete
|
16 |
+
movements of the paddle.
|
17 |
+
|
18 |
+
Attributes
|
19 |
+
----------
|
20 |
+
paddle_size: int, default=5
|
21 |
+
The size of the paddle in pixels.
|
22 |
+
random_background: bool, default=False
|
23 |
+
Whether to use a random background image.
|
24 |
+
discrete_background: bool, default=False
|
25 |
+
If True and random_background is True,
|
26 |
+
the background will be chosen from a discrete set of values.
|
27 |
+
scale_value: int, default=255
|
28 |
+
The scaling factor for the image values.
|
29 |
+
"""
|
30 |
+
|
31 |
+
paddle_size: int = 5
|
32 |
+
random_background: bool = False
|
33 |
+
discrete_background: bool = False
|
34 |
+
scale_value: int = 255
|
35 |
+
dense: bool = False
|
36 |
+
|
37 |
+
size: int = field(init=False, default_factory=lambda: 21)
|
38 |
+
scale_factor: int = field(init=False, default_factory=lambda: 4)
|
39 |
+
image: np.ndarray = field(init=False)
|
40 |
+
background: np.ndarray = field(init=False)
|
41 |
+
left_paddle_offset: int = field(init=False)
|
42 |
+
right_paddle_offset: int = field(init=False)
|
43 |
+
|
44 |
+
def __post_init__(self):
|
45 |
+
"""Initializes internal environment variables after object creation."""
|
46 |
+
self.final_size = (
|
47 |
+
self.size * self.scale_factor,
|
48 |
+
self.size * self.scale_factor,
|
49 |
+
)
|
50 |
+
self.default_size = (self.size, self.size)
|
51 |
+
if self.random_background:
|
52 |
+
if self.discrete_background:
|
53 |
+
self.background = np.random.choice(
|
54 |
+
np.linspace(0, 0.5, 10),
|
55 |
+
size=self.final_size,
|
56 |
+
)
|
57 |
+
else:
|
58 |
+
self.background = resize(
|
59 |
+
np.random.choice(
|
60 |
+
np.linspace(0, 0.999, 10),
|
61 |
+
size=self.default_size,
|
62 |
+
),
|
63 |
+
self.final_size,
|
64 |
+
)
|
65 |
+
|
66 |
+
self.image = np.zeros(self.default_size)
|
67 |
+
self.left_paddle_offset = self.paddle_size // 2
|
68 |
+
self.right_paddle_offset = self.left_paddle_offset + (
|
69 |
+
self.paddle_size % 2
|
70 |
+
)
|
71 |
+
|
72 |
+
self.actions = {
|
73 |
+
0: lambda self=self: max(self.pos - 2, self.left_paddle_offset),
|
74 |
+
1: lambda self=self: min(
|
75 |
+
self.pos + 2, self.size - self.right_paddle_offset - 1
|
76 |
+
),
|
77 |
+
2: lambda self=self: self.pos,
|
78 |
+
}
|
79 |
+
|
80 |
+
def _update_ball(self):
|
81 |
+
"""Updates the position of the ball in the environment.
|
82 |
+
|
83 |
+
This method updates the ball's position based on its current velocity
|
84 |
+
and checks for collisions with the walls.
|
85 |
+
If a collision occurs, the ball's velocity is reversed appropriately.
|
86 |
+
"""
|
87 |
+
self.image[self.bally, self.ballx] = 0
|
88 |
+
self.ballx += self.vx
|
89 |
+
self.bally += self.vy
|
90 |
+
if self.ballx > self.size - 1:
|
91 |
+
self.ballx -= 2 * (self.ballx - (self.size - 1))
|
92 |
+
self.vx *= -1
|
93 |
+
elif self.ballx < 0:
|
94 |
+
self.ballx -= 2 * self.ballx
|
95 |
+
self.vx *= -1
|
96 |
+
self.image[self.bally, self.ballx] = 1
|
97 |
+
|
98 |
+
def _update_paddle(self):
|
99 |
+
"""Updates the position of the paddle in the environment.
|
100 |
+
|
101 |
+
This method clears the previous position of the paddle and
|
102 |
+
redraws it at its new position based on the current `self.pos` value.
|
103 |
+
"""
|
104 |
+
self.image[-5].fill(0)
|
105 |
+
left_pos = self.pos - self.left_paddle_offset
|
106 |
+
right_pos = self.pos + self.right_paddle_offset
|
107 |
+
|
108 |
+
self.image[
|
109 |
+
-5,
|
110 |
+
left_pos:right_pos,
|
111 |
+
] = np.ones(self.paddle_size)
|
112 |
+
|
113 |
+
def _compute_terminal(self):
|
114 |
+
"""Determines if the episode is terminal and calculates the reward.
|
115 |
+
|
116 |
+
This method checks if the ball has reached the bottom of the screen,
|
117 |
+
indicating the end of an episode. If so, it calculates a reward based
|
118 |
+
on whether the ball was caught by the paddle.
|
119 |
+
|
120 |
+
Returns
|
121 |
+
-------
|
122 |
+
reward : int
|
123 |
+
The reward for the current timestep
|
124 |
+
(1 if the ball is caught, 0 otherwise).
|
125 |
+
terminal : bool
|
126 |
+
Whether the episode has ended.
|
127 |
+
"""
|
128 |
+
terminal = self.bally == self.size - 5
|
129 |
+
reward = terminal and (
|
130 |
+
-self.left_paddle_offset
|
131 |
+
<= self.ballx - self.pos
|
132 |
+
<= self.right_paddle_offset
|
133 |
+
)
|
134 |
+
return int(reward), terminal
|
135 |
+
|
136 |
+
def step(self, action: int) -> Tuple[NDArray, int, bool]:
|
137 |
+
"""Takes a step in the environment.
|
138 |
+
|
139 |
+
Parameters
|
140 |
+
----------
|
141 |
+
action: int
|
142 |
+
The action to take: 0 (move left), 1 (move right), or 2 (stay).
|
143 |
+
|
144 |
+
Returns
|
145 |
+
-------
|
146 |
+
image: np.ndarray
|
147 |
+
The rendered image of the environment.
|
148 |
+
reward: int
|
149 |
+
The reward obtained after taking the action.
|
150 |
+
terminal: bool
|
151 |
+
Whether the episode has ended.
|
152 |
+
"""
|
153 |
+
self.pos = self.actions[action]()
|
154 |
+
self._update_ball()
|
155 |
+
self._update_paddle()
|
156 |
+
|
157 |
+
image = resize(
|
158 |
+
self.image,
|
159 |
+
(self.size * self.scale_factor, self.size * self.scale_factor),
|
160 |
+
)
|
161 |
+
image[image != 0] = 1
|
162 |
+
if self.random_background:
|
163 |
+
mask = image == 0
|
164 |
+
image[mask] = self.background[mask]
|
165 |
+
if self.dense:
|
166 |
+
return (
|
167 |
+
[self.ballx, self.bally, self.pos],
|
168 |
+
*self._compute_terminal(),
|
169 |
+
)
|
170 |
+
return (image * self.scale_value, *self._compute_terminal())
|
171 |
+
|
172 |
+
def reset(self) -> NDArray:
|
173 |
+
"""Resets the environment to its initial state.
|
174 |
+
|
175 |
+
Returns
|
176 |
+
-------
|
177 |
+
image: np.ndarray
|
178 |
+
The initial rendered image of the environment.
|
179 |
+
"""
|
180 |
+
self.image = np.zeros((self.size, self.size))
|
181 |
+
self.pos = np.random.randint(
|
182 |
+
self.left_paddle_offset, self.size - self.right_paddle_offset
|
183 |
+
)
|
184 |
+
self.vx = np.random.randint(5) - 2
|
185 |
+
self.vy = 1
|
186 |
+
self.ballx, self.bally = np.random.randint(self.size), 4
|
187 |
+
self.image[self.bally, self.ballx] = 1
|
188 |
+
self._update_paddle()
|
189 |
+
|
190 |
+
return self.step(2)[0]
|
udrl/catch/renderer.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass, field
|
2 |
+
from os import environ
|
3 |
+
from typing import Tuple
|
4 |
+
|
5 |
+
import pygame
|
6 |
+
from numpy.typing import NDArray
|
7 |
+
|
8 |
+
environ["PYGAME_HIDE_SUPPORT_PROMPT"] = "1"
|
9 |
+
|
10 |
+
|
11 |
+
@dataclass
|
12 |
+
class Renderer:
|
13 |
+
"""A renderer for visualizing the CatchEnv game using Pygame.
|
14 |
+
|
15 |
+
This class initializes a Pygame screen and provides a method to render the
|
16 |
+
CatchEnv game state as an image onto the screen.
|
17 |
+
|
18 |
+
Attributes
|
19 |
+
----------
|
20 |
+
size : Tuple[int, int]
|
21 |
+
The size of the environment to render (height, width).
|
22 |
+
scale_factor : int, default=5
|
23 |
+
The scaling factor for the rendered image.
|
24 |
+
"""
|
25 |
+
|
26 |
+
size: Tuple[int, int]
|
27 |
+
scale_factor: int = 5
|
28 |
+
|
29 |
+
screen: pygame.surface.Surface = field(init=False)
|
30 |
+
|
31 |
+
def __post_init__(self):
|
32 |
+
"""Initializes the Pygame display after object creation."""
|
33 |
+
pygame.init()
|
34 |
+
self.screen = pygame.display.set_mode(
|
35 |
+
(
|
36 |
+
self.size[1] * self.scale_factor,
|
37 |
+
self.size[0] * self.scale_factor,
|
38 |
+
)
|
39 |
+
)
|
40 |
+
|
41 |
+
def quit(self):
|
42 |
+
"""Quits the Pygame display."""
|
43 |
+
pygame.quit()
|
44 |
+
|
45 |
+
def __call__(self, image: NDArray):
|
46 |
+
"""Renders the CatchEnv game state onto the Pygame screen.
|
47 |
+
|
48 |
+
Parameters
|
49 |
+
----------
|
50 |
+
image : np.ndarray
|
51 |
+
A 2D NumPy array representing the game state. The array should have
|
52 |
+
values that correspond to pixel intensities or colors.
|
53 |
+
"""
|
54 |
+
|
55 |
+
scaled_size = (
|
56 |
+
image.shape[0] * self.scale_factor,
|
57 |
+
image.shape[1] * self.scale_factor,
|
58 |
+
)
|
59 |
+
scaled_image = pygame.transform.scale(
|
60 |
+
pygame.surfarray.make_surface(image.T), scaled_size
|
61 |
+
)
|
62 |
+
|
63 |
+
# Blit (copy) the scaled image onto the screen
|
64 |
+
self.screen.blit(scaled_image, (0, 0))
|
65 |
+
pygame.display.flip()
|
udrl/cli.py
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import dataclasses
|
3 |
+
import inspect as i
|
4 |
+
from typing import Callable, Dict, Any
|
5 |
+
from dataclasses import fields, is_dataclass
|
6 |
+
|
7 |
+
|
8 |
+
def sel_args(kw: Dict[str, Any], fun: Callable) -> Dict[str, Any]:
|
9 |
+
"""
|
10 |
+
Selects keyword arguments relevant to a function.
|
11 |
+
|
12 |
+
Parameters
|
13 |
+
----------
|
14 |
+
kw : Dict[str, Any]
|
15 |
+
A dictionary of keyword arguments.
|
16 |
+
fun : Callable
|
17 |
+
The function for which arguments are to be selected.
|
18 |
+
|
19 |
+
Returns
|
20 |
+
-------
|
21 |
+
Dict[str, Any]
|
22 |
+
A new dictionary containing only the keyword arguments
|
23 |
+
that are valid parameters for the given function.
|
24 |
+
"""
|
25 |
+
return {
|
26 |
+
k: v for k, v in kw.items() if k in list(i.signature(fun).parameters)
|
27 |
+
}
|
28 |
+
|
29 |
+
|
30 |
+
def apply(fun: Callable, kw: Dict[str, Any]) -> Any:
|
31 |
+
"""
|
32 |
+
Applies a function with selected keyword arguments.
|
33 |
+
|
34 |
+
Parameters
|
35 |
+
----------
|
36 |
+
fun : Callable
|
37 |
+
The function to apply.
|
38 |
+
kw : Dict[str, Any]
|
39 |
+
A dictionary of keyword arguments.
|
40 |
+
|
41 |
+
Returns
|
42 |
+
-------
|
43 |
+
Any
|
44 |
+
The result of calling the function with the selected keyword arguments.
|
45 |
+
"""
|
46 |
+
return fun(**sel_args(kw, fun))
|
47 |
+
|
48 |
+
|
49 |
+
def create_argparse_dict(dataclass_cls):
|
50 |
+
"""
|
51 |
+
Creates an argument parser dictionary configuration from a dataclass.
|
52 |
+
|
53 |
+
This function examines the fields of a dataclass and generates a dictionary
|
54 |
+
that can be used to configure an argparse.ArgumentParser.
|
55 |
+
It handles boolean fields with special actions, sets default values,
|
56 |
+
includes help messages with defaults, and supports optional choices
|
57 |
+
and required arguments based on metadata.
|
58 |
+
|
59 |
+
Parameters
|
60 |
+
----------
|
61 |
+
dataclass_cls : type
|
62 |
+
The dataclass type to create the argument parser dictionary from.
|
63 |
+
|
64 |
+
Returns
|
65 |
+
-------
|
66 |
+
Dict[str, Dict[str, Any]]
|
67 |
+
A dictionary mapping argument names to dictionaries containing
|
68 |
+
argparse configuration options.
|
69 |
+
"""
|
70 |
+
result = {}
|
71 |
+
for field in dataclasses.fields(dataclass_cls):
|
72 |
+
if not field.init:
|
73 |
+
continue
|
74 |
+
arg_name = f"--{field.name.replace('_', '-')}"
|
75 |
+
if field.type == bool:
|
76 |
+
result[arg_name] = dict(
|
77 |
+
action=argparse.BooleanOptionalAction,
|
78 |
+
default=field.default,
|
79 |
+
)
|
80 |
+
continue
|
81 |
+
result[arg_name] = {
|
82 |
+
"type": field.type,
|
83 |
+
"default": (
|
84 |
+
field.default
|
85 |
+
if not dataclasses.is_dataclass(field.type)
|
86 |
+
else None
|
87 |
+
),
|
88 |
+
"help": f"{field.metadata.get('help', '')}"
|
89 |
+
f"(default: {field.default})",
|
90 |
+
}
|
91 |
+
if choices := field.metadata.get("choices", None):
|
92 |
+
result[arg_name]["choices"] = choices
|
93 |
+
if required := field.metadata.get("required", None):
|
94 |
+
result[arg_name]["required"] = required
|
95 |
+
return result
|
96 |
+
|
97 |
+
|
98 |
+
def create_experiment_from_args(
|
99 |
+
args: argparse.Namespace, dataclass: Callable[..., Any]
|
100 |
+
) -> Any:
|
101 |
+
"""
|
102 |
+
Creates an experiment instance from parsed command-line arguments.
|
103 |
+
|
104 |
+
Parameters
|
105 |
+
----------
|
106 |
+
args : argparse.Namespace
|
107 |
+
An argparse Namespace object containing parsed command-line arguments.
|
108 |
+
dataclass : Callable[..., Any]
|
109 |
+
A dataclass constructor that takes keyword arguments corresponding
|
110 |
+
to experiment parameters.
|
111 |
+
|
112 |
+
Returns
|
113 |
+
-------
|
114 |
+
Any
|
115 |
+
An instance of the dataclass initialized with the parsed arguments.
|
116 |
+
"""
|
117 |
+
return apply(
|
118 |
+
dataclass,
|
119 |
+
{
|
120 |
+
k.replace("--", "").replace("-", "_"): v
|
121 |
+
for k, v in vars(args).items()
|
122 |
+
},
|
123 |
+
)
|
124 |
+
|
125 |
+
|
126 |
+
def with_meta(default: Any, help: str, **kwargs):
|
127 |
+
"""
|
128 |
+
Creates a dataclass field with default value, help string,
|
129 |
+
and additional metadata.
|
130 |
+
|
131 |
+
This function simplifies the creation of dataclass fields by
|
132 |
+
providing a convenient way to set a default value,
|
133 |
+
a help string, and other metadata attributes for a field.
|
134 |
+
|
135 |
+
Parameters
|
136 |
+
----------
|
137 |
+
default : Any
|
138 |
+
The default value for the field. If callable,
|
139 |
+
it's treated as a default factory.
|
140 |
+
help : str
|
141 |
+
The help string describing the field's purpose.
|
142 |
+
**kwargs
|
143 |
+
Additional keyword arguments to be included in the field's metadata.
|
144 |
+
|
145 |
+
Returns
|
146 |
+
-------
|
147 |
+
dataclasses.Field
|
148 |
+
A dataclass Field object with the specified default,
|
149 |
+
help, and metadata.
|
150 |
+
"""
|
151 |
+
args: Dict[str, Any] = {"metadata": {"help": help, **kwargs}}
|
152 |
+
if callable(default):
|
153 |
+
args["default_factory"] = default
|
154 |
+
else:
|
155 |
+
args["default"] = default
|
156 |
+
return dataclasses.field(**args)
|
157 |
+
|
158 |
+
|
159 |
+
def dataclass_non_defaults_to_string(data_obj):
|
160 |
+
"""Converts non-default values of a dataclass object's fields to a string,
|
161 |
+
excluding 'seed' and 'env_name'.
|
162 |
+
|
163 |
+
Parameters
|
164 |
+
----------
|
165 |
+
data_obj : dataclass object
|
166 |
+
The dataclass object to process.
|
167 |
+
|
168 |
+
Returns
|
169 |
+
-------
|
170 |
+
str
|
171 |
+
A string representation of non-default field values, or "base" if all
|
172 |
+
fields have default values
|
173 |
+
(excluding the 'seed' and 'env_name' attributes).
|
174 |
+
|
175 |
+
Raises
|
176 |
+
------
|
177 |
+
TypeError
|
178 |
+
If the input is not a dataclass object.
|
179 |
+
"""
|
180 |
+
if not is_dataclass(data_obj):
|
181 |
+
raise TypeError("Input must be a dataclass object.")
|
182 |
+
|
183 |
+
non_defaults = []
|
184 |
+
for field in fields(data_obj):
|
185 |
+
if field.name == "seed" or field.name == "env_name":
|
186 |
+
continue
|
187 |
+
if getattr(data_obj, field.name) != field.default:
|
188 |
+
non_defaults.append(
|
189 |
+
field.name + str(getattr(data_obj, field.name))
|
190 |
+
)
|
191 |
+
|
192 |
+
return "_".join(non_defaults) or "base"
|
udrl/data_proc.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
import numpy as np
|
3 |
+
import json
|
4 |
+
import csv
|
5 |
+
|
6 |
+
naming = {
|
7 |
+
"neural": "NN",
|
8 |
+
"ensemble.ExtraTreesClassifier": "ET",
|
9 |
+
"ensemble.RandomForestClassifier": "RF",
|
10 |
+
}
|
11 |
+
|
12 |
+
if __name__ == "__main__":
|
13 |
+
path = Path("data")
|
14 |
+
csvs_path = path / "csvs"
|
15 |
+
csvs_path.mkdir(parents=True, exist_ok=True)
|
16 |
+
for env in path.iterdir():
|
17 |
+
all_paths = list(set([p.parent for p in env.rglob("*.npy")]))
|
18 |
+
if not all_paths:
|
19 |
+
continue
|
20 |
+
toy_rewards = np.load(all_paths[0] / "train_rewards.npy")
|
21 |
+
data = {"episode": list(range(len(toy_rewards)))}
|
22 |
+
estimators = {
|
23 |
+
"neural": ([], [], [], []),
|
24 |
+
"ensemble.ExtraTreesClassifier": ([], [], [], []),
|
25 |
+
"ensemble.RandomForestClassifier": ([], [], [], []),
|
26 |
+
}
|
27 |
+
for exp in all_paths:
|
28 |
+
print(exp)
|
29 |
+
rewards = np.load(exp / "train_rewards.npy")
|
30 |
+
|
31 |
+
with open((exp / "conf.json"), "r") as f:
|
32 |
+
conf = json.load(f)
|
33 |
+
|
34 |
+
estimators[conf["estimator_name"]][0].append(list(rewards[:, 0]))
|
35 |
+
estimators[conf["estimator_name"]][1].append(list(rewards[:, 1]))
|
36 |
+
estimators[conf["estimator_name"]][2].append(conf["test_mean"])
|
37 |
+
estimators[conf["estimator_name"]][3].append(conf["test_std"])
|
38 |
+
|
39 |
+
for k, v in estimators.items():
|
40 |
+
data[naming[k] + "_mean"] = [
|
41 |
+
"{:.2f}".format(np.mean(x)) for x in zip(*v[0])
|
42 |
+
]
|
43 |
+
data[naming[k] + "_std"] = [
|
44 |
+
"{:.2f}".format(np.std(x)) for x in zip(*v[0])
|
45 |
+
]
|
46 |
+
print(f"{k}:{env.name}-> {np.median(v[2])} +- {np.median(v[3])}")
|
47 |
+
|
48 |
+
with open(csvs_path / f"{env.name}.csv", "w") as f:
|
49 |
+
w = csv.writer(f)
|
50 |
+
w.writerow(data.keys())
|
51 |
+
w.writerows(zip(*data.values()))
|
udrl/inference.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib.pyplot as plt
|
2 |
+
import numpy as np
|
3 |
+
from .policies import SklearnPolicy, NeuralPolicy
|
4 |
+
from .agent import UpsideDownAgent, AgentHyper
|
5 |
+
from pathlib import Path
|
6 |
+
from collections import Counter
|
7 |
+
from tqdm import trange
|
8 |
+
|
9 |
+
|
10 |
+
def get_common(base, env, conf, seed):
|
11 |
+
|
12 |
+
path = base / env / conf / seed
|
13 |
+
|
14 |
+
if not path.exists():
|
15 |
+
print("Cannot find path")
|
16 |
+
return None, None
|
17 |
+
algo_name = (
|
18 |
+
"NN" if "neural" in conf else ("ET" if "Extra" in conf else "RT")
|
19 |
+
)
|
20 |
+
|
21 |
+
des_ret = np.load(str(path / "desired_returns.npy")).astype(int)
|
22 |
+
des_hor = np.load(str(path / "desired_horizons.npy")).astype(int)
|
23 |
+
# rew = np.load(str(path / "train_rewards.npy")).astype(int)[:, 0]
|
24 |
+
|
25 |
+
te = []
|
26 |
+
prev = -np.inf
|
27 |
+
for i, x in enumerate(des_hor):
|
28 |
+
if prev < x:
|
29 |
+
te.append(i)
|
30 |
+
prev = x
|
31 |
+
|
32 |
+
init_des_ret = des_ret[te]
|
33 |
+
init_des_hor = des_hor[te]
|
34 |
+
|
35 |
+
mean_des_ret = []
|
36 |
+
mean_des_hor = []
|
37 |
+
tmp_r = []
|
38 |
+
|
39 |
+
tmp_h = []
|
40 |
+
for i, (ret, hor) in enumerate(zip(init_des_ret, init_des_hor)):
|
41 |
+
tmp_r.append(ret)
|
42 |
+
tmp_h.append(hor)
|
43 |
+
if i % 15 == 0:
|
44 |
+
mean_des_hor.append(np.mean(tmp_h))
|
45 |
+
mean_des_ret.append(np.mean(tmp_r))
|
46 |
+
tmp_r = []
|
47 |
+
tmp_h = []
|
48 |
+
|
49 |
+
common_hor = Counter(init_des_hor[-1500:]).most_common()[0][0]
|
50 |
+
common_ret = Counter(init_des_ret[-1500:]).most_common()[0][0]
|
51 |
+
print(f"{env}:{algo_name}.horizon-> {common_hor}")
|
52 |
+
print(f"{env}:{algo_name}.return-> {common_ret}")
|
53 |
+
return common_ret, common_hor
|
54 |
+
|
55 |
+
|
56 |
+
def test_desired(base, env, conf, des_ret, des_hor):
|
57 |
+
|
58 |
+
algo_name = (
|
59 |
+
"NN" if "neural" in conf else ("ET" if "Extra" in conf else "RT")
|
60 |
+
)
|
61 |
+
if des_hor is None or des_ret is None:
|
62 |
+
print(f"Invalid desired for {env}:{algo_name}")
|
63 |
+
return
|
64 |
+
for path in (base / env / conf).iterdir():
|
65 |
+
if "neural" in conf:
|
66 |
+
policy = NeuralPolicy.load(str(path / "policy"))
|
67 |
+
else:
|
68 |
+
policy = SklearnPolicy.load(str(path / "policy"))
|
69 |
+
|
70 |
+
hyper = AgentHyper(env, warm_up=0)
|
71 |
+
|
72 |
+
agent = UpsideDownAgent(hyper, policy)
|
73 |
+
|
74 |
+
final_r = [
|
75 |
+
agent.collect_episode(
|
76 |
+
des_ret,
|
77 |
+
des_hor,
|
78 |
+
test=True,
|
79 |
+
store_episode=False,
|
80 |
+
)[0]
|
81 |
+
for _ in range(100)
|
82 |
+
]
|
83 |
+
print(
|
84 |
+
f"{env}:{algo_name}:{path.name}:r.{des_ret}:h.{des_hor}"
|
85 |
+
f" -> {np.median(final_r):.2f} +- {np.std(final_r):.2f}"
|
86 |
+
f",max {np.max(final_r):.2f},min {np.min(final_r):.2f}"
|
87 |
+
)
|
88 |
+
|
89 |
+
|
90 |
+
base = Path("/home/vimmoos/upside_down_rl/data")
|
91 |
+
confs = {
|
92 |
+
"NN": "estimator_nameneural_batch_size256_warm_up260",
|
93 |
+
"ET": "estimator_nameensemble.ExtraTreesClassifier_train_per_iter1",
|
94 |
+
"RT": "train_per_iter1",
|
95 |
+
}
|
96 |
+
envs = ["LunarLander-v2", "Acrobot-v1"]
|
97 |
+
seeds = [str(45), str(46)]
|
98 |
+
|
99 |
+
res = {}
|
100 |
+
|
101 |
+
|
102 |
+
for env in envs:
|
103 |
+
res[env] = {}
|
104 |
+
for algo_name, conf in confs.items():
|
105 |
+
res[env][algo_name] = {}
|
106 |
+
for seed in seeds:
|
107 |
+
ret, hor = get_common(base, env, conf + "_save_desiredTrue", seed)
|
108 |
+
res[env][algo_name][seed] = (ret, hor)
|
109 |
+
|
110 |
+
|
111 |
+
pprint(res)
|
112 |
+
|
113 |
+
for env, algos in res.items():
|
114 |
+
for algo, seeds in algos.items():
|
115 |
+
for _, vals in seeds.items():
|
116 |
+
test_desired(base, env, confs[algo], *vals)
|
117 |
+
|
118 |
+
|
119 |
+
# plt.plot(mean_des_ret)
|
120 |
+
# plt.plot(mean_des_hor)
|
121 |
+
# plt.plot(rew)
|
122 |
+
# plt.show()
|
udrl/plot.py
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .policies import SklearnPolicy
|
2 |
+
from .agent import UpsideDownAgent, AgentHyper
|
3 |
+
from pathlib import Path
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import numpy as np
|
6 |
+
from itertools import zip_longest, tee
|
7 |
+
from tqdm import tqdm, trange
|
8 |
+
import imageio
|
9 |
+
|
10 |
+
|
11 |
+
def calculate_ep_feat_importance(
|
12 |
+
episode, agent, desired_return, desired_horizon
|
13 |
+
):
|
14 |
+
ep_features = []
|
15 |
+
|
16 |
+
for state, _, reward in zip(*episode.values()):
|
17 |
+
command = np.array(
|
18 |
+
[
|
19 |
+
desired_return * agent.conf.return_scale,
|
20 |
+
desired_horizon * agent.conf.horizon_scale,
|
21 |
+
]
|
22 |
+
)
|
23 |
+
command = np.expand_dims(command, axis=0)
|
24 |
+
ext_state = np.concatenate((state, command), axis=1)
|
25 |
+
|
26 |
+
feature_importances = {}
|
27 |
+
|
28 |
+
for t in agent.policy.estimator.estimators_:
|
29 |
+
branch = np.array(t.decision_path(ext_state).todense(), dtype=bool)
|
30 |
+
imp = t.tree_.impurity[branch[0]]
|
31 |
+
for f, i in zip(
|
32 |
+
t.tree_.feature[branch[0]][:-1], imp[:-1] - imp[1:]
|
33 |
+
):
|
34 |
+
feature_importances.setdefault(f, []).append(i)
|
35 |
+
|
36 |
+
# Line 8 Algorithm 2
|
37 |
+
desired_return -= reward
|
38 |
+
# Line 9 Algorithm 2
|
39 |
+
desired_horizon = max(desired_horizon - 1, 1)
|
40 |
+
|
41 |
+
summed_importances = [
|
42 |
+
sum(feature_importances[k])
|
43 |
+
for k in range(len(feature_importances.keys()))
|
44 |
+
]
|
45 |
+
ep_features.append(summed_importances)
|
46 |
+
return ep_features
|
47 |
+
|
48 |
+
|
49 |
+
def summarize_episodes_feat(
|
50 |
+
episodes_feat, summarize_funs: list = [np.mean, np.std]
|
51 |
+
):
|
52 |
+
return [
|
53 |
+
[
|
54 |
+
[
|
55 |
+
fun(list(data))
|
56 |
+
for fun, data in zip(
|
57 |
+
summarize_funs,
|
58 |
+
tee(
|
59 |
+
(s for s in state if s is not None),
|
60 |
+
len(summarize_funs),
|
61 |
+
),
|
62 |
+
)
|
63 |
+
]
|
64 |
+
for state in zip_longest(*ep)
|
65 |
+
]
|
66 |
+
for ep in zip_longest(*episodes_feat, fillvalue=[])
|
67 |
+
]
|
68 |
+
|
69 |
+
|
70 |
+
def calculate_features_importance(
|
71 |
+
path: Path,
|
72 |
+
env: str,
|
73 |
+
desired_return: int,
|
74 |
+
desired_horizon: int,
|
75 |
+
horizon_scale: float,
|
76 |
+
return_scale: float,
|
77 |
+
redundancy: int = 100,
|
78 |
+
):
|
79 |
+
policy = SklearnPolicy.load(str(path / "policy"))
|
80 |
+
hyper = AgentHyper(
|
81 |
+
env,
|
82 |
+
warm_up=0,
|
83 |
+
horizon_scale=horizon_scale,
|
84 |
+
return_scale=return_scale,
|
85 |
+
)
|
86 |
+
|
87 |
+
agent = UpsideDownAgent(hyper, policy)
|
88 |
+
|
89 |
+
for _ in trange(redundancy, desc="Collect Data"):
|
90 |
+
agent.collect_episode(desired_return, desired_horizon, test=True)
|
91 |
+
|
92 |
+
episodes = [
|
93 |
+
{k: v for k, v in ep.items() if k != "summed_rewards"}
|
94 |
+
for ep in agent.memory.buffer
|
95 |
+
]
|
96 |
+
|
97 |
+
episodes_feat = [
|
98 |
+
calculate_ep_feat_importance(
|
99 |
+
ep, agent, desired_return, desired_horizon
|
100 |
+
)
|
101 |
+
for ep in tqdm(episodes, desc="Calculate importance features")
|
102 |
+
]
|
103 |
+
|
104 |
+
feature_importances = summarize_episodes_feat(episodes_feat)
|
105 |
+
return feature_importances
|
106 |
+
|
107 |
+
|
108 |
+
def example_plot(feature_importances):
|
109 |
+
for idx, state_feat in tqdm(
|
110 |
+
enumerate(feature_importances),
|
111 |
+
desc="Plotting",
|
112 |
+
total=len(feature_importances),
|
113 |
+
):
|
114 |
+
x = np.arange(len(state_feat))
|
115 |
+
|
116 |
+
plt.figure()
|
117 |
+
plt.title(f"Cartpole-v0 State {idx}")
|
118 |
+
plt.bar(x, [x[0] for x in state_feat], yerr=[x[1] for x in state_feat])
|
119 |
+
|
120 |
+
plt.xticks(
|
121 |
+
x,
|
122 |
+
[
|
123 |
+
*[f"feature-{index}" for index in range(len(state_feat) - 2)],
|
124 |
+
r"$d_t^{r}$",
|
125 |
+
r"$d_t^{h}$",
|
126 |
+
],
|
127 |
+
)
|
128 |
+
plt.savefig(f"data/example_plot2/importances_state_{idx}")
|
129 |
+
plt.close()
|
130 |
+
|
131 |
+
|
132 |
+
def create_gif_from_plots(
|
133 |
+
image_filenames, output_filename="animation.gif", duration=0.5
|
134 |
+
):
|
135 |
+
"""Creates a GIF from a list of image filenames."""
|
136 |
+
|
137 |
+
images = [imageio.imread(filename) for filename in image_filenames]
|
138 |
+
imageio.mimsave(output_filename, images, duration=duration)
|
139 |
+
|
140 |
+
|
141 |
+
base_path = Path("data")
|
142 |
+
env = "CartPole-v0"
|
143 |
+
estimator = "ExtraTreesClassifier"
|
144 |
+
seed = str(42)
|
145 |
+
conf_name = "estimator_nameensemble.ExtraTreesClassifier_train_per_iter1"
|
146 |
+
desired_return = 200
|
147 |
+
desired_horizon = 200
|
148 |
+
|
149 |
+
path = base_path / env / conf_name / seed
|
150 |
+
|
151 |
+
|
152 |
+
res = calculate_features_importance(
|
153 |
+
path, env, desired_return, desired_horizon, 0.02, 0.02
|
154 |
+
)
|
155 |
+
example_plot(res)
|
156 |
+
|
157 |
+
image_filenames = [
|
158 |
+
f"data/example_plot2/importances_state_{idx}.png"
|
159 |
+
for idx in range(len(res))
|
160 |
+
]
|
161 |
+
|
162 |
+
create_gif_from_plots(image_filenames)
|
163 |
+
|
164 |
+
|
165 |
+
# import numpy as np
|
166 |
+
# import matplotlib.pyplot as plt
|
167 |
+
# from sklearn.cluster import KMeans, HDBSCAN
|
168 |
+
# from sklearn.decomposition import PCA
|
169 |
+
|
170 |
+
# # Assuming you have your data in a numpy array 'data'
|
171 |
+
# data = np.array(res)[:, :, 0]
|
172 |
+
|
173 |
+
# # 1. Apply K-Means clustering
|
174 |
+
# kmeans = HDBSCAN()
|
175 |
+
# kmeans.fit(data)
|
176 |
+
# labels = kmeans.labels_
|
177 |
+
|
178 |
+
# # 2. Dimensionality Reduction for visualization (PCA)
|
179 |
+
# pca = PCA(n_components=2) # Reduce to 2 dimensions for plotting
|
180 |
+
# data_pca = pca.fit_transform(data)
|
181 |
+
|
182 |
+
# # 3. Plotting
|
183 |
+
# plt.figure(figsize=(10, 8))
|
184 |
+
# plt.scatter(data_pca[:, 0], data_pca[:, 1], c=labels, cmap="viridis")
|
185 |
+
# plt.title("K-Means Clustering Visualization")
|
186 |
+
# plt.xlabel("Principal Component 1")
|
187 |
+
# plt.ylabel("Principal Component 2")
|
188 |
+
# plt.colorbar()
|
189 |
+
# plt.show()
|
udrl/policies.py
ADDED
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass, field
|
2 |
+
from typing import Dict, Any, Union
|
3 |
+
from abc import ABC
|
4 |
+
import importlib
|
5 |
+
from pickle import dump, load
|
6 |
+
|
7 |
+
|
8 |
+
from sklearn.exceptions import NotFittedError
|
9 |
+
from sklearn.base import BaseEstimator
|
10 |
+
from sklearn.metrics import classification_report
|
11 |
+
import numpy as np
|
12 |
+
|
13 |
+
import torch
|
14 |
+
from torch import nn
|
15 |
+
from torch.distributions import Categorical
|
16 |
+
|
17 |
+
|
18 |
+
class ABCPolicy(ABC):
|
19 |
+
"""An abstract base class for defining agent policies.
|
20 |
+
|
21 |
+
Methods
|
22 |
+
-------
|
23 |
+
__call__(state, command, test)
|
24 |
+
Selects an action based on the given state and command.
|
25 |
+
|
26 |
+
Parameters
|
27 |
+
----------
|
28 |
+
state : np.array
|
29 |
+
The current state of the environment.
|
30 |
+
command : np.array
|
31 |
+
The command or goal provided to the policy.
|
32 |
+
test : bool
|
33 |
+
Whether the policy is being used in a testing scenario.
|
34 |
+
|
35 |
+
Returns
|
36 |
+
-------
|
37 |
+
int or np.array
|
38 |
+
The selected action.
|
39 |
+
|
40 |
+
train(states, commands, actions)
|
41 |
+
Trains the policy using the provided experiences.
|
42 |
+
|
43 |
+
Parameters
|
44 |
+
----------
|
45 |
+
states : np.array
|
46 |
+
A batch of states.
|
47 |
+
commands : np.array
|
48 |
+
A batch of corresponding commands.
|
49 |
+
actions : np.array
|
50 |
+
A batch of corresponding actions taken.
|
51 |
+
|
52 |
+
Returns
|
53 |
+
-------
|
54 |
+
Dict[str, Any]
|
55 |
+
A dictionary containing training metrics or other information.
|
56 |
+
It MUST contain the key "metric"
|
57 |
+
|
58 |
+
save(path)
|
59 |
+
Saves the policy to the specified path.
|
60 |
+
|
61 |
+
Parameters
|
62 |
+
----------
|
63 |
+
path : str
|
64 |
+
The path to save the policy to.
|
65 |
+
|
66 |
+
load(path)
|
67 |
+
Loads the policy from the specified path.
|
68 |
+
|
69 |
+
Parameters
|
70 |
+
----------
|
71 |
+
path : str
|
72 |
+
The path to load the policy from.
|
73 |
+
"""
|
74 |
+
|
75 |
+
def __call__(
|
76 |
+
self,
|
77 |
+
state: np.array,
|
78 |
+
command: np.array,
|
79 |
+
test: bool,
|
80 |
+
) -> Union[int, np.array]: ...
|
81 |
+
|
82 |
+
def train(
|
83 |
+
self,
|
84 |
+
states: np.array,
|
85 |
+
commands: np.array,
|
86 |
+
actions: np.array,
|
87 |
+
) -> Dict[str, Any]: ...
|
88 |
+
|
89 |
+
def save(self, path: str): ...
|
90 |
+
def load(path: str): ...
|
91 |
+
|
92 |
+
|
93 |
+
@dataclass
|
94 |
+
class SklearnPolicy(ABCPolicy):
|
95 |
+
"""A policy using a scikit-learn estimator for action selection.
|
96 |
+
|
97 |
+
Parameters
|
98 |
+
----------
|
99 |
+
epsilon : float
|
100 |
+
Exploration rate for epsilon-greedy action selection.
|
101 |
+
action_size : int
|
102 |
+
The number of possible actions in the environment.
|
103 |
+
estimator_name : str
|
104 |
+
The fully qualified name of the scikit-learn estimator class
|
105 |
+
(e.g., 'ensemble.RandomForestClassifier').
|
106 |
+
estimator_kwargs : Dict[str, Any], optional
|
107 |
+
Keyword arguments to pass to the estimator constructor (default: {}).
|
108 |
+
|
109 |
+
Attributes
|
110 |
+
----------
|
111 |
+
estimator : BaseEstimator
|
112 |
+
The initialized scikit-learn estimator.
|
113 |
+
|
114 |
+
Methods
|
115 |
+
-------
|
116 |
+
__call__(state, command, test)
|
117 |
+
Selects an action based on the given state and command,
|
118 |
+
using the estimator or epsilon-greedy exploration.
|
119 |
+
|
120 |
+
train(states, commands, actions)
|
121 |
+
Trains the estimator using the provided experiences.
|
122 |
+
|
123 |
+
save(path)
|
124 |
+
Saves the policy (including the estimator) to a pickle file.
|
125 |
+
|
126 |
+
load(path)
|
127 |
+
Loads the policy (including the estimator) from a pickle file.
|
128 |
+
"""
|
129 |
+
|
130 |
+
epsilon: float
|
131 |
+
action_size: int
|
132 |
+
estimator_name: str
|
133 |
+
estimator_kwargs: Dict[str, Any] = field(default_factory=dict)
|
134 |
+
estimator: BaseEstimator = field(init=False)
|
135 |
+
|
136 |
+
def __post_init__(self):
|
137 |
+
module, clf_name = self.estimator_name.split(".")
|
138 |
+
module = importlib.import_module("sklearn." + module)
|
139 |
+
self.estimator = getattr(module, clf_name)(
|
140 |
+
**self.estimator_kwargs,
|
141 |
+
)
|
142 |
+
|
143 |
+
def __call__(
|
144 |
+
self,
|
145 |
+
state: np.array,
|
146 |
+
command: np.array,
|
147 |
+
test: bool,
|
148 |
+
):
|
149 |
+
input_state = np.concatenate((state, command), axis=1)
|
150 |
+
actions = None
|
151 |
+
try:
|
152 |
+
actions = self.estimator.predict(input_state)
|
153 |
+
except NotFittedError:
|
154 |
+
...
|
155 |
+
|
156 |
+
if not test and (actions is None or np.random.rand() <= self.epsilon):
|
157 |
+
return np.random.choice(self.action_size)
|
158 |
+
return actions[0]
|
159 |
+
|
160 |
+
def train(
|
161 |
+
self,
|
162 |
+
states: np.array,
|
163 |
+
commands: np.array,
|
164 |
+
actions: np.array,
|
165 |
+
):
|
166 |
+
input_state = np.concatenate((states, commands), axis=1)
|
167 |
+
self.estimator.fit(input_state, actions)
|
168 |
+
pred = self.estimator.predict(input_state)
|
169 |
+
report = classification_report(actions, pred, output_dict=True)
|
170 |
+
report["metric"] = report["accuracy"]
|
171 |
+
return report
|
172 |
+
|
173 |
+
def save(self, path: str):
|
174 |
+
with open(path + ".pkl", "wb") as f:
|
175 |
+
dump(self, f)
|
176 |
+
|
177 |
+
def load(path: str):
|
178 |
+
with open(path + ".pkl", "rb") as f:
|
179 |
+
policy = load(f)
|
180 |
+
return policy
|
181 |
+
|
182 |
+
|
183 |
+
class BehaviorNet(nn.Module):
|
184 |
+
"""
|
185 |
+
A neural network module designed to model agent behavior based on state
|
186 |
+
and command inputs.
|
187 |
+
|
188 |
+
Parameters
|
189 |
+
----------
|
190 |
+
state_size : int
|
191 |
+
Dimensionality of the state input.
|
192 |
+
action_size : int
|
193 |
+
Dimensionality of the action output.
|
194 |
+
command_size : int
|
195 |
+
Dimensionality of the command input.
|
196 |
+
hidden_size : int, optional
|
197 |
+
Number of neurons in the hidden layers. Defaults to 64.
|
198 |
+
|
199 |
+
Returns
|
200 |
+
-------
|
201 |
+
torch.Tensor
|
202 |
+
A probability distribution over actions,
|
203 |
+
shaped (batch_size, action_size).
|
204 |
+
"""
|
205 |
+
|
206 |
+
def __init__(
|
207 |
+
self,
|
208 |
+
state_size: int,
|
209 |
+
action_size: int,
|
210 |
+
command_size: int,
|
211 |
+
hidden_size: int = 64,
|
212 |
+
):
|
213 |
+
super().__init__()
|
214 |
+
self.state_entry = nn.Sequential(
|
215 |
+
nn.Linear(state_size, hidden_size), nn.Sigmoid()
|
216 |
+
)
|
217 |
+
self.command_entry = nn.Sequential(
|
218 |
+
nn.Linear(command_size, hidden_size), nn.Sigmoid()
|
219 |
+
)
|
220 |
+
self.model = nn.Sequential(
|
221 |
+
nn.Linear(hidden_size, hidden_size),
|
222 |
+
nn.ReLU(),
|
223 |
+
nn.Linear(hidden_size, hidden_size),
|
224 |
+
nn.ReLU(),
|
225 |
+
nn.Linear(hidden_size, hidden_size),
|
226 |
+
nn.ReLU(),
|
227 |
+
nn.Linear(hidden_size, action_size),
|
228 |
+
nn.Softmax(dim=-1),
|
229 |
+
)
|
230 |
+
|
231 |
+
def forward(self, state, command):
|
232 |
+
state_out = self.state_entry(state)
|
233 |
+
command_out = self.command_entry(command)
|
234 |
+
out = state_out * command_out
|
235 |
+
return self.model(out)
|
236 |
+
|
237 |
+
|
238 |
+
@dataclass
|
239 |
+
class NeuralPolicy(ABCPolicy):
|
240 |
+
"""
|
241 |
+
A policy that uses a neural network to map states and commands to actions.
|
242 |
+
|
243 |
+
Parameters
|
244 |
+
----------
|
245 |
+
state_size : int
|
246 |
+
The dimensionality of the state input.
|
247 |
+
action_size : int
|
248 |
+
The dimensionality of the action output.
|
249 |
+
command_size : int, optional
|
250 |
+
The dimensionality of the command input. Defaults to 2.
|
251 |
+
hidden_size : int, optional
|
252 |
+
The number of neurons in the hidden layers of the neural network.
|
253 |
+
Defaults to 64.
|
254 |
+
device : str, optional
|
255 |
+
The device on which to run the neural network.
|
256 |
+
Can be "auto" (to automatically select CUDA if available, else CPU),
|
257 |
+
or a valid torch device string. Defaults to "auto".
|
258 |
+
loss : nn.Module, optional
|
259 |
+
The loss function class used for training.
|
260 |
+
Defaults to `nn.CrossEntropyLoss`.
|
261 |
+
|
262 |
+
Attributes
|
263 |
+
----------
|
264 |
+
estimator : nn.Module
|
265 |
+
The neural network used to estimate the action probabilities.
|
266 |
+
loss : nn.Module
|
267 |
+
The instantiated loss function used for training.
|
268 |
+
optim : torch.optim.Adam
|
269 |
+
The optimizer used for training.
|
270 |
+
|
271 |
+
Methods
|
272 |
+
-------
|
273 |
+
__call__(state, command, test)
|
274 |
+
Selects an action based on the given state and command
|
275 |
+
|
276 |
+
train(states, commands, actions)
|
277 |
+
Trains the estimator using the provided experiences.
|
278 |
+
|
279 |
+
save(path)
|
280 |
+
Saves the policy.
|
281 |
+
|
282 |
+
load(path)
|
283 |
+
Loads the policy.
|
284 |
+
"""
|
285 |
+
|
286 |
+
state_size: int
|
287 |
+
action_size: int
|
288 |
+
command_size: int = 2
|
289 |
+
hidden_size: int = 64
|
290 |
+
# NOTE GPU maybe be drastically slower for small batch_size
|
291 |
+
device: str = "cpu"
|
292 |
+
loss: nn.Module = nn.CrossEntropyLoss
|
293 |
+
estimator: nn.Module = field(init=False)
|
294 |
+
|
295 |
+
def __post_init__(self):
|
296 |
+
self.estimator = BehaviorNet(
|
297 |
+
self.state_size,
|
298 |
+
self.action_size,
|
299 |
+
self.command_size,
|
300 |
+
self.hidden_size,
|
301 |
+
)
|
302 |
+
if self.device == "auto":
|
303 |
+
self.device = torch.device(
|
304 |
+
"cuda" if torch.cuda.is_available() else "cpu"
|
305 |
+
)
|
306 |
+
self.estimator.to(self.device)
|
307 |
+
|
308 |
+
self.loss = self.loss()
|
309 |
+
self.optim = torch.optim.Adam(self.estimator.parameters())
|
310 |
+
|
311 |
+
def __call__(
|
312 |
+
self,
|
313 |
+
state: np.array,
|
314 |
+
command: np.array,
|
315 |
+
test: bool,
|
316 |
+
):
|
317 |
+
state = torch.FloatTensor(state).to(self.device)
|
318 |
+
command = torch.FloatTensor(command).to(self.device)
|
319 |
+
action_probs = self.estimator(state, command)
|
320 |
+
if test:
|
321 |
+
return torch.argmax(action_probs).item()
|
322 |
+
return Categorical(action_probs).sample().item()
|
323 |
+
|
324 |
+
def train(
|
325 |
+
self,
|
326 |
+
states: np.array,
|
327 |
+
commands: np.array,
|
328 |
+
actions: np.array,
|
329 |
+
):
|
330 |
+
states = torch.FloatTensor(states).to(self.device)
|
331 |
+
commands = torch.FloatTensor(commands).to(self.device)
|
332 |
+
actions = torch.LongTensor(actions).to(self.device)
|
333 |
+
|
334 |
+
pred = self.estimator(states, commands)
|
335 |
+
self.optim.zero_grad()
|
336 |
+
loss = self.loss(pred, actions)
|
337 |
+
loss.backward()
|
338 |
+
self.optim.step()
|
339 |
+
return {"metric": loss.item()}
|
340 |
+
|
341 |
+
def save(self, path: str):
|
342 |
+
torch.save(
|
343 |
+
{
|
344 |
+
"model": self.estimator.state_dict(),
|
345 |
+
"optim": self.optim.state_dict(),
|
346 |
+
"state_size": self.state_size,
|
347 |
+
"action_size": self.action_size,
|
348 |
+
"command_size": self.command_size,
|
349 |
+
"hidden_size": self.hidden_size,
|
350 |
+
},
|
351 |
+
path + ".pth",
|
352 |
+
)
|
353 |
+
|
354 |
+
def load(path: str):
|
355 |
+
saved_dict = torch.load(path + ".pth")
|
356 |
+
policy = NeuralPolicy(
|
357 |
+
saved_dict["state_size"],
|
358 |
+
saved_dict["action_size"],
|
359 |
+
saved_dict["command_size"],
|
360 |
+
saved_dict["hidden_size"],
|
361 |
+
)
|
362 |
+
policy.estimator.load_state_dict(saved_dict["model"])
|
363 |
+
policy.optim.load_state_dict(saved_dict["optim"])
|
364 |
+
return policy
|
udrl/test.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import gymnasium as gym
|
2 |
+
# import pygame
|
3 |
+
# import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
# def normalize_value(value, is_bounded, low=None, high=None):
|
7 |
+
# if is_bounded:
|
8 |
+
# return (value - low) / (high - low)
|
9 |
+
# else:
|
10 |
+
# return 0.5 * (np.tanh(value / 2) + 1)
|
11 |
+
|
12 |
+
|
13 |
+
# def draw_bar(screen, start, value, max_length, color, height=20):
|
14 |
+
# bar_length = value * max_length
|
15 |
+
# pygame.draw.rect(screen, color, (*start, bar_length, height))
|
16 |
+
# pygame.draw.rect(
|
17 |
+
# screen, (0, 0, 0), (*start, max_length, height), 2
|
18 |
+
# ) # Border
|
19 |
+
# mid_x = start[0] + max_length / 2
|
20 |
+
# pygame.draw.line(
|
21 |
+
# screen, (0, 0, 0), (mid_x, start[1]), (mid_x, start[1] + height), 2
|
22 |
+
# )
|
23 |
+
|
24 |
+
|
25 |
+
# def visualize_environment(screen, state, env):
|
26 |
+
# screen_width, screen_height = screen.get_size()
|
27 |
+
# screen.fill((255, 255, 255))
|
28 |
+
|
29 |
+
# # Visualize environment-specific elements
|
30 |
+
# if env.spec.id.startswith("CartPole"):
|
31 |
+
# cart_x = int(state[0] * 50 + screen_width // 2)
|
32 |
+
# cart_y = screen_height - 100
|
33 |
+
# pole_angle = state[2]
|
34 |
+
# pygame.draw.rect(screen, (0, 0, 0), (cart_x - 30, cart_y - 15, 60, 30))
|
35 |
+
# pygame.draw.line(
|
36 |
+
# screen,
|
37 |
+
# (0, 0, 0),
|
38 |
+
# (cart_x, cart_y),
|
39 |
+
# (
|
40 |
+
# cart_x + int(np.sin(pole_angle) * 100),
|
41 |
+
# cart_y - int(np.cos(pole_angle) * 100),
|
42 |
+
# ),
|
43 |
+
# 6,
|
44 |
+
# )
|
45 |
+
# elif env.spec.id.startswith("Acrobot"):
|
46 |
+
# center_x, center_y = screen_width // 2, screen_height // 2
|
47 |
+
# l1, l2 = 100, 100 # Length of links
|
48 |
+
# s0, s1 = state[0], state[1] # sin(theta1), sin(theta2)
|
49 |
+
# c0, c1 = state[2], state[3] # cos(theta1), cos(theta2)
|
50 |
+
# x0, y0 = center_x, center_y
|
51 |
+
# x1 = x0 + l1 * s0
|
52 |
+
# y1 = y0 + l1 * c0
|
53 |
+
# x2 = x1 + l2 * s1
|
54 |
+
# y2 = y1 + l2 * c1
|
55 |
+
# pygame.draw.line(screen, (0, 0, 0), (x0, y0), (x1, y1), 6)
|
56 |
+
# pygame.draw.line(screen, (0, 0, 0), (x1, y1), (x2, y2), 6)
|
57 |
+
# pygame.draw.circle(screen, (0, 0, 255), (int(x0), int(y0)), 10)
|
58 |
+
# pygame.draw.circle(screen, (0, 255, 0), (int(x1), int(y1)), 10)
|
59 |
+
# pygame.draw.circle(screen, (255, 0, 0), (int(x2), int(y2)), 10)
|
60 |
+
# # Add more environment-specific visualizations here as needed
|
61 |
+
|
62 |
+
# # Draw bars for each state dimension
|
63 |
+
# num_dims = env.observation_space.shape[0]
|
64 |
+
# bar_colors = [
|
65 |
+
# (255, 0, 0),
|
66 |
+
# (0, 255, 0),
|
67 |
+
# (0, 0, 255),
|
68 |
+
# (255, 255, 0),
|
69 |
+
# (255, 0, 255),
|
70 |
+
# (0, 255, 255),
|
71 |
+
# ]
|
72 |
+
# bar_starts = [(50, 50 + i * 70) for i in range(num_dims)]
|
73 |
+
# max_length = 300
|
74 |
+
|
75 |
+
# for i, (start, color) in enumerate(zip(bar_starts, bar_colors)):
|
76 |
+
# is_bounded = not (
|
77 |
+
# env.observation_space.high[i] > 100
|
78 |
+
# ) and not np.isinf(env.observation_space.low[i] < -100)
|
79 |
+
# normalized_value = normalize_value(
|
80 |
+
# state[i],
|
81 |
+
# is_bounded,
|
82 |
+
# env.observation_space.low[i],
|
83 |
+
# env.observation_space.high[i],
|
84 |
+
# )
|
85 |
+
# draw_bar(screen, start, normalized_value, max_length, color)
|
86 |
+
|
87 |
+
# # Draw labels
|
88 |
+
# font = pygame.font.Font(None, 30)
|
89 |
+
# text = font.render(f"Dim {i}: {state[i]:.2f}", True, (0, 0, 0))
|
90 |
+
# screen.blit(text, (start[0], start[1] - 30))
|
91 |
+
|
92 |
+
# # Add description of bar representation
|
93 |
+
# if is_bounded:
|
94 |
+
# desc = f"(Range: {env.observation_space.low[i]:.2f} to {env.observation_space.high[i]:.2f})"
|
95 |
+
# else:
|
96 |
+
# desc = "(Unbounded: Center is 0, edges are ±∞)"
|
97 |
+
# desc_text = pygame.font.Font(None, 24).render(
|
98 |
+
# desc, True, (100, 100, 100)
|
99 |
+
# )
|
100 |
+
# screen.blit(desc_text, (start[0], start[1] + 25))
|
101 |
+
|
102 |
+
# pygame.display.flip()
|
103 |
+
|
104 |
+
|
105 |
+
# def run_visualization(env_name):
|
106 |
+
# pygame.init()
|
107 |
+
# screen = pygame.display.set_mode((800, 600))
|
108 |
+
# pygame.display.set_caption(f"{env_name} Visualization")
|
109 |
+
|
110 |
+
# env = gym.make(env_name)
|
111 |
+
# state, _ = env.reset()
|
112 |
+
|
113 |
+
# clock = pygame.time.Clock()
|
114 |
+
|
115 |
+
# running = True
|
116 |
+
# while running:
|
117 |
+
# visualize_environment(screen, state, env)
|
118 |
+
# action = env.action_space.sample()
|
119 |
+
# state, reward, done, truncated, info = env.step(action)
|
120 |
+
|
121 |
+
# if done or truncated:
|
122 |
+
# state, _ = env.reset()
|
123 |
+
|
124 |
+
# for event in pygame.event.get():
|
125 |
+
# if event.type == pygame.QUIT:
|
126 |
+
# running = False
|
127 |
+
|
128 |
+
# clock.tick(60) # Limit to 60 FPS
|
129 |
+
|
130 |
+
# env.close()
|
131 |
+
# pygame.quit()
|
132 |
+
|
133 |
+
|
134 |
+
# # Example usage
|
135 |
+
# # run_visualization("CartPole-v1")
|
136 |
+
# # Uncomment the line below to run Acrobot visualization
|
137 |
+
# run_visualization("Acrobot-v1")
|
udrl/viz.py
ADDED
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gymnasium as gym
|
2 |
+
import pygame
|
3 |
+
import numpy as np
|
4 |
+
from .policies import SklearnPolicy
|
5 |
+
from .agent import UpsideDownAgent, AgentHyper
|
6 |
+
from pathlib import Path
|
7 |
+
import json
|
8 |
+
|
9 |
+
|
10 |
+
def normalize_value(value, is_bounded, low=None, high=None):
|
11 |
+
return (value - low) / (high - low)
|
12 |
+
|
13 |
+
|
14 |
+
def draw_bar(screen, start, value, max_length, color, height=20, mid=True):
|
15 |
+
bar_length = value * max_length
|
16 |
+
pygame.draw.rect(screen, color, (*start, bar_length, height))
|
17 |
+
pygame.draw.rect(screen, (0, 0, 0), (*start, max_length, height), 2)
|
18 |
+
if mid:
|
19 |
+
mid_x = start[0] + max_length / 2
|
20 |
+
pygame.draw.line(
|
21 |
+
screen, (0, 0, 0), (mid_x, start[1]), (mid_x, start[1] + height), 2
|
22 |
+
)
|
23 |
+
|
24 |
+
|
25 |
+
def create_button(text, position, size):
|
26 |
+
font = pygame.font.Font(None, 36)
|
27 |
+
button_rect = pygame.Rect(position, size)
|
28 |
+
text_surf = font.render(text, True, (0, 0, 0))
|
29 |
+
text_rect = text_surf.get_rect(center=button_rect.center)
|
30 |
+
return button_rect, text_surf, text_rect
|
31 |
+
|
32 |
+
|
33 |
+
def visualize_environment(
|
34 |
+
screen,
|
35 |
+
state,
|
36 |
+
env,
|
37 |
+
env_surface,
|
38 |
+
paused,
|
39 |
+
feature_importances,
|
40 |
+
epoch,
|
41 |
+
max_epoch=200,
|
42 |
+
):
|
43 |
+
screen_width, screen_height = screen.get_size()
|
44 |
+
screen.fill((255, 255, 255))
|
45 |
+
screen.blit(env_surface, (0, 0))
|
46 |
+
|
47 |
+
num_dims = len(feature_importances)
|
48 |
+
bar_colors = [
|
49 |
+
(255, 0, 0), # Red
|
50 |
+
(0, 255, 0), # Green
|
51 |
+
(0, 0, 255), # Blue
|
52 |
+
(255, 255, 0), # Yellow
|
53 |
+
(255, 0, 255), # Magenta
|
54 |
+
(0, 255, 255), # Cyan
|
55 |
+
(128, 128, 0), # Olive
|
56 |
+
(0, 128, 128), # Teal
|
57 |
+
(128, 0, 0), # Maroon
|
58 |
+
(0, 128, 0), # Dark Green
|
59 |
+
(0, 0, 128), # Navy
|
60 |
+
(128, 128, 128), # Gray
|
61 |
+
(192, 192, 192), # Light Gray
|
62 |
+
(255, 165, 0), # Orange
|
63 |
+
(255, 192, 203), # Pink
|
64 |
+
]
|
65 |
+
|
66 |
+
bar_starts = [
|
67 |
+
(screen_width - 350, 50 + i * 70) for i in range(num_dims + 1)
|
68 |
+
]
|
69 |
+
max_length = 300
|
70 |
+
|
71 |
+
for i, (start, color) in enumerate(zip(bar_starts, bar_colors)):
|
72 |
+
if i == 0:
|
73 |
+
normalized_value = epoch / max_epoch
|
74 |
+
draw_bar(
|
75 |
+
screen, start, normalized_value, max_length, color, mid=False
|
76 |
+
)
|
77 |
+
font = pygame.font.Font(None, 30)
|
78 |
+
text = font.render(
|
79 |
+
f"Epoch {epoch}",
|
80 |
+
True,
|
81 |
+
(0, 0, 0),
|
82 |
+
)
|
83 |
+
screen.blit(text, (start[0], start[1] - 30))
|
84 |
+
continue
|
85 |
+
i -= 1
|
86 |
+
normalized_value = feature_importances[i] / 30
|
87 |
+
draw_bar(screen, start, normalized_value, max_length, color)
|
88 |
+
|
89 |
+
font = pygame.font.Font(None, 30)
|
90 |
+
what = f"Importance {i}"
|
91 |
+
if len(feature_importances) - i == 2:
|
92 |
+
what = "Desired Return"
|
93 |
+
if len(feature_importances) - i == 1:
|
94 |
+
what = "Desired Horizon"
|
95 |
+
|
96 |
+
text = font.render(
|
97 |
+
f"{what}: {feature_importances[i]:.2f}", True, (0, 0, 0)
|
98 |
+
)
|
99 |
+
screen.blit(text, (start[0], start[1] - 30))
|
100 |
+
|
101 |
+
desc = "(Range: 0 to 30)"
|
102 |
+
desc_text = pygame.font.Font(None, 24).render(
|
103 |
+
desc, True, (100, 100, 100)
|
104 |
+
)
|
105 |
+
screen.blit(desc_text, (start[0], start[1] + 25))
|
106 |
+
|
107 |
+
button_width, button_height = 100, 50
|
108 |
+
reset_button, reset_text, reset_text_rect = create_button(
|
109 |
+
"Reset", (10, screen_height - 60), (button_width, button_height)
|
110 |
+
)
|
111 |
+
pause_play_button, pause_play_text, pause_play_text_rect = create_button(
|
112 |
+
"Pause" if not paused else "Play",
|
113 |
+
(120, screen_height - 60),
|
114 |
+
(button_width, button_height),
|
115 |
+
)
|
116 |
+
next_button, next_text, next_text_rect = create_button(
|
117 |
+
"Next", (230, screen_height - 60), (button_width, button_height)
|
118 |
+
)
|
119 |
+
save_button, save_text, save_text_rect = create_button(
|
120 |
+
"Save", (340, screen_height - 60), (button_width, button_height)
|
121 |
+
)
|
122 |
+
|
123 |
+
pygame.draw.rect(screen, (200, 200, 200), reset_button)
|
124 |
+
pygame.draw.rect(screen, (200, 200, 200), pause_play_button)
|
125 |
+
pygame.draw.rect(screen, (200, 200, 200), next_button)
|
126 |
+
pygame.draw.rect(screen, (200, 200, 200), save_button)
|
127 |
+
screen.blit(reset_text, reset_text_rect)
|
128 |
+
screen.blit(pause_play_text, pause_play_text_rect)
|
129 |
+
screen.blit(next_text, next_text_rect)
|
130 |
+
screen.blit(save_text, save_text_rect)
|
131 |
+
|
132 |
+
pygame.display.flip()
|
133 |
+
return reset_button, pause_play_button, next_button, save_button
|
134 |
+
|
135 |
+
|
136 |
+
def run_visualization(
|
137 |
+
env_name,
|
138 |
+
agent,
|
139 |
+
init_desired_return,
|
140 |
+
init_desired_horizon,
|
141 |
+
max_epoch,
|
142 |
+
base_path,
|
143 |
+
):
|
144 |
+
base_path = (
|
145 |
+
Path(base_path) / env_name / agent.policy.estimator.__str__()[:-2]
|
146 |
+
)
|
147 |
+
base_path.mkdir(parents=True, exist_ok=True)
|
148 |
+
desired_return = init_desired_return
|
149 |
+
desired_horizon = init_desired_horizon
|
150 |
+
|
151 |
+
pygame.init()
|
152 |
+
screen = pygame.display.set_mode((1000, 800))
|
153 |
+
pygame.display.set_caption(f"{env_name} Visualization")
|
154 |
+
|
155 |
+
env = gym.make(env_name, render_mode="rgb_array")
|
156 |
+
state, _ = env.reset()
|
157 |
+
|
158 |
+
clock = pygame.time.Clock()
|
159 |
+
epoch = 0
|
160 |
+
save_index = 0
|
161 |
+
|
162 |
+
running = True
|
163 |
+
paused = False
|
164 |
+
step = False
|
165 |
+
while running:
|
166 |
+
|
167 |
+
env_render = env.render()
|
168 |
+
env_surface = pygame.surfarray.make_surface(env_render.swapaxes(0, 1))
|
169 |
+
if not paused or step:
|
170 |
+
command = np.array(
|
171 |
+
[
|
172 |
+
desired_return * agent.conf.return_scale,
|
173 |
+
desired_horizon * agent.conf.horizon_scale,
|
174 |
+
]
|
175 |
+
)
|
176 |
+
command = np.expand_dims(command, axis=0)
|
177 |
+
state = np.expand_dims(state, axis=0)
|
178 |
+
|
179 |
+
action = agent.policy(state, command, True)
|
180 |
+
|
181 |
+
ext_state = np.concatenate((state, command), axis=1)
|
182 |
+
|
183 |
+
state, reward, done, truncated, info = env.step(action)
|
184 |
+
|
185 |
+
feature_importances = {
|
186 |
+
idx: [] for idx in range(ext_state.shape[1])
|
187 |
+
}
|
188 |
+
|
189 |
+
for t in agent.policy.estimator.estimators_:
|
190 |
+
branch = np.array(
|
191 |
+
t.decision_path(ext_state).todense(), dtype=bool
|
192 |
+
)
|
193 |
+
imp = t.tree_.impurity[branch[0]]
|
194 |
+
|
195 |
+
for f, i in zip(
|
196 |
+
t.tree_.feature[branch[0]][:-1], imp[:-1] - imp[1:]
|
197 |
+
):
|
198 |
+
feature_importances.setdefault(f, []).append(i)
|
199 |
+
|
200 |
+
# Line 8 Algorithm 2
|
201 |
+
desired_return -= reward
|
202 |
+
# Line 9 Algorithm 2
|
203 |
+
desired_horizon = max(desired_horizon - 1, 1)
|
204 |
+
|
205 |
+
summed_importances = [
|
206 |
+
sum(feature_importances.get(k, [0.001]))
|
207 |
+
for k in range(len(feature_importances.keys()))
|
208 |
+
]
|
209 |
+
|
210 |
+
epoch += 1
|
211 |
+
|
212 |
+
reset_button, pause_play_button, next_button, save_button = (
|
213 |
+
visualize_environment(
|
214 |
+
screen,
|
215 |
+
state,
|
216 |
+
env,
|
217 |
+
env_surface,
|
218 |
+
paused,
|
219 |
+
summed_importances,
|
220 |
+
epoch,
|
221 |
+
max_epoch,
|
222 |
+
)
|
223 |
+
)
|
224 |
+
|
225 |
+
if done or truncated:
|
226 |
+
state, _ = env.reset()
|
227 |
+
desired_horizon = init_desired_horizon
|
228 |
+
desired_return = init_desired_return
|
229 |
+
epoch = 0
|
230 |
+
|
231 |
+
step = False
|
232 |
+
for event in pygame.event.get():
|
233 |
+
if event.type == pygame.QUIT:
|
234 |
+
running = False
|
235 |
+
elif event.type == pygame.MOUSEBUTTONDOWN:
|
236 |
+
if reset_button.collidepoint(event.pos):
|
237 |
+
state, _ = env.reset()
|
238 |
+
|
239 |
+
desired_horizon = init_desired_horizon
|
240 |
+
desired_return = init_desired_return
|
241 |
+
epoch = 0
|
242 |
+
elif pause_play_button.collidepoint(event.pos):
|
243 |
+
paused = not paused
|
244 |
+
elif (
|
245 |
+
next_button.collidepoint(event.pos) and paused
|
246 |
+
): # Only when paused
|
247 |
+
step = True
|
248 |
+
elif save_button.collidepoint(event.pos):
|
249 |
+
pygame.image.save(
|
250 |
+
env_surface,
|
251 |
+
str(base_path / f"env_image_{save_index}.png"),
|
252 |
+
)
|
253 |
+
with open(
|
254 |
+
str(base_path / f"info_{save_index}.json"), "w"
|
255 |
+
) as f:
|
256 |
+
json.dump(
|
257 |
+
{
|
258 |
+
"state": {
|
259 |
+
i: str(val) for i, val in enumerate(state)
|
260 |
+
},
|
261 |
+
"feature": {
|
262 |
+
i: str(val)
|
263 |
+
for i, val in enumerate(summed_importances)
|
264 |
+
},
|
265 |
+
"action": str(action),
|
266 |
+
"reward": str(reward),
|
267 |
+
"desired_return": str(desired_return + reward),
|
268 |
+
"desired_horizon": str(desired_horizon + 1),
|
269 |
+
},
|
270 |
+
f,
|
271 |
+
indent=4,
|
272 |
+
)
|
273 |
+
|
274 |
+
save_index += 1
|
275 |
+
clock.tick(5)
|
276 |
+
|
277 |
+
env.close()
|
278 |
+
pygame.quit()
|
279 |
+
|
280 |
+
|
281 |
+
# LunarLander-v2:RT:43:r.57:h.102 -> -92.03 +- 81.51,max 36.37,min -327.94
|
282 |
+
# Acrobot-v1:RT:44:r.-79:h.82 -> -79.00 +- 47.01,max -64.00,min -500.00
|
283 |
+
|
284 |
+
|
285 |
+
base_path = Path("data")
|
286 |
+
# env = "CartPole-v0"
|
287 |
+
env = "Acrobot-v1"
|
288 |
+
# env = "LunarLander-v2"
|
289 |
+
estimator = "RandomForestClassifier"
|
290 |
+
seed = str(44)
|
291 |
+
conf_name = "train_per_iter1"
|
292 |
+
desired_return = -79
|
293 |
+
desired_horizon = 82
|
294 |
+
max_epoch = 500
|
295 |
+
|
296 |
+
path = base_path / env / conf_name / seed
|
297 |
+
|
298 |
+
policy = SklearnPolicy.load(str(path / "policy"))
|
299 |
+
hyper = AgentHyper(
|
300 |
+
env,
|
301 |
+
warm_up=0,
|
302 |
+
# horizon_scale=horizon_scale,
|
303 |
+
# return_scale=return_scale,
|
304 |
+
)
|
305 |
+
|
306 |
+
agent = UpsideDownAgent(hyper, policy)
|
307 |
+
|
308 |
+
run_visualization(
|
309 |
+
env, agent, desired_return, desired_horizon, max_epoch, "data/viz_examples"
|
310 |
+
)
|