Last active
November 14, 2016 10:33
-
-
Save louisng5/29683c559c46f5bdfd5ca3ea593bd692 to your computer and use it in GitHub Desktop.
OpenAI - CartPole-v0 - 19 EPISODES BEFORE SOLVE
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| I have used some aggressive measure to achieve 19 Episodes before solve, inculding: | |
| 1) Large batch_size used for experience replay. | |
| 2) 30 epoch size used for every training batch. | |
| 3) 0 probability for random action. | |
| With this approach, the NN start to "know" how to balance at episode 10-15. | |
| Keras was used for creating and training the NN. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import gym | |
| import os | |
| os.environ['KERAS_BACKEND'] = 'theano' | |
| import keras | |
| from keras.models import Sequential | |
| from keras.layers import Dense, Activation, Dropout | |
| import numpy as np | |
| import pandas as pd | |
| import random | |
| learning_rate = 1 | |
| gamma = 0.96 | |
| max_episode = 9000. | |
| max_state = 200. | |
| batch_size = 200 | |
| mem_size = 200 | |
| monitor = False | |
| def base_model(): | |
| model = Sequential() | |
| model.add(Dense(16, input_dim=4, activation='linear')) | |
| model.add(Dense(32, activation='relu')) | |
| model.add(Dense(64, activation='relu')) | |
| model.add(Dense(32, activation='relu')) | |
| model.add(Dense(16, activation='relu')) | |
| model.add(Dense(2, activation='linear')) | |
| model.compile(loss='mse', optimizer='Adam') | |
| return model | |
| model = base_model() | |
| # model = keras.models.load_model('my_model.h5') | |
| env = gym.make('CartPole-v0') | |
| if monitor: | |
| env.monitor.start('/tmp/cartpole-experiment-1', force=True) | |
| for i_episode in range(int(max_episode)): | |
| observation = env.reset() | |
| # random_rate = (max_episode - i_episode) / (max_episode * 20) | |
| random_rate = 0 | |
| d = [] | |
| step_count = 0 | |
| print i_episode | |
| acc_reward = 0 | |
| for t in range(int(max_state + 1)): | |
| env.render() | |
| prediction = model.predict(observation.reshape(1, 4)) | |
| if random.random() < random_rate: | |
| action = env.action_space.sample() | |
| else: | |
| action = np.argmax(prediction) | |
| observation2, reward, done, info = env.step(action) | |
| ofv = np.max(model.predict(observation2.reshape(1, 4))) | |
| if done and t < 199: | |
| print "done" | |
| reward = -800 | |
| update = -800 | |
| else: | |
| reward = 1 | |
| acc_reward += 1 | |
| update = prediction[0][action] + learning_rate * (reward + gamma * ofv - prediction[0][action]) | |
| prediction[0][action] = update | |
| d.append([observation, prediction]) | |
| if len(d) == mem_size + 1: | |
| d = d[1:] | |
| if step_count >= 5 or done: | |
| ds = np.array(random.sample(d, batch_size if len(d) >= batch_size else len(d))) | |
| X, Y = [], [] | |
| for x, y in ds: | |
| X.append(x) | |
| Y.append(y) | |
| X, Y = np.array(X), np.array(Y).reshape(batch_size if len(d) >= batch_size else len(d), 2) | |
| model.fit(X, Y, nb_epoch=30, verbose=0) | |
| step_count = 0 | |
| step_count += 1 | |
| observation = observation2 | |
| if done: | |
| if acc_reward <= 195: | |
| success_count = 0 | |
| print("Episode finished after {} timesteps".format(t + 1)) | |
| break | |
| if acc_reward >= 195: | |
| success_count += 1 | |
| if success_count > 100: | |
| break | |
| model.save('my_model.h5') | |
| if monitor: | |
| env.monitor.close() | |
| gym.upload('/tmp/cartpole-experiment-1', api_key='') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Also, I've never seen learning rate and gamma used, can you explain this: