Skip to content

Instantly share code, notes, and snippets.

@louisng5
Last active November 14, 2016 10:33
Show Gist options
  • Select an option

  • Save louisng5/29683c559c46f5bdfd5ca3ea593bd692 to your computer and use it in GitHub Desktop.

Select an option

Save louisng5/29683c559c46f5bdfd5ca3ea593bd692 to your computer and use it in GitHub Desktop.
OpenAI - CartPole-v0 - 19 EPISODES BEFORE SOLVE
I have used some aggressive measure to achieve 19 Episodes before solve, inculding:
1) Large batch_size used for experience replay.
2) 30 epoch size used for every training batch.
3) 0 probability for random action.
With this approach, the NN start to "know" how to balance at episode 10-15.
Keras was used for creating and training the NN.
import gym
import os
os.environ['KERAS_BACKEND'] = 'theano'
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
import numpy as np
import pandas as pd
import random
learning_rate = 1
gamma = 0.96
max_episode = 9000.
max_state = 200.
batch_size = 200
mem_size = 200
monitor = False
def base_model():
model = Sequential()
model.add(Dense(16, input_dim=4, activation='linear'))
model.add(Dense(32, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(2, activation='linear'))
model.compile(loss='mse', optimizer='Adam')
return model
model = base_model()
# model = keras.models.load_model('my_model.h5')
env = gym.make('CartPole-v0')
if monitor:
env.monitor.start('/tmp/cartpole-experiment-1', force=True)
for i_episode in range(int(max_episode)):
observation = env.reset()
# random_rate = (max_episode - i_episode) / (max_episode * 20)
random_rate = 0
d = []
step_count = 0
print i_episode
acc_reward = 0
for t in range(int(max_state + 1)):
env.render()
prediction = model.predict(observation.reshape(1, 4))
if random.random() < random_rate:
action = env.action_space.sample()
else:
action = np.argmax(prediction)
observation2, reward, done, info = env.step(action)
ofv = np.max(model.predict(observation2.reshape(1, 4)))
if done and t < 199:
print "done"
reward = -800
update = -800
else:
reward = 1
acc_reward += 1
update = prediction[0][action] + learning_rate * (reward + gamma * ofv - prediction[0][action])
prediction[0][action] = update
d.append([observation, prediction])
if len(d) == mem_size + 1:
d = d[1:]
if step_count >= 5 or done:
ds = np.array(random.sample(d, batch_size if len(d) >= batch_size else len(d)))
X, Y = [], []
for x, y in ds:
X.append(x)
Y.append(y)
X, Y = np.array(X), np.array(Y).reshape(batch_size if len(d) >= batch_size else len(d), 2)
model.fit(X, Y, nb_epoch=30, verbose=0)
step_count = 0
step_count += 1
observation = observation2
if done:
if acc_reward <= 195:
success_count = 0
print("Episode finished after {} timesteps".format(t + 1))
break
if acc_reward >= 195:
success_count += 1
if success_count > 100:
break
model.save('my_model.h5')
if monitor:
env.monitor.close()
gym.upload('/tmp/cartpole-experiment-1', api_key='')
@ryanpeach
Copy link

Also, I've never seen learning rate and gamma used, can you explain this:

update = prediction[0][action] + learning_rate * (reward + gamma * ofv - prediction[0][action])

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment