-
-
Save floodsung/3b9d893f1e0788f8fad0e6b49cde70f1 to your computer and use it in GitHub Desktop.
| # ------------------------------- | |
| # DQN for CartPole in OpenAI Gym | |
| # Author: Flood Sung | |
| # Date: 2016.6.27 | |
| # All rights reserved | |
| # ------------------------------- | |
| import gym | |
| import tensorflow as tf | |
| import numpy as np | |
| import random | |
| from collections import deque | |
| # Hyper Parameters for DQN | |
| GAMMA = 0.9 # discount factor for target Q | |
| INITIAL_EPSILON = 0.5 # starting value of epsilon | |
| FINAL_EPSILON = 0.01 # final value of epsilon | |
| REPLAY_SIZE = 10000 # experience replay buffer size | |
| BATCH_SIZE = 32 # size of minibatch | |
| class DQN(): | |
| # DQN Agent | |
| def __init__(self, env): | |
| # init experience replay | |
| self.replay_buffer = deque() | |
| # init some parameters | |
| self.time_step = 0 | |
| self.epsilon = INITIAL_EPSILON | |
| self.state_dim = env.observation_space.shape[0] | |
| self.action_dim = env.action_space.n | |
| self.create_Q_network() | |
| self.create_training_method() | |
| # Init session | |
| self.session = tf.InteractiveSession() | |
| self.session.run(tf.initialize_all_variables()) | |
| # loading networks | |
| self.saver = tf.train.Saver() | |
| checkpoint = tf.train.get_checkpoint_state("saved_networks") | |
| if checkpoint and checkpoint.model_checkpoint_path: | |
| self.saver.restore(self.session, checkpoint.model_checkpoint_path) | |
| print "Successfully loaded:", checkpoint.model_checkpoint_path | |
| else: | |
| print "Could not find old network weights" | |
| global summary_writer | |
| summary_writer = tf.train.SummaryWriter('~/logs',graph=self.session.graph) | |
| def create_Q_network(self): | |
| # network weights | |
| W1 = self.weight_variable([self.state_dim,20]) | |
| b1 = self.bias_variable([20]) | |
| W2 = self.weight_variable([20,self.action_dim]) | |
| b2 = self.bias_variable([self.action_dim]) | |
| # input layer | |
| self.state_input = tf.placeholder("float",[None,self.state_dim]) | |
| # hidden layers | |
| h_layer = tf.nn.relu(tf.matmul(self.state_input,W1) + b1) | |
| # Q Value layer | |
| self.Q_value = tf.matmul(h_layer,W2) + b2 | |
| def create_training_method(self): | |
| self.action_input = tf.placeholder("float",[None,self.action_dim]) # one hot presentation | |
| self.y_input = tf.placeholder("float",[None]) | |
| Q_action = tf.reduce_sum(tf.mul(self.Q_value,self.action_input),reduction_indices = 1) | |
| self.cost = tf.reduce_mean(tf.square(self.y_input - Q_action)) | |
| tf.scalar_summary("loss",self.cost) | |
| global merged_summary_op | |
| merged_summary_op = tf.merge_all_summaries() | |
| self.optimizer = tf.train.AdamOptimizer(0.0001).minimize(self.cost) | |
| def perceive(self,state,action,reward,next_state,done): | |
| one_hot_action = np.zeros(self.action_dim) | |
| one_hot_action[action] = 1 | |
| self.replay_buffer.append((state,one_hot_action,reward,next_state,done)) | |
| if len(self.replay_buffer) > REPLAY_SIZE: | |
| self.replay_buffer.popleft() | |
| if len(self.replay_buffer) > BATCH_SIZE: | |
| self.train_Q_network() | |
| def train_Q_network(self): | |
| self.time_step += 1 | |
| # Step 1: obtain random minibatch from replay memory | |
| minibatch = random.sample(self.replay_buffer,BATCH_SIZE) | |
| state_batch = [data[0] for data in minibatch] | |
| action_batch = [data[1] for data in minibatch] | |
| reward_batch = [data[2] for data in minibatch] | |
| next_state_batch = [data[3] for data in minibatch] | |
| # Step 2: calculate y | |
| y_batch = [] | |
| Q_value_batch = self.Q_value.eval(feed_dict={self.state_input:next_state_batch}) | |
| for i in range(0,BATCH_SIZE): | |
| done = minibatch[i][4] | |
| if done: | |
| y_batch.append(reward_batch[i]) | |
| else : | |
| y_batch.append(reward_batch[i] + GAMMA * np.max(Q_value_batch[i])) | |
| self.optimizer.run(feed_dict={ | |
| self.y_input:y_batch, | |
| self.action_input:action_batch, | |
| self.state_input:state_batch | |
| }) | |
| summary_str = self.session.run(merged_summary_op,feed_dict={ | |
| self.y_input : y_batch, | |
| self.action_input : action_batch, | |
| self.state_input : state_batch | |
| }) | |
| summary_writer.add_summary(summary_str,self.time_step) | |
| # save network every 1000 iteration | |
| if self.time_step % 1000 == 0: | |
| self.saver.save(self.session, 'saved_networks/' + 'network' + '-dqn', global_step = self.time_step) | |
| def egreedy_action(self,state): | |
| Q_value = self.Q_value.eval(feed_dict = { | |
| self.state_input:[state] | |
| })[0] | |
| if random.random() <= self.epsilon: | |
| return random.randint(0,self.action_dim - 1) | |
| else: | |
| return np.argmax(Q_value) | |
| self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON)/10000 | |
| def action(self,state): | |
| return np.argmax(self.Q_value.eval(feed_dict = { | |
| self.state_input:[state] | |
| })[0]) | |
| def weight_variable(self,shape): | |
| initial = tf.truncated_normal(shape) | |
| return tf.Variable(initial) | |
| def bias_variable(self,shape): | |
| initial = tf.constant(0.01, shape = shape) | |
| return tf.Variable(initial) | |
| # --------------------------------------------------------- | |
| # Hyper Parameters | |
| ENV_NAME = 'CartPole-v0' | |
| EPISODE = 10000 # Episode limitation | |
| STEP = 300 # Step limitation in an episode | |
| TEST = 10 # The number of experiment test every 100 episode | |
| def main(): | |
| # initialize OpenAI Gym env and dqn agent | |
| env = gym.make(ENV_NAME) | |
| agent = DQN(env) | |
| for episode in xrange(EPISODE): | |
| # initialize task | |
| state = env.reset() | |
| # Train | |
| for step in xrange(STEP): | |
| action = agent.egreedy_action(state) # e-greedy action for train | |
| next_state,reward,done,_ = env.step(action) | |
| # Define reward for agent | |
| reward_agent = -1 if done else 0.1 | |
| agent.perceive(state,action,reward,next_state,done) | |
| state = next_state | |
| if done: | |
| break | |
| # Test every 100 episodes | |
| if episode % 100 == 0: | |
| total_reward = 0 | |
| for i in xrange(TEST): | |
| state = env.reset() | |
| for j in xrange(STEP): | |
| env.render() | |
| action = agent.action(state) # direct action for test | |
| state,reward,done,_ = env.step(action) | |
| total_reward += reward | |
| if done: | |
| break | |
| ave_reward = total_reward/TEST | |
| print 'episode: ',episode,'Evaluation Average Reward:',ave_reward | |
| if ave_reward >= 200: | |
| break | |
| # save results for uploading | |
| env.monitor.start('gym_results/CartPole-v0-experiment-1',force = True) | |
| for i in xrange(100): | |
| state = env.reset() | |
| for j in xrange(200): | |
| env.render() | |
| action = agent.action(state) # direct action for test | |
| state,reward,done,_ = env.step(action) | |
| total_reward += reward | |
| if done: | |
| break | |
| env.monitor.close() | |
| if __name__ == '__main__': | |
| main() |
@aferral you should mkdir saved_networks in your directory first.
reward_agent is assigned but never used
Seeing this Placeholder error when i run it, any idea why?
File "/mnt/hgfs/Python_Scripts/dqn.py", line 112, in train_Q_network
self.state_input : state_batch
File "/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 710, in run
run_metadata_ptr)
File "/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 908, in _run
feed_dict_string, options, run_metadata)
File "/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 958, in _do_run
target_list, options, run_metadata)
File "/home/anaconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 978, in _do_call
raise type(e)(node_def, op, message)
InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder' with dtype float
[[Node: Placeholder = Placeholderdtype=DT_FLOAT, shape=[], _device="/job:localhost/replica:0/task:0/cpu:0"]]
the code at line 129 is never reached.
If you don't believe, you can add a log statement there and see if there's output.
After episode 0 im getting this error
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors.NotFoundError: saved_networks/network-dqn-1000.tempstate1749451825602796753
[[Node: save/save = SaveSlices[T=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_recv_save/Const_0, save/save/tensor_names, save/save/shapes_and_slices, Variable, Variable/Adam, Variable/Adam_1, Variable_1, Variable_1/Adam, Variable_1/Adam_1, Variable_2, Variable_2/Adam, Variable_2/Adam_1, Variable_3, Variable_3/Adam, Variable_3/Adam_1, beta1_power, beta2_power)]]
Caused by op u'save/save', defined at:
Is just me?