-
-
Save greydanus/5036f784eec2036252e1990da21eda18 to your computer and use it in GitHub Desktop.
| '''Solves Pong with Policy Gradients in Tensorflow.''' | |
| # written October 2016 by Sam Greydanus | |
| # inspired by karpathy's gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5 | |
| import numpy as np | |
| import gym | |
| import tensorflow as tf | |
| # hyperparameters | |
| n_obs = 80 * 80 # dimensionality of observations | |
| h = 200 # number of hidden layer neurons | |
| n_actions = 3 # number of available actions | |
| learning_rate = 1e-3 | |
| gamma = .99 # discount factor for reward | |
| decay = 0.99 # decay rate for RMSProp gradients | |
| save_path='models/pong.ckpt' | |
| # gamespace | |
| env = gym.make("Pong-v0") # environment info | |
| observation = env.reset() | |
| prev_x = None | |
| xs,rs,ys = [],[],[] | |
| running_reward = None | |
| reward_sum = 0 | |
| episode_number = 0 | |
| # initialize model | |
| tf_model = {} | |
| with tf.variable_scope('layer_one',reuse=False): | |
| xavier_l1 = tf.truncated_normal_initializer(mean=0, stddev=1./np.sqrt(n_obs), dtype=tf.float32) | |
| tf_model['W1'] = tf.get_variable("W1", [n_obs, h], initializer=xavier_l1) | |
| with tf.variable_scope('layer_two',reuse=False): | |
| xavier_l2 = tf.truncated_normal_initializer(mean=0, stddev=1./np.sqrt(h), dtype=tf.float32) | |
| tf_model['W2'] = tf.get_variable("W2", [h,n_actions], initializer=xavier_l2) | |
| # tf operations | |
| def tf_discount_rewards(tf_r): #tf_r ~ [game_steps,1] | |
| discount_f = lambda a, v: a*gamma + v; | |
| tf_r_reverse = tf.scan(discount_f, tf.reverse(tf_r,[True, False])) | |
| tf_discounted_r = tf.reverse(tf_r_reverse,[True, False]) | |
| return tf_discounted_r | |
| def tf_policy_forward(x): #x ~ [1,D] | |
| h = tf.matmul(x, tf_model['W1']) | |
| h = tf.nn.relu(h) | |
| logp = tf.matmul(h, tf_model['W2']) | |
| p = tf.nn.softmax(logp) | |
| return p | |
| # downsampling | |
| def prepro(I): | |
| """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """ | |
| I = I[35:195] # crop | |
| I = I[::2,::2,0] # downsample by factor of 2 | |
| I[I == 144] = 0 # erase background (background type 1) | |
| I[I == 109] = 0 # erase background (background type 2) | |
| I[I != 0] = 1 # everything else (paddles, ball) just set to 1 | |
| return I.astype(np.float).ravel() | |
| # tf placeholders | |
| tf_x = tf.placeholder(dtype=tf.float32, shape=[None, n_obs],name="tf_x") | |
| tf_y = tf.placeholder(dtype=tf.float32, shape=[None, n_actions],name="tf_y") | |
| tf_epr = tf.placeholder(dtype=tf.float32, shape=[None,1], name="tf_epr") | |
| # tf reward processing (need tf_discounted_epr for policy gradient wizardry) | |
| tf_discounted_epr = tf_discount_rewards(tf_epr) | |
| tf_mean, tf_variance= tf.nn.moments(tf_discounted_epr, [0], shift=None, name="reward_moments") | |
| tf_discounted_epr -= tf_mean | |
| tf_discounted_epr /= tf.sqrt(tf_variance + 1e-6) | |
| # tf optimizer op | |
| tf_aprob = tf_policy_forward(tf_x) | |
| loss = tf.nn.l2_loss(tf_y-tf_aprob) | |
| optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=decay) | |
| tf_grads = optimizer.compute_gradients(loss, var_list=tf.trainable_variables(), grad_loss=tf_discounted_epr) | |
| train_op = optimizer.apply_gradients(tf_grads) | |
| # tf graph initialization | |
| sess = tf.InteractiveSession() | |
| tf.initialize_all_variables().run() | |
| # try load saved model | |
| saver = tf.train.Saver(tf.all_variables()) | |
| load_was_success = True # yes, I'm being optimistic | |
| try: | |
| save_dir = '/'.join(save_path.split('/')[:-1]) | |
| ckpt = tf.train.get_checkpoint_state(save_dir) | |
| load_path = ckpt.model_checkpoint_path | |
| saver.restore(sess, load_path) | |
| except: | |
| print "no saved model to load. starting new session" | |
| load_was_success = False | |
| else: | |
| print "loaded model: {}".format(load_path) | |
| saver = tf.train.Saver(tf.all_variables()) | |
| episode_number = int(load_path.split('-')[-1]) | |
| # training loop | |
| while True: | |
| # if True: env.render() | |
| # preprocess the observation, set input to network to be difference image | |
| cur_x = prepro(observation) | |
| x = cur_x - prev_x if prev_x is not None else np.zeros(n_obs) | |
| prev_x = cur_x | |
| # stochastically sample a policy from the network | |
| feed = {tf_x: np.reshape(x, (1,-1))} | |
| aprob = sess.run(tf_aprob,feed) ; aprob = aprob[0,:] | |
| action = np.random.choice(n_actions, p=aprob) | |
| label = np.zeros_like(aprob) ; label[action] = 1 | |
| # step the environment and get new measurements | |
| observation, reward, done, info = env.step(action+1) | |
| reward_sum += reward | |
| # record game history | |
| xs.append(x) ; ys.append(label) ; rs.append(reward) | |
| if done: | |
| # update running reward | |
| running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 | |
| # parameter update | |
| feed = {tf_x: np.vstack(xs), tf_epr: np.vstack(rs), tf_y: np.vstack(ys)} | |
| _ = sess.run(train_op,feed) | |
| # print progress console | |
| if episode_number % 10 == 0: | |
| print 'ep {}: reward: {}, mean reward: {:3f}'.format(episode_number, reward_sum, running_reward) | |
| else: | |
| print '\tep {}: reward: {}'.format(episode_number, reward_sum) | |
| # bookkeeping | |
| xs,rs,ys = [],[],[] # reset game history | |
| episode_number += 1 # the Next Episode | |
| observation = env.reset() # reset env | |
| reward_sum = 0 | |
| if episode_number % 50 == 0: | |
| saver.save(sess, save_path, global_step=episode_number) | |
| print "SAVED MODEL #{}".format(episode_number) |
@sahnimanas this is the core of how policy gradients work. The tf_discounted_epr is a tensorflow variable which holds the discounted episode rewards. It's a vector of positive and negative scalar values. We multiply this vector by the gradients to encourage (when tf_discounted_epr is positive) or discourage ( when tf_discounted_epr is negative) particular actions.
regarding loss, should it be loss = -tf.reduce_sum(tf.log(tf_aprob) * tf_y * tf_discounted_epr, axis=1)?
(alternatively, this could work too: loss = tf.losses.log_loss(labels=tf_y, predictions=tf_aprob, weights=tf_discounted_epr.)
@sahnimanas since there is only positive rewards (ie you get a point if you win) and no negative rewards (ie you dont lose a point if you lose), we convert total rewards to a gaussian distribution.
should we take l2 loss (loss = tf.nn.l2_loss(tf_y-tf_aprob) or cross entropy loss?
(as far as i can tell, most policy gradients implementations minimize cross entropy loss.)
@jjstrydom this implementation is not optimized for speed or for GPUs. It's supposed to be extremely minimalist. I'd suggest using a different implementation if you want to scale up.