Used normalized advantage functions (NAF) from this paper:
Continuous Deep Q-Learning with Model-based Acceleration
Shixiang Gu, Timothy Lillicrap, Ilya Sutskever, Sergey Levine
http://arxiv.org/abs/1603.00748
Refer to code for hyperparameters.
Used normalized advantage functions (NAF) from this paper:
Continuous Deep Q-Learning with Model-based Acceleration
Shixiang Gu, Timothy Lillicrap, Ilya Sutskever, Sergey Levine
http://arxiv.org/abs/1603.00748
Refer to code for hyperparameters.
| import argparse | |
| import gym | |
| from gym.spaces import Box, Discrete | |
| from keras.models import Model | |
| from keras.layers import Input, Dense, Lambda, Reshape, merge | |
| from keras.layers.normalization import BatchNormalization | |
| from keras.optimizers import Adam, RMSprop | |
| from keras import backend as K | |
| import theano.tensor as T | |
| import numpy as np | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('--batch_size', type=int, default=100) | |
| parser.add_argument('--hidden_size', type=int, default=100) | |
| parser.add_argument('--layers', type=int, default=2) | |
| parser.add_argument('--batch_norm', action="store_true", default=False) | |
| parser.add_argument('--no_batch_norm', action="store_false", dest="batch_norm") | |
| parser.add_argument('--min_train', type=int, default=10) | |
| parser.add_argument('--train_repeat', type=int, default=10) | |
| parser.add_argument('--gamma', type=float, default=0.9) | |
| parser.add_argument('--tau', type=float, default=0.001) | |
| parser.add_argument('--episodes', type=int, default=200) | |
| parser.add_argument('--max_timesteps', type=int, default=200) | |
| parser.add_argument('--activation', choices=['tanh', 'relu'], default='tanh') | |
| parser.add_argument('--optimizer', choices=['adam', 'rmsprop'], default='adam') | |
| parser.add_argument('--optimizer_lr', type=float, default=0.001) | |
| parser.add_argument('--noise_decay', choices=['linear', 'exp', 'fixed'], default='linear') | |
| parser.add_argument('--fixed_noise', type=float, default=0.1) | |
| parser.add_argument('--display', action='store_true', default=True) | |
| parser.add_argument('--no_display', dest='display', action='store_false') | |
| parser.add_argument('--gym_monitor') | |
| parser.add_argument('environment') | |
| args = parser.parse_args() | |
| env = gym.make(args.environment) | |
| assert isinstance(env.observation_space, Box) | |
| assert isinstance(env.action_space, Box) | |
| assert len(env.action_space.shape) == 1 | |
| num_actuators = env.action_space.shape[0] | |
| if args.gym_monitor: | |
| env.monitor.start(args.gym_monitor) | |
| if num_actuators == 1: | |
| def L(x): | |
| return K.exp(x) | |
| def P(x): | |
| return x*x | |
| def A(t): | |
| m, p, u = t | |
| return -(u - m)**2 * p | |
| def Q(t): | |
| v, a = t | |
| return v + a | |
| else: | |
| def L(x): | |
| # TODO: batching | |
| #return T.nlinalg.alloc_diag(K.exp(T.nlinalg.ExtractDiag(view=True)(x))) + T.tril(x, k=-1) | |
| return K.exp(x) | |
| def P(x): | |
| return K.batch_dot(x, K.permute_dimensions(x, (0,2,1))) | |
| def A(t): | |
| m, p, u = t | |
| return -K.batch_dot(K.batch_dot(K.permute_dimensions(u - m, (0,2,1)), p), u - m) | |
| def Q(t): | |
| v, a = t | |
| return v + a | |
| def createLayers(): | |
| x = Input(shape=env.observation_space.shape, name='x') | |
| u = Input(shape=env.action_space.shape, name='u') | |
| if args.batch_norm: | |
| h = BatchNormalization()(x) | |
| else: | |
| h = x | |
| for i in xrange(args.layers): | |
| h = Dense(args.hidden_size, activation=args.activation, name='h'+str(i+1))(h) | |
| if args.batch_norm and i != args.layers - 1: | |
| h = BatchNormalization()(h) | |
| v = Dense(1, init='uniform', name='v')(h) | |
| m = Dense(num_actuators, init='uniform', name='m')(h) | |
| l = Dense(num_actuators**2, name='l0')(h) | |
| l = Reshape((num_actuators, num_actuators))(l) | |
| l = Lambda(L, output_shape=(num_actuators, num_actuators), name='l')(l) | |
| p = Lambda(P, output_shape=(num_actuators, num_actuators), name='p')(l) | |
| a = merge([m, p, u], mode=A, output_shape=(None, num_actuators,), name="a") | |
| q = merge([v, a], mode=Q, output_shape=(None, num_actuators,), name="q") | |
| return x, u, m, v, q | |
| x, u, m, v, q = createLayers() | |
| _mu = K.function([K.learning_phase(), x], m) | |
| mu = lambda x: _mu([0] + [x]) | |
| model = Model(input=[x,u], output=q) | |
| model.summary() | |
| if args.optimizer == 'adam': | |
| optimizer = Adam(args.optimizer_lr) | |
| elif args.optimizer == 'rmsprop': | |
| optimizer = RMSprop(args.optimizer_lr) | |
| else: | |
| assert False | |
| model.compile(optimizer=optimizer, loss='mse') | |
| x, u, m, v, q = createLayers() | |
| _V = K.function([K.learning_phase(), x], v) | |
| V = lambda x: _V([0] + [x]) | |
| #q_f = K.function([x, u], q) | |
| target_model = Model(input=[x,u], output=q) | |
| target_model.set_weights(model.get_weights()) | |
| prestates = [] | |
| actions = [] | |
| rewards = [] | |
| poststates = [] | |
| terminals = [] | |
| total_reward = 0 | |
| for i_episode in xrange(args.episodes): | |
| observation = env.reset() | |
| #print "initial state:", observation | |
| episode_reward = 0 | |
| for t in xrange(args.max_timesteps): | |
| if args.display: | |
| env.render() | |
| x = np.array([observation]) | |
| u = mu(x) | |
| if args.noise_decay == 'linear': | |
| noise = 1. / (i_episode + 1) | |
| elif args.noise_decay == 'exp': | |
| noise = 10 ** -i_episode | |
| elif args.noise_decay == 'fixed': | |
| noise = args.fixed_noise | |
| else: | |
| assert False | |
| #print "noise:", noise | |
| action = u[0] + np.random.randn(num_actuators) * noise | |
| #print "action:", action | |
| prestates.append(observation) | |
| actions.append(action) | |
| #print "prestate:", observation | |
| observation, reward, done, info = env.step(action) | |
| episode_reward += reward | |
| #print "reward:", reward | |
| #print "poststate:", observation | |
| rewards.append(reward) | |
| poststates.append(observation) | |
| terminals.append(done) | |
| if len(prestates) > args.min_train: | |
| for k in xrange(args.train_repeat): | |
| if len(prestates) > args.batch_size: | |
| indexes = np.random.choice(len(prestates), size=args.batch_size) | |
| else: | |
| indexes = range(len(prestates)) | |
| v = V(np.array(poststates)[indexes]) | |
| y = np.array(rewards)[indexes] + args.gamma * np.squeeze(v) | |
| model.train_on_batch([np.array(prestates)[indexes], np.array(actions)[indexes]], y) | |
| weights = model.get_weights() | |
| target_weights = target_model.get_weights() | |
| for i in xrange(len(weights)): | |
| target_weights[i] = args.tau * weights[i] + (1 - args.tau) * target_weights[i] | |
| target_model.set_weights(target_weights) | |
| if done: | |
| break | |
| episode_reward = episode_reward / float(t + 1) | |
| print "Episode {} finished after {} timesteps, average reward {}".format(i_episode + 1, t + 1, episode_reward) | |
| total_reward += episode_reward | |
| print "Average reward per episode {}".format(total_reward / args.episodes) | |
| if args.gym_monitor: | |
| env.monitor.close() |