heerad · August 1, 2017 06:50 · sritee · Jun 26, 2018 · mehdimashayekhi · Oct 10, 2018
diff --git a/ddpg-Pendulum-v0.py b/ddpg-Pendulum-v0.py
 import numpy as np
 import gym
 from gym import wrappers
 import tensorflow as tf
 import json, sys, os
 from os import path
 import random
 from collections import deque

 #####################################################################################################
 ## Algorithm

 # Deep Deterministic Policy Gradient (DDPG)
 # An off-policy actor-critic algorithm that uses additive exploration noise (e.g. an Ornstein-Uhlenbeck process) on top
 # of a deterministic policy to generate experiences (s, a, r, s'). It uses minibatches of these experiences from replay 
 # memory to update the actor (policy) and critic (Q function) parameters.
 # Neural networks are used for function approximation.
 # Slowly-changing "target" networks are used to improve stability and encourage convergence.
 # Parameter updates are made via Adam.
 # Assumes continuous action spaces!

 #####################################################################################################
 ## Setup

 env_to_use = 'Pendulum-v0'

 # hyperparameters
 gamma = 0.99				# reward discount factor
 h1_actor = 8				# hidden layer 1 size for the actor
 h2_actor = 8				# hidden layer 2 size for the actor
 h3_actor = 8				# hidden layer 3 size for the actor
 h1_critic = 8				# hidden layer 1 size for the critic
 h2_critic = 8				# hidden layer 2 size for the critic
 h3_critic = 8				# hidden layer 3 size for the critic
 lr_actor = 1e-3				# learning rate for the actor
 lr_critic = 1e-3			# learning rate for the critic
 lr_decay = 1				# learning rate decay (per episode)
 l2_reg_actor = 1e-6			# L2 regularization factor for the actor
 l2_reg_critic = 1e-6		# L2 regularization factor for the critic
 dropout_actor = 0			# dropout rate for actor (0 = no dropout)
 dropout_critic = 0			# dropout rate for critic (0 = no dropout)
 num_episodes = 15000		# number of episodes
 max_steps_ep = 10000	# default max number of steps per episode (unless env has a lower hardcoded limit)
 tau = 1e-2				# soft target update rate
 train_every = 1			# number of steps to run the policy (and collect experience) before updating network weights
 replay_memory_capacity = int(1e5)	# capacity of experience replay memory
 minibatch_size = 1024	# size of minibatch from experience replay memory for updates
 initial_noise_scale = 0.1	# scale of the exploration noise process (1.0 is the range of each action dimension)
 noise_decay = 0.99		# decay rate (per episode) of the scale of the exploration noise process
 exploration_mu = 0.0	# mu parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt
 exploration_theta = 0.15 # theta parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt
 exploration_sigma = 0.2	# sigma parameter for the exploration noise process: dXt = theta*(mu-Xt	)*dt + sigma*dWt

 # game parameters
 env = gym.make(env_to_use)
 state_dim = np.prod(np.array(env.observation_space.shape)) 	# Get total number of dimensions in state
 action_dim = np.prod(np.array(env.action_space.shape))		# Assuming continuous action space

 # set seeds to 0
 env.seed(0)
 np.random.seed(0)

 # prepare monitorings
 outdir = '/tmp/ddpg-agent-results'
 env = wrappers.Monitor(env, outdir, force=True)
 def writefile(fname, s):
    with open(path.join(outdir, fname), 'w') as fh: fh.write(s)
 info = {}
 info['env_id'] = env.spec.id
 info['params'] = dict(
 	gamma = gamma,
 	h1_actor = h1_actor,
 	h2_actor = h2_actor,
 	h3_actor = h3_actor,
 	h1_critic = h1_critic,
 	h2_critic = h2_critic,
 	h3_critic = h3_critic,
 	lr_actor = lr_actor,
 	lr_critic = lr_critic,
 	lr_decay = lr_decay,
 	l2_reg_actor = l2_reg_actor,
 	l2_reg_critic = l2_reg_critic,
 	dropout_actor = dropout_actor,
 	dropout_critic = dropout_critic,
 	num_episodes = num_episodes,
 	max_steps_ep = max_steps_ep,
 	tau = tau,
 	train_every = train_every,
 	replay_memory_capacity = replay_memory_capacity,
 	minibatch_size = minibatch_size,
 	initial_noise_scale = initial_noise_scale,
 	noise_decay = noise_decay,
 	exploration_mu = exploration_mu,
 	exploration_theta = exploration_theta,
 	exploration_sigma = exploration_sigma
 )

 np.set_printoptions(threshold=np.nan)

 replay_memory = deque(maxlen=replay_memory_capacity)			# used for O(1) popleft() operation

 def add_to_memory(experience):
 	replay_memory.append(experience)

 def sample_from_memory(minibatch_size):
 	return random.sample(replay_memory, minibatch_size)

 #####################################################################################################
 ## Tensorflow

 tf.reset_default_graph()

 # placeholders
 state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim])
 action_ph = tf.placeholder(dtype=tf.float32, shape=[None,action_dim])
 reward_ph = tf.placeholder(dtype=tf.float32, shape=[None])
 next_state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim])
 is_not_terminal_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # indicators (go into target computation)
 is_training_ph = tf.placeholder(dtype=tf.bool, shape=()) # for dropout

 # episode counter
 episodes = tf.Variable(0.0, trainable=False, name='episodes')
 episode_inc_op = episodes.assign_add(1)

 # will use this to initialize both the actor network its slowly-changing target network with same structure
 def generate_actor_network(s, trainable, reuse):
 	hidden = tf.layers.dense(s, h1_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense', reuse = reuse)
 	hidden_drop = tf.layers.dropout(hidden, rate = dropout_actor, training = trainable & is_training_ph)
 	hidden_2 = tf.layers.dense(hidden_drop, h2_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense_1', reuse = reuse)
 	hidden_drop_2 = tf.layers.dropout(hidden_2, rate = dropout_actor, training = trainable & is_training_ph)
 	hidden_3 = tf.layers.dense(hidden_drop_2, h3_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense_2', reuse = reuse)
 	hidden_drop_3 = tf.layers.dropout(hidden_3, rate = dropout_actor, training = trainable & is_training_ph)
 	actions_unscaled = tf.layers.dense(hidden_drop_3, action_dim, trainable = trainable, name = 'dense_3', reuse = reuse)
 	actions = env.action_space.low + tf.nn.sigmoid(actions_unscaled)*(env.action_space.high - env.action_space.low) # bound the actions to the valid range
 	return actions

 # actor network
 with tf.variable_scope('actor'):
 	# Policy's outputted action for each state_ph (for generating actions and training the critic)
 	actions = generate_actor_network(state_ph, trainable = True, reuse = False)

 # slow target actor network
 with tf.variable_scope('slow_target_actor', reuse=False):
 	# Slow target policy's outputted action for each next_state_ph (for training the critic)
 	# use stop_gradient to treat the output values as constant targets when doing backprop
 	slow_target_next_actions = tf.stop_gradient(generate_actor_network(next_state_ph, trainable = False, reuse = False))

 # will use this to initialize both the critic network its slowly-changing target network with same structure
 def generate_critic_network(s, a, trainable, reuse):
 	state_action = tf.concat([s, a], axis=1)
 	hidden = tf.layers.dense(state_action, h1_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense', reuse = reuse)
 	hidden_drop = tf.layers.dropout(hidden, rate = dropout_critic, training = trainable & is_training_ph)
 	hidden_2 = tf.layers.dense(hidden_drop, h2_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense_1', reuse = reuse)
 	hidden_drop_2 = tf.layers.dropout(hidden_2, rate = dropout_critic, training = trainable & is_training_ph)
 	hidden_3 = tf.layers.dense(hidden_drop_2, h3_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense_2', reuse = reuse)
 	hidden_drop_3 = tf.layers.dropout(hidden_3, rate = dropout_critic, training = trainable & is_training_ph)
 	q_values = tf.layers.dense(hidden_drop_3, 1, trainable = trainable, name = 'dense_3', reuse = reuse)
 	return q_values

 with tf.variable_scope('critic') as scope:
 	# Critic applied to state_ph and a given action (for training critic)
 	q_values_of_given_actions = generate_critic_network(state_ph, action_ph, trainable = True, reuse = False)
 	# Critic applied to state_ph and the current policy's outputted actions for state_ph (for training actor via deterministic policy gradient)
 	q_values_of_suggested_actions = generate_critic_network(state_ph, actions, trainable = True, reuse = True)

 # slow target critic network
 with tf.variable_scope('slow_target_critic', reuse=False):
 	# Slow target critic applied to slow target actor's outputted actions for next_state_ph (for training critic)
 	slow_q_values_next = tf.stop_gradient(generate_critic_network(next_state_ph, slow_target_next_actions, trainable = False, reuse = False))

 # isolate vars for each network
 actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor')
 slow_target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='slow_target_actor')
 critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic')
 slow_target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='slow_target_critic')

 # update values for slowly-changing targets towards current actor and critic
 update_slow_target_ops = []
 for i, slow_target_actor_var in enumerate(slow_target_actor_vars):
 	update_slow_target_actor_op = slow_target_actor_var.assign(tau*actor_vars[i]+(1-tau)*slow_target_actor_var)
 	update_slow_target_ops.append(update_slow_target_actor_op)

 for i, slow_target_var in enumerate(slow_target_critic_vars):
 	update_slow_target_critic_op = slow_target_var.assign(tau*critic_vars[i]+(1-tau)*slow_target_var)
 	update_slow_target_ops.append(update_slow_target_critic_op)

 update_slow_targets_op = tf.group(*update_slow_target_ops, name='update_slow_targets')

 # One step TD targets y_i for (s,a) from experience replay
 # = r_i + gamma*Q_slow(s',mu_slow(s')) if s' is not terminal
 # = r_i if s' terminal
 targets = tf.expand_dims(reward_ph, 1) + tf.expand_dims(is_not_terminal_ph, 1) * gamma * slow_q_values_next

 # 1-step temporal difference errors
 td_errors = targets - q_values_of_given_actions

 # critic loss function (mean-square value error with regularization)
 critic_loss = tf.reduce_mean(tf.square(td_errors))
 for var in critic_vars:
 	if not 'bias' in var.name:
 		critic_loss += l2_reg_critic * 0.5 * tf.nn.l2_loss(var)

 # critic optimizer
 critic_train_op = tf.train.AdamOptimizer(lr_critic*lr_decay**episodes).minimize(critic_loss)

 # actor loss function (mean Q-values under current policy with regularization)
 actor_loss = -1*tf.reduce_mean(q_values_of_suggested_actions)
 for var in actor_vars:
 	if not 'bias' in var.name:
 		actor_loss += l2_reg_actor * 0.5 * tf.nn.l2_loss(var)

 # actor optimizer
 # the gradient of the mean Q-values wrt actor params is the deterministic policy gradient (keeping critic params fixed)
 actor_train_op = tf.train.AdamOptimizer(lr_actor*lr_decay**episodes).minimize(actor_loss, var_list=actor_vars)

 # initialize session
 sess = tf.Session()	
 sess.run(tf.global_variables_initializer())

 #####################################################################################################
 ## Training

 total_steps = 0
 for ep in range(num_episodes):

 	total_reward = 0
 	steps_in_ep = 0

 	# Initialize exploration noise process
 	noise_process = np.zeros(action_dim)
 	noise_scale = (initial_noise_scale * noise_decay**ep) * (env.action_space.high - env.action_space.low)

 	# Initial state
 	observation = env.reset()
 	if ep%10 == 0: env.render()

 	for t in range(max_steps_ep):

 		# choose action based on deterministic policy
 		action_for_state, = sess.run(actions, 
 			feed_dict = {state_ph: observation[None], is_training_ph: False})

 		# add temporally-correlated exploration noise to action (using an Ornstein-Uhlenbeck process)
 		# print(action_for_state)
 		noise_process = exploration_theta*(exploration_mu - noise_process) + exploration_sigma*np.random.randn(action_dim)
 		# print(noise_scale*noise_process)
 		action_for_state += noise_scale*noise_process

 		# take step
 		next_observation, reward, done, _info = env.step(action_for_state)
 		if ep%10 == 0: env.render()
 		total_reward += reward

 		add_to_memory((observation, action_for_state, reward, next_observation, 
 			# is next_observation a terminal state?
 			# 0.0 if done and not env.env._past_limit() else 1.0))
 			0.0 if done else 1.0))

 		# update network weights to fit a minibatch of experience
 		if total_steps%train_every == 0 and len(replay_memory) >= minibatch_size:

 			# grab N (s,a,r,s') tuples from replay memory
 			minibatch = sample_from_memory(minibatch_size)

 			# update the critic and actor params using mean-square value error and deterministic policy gradient, respectively
 			_, _ = sess.run([critic_train_op, actor_train_op], 
 				feed_dict = {
 					state_ph: np.asarray([elem[0] for elem in minibatch]),
 					action_ph: np.asarray([elem[1] for elem in minibatch]),
 					reward_ph: np.asarray([elem[2] for elem in minibatch]),
 					next_state_ph: np.asarray([elem[3] for elem in minibatch]),
 					is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch]),
 					is_training_ph: True})

 			# update slow actor and critic targets towards current actor and critic
 			_ = sess.run(update_slow_targets_op)

 		observation = next_observation
 		total_steps += 1
 		steps_in_ep += 1
 		
 		if done: 
 			# Increment episode counter
 			_ = sess.run(episode_inc_op)
 			break
 		
 	print('Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f'%(ep,total_reward,steps_in_ep, noise_scale))

 # Finalize and upload results
 writefile('info.json', json.dumps(info))
 env.close()
 gym.upload(outdir)
	import numpy as np
	import gym
	from gym import wrappers
	import tensorflow as tf
	import json, sys, os
	from os import path
	import random
	from collections import deque

	#####################################################################################################
	## Algorithm

	# Deep Deterministic Policy Gradient (DDPG)
	# An off-policy actor-critic algorithm that uses additive exploration noise (e.g. an Ornstein-Uhlenbeck process) on top
	# of a deterministic policy to generate experiences (s, a, r, s'). It uses minibatches of these experiences from replay
	# memory to update the actor (policy) and critic (Q function) parameters.
	# Neural networks are used for function approximation.
	# Slowly-changing "target" networks are used to improve stability and encourage convergence.
	# Parameter updates are made via Adam.
	# Assumes continuous action spaces!

	#####################################################################################################
	## Setup

	env_to_use = 'Pendulum-v0'

	# hyperparameters
	gamma = 0.99 # reward discount factor
	h1_actor = 8 # hidden layer 1 size for the actor
	h2_actor = 8 # hidden layer 2 size for the actor
	h3_actor = 8 # hidden layer 3 size for the actor
	h1_critic = 8 # hidden layer 1 size for the critic
	h2_critic = 8 # hidden layer 2 size for the critic
	h3_critic = 8 # hidden layer 3 size for the critic
	lr_actor = 1e-3 # learning rate for the actor
	lr_critic = 1e-3 # learning rate for the critic
	lr_decay = 1 # learning rate decay (per episode)
	l2_reg_actor = 1e-6 # L2 regularization factor for the actor
	l2_reg_critic = 1e-6 # L2 regularization factor for the critic
	dropout_actor = 0 # dropout rate for actor (0 = no dropout)
	dropout_critic = 0 # dropout rate for critic (0 = no dropout)
	num_episodes = 15000 # number of episodes
	max_steps_ep = 10000 # default max number of steps per episode (unless env has a lower hardcoded limit)
	tau = 1e-2 # soft target update rate
	train_every = 1 # number of steps to run the policy (and collect experience) before updating network weights
	replay_memory_capacity = int(1e5) # capacity of experience replay memory
	minibatch_size = 1024 # size of minibatch from experience replay memory for updates
	initial_noise_scale = 0.1 # scale of the exploration noise process (1.0 is the range of each action dimension)
	noise_decay = 0.99 # decay rate (per episode) of the scale of the exploration noise process
	exploration_mu = 0.0 # mu parameter for the exploration noise process: dXt = theta(mu-Xt)dt + sigma*dWt
	exploration_theta = 0.15 # theta parameter for the exploration noise process: dXt = theta(mu-Xt)dt + sigma*dWt
	exploration_sigma = 0.2 # sigma parameter for the exploration noise process: dXt = theta(mu-Xt )dt + sigma*dWt

	# game parameters
	env = gym.make(env_to_use)
	state_dim = np.prod(np.array(env.observation_space.shape)) # Get total number of dimensions in state
	action_dim = np.prod(np.array(env.action_space.shape)) # Assuming continuous action space

	# set seeds to 0
	env.seed(0)
	np.random.seed(0)

	# prepare monitorings
	outdir = '/tmp/ddpg-agent-results'
	env = wrappers.Monitor(env, outdir, force=True)
	def writefile(fname, s):
	with open(path.join(outdir, fname), 'w') as fh: fh.write(s)
	info = {}
	info['env_id'] = env.spec.id
	info['params'] = dict(
	gamma = gamma,
	h1_actor = h1_actor,
	h2_actor = h2_actor,
	h3_actor = h3_actor,
	h1_critic = h1_critic,
	h2_critic = h2_critic,
	h3_critic = h3_critic,
	lr_actor = lr_actor,
	lr_critic = lr_critic,
	lr_decay = lr_decay,
	l2_reg_actor = l2_reg_actor,
	l2_reg_critic = l2_reg_critic,
	dropout_actor = dropout_actor,
	dropout_critic = dropout_critic,
	num_episodes = num_episodes,
	max_steps_ep = max_steps_ep,
	tau = tau,
	train_every = train_every,
	replay_memory_capacity = replay_memory_capacity,
	minibatch_size = minibatch_size,
	initial_noise_scale = initial_noise_scale,
	noise_decay = noise_decay,
	exploration_mu = exploration_mu,
	exploration_theta = exploration_theta,
	exploration_sigma = exploration_sigma
	)

	np.set_printoptions(threshold=np.nan)

	replay_memory = deque(maxlen=replay_memory_capacity) # used for O(1) popleft() operation

	def add_to_memory(experience):
	replay_memory.append(experience)

	def sample_from_memory(minibatch_size):
	return random.sample(replay_memory, minibatch_size)

	#####################################################################################################
	## Tensorflow

	tf.reset_default_graph()

	# placeholders
	state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim])
	action_ph = tf.placeholder(dtype=tf.float32, shape=[None,action_dim])
	reward_ph = tf.placeholder(dtype=tf.float32, shape=[None])
	next_state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim])
	is_not_terminal_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # indicators (go into target computation)
	is_training_ph = tf.placeholder(dtype=tf.bool, shape=()) # for dropout

	# episode counter
	episodes = tf.Variable(0.0, trainable=False, name='episodes')
	episode_inc_op = episodes.assign_add(1)

	# will use this to initialize both the actor network its slowly-changing target network with same structure
	def generate_actor_network(s, trainable, reuse):
	hidden = tf.layers.dense(s, h1_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense', reuse = reuse)
	hidden_drop = tf.layers.dropout(hidden, rate = dropout_actor, training = trainable & is_training_ph)
	hidden_2 = tf.layers.dense(hidden_drop, h2_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense_1', reuse = reuse)
	hidden_drop_2 = tf.layers.dropout(hidden_2, rate = dropout_actor, training = trainable & is_training_ph)
	hidden_3 = tf.layers.dense(hidden_drop_2, h3_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense_2', reuse = reuse)
	hidden_drop_3 = tf.layers.dropout(hidden_3, rate = dropout_actor, training = trainable & is_training_ph)
	actions_unscaled = tf.layers.dense(hidden_drop_3, action_dim, trainable = trainable, name = 'dense_3', reuse = reuse)
	actions = env.action_space.low + tf.nn.sigmoid(actions_unscaled)*(env.action_space.high - env.action_space.low) # bound the actions to the valid range
	return actions

	# actor network
	with tf.variable_scope('actor'):
	# Policy's outputted action for each state_ph (for generating actions and training the critic)
	actions = generate_actor_network(state_ph, trainable = True, reuse = False)

	# slow target actor network
	with tf.variable_scope('slow_target_actor', reuse=False):
	# Slow target policy's outputted action for each next_state_ph (for training the critic)
	# use stop_gradient to treat the output values as constant targets when doing backprop
	slow_target_next_actions = tf.stop_gradient(generate_actor_network(next_state_ph, trainable = False, reuse = False))

	# will use this to initialize both the critic network its slowly-changing target network with same structure
	def generate_critic_network(s, a, trainable, reuse):
	state_action = tf.concat([s, a], axis=1)
	hidden = tf.layers.dense(state_action, h1_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense', reuse = reuse)
	hidden_drop = tf.layers.dropout(hidden, rate = dropout_critic, training = trainable & is_training_ph)
	hidden_2 = tf.layers.dense(hidden_drop, h2_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense_1', reuse = reuse)
	hidden_drop_2 = tf.layers.dropout(hidden_2, rate = dropout_critic, training = trainable & is_training_ph)
	hidden_3 = tf.layers.dense(hidden_drop_2, h3_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense_2', reuse = reuse)
	hidden_drop_3 = tf.layers.dropout(hidden_3, rate = dropout_critic, training = trainable & is_training_ph)
	q_values = tf.layers.dense(hidden_drop_3, 1, trainable = trainable, name = 'dense_3', reuse = reuse)
	return q_values

	with tf.variable_scope('critic') as scope:
	# Critic applied to state_ph and a given action (for training critic)
	q_values_of_given_actions = generate_critic_network(state_ph, action_ph, trainable = True, reuse = False)
	# Critic applied to state_ph and the current policy's outputted actions for state_ph (for training actor via deterministic policy gradient)
	q_values_of_suggested_actions = generate_critic_network(state_ph, actions, trainable = True, reuse = True)

	# slow target critic network
	with tf.variable_scope('slow_target_critic', reuse=False):
	# Slow target critic applied to slow target actor's outputted actions for next_state_ph (for training critic)
	slow_q_values_next = tf.stop_gradient(generate_critic_network(next_state_ph, slow_target_next_actions, trainable = False, reuse = False))

	# isolate vars for each network
	actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor')
	slow_target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='slow_target_actor')
	critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic')
	slow_target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='slow_target_critic')

	# update values for slowly-changing targets towards current actor and critic
	update_slow_target_ops = []
	for i, slow_target_actor_var in enumerate(slow_target_actor_vars):
	update_slow_target_actor_op = slow_target_actor_var.assign(tauactor_vars[i]+(1-tau)slow_target_actor_var)
	update_slow_target_ops.append(update_slow_target_actor_op)

	for i, slow_target_var in enumerate(slow_target_critic_vars):
	update_slow_target_critic_op = slow_target_var.assign(taucritic_vars[i]+(1-tau)slow_target_var)
	update_slow_target_ops.append(update_slow_target_critic_op)

	update_slow_targets_op = tf.group(*update_slow_target_ops, name='update_slow_targets')

	# One step TD targets y_i for (s,a) from experience replay
	# = r_i + gamma*Q_slow(s',mu_slow(s')) if s' is not terminal
	# = r_i if s' terminal
	targets = tf.expand_dims(reward_ph, 1) + tf.expand_dims(is_not_terminal_ph, 1) * gamma * slow_q_values_next

	# 1-step temporal difference errors
	td_errors = targets - q_values_of_given_actions

	# critic loss function (mean-square value error with regularization)
	critic_loss = tf.reduce_mean(tf.square(td_errors))
	for var in critic_vars:
	if not 'bias' in var.name:
	critic_loss += l2_reg_critic * 0.5 * tf.nn.l2_loss(var)

	# critic optimizer
	critic_train_op = tf.train.AdamOptimizer(lr_criticlr_decay*episodes).minimize(critic_loss)

	# actor loss function (mean Q-values under current policy with regularization)
	actor_loss = -1*tf.reduce_mean(q_values_of_suggested_actions)
	for var in actor_vars:
	if not 'bias' in var.name:
	actor_loss += l2_reg_actor * 0.5 * tf.nn.l2_loss(var)

	# actor optimizer
	# the gradient of the mean Q-values wrt actor params is the deterministic policy gradient (keeping critic params fixed)
	actor_train_op = tf.train.AdamOptimizer(lr_actorlr_decay*episodes).minimize(actor_loss, var_list=actor_vars)

	# initialize session
	sess = tf.Session()
	sess.run(tf.global_variables_initializer())

	#####################################################################################################
	## Training

	total_steps = 0
	for ep in range(num_episodes):

	total_reward = 0
	steps_in_ep = 0

	# Initialize exploration noise process
	noise_process = np.zeros(action_dim)
	noise_scale = (initial_noise_scale * noise_decay*ep) (env.action_space.high - env.action_space.low)

	# Initial state
	observation = env.reset()
	if ep%10 == 0: env.render()

	for t in range(max_steps_ep):

	# choose action based on deterministic policy
	action_for_state, = sess.run(actions,
	feed_dict = {state_ph: observation[None], is_training_ph: False})

	# add temporally-correlated exploration noise to action (using an Ornstein-Uhlenbeck process)
	# print(action_for_state)
	noise_process = exploration_theta(exploration_mu - noise_process) + exploration_sigmanp.random.randn(action_dim)
	# print(noise_scale*noise_process)
	action_for_state += noise_scale*noise_process

	# take step
	next_observation, reward, done, _info = env.step(action_for_state)
	if ep%10 == 0: env.render()
	total_reward += reward

	add_to_memory((observation, action_for_state, reward, next_observation,
	# is next_observation a terminal state?
	# 0.0 if done and not env.env._past_limit() else 1.0))
	0.0 if done else 1.0))

	# update network weights to fit a minibatch of experience
	if total_steps%train_every == 0 and len(replay_memory) >= minibatch_size:

	# grab N (s,a,r,s') tuples from replay memory
	minibatch = sample_from_memory(minibatch_size)

	# update the critic and actor params using mean-square value error and deterministic policy gradient, respectively
	_, _ = sess.run([critic_train_op, actor_train_op],
	feed_dict = {
	state_ph: np.asarray([elem[0] for elem in minibatch]),
	action_ph: np.asarray([elem[1] for elem in minibatch]),
	reward_ph: np.asarray([elem[2] for elem in minibatch]),
	next_state_ph: np.asarray([elem[3] for elem in minibatch]),
	is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch]),
	is_training_ph: True})

	# update slow actor and critic targets towards current actor and critic
	_ = sess.run(update_slow_targets_op)

	observation = next_observation
	total_steps += 1
	steps_in_ep += 1

	if done:
	# Increment episode counter
	_ = sess.run(episode_inc_op)
	break

	print('Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f'%(ep,total_reward,steps_in_ep, noise_scale))

	# Finalize and upload results
	writefile('info.json', json.dumps(info))
	env.close()
	gym.upload(outdir)
No results found