ltbringer · September 9, 2018 20:05
diff --git a/tic_tac_toe_player.py b/tic_tac_toe_player.py
 class Agent(object):
    def __init__(self, exploration_rate=0.33, learning_rate=0.5, discount_factor=0.01):
        """
        An agent is a problem solver. 
        It should perform actions like:
            - plotting a symbol on the tic-tac-toe board if it is vacant.
            - Remember which states are more profitable than the others.
            - Explore better states
            - Exploit for maximum profit
        
        params:
        - exploration_rate: A floating point number < 1 
                which defines the agents probability to explore.
        - learning_rate: Used for assessing the value of intermediate 
                states during temporal difference learning.
        - discount_factor: The factor by which a reward must be reduced 
                to be passed on for intermediate states
        """
        self.states = {}
        # The list of states, a linear representation of the 3x3 tic tac toe board
        self.state_order = []
        # The order in which the agent progressed through states to be able to 
        # assign discounted rewards to older states.
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate

    @staticmethod
    def serialize_board(board):
        """
        convert the matrix 
        
            [
                [1, 2, 3],
                [4, 5, 6],
                [7, 8, 9],
            ]
            
            to the form: "123456789" i.e. flatten and stringify
        """
        serialized_board = board.flatten()
        return ''.join([str(i) for i in serialized_board.flatten().tolist()])

    def get_serious(self):
        """
        Quit exploring states and start exploiting
        Use this if you want to play with the agent.
        """
        self.exploration_rate = 0
    
    def learn_by_temporal_difference(self, reward, new_state_key, state_key):
        """
        Implementation of the temporal difference formula.
        https://en.wikipedia.org/wiki/Temporal_difference_learning
        https://detailed.af/reinforcement/
        """
        old_state = self.states.get(state_key, np.zeros((3,3)))
        return self.learning_rate * ((reward * self.states[new_state_key]) - old_state)

    def set_state(self, old_board, action):
        """
        Store the action performed for a given state
        """
        state_key = Agent.serialize_board(old_board)
        self.state_order.append((state_key, action))

    def on_reward(self, reward):
        """
        Assign rewards to actions performed on intermediate states.
        """
        new_state_key, new_action = self.state_order.pop()
        # get the latest state and the action performed that led to the reward

        self.states[new_state_key] = np.zeros((3,3))
        # initialize the value with a zero matrix

        self.states[new_state_key].itemset(new_action, reward)
        # Assign the reward to this state
        
        while self.state_order:
            # while there is a stack of states (that were caused by actions performed)

            state_key, action = self.state_order.pop()
            # get the state and action performed on it
            
            reward *= self.discount_factor
            # Reduce the original reward (self.discount_factor is a number < 1)
            
            # Implementation of the value function
            if state_key in self.states:
                reward += self.learn_by_temporal_difference(reward, new_state_key, state_key).item(new_action)
                # If this state was encountered due to a different experiment, increase its previous value
                self.states[state_key].itemset(action, reward)
            else:
                self.states[state_key] = np.zeros((3,3))
                reward = self.learn_by_temporal_difference(reward, new_state_key, state_key).item(new_action)
                self.states[state_key].itemset(action, reward)
                # If this state was not encountered before, assign it the discounted reward as its value                
            new_state_key = state_key
            new_action = action
            
    def select_move(self, board):
        """
        Choose from exploration and exploitation.
        Epsilon greedy implementation for policy.
        http://home.deib.polimi.it/restelli/MyWebSite/pdf/rl5.pdf
        http://tokic.com/www/tokicm/publikationen/papers/AdaptiveEpsilonGreedyExploration.pdf
        """
        state_key = Agent.serialize_board(board)
        exploration = np.random.random() < self.exploration_rate
        print('explore' if exploration or state_key not in self.states else 'exploit')
        action = self.explore_board(board) \
                    if exploration or state_key not in self.states \
                    else self.exploit_board(state_key)
        print(action)
        self.set_state(board, action)
        return action

    def explore_board(self, board):
        """
        Find an empty cell from the board
        """
        zero_x, zero_y = np.where(board == 0)
        vacant_cells = [(x, y) for x, y in zip(zero_x, zero_y)]
        randomly_selected_vacant_cell = np.random.choice(len(vacant_cells))
        return vacant_cells[randomly_selected_vacant_cell]
        
    def exploit_board(self, state_key):
        """
        Find the best action for the given state
        """
        state_values = self.states[state_key]
        # For the current state get the matrix of accumulated rewards
        print('State rewards')
        print(state_values)
        
        best_actions_x, best_actions_y = np.where(state_values == state_values.max())
        # Find the coordinates which correspond to highest reward
        
        best_value_indices = [(x, y) for x,y in zip(best_actions_x, best_actions_y)]
        select_index = np.random.choice(len(best_value_indices))
        return best_value_indices[select_index]
	class Agent(object):
	def __init__(self, exploration_rate=0.33, learning_rate=0.5, discount_factor=0.01):
	"""
	An agent is a problem solver.
	It should perform actions like:
	- plotting a symbol on the tic-tac-toe board if it is vacant.
	- Remember which states are more profitable than the others.
	- Explore better states
	- Exploit for maximum profit

	params:
	- exploration_rate: A floating point number < 1
	which defines the agents probability to explore.
	- learning_rate: Used for assessing the value of intermediate
	states during temporal difference learning.
	- discount_factor: The factor by which a reward must be reduced
	to be passed on for intermediate states
	"""
	self.states = {}
	# The list of states, a linear representation of the 3x3 tic tac toe board
	self.state_order = []
	# The order in which the agent progressed through states to be able to
	# assign discounted rewards to older states.
	self.learning_rate = learning_rate
	self.discount_factor = discount_factor
	self.exploration_rate = exploration_rate

	@staticmethod
	def serialize_board(board):
	"""
	convert the matrix

	[
	[1, 2, 3],
	[4, 5, 6],
	[7, 8, 9],
	]

	to the form: "123456789" i.e. flatten and stringify
	"""
	serialized_board = board.flatten()
	return ''.join([str(i) for i in serialized_board.flatten().tolist()])

	def get_serious(self):
	"""
	Quit exploring states and start exploiting
	Use this if you want to play with the agent.
	"""
	self.exploration_rate = 0

	def learn_by_temporal_difference(self, reward, new_state_key, state_key):
	"""
	Implementation of the temporal difference formula.
	https://en.wikipedia.org/wiki/Temporal_difference_learning
	https://detailed.af/reinforcement/
	"""
	old_state = self.states.get(state_key, np.zeros((3,3)))
	return self.learning_rate * ((reward * self.states[new_state_key]) - old_state)

	def set_state(self, old_board, action):
	"""
	Store the action performed for a given state
	"""
	state_key = Agent.serialize_board(old_board)
	self.state_order.append((state_key, action))

	def on_reward(self, reward):
	"""
	Assign rewards to actions performed on intermediate states.
	"""
	new_state_key, new_action = self.state_order.pop()
	# get the latest state and the action performed that led to the reward

	self.states[new_state_key] = np.zeros((3,3))
	# initialize the value with a zero matrix

	self.states[new_state_key].itemset(new_action, reward)
	# Assign the reward to this state

	while self.state_order:
	# while there is a stack of states (that were caused by actions performed)

	state_key, action = self.state_order.pop()
	# get the state and action performed on it

	reward *= self.discount_factor
	# Reduce the original reward (self.discount_factor is a number < 1)

	# Implementation of the value function
	if state_key in self.states:
	reward += self.learn_by_temporal_difference(reward, new_state_key, state_key).item(new_action)
	# If this state was encountered due to a different experiment, increase its previous value
	self.states[state_key].itemset(action, reward)
	else:
	self.states[state_key] = np.zeros((3,3))
	reward = self.learn_by_temporal_difference(reward, new_state_key, state_key).item(new_action)
	self.states[state_key].itemset(action, reward)
	# If this state was not encountered before, assign it the discounted reward as its value
	new_state_key = state_key
	new_action = action

	def select_move(self, board):
	"""
	Choose from exploration and exploitation.
	Epsilon greedy implementation for policy.
	http://home.deib.polimi.it/restelli/MyWebSite/pdf/rl5.pdf
	http://tokic.com/www/tokicm/publikationen/papers/AdaptiveEpsilonGreedyExploration.pdf
	"""
	state_key = Agent.serialize_board(board)
	exploration = np.random.random() < self.exploration_rate
	print('explore' if exploration or state_key not in self.states else 'exploit')
	action = self.explore_board(board) \
	if exploration or state_key not in self.states \
	else self.exploit_board(state_key)
	print(action)
	self.set_state(board, action)
	return action

	def explore_board(self, board):
	"""
	Find an empty cell from the board
	"""
	zero_x, zero_y = np.where(board == 0)
	vacant_cells = [(x, y) for x, y in zip(zero_x, zero_y)]
	randomly_selected_vacant_cell = np.random.choice(len(vacant_cells))
	return vacant_cells[randomly_selected_vacant_cell]

	def exploit_board(self, state_key):
	"""
	Find the best action for the given state
	"""
	state_values = self.states[state_key]
	# For the current state get the matrix of accumulated rewards
	print('State rewards')
	print(state_values)

	best_actions_x, best_actions_y = np.where(state_values == state_values.max())
	# Find the coordinates which correspond to highest reward

	best_value_indices = [(x, y) for x,y in zip(best_actions_x, best_actions_y)]
	select_index = np.random.choice(len(best_value_indices))
	return best_value_indices[select_index]
No results found