Created
September 9, 2018 20:05
-
-
Save ltbringer/cc99aaa969cb6c880a32e0fc3d04f694 to your computer and use it in GitHub Desktop.
An agent that can play tic tac toe
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class Agent(object): | |
| def __init__(self, exploration_rate=0.33, learning_rate=0.5, discount_factor=0.01): | |
| """ | |
| An agent is a problem solver. | |
| It should perform actions like: | |
| - plotting a symbol on the tic-tac-toe board if it is vacant. | |
| - Remember which states are more profitable than the others. | |
| - Explore better states | |
| - Exploit for maximum profit | |
| params: | |
| - exploration_rate: A floating point number < 1 | |
| which defines the agents probability to explore. | |
| - learning_rate: Used for assessing the value of intermediate | |
| states during temporal difference learning. | |
| - discount_factor: The factor by which a reward must be reduced | |
| to be passed on for intermediate states | |
| """ | |
| self.states = {} | |
| # The list of states, a linear representation of the 3x3 tic tac toe board | |
| self.state_order = [] | |
| # The order in which the agent progressed through states to be able to | |
| # assign discounted rewards to older states. | |
| self.learning_rate = learning_rate | |
| self.discount_factor = discount_factor | |
| self.exploration_rate = exploration_rate | |
| @staticmethod | |
| def serialize_board(board): | |
| """ | |
| convert the matrix | |
| [ | |
| [1, 2, 3], | |
| [4, 5, 6], | |
| [7, 8, 9], | |
| ] | |
| to the form: "123456789" i.e. flatten and stringify | |
| """ | |
| serialized_board = board.flatten() | |
| return ''.join([str(i) for i in serialized_board.flatten().tolist()]) | |
| def get_serious(self): | |
| """ | |
| Quit exploring states and start exploiting | |
| Use this if you want to play with the agent. | |
| """ | |
| self.exploration_rate = 0 | |
| def learn_by_temporal_difference(self, reward, new_state_key, state_key): | |
| """ | |
| Implementation of the temporal difference formula. | |
| https://en.wikipedia.org/wiki/Temporal_difference_learning | |
| https://detailed.af/reinforcement/ | |
| """ | |
| old_state = self.states.get(state_key, np.zeros((3,3))) | |
| return self.learning_rate * ((reward * self.states[new_state_key]) - old_state) | |
| def set_state(self, old_board, action): | |
| """ | |
| Store the action performed for a given state | |
| """ | |
| state_key = Agent.serialize_board(old_board) | |
| self.state_order.append((state_key, action)) | |
| def on_reward(self, reward): | |
| """ | |
| Assign rewards to actions performed on intermediate states. | |
| """ | |
| new_state_key, new_action = self.state_order.pop() | |
| # get the latest state and the action performed that led to the reward | |
| self.states[new_state_key] = np.zeros((3,3)) | |
| # initialize the value with a zero matrix | |
| self.states[new_state_key].itemset(new_action, reward) | |
| # Assign the reward to this state | |
| while self.state_order: | |
| # while there is a stack of states (that were caused by actions performed) | |
| state_key, action = self.state_order.pop() | |
| # get the state and action performed on it | |
| reward *= self.discount_factor | |
| # Reduce the original reward (self.discount_factor is a number < 1) | |
| # Implementation of the value function | |
| if state_key in self.states: | |
| reward += self.learn_by_temporal_difference(reward, new_state_key, state_key).item(new_action) | |
| # If this state was encountered due to a different experiment, increase its previous value | |
| self.states[state_key].itemset(action, reward) | |
| else: | |
| self.states[state_key] = np.zeros((3,3)) | |
| reward = self.learn_by_temporal_difference(reward, new_state_key, state_key).item(new_action) | |
| self.states[state_key].itemset(action, reward) | |
| # If this state was not encountered before, assign it the discounted reward as its value | |
| new_state_key = state_key | |
| new_action = action | |
| def select_move(self, board): | |
| """ | |
| Choose from exploration and exploitation. | |
| Epsilon greedy implementation for policy. | |
| http://home.deib.polimi.it/restelli/MyWebSite/pdf/rl5.pdf | |
| http://tokic.com/www/tokicm/publikationen/papers/AdaptiveEpsilonGreedyExploration.pdf | |
| """ | |
| state_key = Agent.serialize_board(board) | |
| exploration = np.random.random() < self.exploration_rate | |
| print('explore' if exploration or state_key not in self.states else 'exploit') | |
| action = self.explore_board(board) \ | |
| if exploration or state_key not in self.states \ | |
| else self.exploit_board(state_key) | |
| print(action) | |
| self.set_state(board, action) | |
| return action | |
| def explore_board(self, board): | |
| """ | |
| Find an empty cell from the board | |
| """ | |
| zero_x, zero_y = np.where(board == 0) | |
| vacant_cells = [(x, y) for x, y in zip(zero_x, zero_y)] | |
| randomly_selected_vacant_cell = np.random.choice(len(vacant_cells)) | |
| return vacant_cells[randomly_selected_vacant_cell] | |
| def exploit_board(self, state_key): | |
| """ | |
| Find the best action for the given state | |
| """ | |
| state_values = self.states[state_key] | |
| # For the current state get the matrix of accumulated rewards | |
| print('State rewards') | |
| print(state_values) | |
| best_actions_x, best_actions_y = np.where(state_values == state_values.max()) | |
| # Find the coordinates which correspond to highest reward | |
| best_value_indices = [(x, y) for x,y in zip(best_actions_x, best_actions_y)] | |
| select_index = np.random.choice(len(best_value_indices)) | |
| return best_value_indices[select_index] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment