Last active
March 26, 2020 09:30
-
-
Save Peng-YM/f3e150fa4c30c2f0da6cdb17ac7d27d8 to your computer and use it in GitHub Desktop.
Simple Python implemetation of the value iteration algorithm in Renforcement Learning.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| def value_iteration(S, A, P, R, gamma, theta): | |
| ''' | |
| Parameters: | |
| S: states. A dictionary containing the name of each state. | |
| A: actions. A dictionary containing the name of each action. | |
| P: transition probabilities. P[previous_state, action, next_state] = probability. | |
| R: reward. R[previous_state, action, next_state] = reward. | |
| gamma: discounting factor. | |
| theta: stop iteration when the improvement of values is smaller than theta | |
| Output: | |
| the value function of each state | |
| ''' | |
| # Initialize array V randomly for all state in S | |
| V = {} | |
| for key in S.keys(): | |
| V[key] = 0 | |
| iteration_cnt = 1 | |
| while True: | |
| iteration_cnt = iteration_cnt + 1 | |
| delta = 0 | |
| # Update the value function of each state | |
| for (sk, s) in S.items(): | |
| v_previous = V[sk] | |
| # Find the best state-value for state s | |
| v_best = float("-inf") | |
| for a in A.values(): | |
| # ns and nsk are next state and its key | |
| v = np.sum([P[s, a, ns]*(R[s, a, ns]+gamma*V[nsk]) for (nsk, ns) in S.items()]) | |
| v_best = max(v, v_best) | |
| # Update | |
| V[sk] = v_best | |
| # Check delta | |
| delta = max(delta, np.abs(V[sk] - v_previous)) | |
| if delta < theta: | |
| print("Total iteration = %d" % iteration_cnt) | |
| break | |
| return V | |
| if __name__ == '__main__': | |
| # States | |
| S = { | |
| "Good": 0, | |
| "Bad": 1 | |
| } | |
| # Actions | |
| A = { | |
| "Produce": 0, | |
| "Inactive": 1, | |
| "Repair": 2 | |
| } | |
| # Transition Probability P(s, a, s') | |
| P = np.zeros((len(S), len(A), len(S))) | |
| P[S["Good"], A["Produce"], S["Good"]] = 0.8 | |
| P[S["Good"], A["Produce"], S["Bad"]] = 0.2 | |
| P[S["Good"], A["Inactive"], S["Good"]] = 1 | |
| P[S["Bad"], A["Repair"], S["Good"]] = 0.8 | |
| P[S["Bad"], A["Repair"], S["Bad"]] = 0.2 | |
| # Rewards R(s, a, s') | |
| R = np.zeros((len(S), len(A), len(S))) | |
| R[S["Good"], A["Produce"], S["Good"]] = 1 | |
| R[S["Good"], A["Produce"], S["Bad"]] = 0 | |
| R[S["Good"], A["Inactive"], S["Good"]] = 0 | |
| R[S["Bad"], A["Repair"], S["Good"]] = 0 | |
| R[S["Bad"], A["Repair"], S["Bad"]] = -10 | |
| # gamma | |
| gamma = 0.9 | |
| # value iteration | |
| V = value_iteration(S, A, P, R, gamma, 1e-8) | |
| print(V) |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Sample output: