Skip to content

Instantly share code, notes, and snippets.

@Peng-YM
Last active March 26, 2020 09:30
Show Gist options
  • Select an option

  • Save Peng-YM/f3e150fa4c30c2f0da6cdb17ac7d27d8 to your computer and use it in GitHub Desktop.

Select an option

Save Peng-YM/f3e150fa4c30c2f0da6cdb17ac7d27d8 to your computer and use it in GitHub Desktop.
Simple Python implemetation of the value iteration algorithm in Renforcement Learning.
import numpy as np
def value_iteration(S, A, P, R, gamma, theta):
'''
Parameters:
S: states. A dictionary containing the name of each state.
A: actions. A dictionary containing the name of each action.
P: transition probabilities. P[previous_state, action, next_state] = probability.
R: reward. R[previous_state, action, next_state] = reward.
gamma: discounting factor.
theta: stop iteration when the improvement of values is smaller than theta
Output:
the value function of each state
'''
# Initialize array V randomly for all state in S
V = {}
for key in S.keys():
V[key] = 0
iteration_cnt = 1
while True:
iteration_cnt = iteration_cnt + 1
delta = 0
# Update the value function of each state
for (sk, s) in S.items():
v_previous = V[sk]
# Find the best state-value for state s
v_best = float("-inf")
for a in A.values():
# ns and nsk are next state and its key
v = np.sum([P[s, a, ns]*(R[s, a, ns]+gamma*V[nsk]) for (nsk, ns) in S.items()])
v_best = max(v, v_best)
# Update
V[sk] = v_best
# Check delta
delta = max(delta, np.abs(V[sk] - v_previous))
if delta < theta:
print("Total iteration = %d" % iteration_cnt)
break
return V
if __name__ == '__main__':
# States
S = {
"Good": 0,
"Bad": 1
}
# Actions
A = {
"Produce": 0,
"Inactive": 1,
"Repair": 2
}
# Transition Probability P(s, a, s')
P = np.zeros((len(S), len(A), len(S)))
P[S["Good"], A["Produce"], S["Good"]] = 0.8
P[S["Good"], A["Produce"], S["Bad"]] = 0.2
P[S["Good"], A["Inactive"], S["Good"]] = 1
P[S["Bad"], A["Repair"], S["Good"]] = 0.8
P[S["Bad"], A["Repair"], S["Bad"]] = 0.2
# Rewards R(s, a, s')
R = np.zeros((len(S), len(A), len(S)))
R[S["Good"], A["Produce"], S["Good"]] = 1
R[S["Good"], A["Produce"], S["Bad"]] = 0
R[S["Good"], A["Inactive"], S["Good"]] = 0
R[S["Bad"], A["Repair"], S["Good"]] = 0
R[S["Bad"], A["Repair"], S["Bad"]] = -10
# gamma
gamma = 0.9
# value iteration
V = value_iteration(S, A, P, R, gamma, 1e-8)
print(V)
@Peng-YM
Copy link
Author

Peng-YM commented Mar 26, 2020

Sample output:

Total iteration = 130
{'Good': 2.959999926929049, 'Bad': 0.15999993391412004}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment