Peng-YM · March 26, 2020 09:30 · Peng-YM · Mar 26, 2020
diff --git a/VI.py b/VI.py
 import numpy as np

 def value_iteration(S, A, P, R, gamma, theta):
    '''
    Parameters:
        S: states. A dictionary containing the name of each state.
        A: actions. A dictionary containing the name of each action.
        P: transition probabilities. P[previous_state, action, next_state] = probability.
        R: reward. R[previous_state, action, next_state] = reward.
        gamma: discounting factor.
        theta: stop iteration when the improvement of values is smaller than theta
    Output:
        the value function of each state
    '''
    # Initialize array V randomly for all state in S
    V = {}
    for key in S.keys():
        V[key] = 0

    iteration_cnt = 1
    while True:
        iteration_cnt = iteration_cnt + 1
        
        delta = 0
        # Update the value function of each state
        for (sk, s) in S.items():
            v_previous = V[sk]
            # Find the best state-value for state s
            v_best = float("-inf")
            for a in A.values():
                # ns and nsk are next state and its key
                v = np.sum([P[s, a, ns]*(R[s, a, ns]+gamma*V[nsk]) for (nsk, ns) in S.items()])
                v_best = max(v, v_best)
            # Update
            V[sk] = v_best
            # Check delta
            delta = max(delta, np.abs(V[sk] - v_previous))
        if delta < theta:
            print("Total iteration = %d" % iteration_cnt)
            break
    return V

 if __name__ == '__main__':

    # States
    S = {
        "Good": 0,
        "Bad": 1
    }

    # Actions
    A = {
        "Produce": 0,
        "Inactive": 1,
        "Repair": 2
    }

    # Transition Probability P(s, a, s')
    P = np.zeros((len(S), len(A), len(S)))

    P[S["Good"], A["Produce"], S["Good"]] = 0.8
    P[S["Good"], A["Produce"], S["Bad"]] = 0.2
    P[S["Good"], A["Inactive"], S["Good"]] = 1

    P[S["Bad"], A["Repair"], S["Good"]] = 0.8
    P[S["Bad"], A["Repair"], S["Bad"]] = 0.2

    # Rewards R(s, a, s')
    R = np.zeros((len(S), len(A), len(S)))

    R[S["Good"], A["Produce"], S["Good"]] = 1
    R[S["Good"], A["Produce"], S["Bad"]] = 0
    R[S["Good"], A["Inactive"], S["Good"]] = 0

    R[S["Bad"], A["Repair"], S["Good"]] = 0
    R[S["Bad"], A["Repair"], S["Bad"]] = -10

    # gamma
    gamma = 0.9

    # value iteration
    V = value_iteration(S, A, P, R, gamma, 1e-8)
    
    print(V)
	import numpy as np

	def value_iteration(S, A, P, R, gamma, theta):
	'''
	Parameters:
	S: states. A dictionary containing the name of each state.
	A: actions. A dictionary containing the name of each action.
	P: transition probabilities. P[previous_state, action, next_state] = probability.
	R: reward. R[previous_state, action, next_state] = reward.
	gamma: discounting factor.
	theta: stop iteration when the improvement of values is smaller than theta
	Output:
	the value function of each state
	'''
	# Initialize array V randomly for all state in S
	V = {}
	for key in S.keys():
	V[key] = 0

	iteration_cnt = 1
	while True:
	iteration_cnt = iteration_cnt + 1

	delta = 0
	# Update the value function of each state
	for (sk, s) in S.items():
	v_previous = V[sk]
	# Find the best state-value for state s
	v_best = float("-inf")
	for a in A.values():
	# ns and nsk are next state and its key
	v = np.sum([P[s, a, ns](R[s, a, ns]+gammaV[nsk]) for (nsk, ns) in S.items()])
	v_best = max(v, v_best)
	# Update
	V[sk] = v_best
	# Check delta
	delta = max(delta, np.abs(V[sk] - v_previous))
	if delta < theta:
	print("Total iteration = %d" % iteration_cnt)
	break
	return V

	if __name__ == '__main__':

	# States
	S = {
	"Good": 0,
	"Bad": 1
	}

	# Actions
	A = {
	"Produce": 0,
	"Inactive": 1,
	"Repair": 2
	}

	# Transition Probability P(s, a, s')
	P = np.zeros((len(S), len(A), len(S)))

	P[S["Good"], A["Produce"], S["Good"]] = 0.8
	P[S["Good"], A["Produce"], S["Bad"]] = 0.2
	P[S["Good"], A["Inactive"], S["Good"]] = 1

	P[S["Bad"], A["Repair"], S["Good"]] = 0.8
	P[S["Bad"], A["Repair"], S["Bad"]] = 0.2

	# Rewards R(s, a, s')
	R = np.zeros((len(S), len(A), len(S)))

	R[S["Good"], A["Produce"], S["Good"]] = 1
	R[S["Good"], A["Produce"], S["Bad"]] = 0
	R[S["Good"], A["Inactive"], S["Good"]] = 0

	R[S["Bad"], A["Repair"], S["Good"]] = 0
	R[S["Bad"], A["Repair"], S["Bad"]] = -10

	# gamma
	gamma = 0.9

	# value iteration
	V = value_iteration(S, A, P, R, gamma, 1e-8)

	print(V)
No results found