-
Notifications
You must be signed in to change notification settings - Fork 1
/
value_iteration.py
72 lines (59 loc) · 1.9 KB
/
value_iteration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import sys
folder_url = "./"
sys.path.append(folder_url)
from evaluation import *
from maze import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
## Value Iteration
def get_Q(state, action, V_values):
Q = 0
reward_list = []
slip_action = ACTMAP[action]
slip_reward, slip_next_state, _ = env.step(state, slip_action, slip_chance=False)
reward_list.append((slip_reward, slip_next_state, env.slip))
reward, next_state, _ = env.step(state, action, slip_chance=False)
reward_list.append((reward, next_state, 1-env.slip))
for reward, next_state, probability in reward_list:
Q += probability * (reward + discount * V_values[next_state])
return Q
#Hyper parameters
discount=0.9
epochs=1000
#Intinialize
env = Maze()
V_values = np.zeros(env.snum)
Q_values = np.zeros((env.snum, env.anum))
optimal_policy = np.zeros(env.snum)
# Value iteration
for i in range(epochs):
print('i',i)
for state in range(env.snum):
if env.idx2cell[int(state/8)] == env.goal_pos:
continue
V = float('-inf')
for action in range(env.anum):
Q = get_Q(state, action, V_values)
V = max(V, Q)
V_values[state] = V
for state in range(env.snum):
for action in range(env.anum):
Q_values[state, action] = get_Q(state, action, V_values)
for state in range(env.snum):
best_action = np.argmax(Q_values[state,:])
optimal_policy[state] = best_action
#save Q_values
np.save(folder_url + 'results/Q_values',Q_values)
#plot the path on map
state = 0
done = False
while not done:
action = int(optimal_policy[state])
reward, next_state, done = env.step(state, action, slip_chance=False)
print('state:',state,'action:',["UP","DOWN","LEFT","RIGHT"][action],'reward:',reward)
env.plot(state, action)
state = next_state
print('final state:',state,'reward:',reward)
env.plot(state, action)