Q-Learning
Q-Learning决策:
用Q Table记录每一个行为的值,作为自己的行为准则,在行动中根据环境的反馈更新行为准则
Q-Learning更新:
Q(S1,A2)估计值 = Q(S1,A2)
Q(S1,A2)现实值 = R+γ*max{Q(S2,A1),Q(S2,A2)}
R为在环境中执行A2到达S1的实际奖励值,max{Q(S2,A1),Q(S2,A2)}是对Q(S2)的最大估计值,γ为衰减率
差距 = Q(S1,A2)现实值 - Q(S1,A2)估计值
新Q(S1,A2) = 老Q(S1,A2) + α * 差距
Q-Learning整体算法:
Initialize Q(s,a) arbitrarily
Repeat(for each episode)
Initialize s
Repeat(for each step of episode):
Choose a from s using policy derived from Q(e.g., e-greedy)
Take action a, observe r,s'
Q(s,a) <-- Q(s,a) + α[r+γmaxa'Q(s',a')-Q(s,a)]
s<--S';
until s is terminal
Epsilon greedy 是用在决策上的一种策略,比如 epsilon = 0.9 时, 就说明有90%的概率按照 Q Table的最优值选择行为,10% 的概率使用随机选行为。alpha是学习率。
Q-Learning例子
导入模块/参数设置
import numpy as np
import pandas as pd
import time
N_STATES = 6 # 1维世界的宽度
ACTIONS = ['left', 'right'] # 探索者的可用动作
EPSILON = 0.9 # 贪婪度 greedy
ALPHA = 0.1 # 学习率
GAMMA = 0.9 # 奖励递减值
MAX_EPISODES = 13 # 最大回合数
FRESH_TIME = 0.3 # 移动间隔时间
定义Q表
def build_q_table(n_states, actions):
table = pd.DataFrame(
np.zeros((n_states, len(actions))), # q_table 全 0 初始
columns=actions, # columns 对应的是行为名称
)
return table
定义动作
def choose_action(state, q_table):
state_actions = q_table.iloc[state, :] # 选出这个 state 的所有 action 值
if (np.random.uniform() > EPSILON) or (state_actions.all() == 0): # 非贪婪 or 或者这个 state 还没有探索过
action_name = np.random.choice(ACTIONS)
else:
action_name = state_actions.argmax() # 贪婪模式
return action_name
环境反馈
def get_env_feedback(S, A):
# This is how agent will interact with the environment
if A == 'right': # move right
if S == N_STATES - 2: # terminate
S_ = 'terminal'
R = 1
else:
S_ = S + 1
R = 0
else: # move left
R = 0
if S == 0:
S_ = S # reach the wall
else:
S_ = S - 1
return S_, R
环境更新
def update_env(S, episode, step_counter):
# This is how environment be updated
env_list = ['-']*(N_STATES-1) + ['T'] # '---------T' our environment
if S == 'terminal':
interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
print('\r{}'.format(interaction), end='')
time.sleep(2)
print('\r ', end='')
else:
env_list[S] = 'o'
interaction = ''.join(env_list)
print('\r{}'.format(interaction), end='')
time.sleep(FRESH_TIME)
Q-Learning算法更新
一个让agent走迷宫的例子
def update():
# 学习 100 回合
for episode in range(100):
# 初始化 state 的观测值
observation = env.reset()
while True:
# 更新可视化环境
env.render()
# RL 大脑根据 state 的观测值挑选 action
action = RL.choose_action(str(observation))
# 探索者在环境中实施这个 action, 并得到环境返回的下一个 state 观测值, reward 和 done (是否是掉下地狱或者升上天堂)
observation_, reward, done = env.step(action)
# RL 从这个序列 (state, action, reward, state_) 中学习
RL.learn(str(observation), action, reward, str(observation_))
# 将下一个 state 的值传到下一次循环
observation = observation_
# 如果掉下地狱或者升上天堂, 这回合就结束了
if done:
break
# 结束游戏并关闭窗口
print('game over')
env.destroy()
if __name__ == "__main__":
# 定义环境 env 和 RL 方式
env = Maze()
RL = QLearningTable(actions=list(range(env.n_actions)))
# 开始可视化环境 env
env.after(100, update)
env.mainloop()
Q-Learning思维决策
构建Q-Learning类
import numpy as np
import pandas as pd
class QLearningTable:
def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
self.actions = actions # a list
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon = e_greedy
self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
def choose_action(self, observation):
self.check_state_exist(observation)
# action selection
if np.random.uniform() < self.epsilon:
# choose best action
state_action = self.q_table.loc[observation, :]
# some actions may have the same value, randomly choose on in these actions
action = np.random.choice(state_action[state_action == np.max(state_action)].index)
else:
# choose random action
action = np.random.choice(self.actions)
return action
def learn(self, s, a, r, s_):
self.check_state_exist(s_)
q_predict = self.q_table.loc[s, a]
if s_ != 'terminal':
q_target = r + self.gamma * self.q_table.loc[s_, :].max() # next state is not terminal
else:
q_target = r # next state is terminal
self.q_table.loc[s, a] += self.lr * (q_target - q_predict) # update
def check_state_exist(self, state):
if state not in self.q_table.index:
# append new state to q table
self.q_table = self.q_table.append(
pd.Series(
[0]*len(self.actions),
index=self.q_table.columns,
name=state,
)
)