Good reference: Reinforcement Learning: An Introduction (Sutton & Barto)
Agent-environment interface
What are the differences between supervised learning and reinforcement learning?
Markov property
\[P(s_{t+1}|a_t,s_t,\ldots,a_0,s_0) = P(s_{t+1}|a_t,s_t)\]
How good is it to be in state \(s\) and execute action \(a\)?
\[Q(s, a) = \mathbb{E}\left[ \sum_{k=0}^\infty \gamma^k r_{t+k+1} \vert s, a \right] = \mathbb{E}\left[ r_{t+1} + \gamma Q(s_{t+1}, a_{t+1}) \vert s, a \right]\]
\(Q\) is usually a table!
Return model: discounted infinite horizon
\(R_t = \sum_{i = 0}^\infty \gamma^i r_{t+1+i}\),
The discounting factor \(\gamma \in (0, 1)\) should be close to 1!
Source: Sutton & Barto
Source: Sutton & Barto

Random Walk
%pylab inline
Populating the interactive namespace from numpy and matplotlib
A = ["left", "right"]
S = ["left_goal", "A", "B", "C", "D", "E", "right_goal"]
def state_transition(state, action): # Deterministic, unknown to the agent
state_index = S.index(state)
if action == "left": state_index -= 1
elif action == "right": state_index += 1
return S[state_index]
def reward(state, action, next_state): # Deterministic, unknown to the agent
if next_state == "right_goal": return 1
else: return 0
def initial_state():
return "C"
def finished(state):
return state in ["left_goal", "right_goal"]
class GreedyPolicy(object):
def __init__(self, Q): self.Q = Q
def __call__(self, state): return A[argmax(self.Q(state))]
class Softmax(object):
def __init__(self, Q, tau):
self.Q = Q
self.tau = tau
def __call__(self, state):
p = exp(self.Q(state)/self.tau)
p /= p.sum()
# You could also use random.choice() in NumPy >= 1.7
# for the following lines:
x = random.rand()
for i in range(p.shape[0]):
if p.cumsum()[i] >= x: return A[i]
class EpsilonGreedyPolicy(object):
def __init__(self, Q, epsilon):
self.Q = Q
self.epsilon = epsilon
def __call__(self, state):
if random.rand() < self.epsilon:
return A[random.randint(0, len(A))]
else:
return A[argmax(self.Q(state))]
class QTable(object):
def __init__(self, S, A):
self.Q = zeros((len(S), len(A)))
def __call__(self, state, action=None):
if action is None:
return self.Q[S.index(state)]
else:
return self.Q[S.index(state), A.index(action)]
def correct(self, state, action, target, alpha):
self.Q[S.index(state), A.index(action)] += alpha * (target - self(state, action))
def q_learning(alpha, gamma, epsilon, n_episodes, Q):
for episode in range(n_episodes):
state = initial_state()
while not finished(state):
action = EpsilonGreedyPolicy(Q, epsilon)(state)
next_state = state_transition(state, action)
r = reward(state, action, next_state)
Q.correct(state, action, r + gamma * numpy.max(Q(next_state)), alpha)
state = next_state
return Q
figure(figsize(10, 6))
for i in range(6):
n_episodes = 100*i
setp(subplot(3, 2, i+1), xticks=(), yticks=(), title="Episode #%d" % n_episodes,
xlabel="State", ylabel="Action")
# Note that epsilon = 0.4 is very high!
imshow(q_learning(alpha=0.1, gamma=0.95, epsilon=0.4, n_episodes=n_episodes,
Q=QTable(S, A)).Q.T, cmap=cm.gray, interpolation="nearest")
from IPython.display import YouTubeVideo
YouTubeVideo("0_xeAD6ZgVU")
print YouTubeVideo("0_xeAD6ZgVU").src
http://www.youtube.com/embed/0_xeAD6ZgVU
%pylab inline
random.seed(0)
Populating the interactive namespace from numpy and matplotlib
Centers \(c_i\) can be
n_rbfs = 5
centers = linspace(0, 1, n_rbfs)
Usually the widths \(h_i\) are all equal.
widths = diff(centers)
widths = 0.05*hstack((widths, widths[-1]))
def rbf_features(x):
return exp(-0.5*(x-centers)**2/widths)
n_samples = 100
X = linspace(0, 1, n_samples)
Phi = array([rbf_features(x) for x in X])
w = random.randn(n_rbfs)
y = sum(Phi * w, axis=1)
figure(figsize=(10, 5))
subplot(1, 2, 1)
title("Radial Basis Functions")
for n in xrange(n_rbfs):
plot(X, Phi[:, n])
subplot(1, 2, 2)
title("Weighted Sum of RBFs")
r = plot(X, y)
t = sin(2*pi*X) + random.randn(X.shape[0]) * 0.1
w = linalg.lstsq(Phi, t)[0]
y = sum(Phi * w, axis=1)
title("w = (" + (", ".join(["%.2f"] * len(w)) % tuple(w)) + ")")
plot(X, t, "o")
r = plot(X, y, lw=5)
t[50:] = 0
w = linalg.lstsq(Phi, t)[0]
y = sum(Phi * w, axis=1)
title("w = (" + (", ".join(["%.2f"] * len(w)) % tuple(w)) + ")")
plot(X, t, "o")
r = plot(X, y, lw=5)
S = linspace(0, 1, 100)
def initial_state():
return 0.5
def finished(state):
return state <= 0.0 or state >= 1.0
def state_transition(state, action): # Stochastic, unknown to the agent
if action == "left": state -= 0.15+random.rand()*0.05
elif action == "right": state += 0.15+random.rand()*0.05
return state
def reward(state, action, next_state): # Deterministic, unknown to the agent
if next_state >= 1.0: return 1
else: return 0
class QRBF(object):
def __init__(self, n_features, S, A):
self.c = linspace(0, max(S), n_features)
self.h = 0.1*diff(self.c)
self.h = hstack((self.h, self.h[-1]))
self.W = zeros((len(A), n_features))
def features(self, state):
return exp(-0.5*(state-self.c)**2/self.h)
def __call__(self, state, action=None):
Q = self.W.dot(self.features(state))
if action is None: return Q
else: return Q[A.index(action)]
def correct(self, state, action, target, alpha):
phi = self.features(state)
y = phi.dot(self.W[A.index(action)])
self.W[A.index(action)] -= alpha * phi * (y - target)
Q = q_learning(alpha=0.02, gamma=0.9, epsilon=0.4, n_episodes=100,
Q=QRBF(5, S, A))
figure(figsize=(12, 5))
Qtable = zeros((len(A), len(S)/5))
for i in range(0, len(S), 5):
for j in range(len(A)):
Qtable[j, i/5] = Q(S[i], A[j])
setp(gca(), xticks=(), yticks=())
r = imshow(Qtable, cmap=cm.gray, interpolation="nearest")
However, there are successful applications:
TD-Gammon, a Self-Teaching Backgammon Program Achieves Master-Level Play, Sutton & Barto: MLP (1 hidden layer, 40-80 hidden nodes):
Playing Atari with Deep Reinforcement Learning: Deep-Q-Networks (2 convolutional hidden layers, 1 fully connected hidden layer)