Learn AI Series):Exercise 1: Multi-armed bandit tournament -- run five algorithms (EpsilonGreedy at 0.1 and 0.01, DecayingEpsilonGreedy, UCB with c=2, and Gaussian Thompson Sampling) across 20 randomly-generated 10-armed environments and tally average regret, optimal-pull percentage, and per-environment wins.
import numpy as np
class BanditEnvironment:
def __init__(self, k=10, seed=42):
self.rng = np.random.RandomState(seed)
self.q_true = self.rng.normal(0, 1, size=k)
self.k = k
self.best_arm = int(np.argmax(self.q_true))
def pull(self, arm):
return self.q_true[arm] + np.random.normal(0, 1)
class EpsilonGreedy:
def __init__(self, k, epsilon=0.1):
self.k, self.epsilon = k, epsilon
self.q = np.zeros(k)
self.n = np.zeros(k)
def choose_arm(self):
if np.random.random() < self.epsilon:
return np.random.randint(self.k)
return int(np.argmax(self.q))
def update(self, arm, reward):
self.n[arm] += 1
self.q[arm] += (reward - self.q[arm]) / self.n[arm]
class DecayingEpsilonGreedy(EpsilonGreedy):
def __init__(self, k, initial_eps=1.0, decay=0.999):
super().__init__(k, initial_eps)
self.decay = decay
def update(self, arm, reward):
super().update(arm, reward)
self.epsilon *= self.decay
class UCB:
def __init__(self, k, c=2.0):
self.k, self.c = k, c
self.q = np.zeros(k)
self.n = np.zeros(k)
self.t = 0
def choose_arm(self):
self.t += 1
for arm in range(self.k):
if self.n[arm] == 0:
return arm
bonus = self.c * np.sqrt(np.log(self.t) / self.n)
return int(np.argmax(self.q + bonus))
def update(self, arm, reward):
self.n[arm] += 1
self.q[arm] += (reward - self.q[arm]) / self.n[arm]
class GaussianThompson:
def __init__(self, k):
self.k = k
self.mu = np.zeros(k)
self.lam = np.ones(k)
self.alpha = np.ones(k)
self.beta = np.ones(k)
def choose_arm(self):
samples = np.zeros(self.k)
for i in range(self.k):
tau = np.random.gamma(self.alpha[i], 1.0 / self.beta[i])
sigma = 1.0 / np.sqrt(tau * self.lam[i])
samples[i] = np.random.normal(self.mu[i], sigma)
return int(np.argmax(samples))
def update(self, arm, reward):
old_mu, old_lam = self.mu[arm], self.lam[arm]
self.lam[arm] += 1
self.mu[arm] = (old_lam * old_mu + reward) / self.lam[arm]
self.alpha[arm] += 0.5
self.beta[arm] += (0.5 * old_lam
* (reward - old_mu) ** 2 / self.lam[arm])
def make_agent(name, k):
return {
"eps-0.1": lambda: EpsilonGreedy(k, 0.1),
"eps-0.01": lambda: EpsilonGreedy(k, 0.01),
"decay": lambda: DecayingEpsilonGreedy(k),
"ucb": lambda: UCB(k, c=2.0),
"thompson": lambda: GaussianThompson(k),
}[name]()
class BanditTournament:
NAMES = ["eps-0.1", "eps-0.01", "decay", "ucb", "thompson"]
def __init__(self, n_envs=20, n_steps=10000, k=10):
self.n_envs, self.n_steps, self.k = n_envs, n_steps, k
def run(self):
regret = {n: [] for n in self.NAMES}
opt = {n: [] for n in self.NAMES}
wins = {n: 0 for n in self.NAMES}
gaps = []
for seed in range(self.n_envs):
env = BanditEnvironment(self.k, seed=seed)
sq = np.sort(env.q_true)
gaps.append((seed, sq[-1] - sq[-2]))
this_env = {}
for name in self.NAMES:
np.random.seed(1000 + seed)
agent = make_agent(name, self.k)
optimal = env.q_true[env.best_arm]
total, hits = 0.0, 0
for _ in range(self.n_steps):
arm = agent.choose_arm()
r = env.pull(arm)
agent.update(arm, r)
total += optimal - r
hits += (arm == env.best_arm)
regret[name].append(total)
opt[name].append(100 * hits / self.n_steps)
this_env[name] = total
wins[min(this_env, key=this_env.get)] += 1
print(f"{'algorithm':12s}{'avg regret':>12s}"
f"{'optimal%':>10s}{'wins':>6s}")
for name in self.NAMES:
print(f"{name:12s}{np.mean(regret[name]):12.1f}"
f"{np.mean(opt[name]):9.1f}%{wins[name]:6d}")
hardest = min(gaps, key=lambda g: g[1])
print(f"\nHardest env: seed {hardest[0]} (gap={hardest[1]:.3f})")
BanditTournament(n_envs=20, n_steps=10000).run()
The key insight: averaged over 20 environments, Thompson Sampling wins the most of them and posts the lowest mean regret, with UCB a close second. Both crush the fixed-epsilon agents, because a fixed epsilon keeps burning pulls on random arms forever -- even after it already knows the answer. EpsilonGreedy(0.01) does well when the arms are well-separated (barely needs to explore) but occasionally gets unlucky and never finds the best arm, which inflates its average regret.
Exercise 2: Contextual bandit news recommender -- LinUCB over 5 article categories with 6-dimensional user features, learning online from sigmoid click feedback, compared against a random baseline.
import numpy as np
class LinUCB:
def __init__(self, n_arms, d, alpha=1.0):
self.n_arms, self.alpha = n_arms, alpha
self.A = [np.eye(d) for _ in range(n_arms)]
self.b = [np.zeros(d) for _ in range(n_arms)]
def choose_arm(self, ctx):
scores = np.zeros(self.n_arms)
for a in range(self.n_arms):
A_inv = np.linalg.inv(self.A[a])
theta = A_inv @ self.b[a]
scores[a] = ctx @ theta + self.alpha * np.sqrt(ctx @ A_inv @ ctx)
return int(np.argmax(scores))
def update(self, arm, ctx, reward):
self.A[arm] += np.outer(ctx, ctx)
self.b[arm] += reward * ctx
class NewsRecommender:
CATS = ["tech", "sports", "politics", "cooking", "finance"]
def __init__(self, seed=42):
self.rng = np.random.RandomState(seed)
self.d = 6 # age, gender, morning, weekday, mobile, returning
self.weights = self.rng.randn(len(self.CATS), self.d)
self.agent = LinUCB(len(self.CATS), self.d, alpha=0.5)
def click_prob(self, arm, ctx):
return 1.0 / (1.0 + np.exp(-(self.weights[arm] @ ctx)))
def run(self, n=10000):
first, last = [], []
counts = np.zeros(len(self.CATS), dtype=int)
rand_clicks = 0
for t in range(n):
ctx = self.rng.randn(self.d)
arm = self.agent.choose_arm(ctx)
click = int(self.rng.random() < self.click_prob(arm, ctx))
self.agent.update(arm, ctx, click)
counts[arm] += 1
if t < 1000:
first.append(click)
if t >= n - 1000:
last.append(click)
rand_arm = self.rng.randint(len(self.CATS))
rand_clicks += int(self.rng.random() < self.click_prob(rand_arm, ctx))
print(f"CTR first 1000: {np.mean(first):.3f}")
print(f"CTR last 1000: {np.mean(last):.3f}")
print(f"Random CTR: {rand_clicks / n:.3f}")
print("Per-article selection counts:")
for name, c in zip(self.CATS, counts):
print(f" {name:9s}: {c}")
NewsRecommender().run()
The CTR in the final 1000 visits edges above the first 1000 -- LinUCB learns fast, so it's already doing well early -- but the headline number is the gap against the random baseline, which sits stuck around 0.50 while the agent runs well above 0.80. That gap is the whole point: random ignores the context, LinUCB reads the user features and matches each visitor to the article most likely to land. LinUCB is the algorithm behind a huge chunk of real recommendation traffic, so this little demo is closer to production than it looks ;-)
Exercise 3: Non-stationary bandit detector -- a sliding-window epsilon-greedy agent that watches for change-points and temporarily boosts exploration when an arm's mean shifts, compared to a fixed-epsilon agent with no detection.
import numpy as np
def make_q(phase, k=10):
"""One clearly-best arm; which arm is best rotates each phase."""
q = np.full(k, -0.5)
q[phase % k] = 2.0
return q
class FixedEpsilon:
"""Full-history epsilon-greedy -- the baseline that cannot forget."""
def __init__(self, k, epsilon=0.1):
self.k, self.epsilon = k, epsilon
self.q = np.zeros(k)
self.n = np.zeros(k)
def choose_arm(self, t):
if np.random.random() < self.epsilon:
return np.random.randint(self.k)
return int(np.argmax(self.q))
def update(self, arm, reward, t):
self.n[arm] += 1
self.q[arm] += (reward - self.q[arm]) / self.n[arm]
class ChangeDetector:
"""Sliding-window epsilon-greedy with a two-window change test
and a temporary exploration boost when a shift is detected."""
def __init__(self, k, window=50, base_eps=0.1):
self.k, self.window, self.base_eps = k, window, base_eps
self.history = [[] for _ in range(k)]
self.boost_until = -1
self.detections = []
def epsilon(self, t):
return 0.5 if t < self.boost_until else self.base_eps
def choose_arm(self, t):
if np.random.random() < self.epsilon(t):
return np.random.randint(self.k)
est = np.full(self.k, np.inf)
for a in range(self.k):
recent = self.history[a][-self.window:]
if recent:
est[a] = np.mean(recent)
return int(np.argmax(est))
def update(self, arm, reward, t):
h = self.history[arm]
h.append(reward)
# Compare this arm's most recent window against the one before it
if len(h) >= 2 * self.window and t % 20 == 0:
recent = np.mean(h[-self.window:])
older = np.mean(h[-2 * self.window:-self.window])
if abs(recent - older) > 1.0:
self.detections.append((t, arm))
self.boost_until = t + 100
def simulate(agent, n_steps=5000, k=10, shift=1000):
rng = np.random.RandomState(0)
total_regret = 0.0
for t in range(n_steps):
q = make_q(t // shift, k) # best arm rotates every `shift` steps
arm = agent.choose_arm(t)
reward = q[arm] + rng.normal(0, 0.5)
agent.update(arm, reward, t)
total_regret += np.max(q) - q[arm]
return total_regret
det = ChangeDetector(10)
det_regret = simulate(det)
fixed_regret = simulate(FixedEpsilon(10))
print("Detected changes (step, arm):")
for step, arm in det.detections[:8]:
print(f" step {step}, arm {arm}")
print(f"\nSliding-window + detection regret: {det_regret:.1f}")
print(f"Fixed full-history regret: {fixed_regret:.1f}")
The full-history agent is a disaster here: once it's convinced arm 0 is best, its sample mean is anchored by hundreds of old pulls, so even after arm 0 stops being good it keeps going back -- racking up roughly four times the regret. The sliding-window agent forgets old data, and the two-window change test fires right after each shift (you'll see detections at ~1040, ~2020, ~3020, ~4040), boosting epsilon to 0.5 for 100 steps so it re-locks onto the new best arm fast. The threshold of 1.0 is a knob: too low and you re-explore on noise, too high and you miss real changes. Tuning that knob is the whole game in non-stationary settings.
Right, episode 104. Last time we built up the whole bandit toolkit -- epsilon-greedy, UCB, Thompson Sampling -- and I promised we'd start putting the mathematical framework on a firmer footing. Bandits were RL with the difficulty dial turned all the way down: one decision, immediate reward, no states. But the real world has states. Where you are determines what you can do, and what you do now determines where you end up next. That sequential structure is exactly what we stripped out for bandits, and now we're going to put it back.
Back in episode #102 we defined the Markov Decision Process -- states, actions, transition probabilities, rewards, and a discount factor gamma. We even built a little grid world and let a Q-learning agent stumble its way to the goal. Today we're going to solve that same kind of problem, but properly, with full knowledge of the rules. This is dynamic programming (DP), and it's where the core ideas of RL crystalize.
Here's the catch, and it's a big one: DP assumes you know everything about the environment. Every state, every action, every transition probability, every reward. Think of it as the "open-book exam" version of RL -- you've got the answer key (the model) right in front of you, and the only question is how to use it. That sounds unrealistic, and for a lot of real problems it is. But almost every algorithm we build after this is, at heart, a clever way of doing dynamic programming without the answer key. So you really want to understand the open-book version first ;-)
Everything in RL value theory hangs off one idea, and it's so simple it almost feels like cheating. The value of where you are equals the reward you're about to get, plus the (discounted) value of where you end up next. That's it. That recursive self-reference is the Bellman equation, named after Richard Bellman who coined the term "dynamic programming" in the 1950s (partly, the story goes, because the name sounded impressive enough to keep his research funding safe -- a man after my own heart).
For a policy pi, the state value function V^pi(s) satisfies:
V^pi(s) = SUM_a pi(a|s) * SUM_s' P(s'|s,a) * [R(s,a,s') + gamma * V^pi(s')]
In words: stand in state s, average over the actions your policy might take (weighted by pi(a|s)), then average over the states you might land in (weighted by the transition probabilities P), and for each of those add the immediate reward plus the discounted value of continuing onward from s'. The value of here is defined in terms of the value of there. It looks circular, and it sort of is -- the trick is that we can solve the circularity by iterating, which is exactly what we'll do in a moment.
Let's make it concrete with a 4x4 grid world. The two opposite corners are terminal (think of them as the exits), every step costs you -1, and the agent just wants out as fast as possible:
import numpy as np
class GridWorldMDP:
"""4x4 grid world with fully known dynamics."""
def __init__(self, size=4):
self.size = size
self.n_states = size * size
self.n_actions = 4 # up, right, down, left
self.terminal_states = {0, self.n_states - 1} # opposite corners
self.action_deltas = {0: (-1, 0), 1: (0, 1),
2: (1, 0), 3: (0, -1)}
def state_to_pos(self, s):
return s // self.size, s % self.size
def pos_to_state(self, row, col):
return row * self.size + col
def get_transition(self, state, action):
"""Returns (next_state, reward, done).
Deterministic transitions to keep things readable."""
if state in self.terminal_states:
return state, 0.0, True
row, col = self.state_to_pos(state)
dr, dc = self.action_deltas[action]
new_row = max(0, min(self.size - 1, row + dr))
new_col = max(0, min(self.size - 1, col + dc))
next_state = self.pos_to_state(new_row, new_col)
reward = -1.0 # every step hurts a little
done = next_state in self.terminal_states
return next_state, reward, done
Bumping into a wall just leaves you where you are (the max/min clamp the position), and you still pay your -1 for the privilege. The optimal play is obvious to a human -- walk the shortest path to the nearest exit -- but the agent doesn't know that yet. It only knows the rules. Our job is to turn the rules into a value for every cell.
Given a fixed policy pi, policy evaluation answers the question "what is V^pi(s) for every state?" We don't solve the linear system directly (you could, it's just a system of linear equations) -- in stead we iterate. Start with V(s) = 0 everywhere, then sweep through the states applying the Bellman equation again and again. Each sweep pushes value information one step further outward, and the whole thing is garanteed to converge to the true V^pi:
def policy_evaluation(mdp, policy, gamma=1.0, theta=1e-6):
"""Compute V^pi by iterating the Bellman equation.
policy: array (n_states, n_actions) of action probabilities
theta: convergence threshold
"""
V = np.zeros(mdp.n_states)
while True:
delta = 0.0
for s in range(mdp.n_states):
if s in mdp.terminal_states:
continue
v_old = V[s]
v_new = 0.0
for a in range(mdp.n_actions):
next_s, reward, done = mdp.get_transition(s, a)
next_value = 0.0 if done else V[next_s]
v_new += policy[s, a] * (reward + gamma * next_value)
V[s] = v_new
delta = max(delta, abs(v_old - v_new))
if delta < theta:
break
return V
mdp = GridWorldMDP(size=4)
random_policy = np.ones((mdp.n_states, mdp.n_actions)) / mdp.n_actions
V = policy_evaluation(mdp, random_policy)
print("Values under the random policy:")
for row in range(mdp.size):
for col in range(mdp.size):
s = mdp.pos_to_state(row, col)
print(f"{V[s]:6.1f}", end=" ")
print()
Under the random policy -- shuffle off in a random direction with 25% probability each -- the cells far from any exit end up deeply negative. That makes sense: a drunkard's walk from the middle of the grid takes a long time to stumble into a corner, and every wasted step costs -1. Cells right next to an exit sit close to zero, because escape is one lucky step away.
The theta parameter is just our patience: once no state's value changes by more than 0.000001 in a full sweep, we call it converged. Notice the delta = max(delta, abs(...)) line -- that's how we measure "did anything still move?" across the whole sweep. Nota bene: with gamma=1.0 (no discounting) this only converges because the episode is garanteed to terminate; with continuing tasks you'd need gamma < 1 to keep the sum finite, exactly as we saw with the discount factor in episode #102.
Knowing V^pi is nice, but the point of having values is to act better. Policy improvement does the obvious thing: in each state, instead of blindly following pi, look one step ahead and ask "which action gives me the best immediate-reward-plus-next-value?" Pick that one. This is the greedy policy with respect to V:
def policy_improvement(mdp, V, gamma=1.0):
"""Build the greedy policy from a value function."""
new_policy = np.zeros((mdp.n_states, mdp.n_actions))
for s in range(mdp.n_states):
if s in mdp.terminal_states:
new_policy[s] = 1.0 / mdp.n_actions # irrelevant at exits
continue
action_values = np.zeros(mdp.n_actions)
for a in range(mdp.n_actions):
next_s, reward, done = mdp.get_transition(s, a)
next_value = 0.0 if done else V[next_s]
action_values[a] = reward + gamma * next_value
best_action = int(np.argmax(action_values))
new_policy[s, best_action] = 1.0 # all probability on the winner
return new_policy
The policy improvement theorem is the quiet hero here: the greedy policy is always at least as good as the one you started with, in every state, garanteed. No hand-waving. If V^pi says you're sitting on a value of -10 and stepping right lands you in a cell worth -5, then stepping right is worth -1 + gamma*(-5) = -6, which beats -10. Greedy notices that and grabs it. Do this in every state at once and you can only improve (or, if you were already optimal, stay put). That guarantee is what makes the next algorithm work.
Now we just bolt the two pieces together. Policy iteration alternates evaluation and improvement and refuses to stop untill the policy stops changing:
def policy_iteration(mdp, gamma=1.0):
"""Find the optimal policy by alternating evaluation and improvement."""
policy = np.ones((mdp.n_states, mdp.n_actions)) / mdp.n_actions
iteration = 0
while True:
iteration += 1
V = policy_evaluation(mdp, policy, gamma)
new_policy = policy_improvement(mdp, V, gamma)
if np.allclose(policy, new_policy):
print(f"Policy iteration converged in {iteration} iterations")
break
policy = new_policy
return policy, V
optimal_policy, optimal_V = policy_iteration(mdp)
arrows = ["^", ">", "v", "<"]
print("\nOptimal policy:")
for row in range(mdp.size):
for col in range(mdp.size):
s = mdp.pos_to_state(row, col)
if s in mdp.terminal_states:
print(" T ", end=" ")
else:
best_a = int(np.argmax(optimal_policy[s]))
print(f" {arrows[best_a]} ", end=" ")
print()
print("\nOptimal values:")
for row in range(mdp.size):
for col in range(mdp.size):
s = mdp.pos_to_state(row, col)
print(f"{optimal_V[s]:6.1f}", end=" ")
print()
Here's the part that surprised me the first time I saw it: policy iteration converges in a tiny number of outer loops -- typically 3 to 5 for a grid world, and that count barely grows as the grid gets bigger. The expensive bit is hidden inside policy evaluation (which itself runs many Bellman sweeps). The improvement step is cheap and decisive -- a couple of greedy passes and the policy locks into the optimum. The arrows you print should all point sensibly toward the nearest exit, and the values should read like a distance map: -1 next to a corner, more negative as you move away.
Policy iteration has a slightly wasteful smell to it -- we run policy evaluation all the way to convergence just to throw most of that precision away on the next greedy step. Why fully evaluate a policy you're about to replace? Value iteration says: don't. Take a single Bellman backup, but instead of averaging over the policy's actions, take the max over actions. That one change folds evaluation and improvement into a single update:
def value_iteration(mdp, gamma=1.0, theta=1e-6):
"""Find optimal values directly via the Bellman optimality equation."""
V = np.zeros(mdp.n_states)
iteration = 0
while True:
iteration += 1
delta = 0.0
for s in range(mdp.n_states):
if s in mdp.terminal_states:
continue
v_old = V[s]
action_values = np.zeros(mdp.n_actions)
for a in range(mdp.n_actions):
next_s, reward, done = mdp.get_transition(s, a)
next_value = 0.0 if done else V[next_s]
action_values[a] = reward + gamma * next_value
V[s] = np.max(action_values) # the max IS the improvement
delta = max(delta, abs(v_old - V[s]))
if delta < theta:
print(f"Value iteration converged in {iteration} iterations")
break
policy = policy_improvement(mdp, V, gamma)
return policy, V
optimal_policy_vi, optimal_V_vi = value_iteration(mdp)
The difference is one operator. Policy evaluation computes V(s) = SUM_a pi(a|s) * [R + gamma*V(s')] -- an expectation under the current policy. Value iteration computes V(s) = max_a [R + gamma*V(s')] -- the best you could do. That max is the Bellman optimality equation, and because it picks the best action every sweep, it improves the policy and evaluates it in the same breath. You never store an explicit policy until the very end, when you read it off the converged values.
Having said that, the two methods compute the same answer -- run both on this grid and optimal_V and optimal_V_vi agree to within theta. Value iteration usually wins on big problems because it skips the expensive inner evaluation loop; policy iteration sometimes wins when evaluation is cheap and you want the policy to lock in after a handful of decisive improvements. Horses for courses.
A fair question, because we keep waving the word "garanteed" around. The reason is that the Bellman operator is a contraction mapping with factor gamma. In plain terms: every time you apply a full Bellman backup, the distance between your current value estimate and the true value function shrinks by at least a factor of gamma. Shrink something repeatedly by a constant factor below 1 and it collapses toward a single fixed point -- and that fixed point is the value function we're after. It's the same reason compound interest in reverse eats a number down to nothing. With gamma=1.0 we lean on guaranteed termination instead, but the moment you set gamma < 1 the contraction argument gives you convergence for free, no matter how you initialize V.
That's not just trivia. The contraction property is the theoretical guarantee underneath value-based RL, and it keeps mattering long after we leave the open-book setting -- it's why bootstrapped updates from incomplete experience still pull estimates in the right direction.
Dynamic programming needs the full model: every state, every transition probability, every reward, all known in advance. That confines it to problems where two things hold:
For a 4x4 grid (16 states), DP is instant. For 100x100 (10,000 states), still fine. But point it at an Atari frame -- on the order of 10^8 possible pixel configurations per screen -- and it falls flat on its face. You cannot enumerate that table, let alone sweep it. And for most interesting problems you don't even have the transition probabilities; the universe doesn't hand you P(s'|s,a) on a plate.
The methods we'll build next are precisely the escape from these two limitations. Monte Carlo methods drop the need for a model entirely -- they let the agent play out whole episodes and average the actual returns it sees, no transition probabilities required. Temporal difference methods go a step further and update value estimates from a single step of experience, blending Monte Carlo's learning-from-reality with DP's bootstrapping-from-its-own-estimates. And eventually we swap the lookup table for a neural network, so the very same Bellman ideas can cope with state spaces too vast to ever write down.
But -- and this is the whole reason DP comes first -- every one of those methods is doing dynamic programming in disguise. The Bellman equation, policy evaluation, policy improvement, the optimality operator: they all carry straight over. The algorithms change to cope with missing models and giant state spaces; the underlying ideas do not budge an inch.
Exercise 1: Build a stochastic grid world solver. Extend GridWorldMDP so that get_transition is probabilistic: the chosen action succeeds with probability p=0.8, and with probability 0.2 the agent slips to one of the two perpendicular directions (0.1 each). This means get_transition should return a list of (prob, next_state, reward, done) tuples instead of a single outcome. Rewrite policy_evaluation to sum over those outcomes. Run value iteration with gamma=0.9 on the 4x4 grid and compare the optimal values and policy against the deterministic version. The values should be more negative (slipping wastes steps) and the policy should sometimes steer away from walls to avoid slipping into them.
Exercise 2: Build a gamma sweep experiment. Using the deterministic 4x4 grid world, run value iteration for gamma in [0.5, 0.7, 0.9, 0.99, 1.0]. For each gamma, print: (a) the number of iterations to convergence, (b) the optimal value of the start-adjacent cell, (c) whether the optimal policy (the arrows) changes at all. You should find the policy is identical across all gammas for this simple shortest-path problem (the optimal route doesn't depend on patience here), but the values and the iteration counts shift -- lower gamma converges faster because the contraction is stronger.
Exercise 3: Build a policy iteration vs value iteration profiler. Add a counter to both policy_iteration and value_iteration that tallies the total number of Bellman backups performed (one backup = one evaluation of reward + gamma*next_value for a single state-action pair). Run both on grid worlds of size 4, 8, and 16 (gamma=0.9) and print a table of total backups for each method at each size. Confirm that policy iteration needs few outer iterations but many backups per evaluation, while value iteration spreads its backups more evenly -- and see which one does fewer total backups as the grid grows.