Learn AI Series):Exercise 1: YOLO annotation format converter.
import json
import xml.etree.ElementTree as ET
class AnnotationConverter:
"""Convert between VOC, YOLO, and COCO formats."""
def __init__(self, class_names):
self.class_names = class_names
self.class_to_id = {
n: i for i, n in enumerate(class_names)
}
def voc_to_yolo(self, voc_annotations, img_w, img_h):
"""VOC (xmin,ymin,xmax,ymax abs) -> YOLO
(cx,cy,w,h normalized)."""
yolo = []
for ann in voc_annotations:
xmin, ymin = ann["xmin"], ann["ymin"]
xmax, ymax = ann["xmax"], ann["ymax"]
cls_id = self.class_to_id[ann["name"]]
cx = ((xmin + xmax) / 2.0) / img_w
cy = ((ymin + ymax) / 2.0) / img_h
w = (xmax - xmin) / img_w
h = (ymax - ymin) / img_h
yolo.append({
"class_id": cls_id, "cx": cx,
"cy": cy, "w": w, "h": h,
})
return yolo
def yolo_to_coco(self, yolo_annotations,
img_w, img_h, image_id=0):
"""YOLO (normalized) -> COCO (abs pixels)."""
coco = []
for i, ann in enumerate(yolo_annotations):
abs_w = ann["w"] * img_w
abs_h = ann["h"] * img_h
x = ann["cx"] * img_w - abs_w / 2
y = ann["cy"] * img_h - abs_h / 2
coco.append({
"id": i,
"image_id": image_id,
"category_id": ann["class_id"],
"bbox": [x, y, abs_w, abs_h],
"area": abs_w * abs_h,
})
return coco
def coco_to_voc(self, coco_annotations):
"""COCO (x,y,w,h abs) -> VOC (xmin,ymin,xmax,ymax)."""
voc = []
for ann in coco_annotations:
x, y, w, h = ann["bbox"]
cls_name = self.class_names[ann["category_id"]]
voc.append({
"name": cls_name,
"xmin": x, "ymin": y,
"xmax": x + w, "ymax": y + h,
})
return voc
def round_trip_test(self, voc_anns, img_w, img_h):
"""VOC -> YOLO -> COCO -> VOC, verify match."""
yolo = self.voc_to_yolo(voc_anns, img_w, img_h)
coco = self.yolo_to_coco(yolo, img_w, img_h)
recovered = self.coco_to_voc(coco)
for orig, rec in zip(voc_anns, recovered):
for key in ["xmin", "ymin", "xmax", "ymax"]:
diff = abs(orig[key] - rec[key])
assert diff < 1e-6, (
f"Mismatch: {key} "
f"{orig[key]} vs {rec[key]}"
)
assert orig["name"] == rec["name"]
print(f"Round-trip OK: {len(voc_anns)} boxes match")
# Test with 5 sample annotations
conv = AnnotationConverter(["cat", "dog", "car"])
test_anns = [
{"name": "cat", "xmin": 50, "ymin": 30,
"xmax": 200, "ymax": 180},
{"name": "dog", "xmin": 300, "ymin": 100,
"xmax": 450, "ymax": 350},
{"name": "car", "xmin": 10, "ymin": 400,
"xmax": 250, "ymax": 550},
{"name": "cat", "xmin": 500, "ymin": 20,
"xmax": 600, "ymax": 120},
{"name": "dog", "xmin": 150, "ymin": 250,
"xmax": 320, "ymax": 380},
]
conv.round_trip_test(test_anns, 640, 480)
The round trip works because all three formats encode the same information -- just in different coordinate systems. VOC uses absolute corner coordinates, YOLO uses normalized center-width-height, COCO uses absolute top-left plus width-height. As long as you don't lose precision during the conversions, you get back exactly what you started with. In practice the main source of bugs is mixing up which format a dataset ships in -- PASCAL VOC XML vs COCO JSON vs YOLO text files -- and quietly feeding the wrong coordinates into your training pipeline.
Exercise 2: Multi-scale detection simulator.
import numpy as np
class MultiScaleDetector:
"""Demonstrate SSD-style multi-scale detection."""
def __init__(self, img_size=300, seed=42):
self.img_size = img_size
self.rng = np.random.RandomState(seed)
self.levels = [
{"name": "38x38", "grid": 38, "stride": 8},
{"name": "19x19", "grid": 19, "stride": 16},
{"name": "10x10", "grid": 10, "stride": 30},
]
def generate_scene(self):
"""8 objects at 3 scales."""
objects = []
# 2 large objects
for _ in range(2):
w = self.rng.randint(100, 140)
h = self.rng.randint(100, 140)
x = self.rng.randint(0, self.img_size - w)
y = self.rng.randint(0, self.img_size - h)
objects.append({
"box": [x, y, x + w, y + h],
"scale": "large",
})
# 3 medium objects
for _ in range(3):
w = self.rng.randint(40, 80)
h = self.rng.randint(40, 80)
x = self.rng.randint(0, self.img_size - w)
y = self.rng.randint(0, self.img_size - h)
objects.append({
"box": [x, y, x + w, y + h],
"scale": "medium",
})
# 3 small objects
for _ in range(3):
w = self.rng.randint(15, 25)
h = self.rng.randint(15, 25)
x = self.rng.randint(0, self.img_size - w)
y = self.rng.randint(0, self.img_size - h)
objects.append({
"box": [x, y, x + w, y + h],
"scale": "small",
})
return objects
def assign_to_level(self, obj):
"""Assign object to best feature level
by matching stride to object size."""
x1, y1, x2, y2 = obj["box"]
obj_size = max(x2 - x1, y2 - y1)
best_level = None
best_ratio = float("inf")
for lvl in self.levels:
ratio = abs(obj_size / lvl["stride"] - 4)
if ratio < best_ratio:
best_ratio = ratio
best_level = lvl["name"]
return best_level
def detect_at_level(self, objects, level):
"""Which objects have center in a grid cell
at this level."""
stride = level["stride"]
grid = level["grid"]
detected = []
for obj in objects:
x1, y1, x2, y2 = obj["box"]
cx = (x1 + x2) / 2
cy = (y1 + y2) / 2
col = int(cx / stride)
row = int(cy / stride)
if 0 <= col < grid and 0 <= row < grid:
detected.append(obj)
return detected
def run(self):
objects = self.generate_scene()
print(f"Generated {len(objects)} objects:")
for o in objects:
x1, y1, x2, y2 = o["box"]
size = max(x2 - x1, y2 - y1)
lvl = self.assign_to_level(o)
print(f" {o['scale']:<7} size={size:>3} "
f"-> best level: {lvl}")
print(f"\nPer-level detection:")
for level in self.levels:
det = self.detect_at_level(objects, level)
print(f" {level['name']}: detects "
f"{len(det)}/{len(objects)}")
print(f"\nSingle-scale misses:")
for level in self.levels:
det = self.detect_at_level(objects, level)
missed = [o for o in objects if o not in det]
if missed:
scales = [m["scale"] for m in missed]
print(f" {level['name']} misses: "
f"{scales}")
sim = MultiScaleDetector()
sim.run()
The key result: no single feature map level catches everything. The 10x10 map (stride 30) detects large objects well but completely misses small ones -- a 20-pixel object barely spans a single cell. The 38x38 map (stride 8) catches small objects but its receptive field is too narrow for large ones. Only by combining all three levels do you get full coverage. This is exactly why SSD and Feature Pyramid Networks predict at multiple scales -- and why modern YOLO versions adopted the same approach starting with v3.
Exercise 3: Detection model benchmarking framework.
import numpy as np
def compute_iou(box1, box2):
x1 = max(box1[0], box2[0])
y1 = max(box1[1], box2[1])
x2 = min(box1[2], box2[2])
y2 = min(box1[3], box2[3])
inter = max(0, x2 - x1) * max(0, y2 - y1)
a1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
a2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
union = a1 + a2 - inter
return inter / max(union, 1e-6)
def compute_ap(preds, gt_boxes, iou_thresh):
preds = sorted(preds, key=lambda p: p["score"],
reverse=True)
if not gt_boxes:
return 0.0
tp = np.zeros(len(preds))
fp = np.zeros(len(preds))
matched = set()
for i, p in enumerate(preds):
best_iou, best_j = 0, -1
for j, g in enumerate(gt_boxes):
iou = compute_iou(p["box"], g)
if iou > best_iou:
best_iou = iou
best_j = j
if best_iou >= iou_thresh and best_j not in matched:
tp[i] = 1
matched.add(best_j)
else:
fp[i] = 1
tp_cum = np.cumsum(tp)
fp_cum = np.cumsum(fp)
prec = tp_cum / (tp_cum + fp_cum)
rec = tp_cum / len(gt_boxes)
return float(np.trapz(prec, rec))
class DetectorBenchmark:
def __init__(self, num_images=20, seed=42):
self.rng = np.random.RandomState(seed)
self.dataset = self._generate(num_images)
def _generate(self, n):
data = []
for _ in range(n):
num_obj = self.rng.randint(1, 5)
boxes = []
for _ in range(num_obj):
x = self.rng.randint(10, 250)
y = self.rng.randint(10, 250)
w = self.rng.randint(30, 80)
h = self.rng.randint(30, 80)
boxes.append([x, y, x + w, y + h])
data.append(boxes)
return data
def perfect_detector(self, gt_boxes):
return [{"box": list(b), "score": 1.0}
for b in gt_boxes]
def noisy_detector(self, gt_boxes):
preds = []
for b in gt_boxes:
noise = self.rng.randint(-10, 10, 4)
noisy_box = [b[i] + noise[i] for i in range(4)]
score = self.rng.uniform(0.5, 0.99)
preds.append({"box": noisy_box,
"score": score})
return preds
def poor_detector(self, gt_boxes):
preds = []
for b in gt_boxes:
if self.rng.random() < 0.5:
noise = self.rng.randint(-30, 30, 4)
noisy = [b[i] + noise[i] for i in range(4)]
preds.append({"box": noisy,
"score": self.rng.uniform(
0.3, 0.8)})
# Add false positives
for _ in range(max(1, len(gt_boxes) // 3)):
x = self.rng.randint(0, 200)
y = self.rng.randint(0, 200)
preds.append({
"box": [x, y, x + 50, y + 50],
"score": self.rng.uniform(0.2, 0.6),
})
return preds
def run(self):
detectors = {
"perfect": self.perfect_detector,
"noisy": self.noisy_detector,
"poor": self.poor_detector,
}
print(f"{'Detector':<10} {'mAP50':>8} {'mAP75':>8}")
print("-" * 28)
for name, fn in detectors.items():
aps_50, aps_75 = [], []
for gt_boxes in self.dataset:
preds = fn(gt_boxes)
aps_50.append(
compute_ap(preds, gt_boxes, 0.5))
aps_75.append(
compute_ap(preds, gt_boxes, 0.75))
m50 = np.mean(aps_50)
m75 = np.mean(aps_75)
print(f"{name:<10} {m50:>8.3f} {m75:>8.3f}")
bench = DetectorBenchmark()
bench.run()
The perfect detector gets AP=1.0 at both thresholds (obviously). The noisy detector does well at IoU=0.5 but drops at IoU=0.75 because its randomly-perturbed boxes often fall below the stricter threshold. The poor detector performs badly everywhere, but the gap between mAP50 and mAP75 is even larger -- sloppy localization gets punished hard by the stricter metric. This is exactly why the COCO benchmark uses mAP50-95 instead of just mAP50: it differentiates detectors that draw tight boxes from detectors that draw sloppy ones.
Here we go. The last two episodes were all about object detection -- drawing bounding boxes around things. Episode #78 built the foundations (IoU, NMS, R-CNN family), and episode #79 covered the modern single-shot detectors (YOLO, SSD, FCOS, CenterNet) that finally made real-time detection practical. A bounding box tells you "there's a cat somewhere inside this rectangle." That's extremely useful, but it's also... kind of crude. The box includes a bunch of background pixels that aren't cat at all. A rectangle can't represent the actual shape of a cat curled up on a couch, or a person's silhouette, or the exact boundary of a road.
Image segmentation goes one level deeper. Instead of drawing rectangles, it labels every single pixel. Is this pixel part of the cat? Is this one? What about this one? The output is a full-resolution mask -- the same dimensions as the input image, but instead of RGB color values, each pixel holds a class label or an instance ID. When a self-driving car needs to know the exact shape of the road surface (not a boxy approximation), when a surgeon needs to see the precise boundary of a tumour on a scan, when a photo editing tool needs to select a person's silhouette pixel by pixel -- that's segmentation ;-)
This is arguably the most computationally demanding visual understanding task. Classification produces one number per image. Detection produces a handful of boxes. Segmentation produces one decision for every pixel -- for a 640x480 image that's 307,200 independent predictions. The architectures need to be clever about maintaining spatial precision while still capturing the high-level semantic understanding of what things are.
Before we build anything, let's get the terminology straight. There are three distinct segmentation tasks, and they require different architectures:
Semantic segmentation: classify every pixel into a category. Sky, road, car, person, tree -- every pixel gets a label. But if two cars are parked next to each other, both just get labeled "car." You can't tell them apart.
Instance segmentation: detect individual objects AND produce a pixel mask for each one. Now those two parked cars are "car_1" and "car_2," each with its own separate mask. But background stuff (sky, road) isn't segmented -- only countable "things."
Panoptic segmentation: the unified approach. Background "stuff" (sky, road, grass) gets semantic labels. Foreground "things" (cars, people, animals) get instance labels. Every pixel accounted for, objects separated.
Input: [Two cats sitting on a green couch]
Semantic: [cat, cat, couch, couch, wall, wall]
(both cats share the same label)
Instance: [cat_1, cat_2, -, -, -, -]
(cats separated, background ignored)
Panoptic: [cat_1, cat_2, couch, couch, wall, wall]
(everything labeled, cats separated)
For most practical applications, semantic segmentation is the starting point. Medical imaging (segment the tumour from healthy tissue), autonomous driving (segment road, sidewalk, lane markings), satellite imagery (segment buildings, vegetation, water). Instance segmentation matters when you need to count or track individual objects -- how many people are in this frame, and where exactly is each one?
Here's the fundamental tension that makes segmentation architectually interesting. We covered this back in episode #45: CNNs progressively shrink spatial dimensions through pooling and strided convolutions. A ResNet backbone takes a 224x224 input and produces a 7x7 feature map by the final layer. That 7x7 representation is semantically rich -- it knows what objects are present, roughly where they are, what the scene looks like. But it has lost 97% of the spatial detail. The information about precise boundaries, thin structures, fine-grained edges -- gone.
Classification doesn't care. It just needs the semantic information from that 7x7 map to say "this image contains a cat."
Detection mostly doesn't care either. The bounding box coordinates get regressed from the feature map, and a few pixels of imprecision in the box is tolerable.
Segmentation cares deeply. You need the output to be the same resolution as the input. Every pixel needs a prediction. So the core architectural question becomes: how do you recover the spatial detail that the encoder threw away?
There are two main strategies:
Encoder-decoder: contract down (lose spatial detail, gain semantic detail), then expand back up (recover spatial detail using the semantic understanding). This is the U-Net approach.
Dilated/atrous convolutions: never lose the resolution in the first place. Use convolutions with gaps between kernel elements to increase the receptive field without reducing spatial dimensions. This is the DeepLab approach.
Both work. Both are still used. Let's build them.
U-Net (Ronneberger et al., 2015) was originally designed for biomedical image segmentation where you often have very few annotated images. Its architecture is beautifully simple: a contracting encoder path, an expanding decoder path, and skip connections that pipe fine-grained features from corresponding encoder levels directly into the decoder.
Encoder (contracting) Decoder (expanding)
Input 256x256 Output 256x256
| conv+pool ^ upconv+concat
128x128 ======================== 128x128
| conv+pool ^ upconv+concat
64x64 ======================== 64x64
| conv+pool ^ upconv+concat
32x32 ======================== 32x32
| conv+pool ^ upconv+concat
16x16 ======================== 16x16
| conv
16x16 (bottleneck)
The name comes from the shape: it looks like the letter U. The left side goes down (encoder), the bottom is the bottleneck, the right side goes up (decoder), and the horizontal lines are skip connections.
Why are the skip connections so important? Without them, the decoder has to reconstruct all spatial detail from the compressed bottleneck alone. That's like asking someone to draw a detailed portrait from a one-sentence description. With skip connections, the decoder gets the original high-resolution features from the encoder AND the semantic understanding from the bottleneck. It can combine both -- using the encoder features for precise boundary information and the bottleneck features for knowing what class each region belongs to.
import torch
import torch.nn as nn
class UNetBlock(nn.Module):
"""Two conv layers with batch norm and ReLU."""
def __init__(self, in_ch, out_ch):
super().__init__()
self.conv = nn.Sequential(
nn.Conv2d(in_ch, out_ch, 3, padding=1),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True),
nn.Conv2d(out_ch, out_ch, 3, padding=1),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True),
)
def forward(self, x):
return self.conv(x)
class UNet(nn.Module):
"""Full U-Net for semantic segmentation."""
def __init__(self, in_channels=3, num_classes=21):
super().__init__()
# Encoder (contracting path)
self.enc1 = UNetBlock(in_channels, 64)
self.enc2 = UNetBlock(64, 128)
self.enc3 = UNetBlock(128, 256)
self.enc4 = UNetBlock(256, 512)
self.pool = nn.MaxPool2d(2)
# Bottleneck
self.bottleneck = UNetBlock(512, 1024)
# Decoder (expanding path)
self.up4 = nn.ConvTranspose2d(
1024, 512, 2, stride=2
)
self.dec4 = UNetBlock(1024, 512)
self.up3 = nn.ConvTranspose2d(
512, 256, 2, stride=2
)
self.dec3 = UNetBlock(512, 256)
self.up2 = nn.ConvTranspose2d(
256, 128, 2, stride=2
)
self.dec2 = UNetBlock(256, 128)
self.up1 = nn.ConvTranspose2d(
128, 64, 2, stride=2
)
self.dec1 = UNetBlock(128, 64)
# Final classification layer
self.out_conv = nn.Conv2d(64, num_classes, 1)
def forward(self, x):
# Encode
e1 = self.enc1(x)
e2 = self.enc2(self.pool(e1))
e3 = self.enc3(self.pool(e2))
e4 = self.enc4(self.pool(e3))
# Bottleneck
b = self.bottleneck(self.pool(e4))
# Decode with skip connections
d4 = self.dec4(torch.cat(
[self.up4(b), e4], dim=1
))
d3 = self.dec3(torch.cat(
[self.up3(d4), e3], dim=1
))
d2 = self.dec2(torch.cat(
[self.up2(d3), e2], dim=1
))
d1 = self.dec1(torch.cat(
[self.up1(d2), e1], dim=1
))
return self.out_conv(d1)
# Test
model = UNet(in_channels=3, num_classes=21)
x = torch.randn(1, 3, 256, 256)
out = model(x)
print(f"Input: {x.shape}")
# torch.Size([1, 3, 256, 256])
print(f"Output: {out.shape}")
# torch.Size([1, 21, 256, 256])
# 21 class scores per pixel
Notice the channel dimensions in the decoder blocks: UNetBlock(1024, 512) for dec4. The input is 1024 because we're concatenating the upsampled bottleneck output (512 channels) with the encoder skip connection (512 channels) along the channel dimension. That's torch.cat([self.up4(b), e4], dim=1) -- the dim=1 is the channel axis.
The ConvTranspose2d (also called transposed convolution or sometimes "deconvolution" -- though that name is technically wrong) is how we upsample. It's the learned counterpart to pooling: where pooling reduces spatial dimensions by 2x, transposed convolution increases them by 2x. You can think of it as a convolution that inserts zeros between input elements and then applies a regular convolution on the expanded grid. The network learns the upsample filter weights during training, rather than using a fixed interpolation like bilinear resize.
Since we're building architectures that upsample, it's worth understanding what ConvTranspose2d actualy does under the hood:
import torch
import torch.nn as nn
# Regular convolution: shrinks spatial dims
conv = nn.Conv2d(1, 1, kernel_size=3,
stride=2, padding=1)
x = torch.randn(1, 1, 8, 8)
print(f"Conv: {x.shape} -> {conv(x).shape}")
# (1,1,8,8) -> (1,1,4,4) -- halved
# Transposed convolution: grows spatial dims
tconv = nn.ConvTranspose2d(1, 1, kernel_size=2,
stride=2)
y = torch.randn(1, 1, 4, 4)
print(f"TConv: {y.shape} -> {tconv(y).shape}")
# (1,1,4,4) -> (1,1,8,8) -- doubled
# Alternative: bilinear interpolation + 1x1 conv
# Some architectures prefer this because transposed
# convolutions can produce checkerboard artifacts
up = nn.Upsample(scale_factor=2, mode="bilinear",
align_corners=True)
proj = nn.Conv2d(1, 1, 1)
z = torch.randn(1, 1, 4, 4)
print(f"Bilinear: {z.shape} -> "
f"{proj(up(z)).shape}")
# (1,1,4,4) -> (1,1,8,8) -- also doubled
The checkerboard artifact problem with transposed convolutions is a well-known issue. When the kernel size is not evenly divisible by the stride, the overlapping output regions get uneven contributions, creating a visible grid pattern. Using kernel_size=2 with stride=2 (like we do in U-Net) avoids this. Some modern architectures skip transposed convolutions entirely and use bilinear upsampling followed by a regular convolution -- same effect, no artifacts, slightly slower.
Regular cross-entropy works for segmentation -- you just apply it per-pixel instead of per-image. But there's a catch: class imbalance. In a typical driving scene, 60% of pixels might be "road," 20% "sky," and the remaining 20% split across cars, pedestrians, signs, buildings, and a dozen other classes. A lazy model can get 80% accuracy by just predicting "road" or "sky" everywhere and completely ignoring rare but critical classes like pedestrians.
Dice loss directly optimizes the overlap metric, which is what we actually care about:
import torch
import torch.nn.functional as F
def dice_loss(predictions, targets, num_classes,
smooth=1.0):
"""Dice loss for segmentation.
predictions: (B, C, H, W) raw logits
targets: (B, H, W) integer class labels
"""
probs = F.softmax(predictions, dim=1)
total_loss = 0.0
for c in range(num_classes):
pred_c = probs[:, c]
target_c = (targets == c).float()
intersection = (pred_c * target_c).sum()
union = pred_c.sum() + target_c.sum()
dice = (2.0 * intersection + smooth) / (
union + smooth
)
total_loss += 1.0 - dice
return total_loss / num_classes
The smooth parameter (also called Laplace smoothing) prevents division by zero when a class isn't present in the image. Without it, a class with zero ground truth pixels would give 0/0.
Focal loss (Lin et al., 2017) is another approach to the imbalance problem. Instead of down-weighting the loss for frequent classes, it down-weights the loss for easy examples. A pixel that the model confidently classifies correctly gets almost zero loss. A pixel the model is uncertain about gets high loss. This forces the model to focus its learning capacity on the hard cases:
def focal_loss(predictions, targets, gamma=2.0,
alpha=0.25):
"""Focal loss: reduce loss for easy examples."""
ce_loss = F.cross_entropy(
predictions, targets, reduction="none"
)
# p_t = probability assigned to the correct class
probs = F.softmax(predictions, dim=1)
p_t = probs.gather(
1, targets.unsqueeze(1)
).squeeze(1)
# Modulating factor: (1-p_t)^gamma
# High p_t (easy example) -> low factor
# Low p_t (hard example) -> high factor
focal_weight = (1.0 - p_t) ** gamma
loss = alpha * focal_weight * ce_loss
return loss.mean()
In practice, a combined loss often works best: total_loss = cross_entropy + dice_loss. The cross-entropy provides stable gradients early in training, while the Dice loss pushes for better overlap on small classes.
Putting the loss functions together with a real training loop:
def train_segmentation(model, dataloader, optimizer,
num_classes, device="cuda"):
"""One epoch of segmentation training."""
model.train()
total_loss = 0.0
for images, masks in dataloader:
# images: (B, 3, H, W) normalized floats
# masks: (B, H, W) integer class labels
images = images.to(device)
masks = masks.to(device)
predictions = model(images)
# predictions: (B, num_classes, H, W)
# Combined loss
ce = F.cross_entropy(
predictions, masks, ignore_index=255
)
dl = dice_loss(predictions, masks, num_classes)
loss = ce + dl
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
return total_loss / len(dataloader)
The ignore_index=255 tells cross-entropy to skip pixels labeled 255 -- a common convention for boundary pixels or unlabeled regions in datasets like PASCAL VOC and Cityscapes. Annotating perfect boundaries between objects is hard, so datasets often leave a thin border as "don't care."
The standard segmentation metric is mean Intersection over Union (mIoU). Same IoU concept from detection (episode #78), but applied to pixel masks instead of bounding boxes:
import numpy as np
def compute_miou(predictions, targets, num_classes):
"""Mean IoU across all classes.
predictions: (N, H, W) predicted class per pixel
targets: (N, H, W) ground truth class per pixel
"""
ious = []
for c in range(num_classes):
pred_mask = (predictions == c)
gt_mask = (targets == c)
intersection = (pred_mask & gt_mask).sum()
union = (pred_mask | gt_mask).sum()
if union == 0:
# Class not present in GT or predictions
# Skip (don't penalize for absent classes)
continue
ious.append(intersection / union)
return np.mean(ious) if ious else 0.0
# Simulated example: 2 images, 3 classes,
# 8x8 pixels each
np.random.seed(42)
gt = np.random.randint(0, 3, (2, 8, 8))
pred = gt.copy()
# Introduce some errors
pred[0, :2, :2] = (pred[0, :2, :2] + 1) % 3
pred[1, 4:6, 4:6] = 0
miou = compute_miou(pred, gt, num_classes=3)
print(f"mIoU: {miou:.3f}")
# High because most pixels match
State-of-the-art mIoU on Cityscapes (urban driving, 19 classes) is around 84-86%. On ADE20K (150 classes, much more diverse) it's around 55-60%. The difficulty scales dramtically with the number of classes and the diversity of scenes -- 150 classes including rare ones like "escalator" and "chandelier" is a much harder problem than 19 classes of road stuff.
We covered Faster R-CNN in episode #78 -- the two-stage detector that proposes regions with an RPN and then classifies them. Mask R-CNN (He et al., 2017) adds a third branch: for each detected object, in addition to predicting the class label and bounding box, it predicts a pixel-level binary mask.
Faster R-CNN:
Image -> Backbone -> FPN -> RPN -> RoI Align
-> Class + Box heads
Mask R-CNN adds:
-> RoI Align -> Mask head (small FCN)
-> 28x28 binary mask per instance
The critical upgrade from Faster R-CNN is RoI Align replacing RoI Pool. RoI Pool quantizes floating-point region coordinates to integer grid positions, which introduces misalignment of up to half a pixel. For bounding box regression, half a pixel doesn't matter. For pixel-accurate masks, it absolutely does. RoI Align uses bilinear interpolation at the exact floating-point coordinates, preserving sub-pixel spatial alignment.
import torch
import torchvision
from torchvision.models.detection import (
maskrcnn_resnet50_fpn_v2,
MaskRCNN_ResNet50_FPN_V2_Weights,
)
# Load pretrained Mask R-CNN
weights = MaskRCNN_ResNet50_FPN_V2_Weights.DEFAULT
model = maskrcnn_resnet50_fpn_v2(weights=weights)
model.eval()
# Prepare an image
preprocess = weights.transforms()
image = torch.randint(
0, 255, (3, 480, 640), dtype=torch.uint8
)
batch = [preprocess(image)]
# Inference
with torch.no_grad():
predictions = model(batch)[0]
# Results: boxes, labels, scores, masks
print(f"Detected objects: {len(predictions['scores'])}")
for i in range(min(5, len(predictions["scores"]))):
score = predictions["scores"][i].item()
if score < 0.5:
break
label = predictions["labels"][i].item()
mask = predictions["masks"][i, 0]
box = predictions["boxes"][i].tolist()
pixels = (mask > 0.5).sum().item()
print(f" Object {i}: class={label}, "
f"score={score:.2f}, "
f"mask={pixels} pixels")
Each mask is output at 28x28 resolution (the internal mask head resolution) and then resized to match the object's bounding box in the original image. The masks are soft probabilities -- threshold at 0.5 for a binary mask. Mask R-CNN's mask quality is genuenly impressive, even for complex shapes like people with outstretched arms or bicycles with thin spokes.
Instead of the encode-then-decode strategy, DeepLabV3 (Chen et al., 2017) takes a different approach: don't lose the resolution in the first place. It uses dilated (atrous) convolutions that increase the receptive field without reducing spatial dimensions:
Standard 3x3 convolution (rate=1):
[x x x]
[x x x] 3x3 receptive field
[x x x] 9 parameters
Dilated 3x3 convolution (rate=2):
[x . x . x]
[. . . . .]
[x . x . x] 5x5 receptive field
[. . . . .] 9 parameters (same!)
[x . x . x]
Dilated 3x3 convolution (rate=4):
[x . . . x . . . x]
9x9 receptive field, still 9 parameters
The dots represent gaps -- the kernel is "inflated" with zeros. You get a much larger receptive field (the network "sees" more context at each position) with zero additional parameters. The tradeoff is that you're sampling sparsely, so you might miss fine details between the kernel elements. But for segmentation where you need both local precision and global context, this is a very effective compromise.
DeepLabV3's key component is ASPP (Atrous Spatial Pyramid Pooling): apply multiple dilated convolutions at different rates in parallel and combine their outputs. This captures context at multiple scales simultaneously:
import torch
import torch.nn as nn
import torch.nn.functional as F
class ASPP(nn.Module):
"""Atrous Spatial Pyramid Pooling."""
def __init__(self, in_channels, out_channels=256):
super().__init__()
# 1x1 convolution (no dilation)
self.conv1x1 = nn.Sequential(
nn.Conv2d(in_channels, out_channels, 1),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
)
# 3x3 at different dilation rates
self.conv_r6 = self._atrous_conv(
in_channels, out_channels, rate=6
)
self.conv_r12 = self._atrous_conv(
in_channels, out_channels, rate=12
)
self.conv_r18 = self._atrous_conv(
in_channels, out_channels, rate=18
)
# Global average pooling branch
self.global_pool = nn.Sequential(
nn.AdaptiveAvgPool2d(1),
nn.Conv2d(in_channels, out_channels, 1),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
)
# Combine all branches
self.project = nn.Sequential(
nn.Conv2d(out_channels * 5,
out_channels, 1),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
)
def _atrous_conv(self, in_ch, out_ch, rate):
return nn.Sequential(
nn.Conv2d(in_ch, out_ch, 3,
padding=rate, dilation=rate),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True),
)
def forward(self, x):
h, w = x.shape[2:]
out1 = self.conv1x1(x)
out6 = self.conv_r6(x)
out12 = self.conv_r12(x)
out18 = self.conv_r18(x)
# Global pool -> upsample to match spatial size
gp = self.global_pool(x)
gp = F.interpolate(
gp, size=(h, w), mode="bilinear",
align_corners=True
)
# Concatenate all 5 branches
combined = torch.cat(
[out1, out6, out12, out18, gp], dim=1
)
return self.project(combined)
# Test
aspp = ASPP(in_channels=2048)
feat = torch.randn(1, 2048, 32, 32)
out = aspp(feat)
print(f"ASPP: {feat.shape} -> {out.shape}")
# (1, 2048, 32, 32) -> (1, 256, 32, 32)
# Spatial size preserved, channels reduced
The five parallel branches each see the same spatial positions but with different amounts of context. Rate 6 captures local structure, rate 18 captures broader scene context, and the global average pooling branch captures the entire image's statistics. The concatenation and 1x1 projection let the network learn which scale of context matters most for each pixel.
In practice, using pretrained DeepLabV3 from torchvision is straightforward:
from torchvision.models.segmentation import (
deeplabv3_resnet101,
DeepLabV3_ResNet101_Weights,
)
weights = DeepLabV3_ResNet101_Weights.DEFAULT
model = deeplabv3_resnet101(weights=weights)
model.eval()
preprocess = weights.transforms()
img = torch.randint(
0, 255, (3, 480, 640), dtype=torch.uint8
)
batch = preprocess(img).unsqueeze(0)
with torch.no_grad():
output = model(batch)["out"]
# (1, 21, H, W) -- 21 PASCAL VOC classes
preds = output.argmax(dim=1).squeeze().numpy()
print(f"Prediction map shape: {preds.shape}")
print(f"Unique classes found: {set(preds.flat)}")
# Class mapping (PASCAL VOC)
class_names = [
"background", "aeroplane", "bicycle", "bird",
"boat", "bottle", "bus", "car", "cat", "chair",
"cow", "dining_table", "dog", "horse",
"motorbike", "person", "potted_plant", "sheep",
"sofa", "train", "tv_monitor",
]
SAM (Kirillov et al., 2023) represents a paradigm shift in segmentation, mirroring the same foundation model revolution we saw in NLP with GPT and BERT (episodes #58-59). Instead of training a new segmentation model for each specific task, SAM is a general-purpose segmentation engine. You provide a prompt -- a point click, a bounding box, or text -- and SAM generates the mask. It was trained on 11 million images with 1.1 billion mask annotations, the largest segmentation dataset ever assembled.
The architecture has three parts:
The key insight is that the image encoder runs once, and then you can try different prompts interactively. Click on the cat, get the cat mask. Click on the couch, get the couch mask. Same image embedding, different prompts. This makes SAM feel instantaneous in interactive use.
from segment_anything import (
SamPredictor,
sam_model_registry,
)
import numpy as np
# Load SAM (download weights separately)
sam = sam_model_registry["vit_h"](
checkpoint="sam_vit_h.pth"
)
predictor = SamPredictor(sam)
# Encode image once
predictor.set_image(image_rgb)
# Prompt with a point: click on what you want
input_point = np.array([[500, 300]])
input_label = np.array([1]) # 1=foreground
masks, scores, logits = predictor.predict(
point_coords=input_point,
point_labels=input_label,
multimask_output=True,
)
# Returns 3 masks at different granularities
# (whole object, part, subpart)
best_mask = masks[scores.argmax()]
print(f"Best score: {scores.max():.3f}")
print(f"Mask pixels: {best_mask.sum()}")
# Prompt with a bounding box
box = np.array([100, 50, 400, 350])
masks, scores, logits = predictor.predict(
box=box,
multimask_output=False,
)
# Segment everything automatically
from segment_anything import (
SamAutomaticMaskGenerator,
)
mask_gen = SamAutomaticMaskGenerator(sam)
all_masks = mask_gen.generate(image_rgb)
print(f"Found {len(all_masks)} objects")
for m in sorted(all_masks,
key=lambda x: x["area"],
reverse=True)[:5]:
print(f" Area: {m['area']:>8d}px, "
f"IoU: {m['predicted_iou']:.3f}, "
f"Stability: {m['stability_score']:.3f}")
SAM's multimask_output=True mode returns three masks at different granularity levels -- think of it as "did you mean the whole person, just their shirt, or just the collar?" The stability_score measures how consistent the mask is under small perturbations to the prompt. High stability means the model is confident about the boundary.
This mirrors what happened with LLMs. We went from training a separate NLP model for sentiment, translation, summarization, QA, and every other task... to having GPT/BERT handle everything through prompting. SAM does the same for segmentation. Instead of "medical segmentation model," "satellite segmentation model," "product segmentation model," you use SAM as the universal segmentation engine and customize through prompts. The field is converging on foundation models for vision, just as it did for language.
| Task | Architecture | When to use |
|---|---|---|
| Semantic segmentation | U-Net, DeepLabV3 | Pixel-level classification (medical, driving, satellite) |
| Instance segmentation | Mask R-CNN | Need to count/track individual objects |
| Interactive segmentation | SAM | Point-and-click, any domain, zero-shot |
| Panoptic segmentation | Panoptic FPN, Mask2Former | Need both stuff and things labeled |
| Real-time segmentation | BiSeNet, STDC | Edge devices, video at 30+ FPS |
For most new projects in 2024+, SAM is the starting point. If you need real-time performance on an edge device, go with a lightweight specialized model. If you need task-specific accuracy beyond what SAM provides, fine-tune a U-Net or DeepLabV3 on your domain data.
Segmentation gives us pixel-level understanding. We now know what is in an image (classification), where it is (detection), and exactly which pixels belong to it (segmentation). But there's another dimension of visual understanding we haven't touched yet -- understanding the spatial structure and pose of objects. A person isn't just a blob of pixels labeled "person." They have arms, legs, joints, and those joints move in specific ways. Tracking that skeletal structure opens up entirely new applications, from sports analytics to AR to gesture control.
Exercise 1: Build a segmentation mask visualization toolkit. Create a class SegmentationVisualizer that: (a) takes a prediction array of shape (H, W) with integer class labels and a class-to-color mapping dictionary, (b) generates a colorized mask image where each class gets its assigned RGB color, (c) implements an overlay method that blends the colorized mask with the original image at a configurable alpha transparency (default 0.5), (d) implements a class_statistics method that computes and prints for each class: pixel count, percentage of total image area, and whether the class is present, (e) implements an iou_per_class method that takes a ground truth array and computes per-class IoU plus mIoU. Test with a synthetic 200x200 prediction map containing 4 classes and verify that mIoU is 1.0 when prediction equals ground truth.
Exercise 2: Implement a simplified U-Net with configurable depth. Create a class FlexibleUNet(nn.Module) that: (a) takes a depth parameter (2 to 5) controlling how many encoder/decoder levels to build, (b) takes a base_channels parameter (default 64) for the first encoder level -- each subsequent level doubles the channels, (c) dynamically creates the encoder blocks, decoder blocks, upsampling layers, and skip connections based on the depth parameter using nn.ModuleList, (d) includes a parameter_count method that returns the total number of trainable parameters, (e) prints the per-level channel dimensions and feature map sizes during the first forward pass. Build U-Nets at depth 2, 3, 4, and 5, run a dummy 256x256 input through each, and print a comparison table of depth vs parameter count vs output shape. Verify that output spatial dimensions always match input spatial dimensions regardless of depth.
Exercise 3: Build a Dice coefficient monitor for tracking segmentation training progress. Create a class DiceMonitor that: (a) implements update(predictions, targets) that takes batches of segmentation predictions (B, C, H, W) logits and targets (B, H, W) integer labels and accumulates true positives, false positives, and false negatives per class across the entire epoch, (b) implements compute() that returns per-class Dice coefficients and the mean Dice, (c) implements reset() to clear accumulators at the start of each epoch, (d) implements report() that prints a formatted table of per-class Dice scores with a visual bar chart using ASCII characters. Simulate 10 training batches with improving predictions (start with random noise, gradually shift toward ground truth) and show how the Dice scores increase. Verify that perfect predictions give Dice=1.0 for all classes.