Image Representation and Color Spaces

bash

pip install pillow==11.0.0 numpy==2.1.2 opencv-python==4.10.0.84 \
    matplotlib==3.9.2 torch==2.5.1 torchvision==0.20.1

python

from PIL import Image
import numpy as np

img = Image.open("cat.jpg")             # PIL Image, 'RGB' mode
arr = np.array(img)
print(arr.shape, arr.dtype)             # (H, W, 3), uint8
print(arr.min(), arr.max())             # 0, 255

python

# RGB: 3 channels, ordered red-green-blue (PIL convention)
print(arr[0, 0])                        # [R, G, B] for the top-left pixel

# Grayscale: 1 channel
gray = img.convert("L")                 # PIL "luminance"
print(np.array(gray).shape)             # (H, W)

pythoneditable · runnable

import numpy as np
import matplotlib.pyplot as plt

# Build a 64x64 RGB image from pure numpy: gradients plus two shapes.
H = W = 64
xx, yy = np.meshgrid(np.linspace(0, 1, W), np.linspace(0, 1, H))
img = np.zeros((H, W, 3))
img[..., 0] = xx                     # red ramps up left to right
img[..., 2] = yy                     # blue ramps up top to bottom
img[..., 1] = 0.25                   # a whisper of green everywhere

img[8:24, 8:24] = [1.0, 0.1, 0.1]    # a red square
disk = (xx - 0.7) ** 2 + (yy - 0.7) ** 2 < 0.02
img[disk] = [0.1, 0.9, 0.2]          # a green disk

r, g, b = img[..., 0], img[..., 1], img[..., 2]
gray = 0.299 * r + 0.587 * g + 0.114 * b

print("image shape:", img.shape)
print("one channel:", r.shape)
print("grayscale = 0.299 R + 0.587 G + 0.114 B")
print("red-square pixel", np.round(img[16, 16], 2), "has gray value", round(float(gray[16, 16]), 3))
print("green-disk pixel", np.round(img[45, 45], 2), "has gray value", round(float(gray[45, 45]), 3))

fig, axes = plt.subplots(1, 2, figsize=(9, 4))
axes[0].imshow(img)
axes[0].set_title("RGB image built from numbers")
axes[1].imshow(r, cmap="hot")
axes[1].set_title("Red channel as heat")
for ax in axes:
    ax.axis("off")
fig.tight_layout()

import numpy as np
import matplotlib.pyplot as plt

# Build a 64x64 RGB image from pure numpy: gradients plus two shapes.
H = W = 64
xx, yy = np.meshgrid(np.linspace(0, 1, W), np.linspace(0, 1, H))
img = np.zeros((H, W, 3))
img[..., 0] = xx                     # red ramps up left to right
img[..., 2] = yy                     # blue ramps up top to bottom
img[..., 1] = 0.25                   # a whisper of green everywhere

img[8:24, 8:24] = [1.0, 0.1, 0.1]    # a red square
disk = (xx - 0.7) ** 2 + (yy - 0.7) ** 2 < 0.02
img[disk] = [0.1, 0.9, 0.2]          # a green disk

r, g, b = img[..., 0], img[..., 1], img[..., 2]
gray = 0.299 * r + 0.587 * g + 0.114 * b

print("image shape:", img.shape)
print("one channel:", r.shape)
print("grayscale = 0.299 R + 0.587 G + 0.114 B")
print("red-square pixel", np.round(img[16, 16], 2), "has gray value", round(float(gray[16, 16]), 3))
print("green-disk pixel", np.round(img[45, 45], 2), "has gray value", round(float(gray[45, 45]), 3))

fig, axes = plt.subplots(1, 2, figsize=(9, 4))
axes[0].imshow(img)
axes[0].set_title("RGB image built from numbers")
axes[1].imshow(r, cmap="hot")
axes[1].set_title("Red channel as heat")
for ax in axes:
    ax.axis("off")
fig.tight_layout()

Space	Channels	When you'd use it
RGB	R, G, B	Default; what every model expects
BGR	B, G, R	OpenCV's default — silently wrong color when displayed as RGB
HSV	Hue, Saturation, Value	Color-based filtering, augmentations like "shift hue"
Lab	Lightness, a, b	Perceptually uniform; image-similarity metrics
YCbCr	Luminance + chroma	JPEG compression, video codecs

python

import cv2
bgr = cv2.imread("cat.jpg")             # OpenCV reads as BGR!
rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
hsv = cv2.cvtColor(rgb, cv2.COLOR_RGB2HSV)
lab = cv2.cvtColor(rgb, cv2.COLOR_RGB2LAB)

dtype	Range	Where you meet it
uint8	0 … 255	JPEG, PNG, everything on disk and on screens
uint16	0 … 65 535	Medical DICOM (typically 12-16 bits used), RAW camera files, microscopy
float32	usually [0, 1] or normalized	What models consume; the default compute dtype
float16 / bfloat16	as float32	Mixed-precision training and inference; half the memory

python

img_f = arr.astype(np.float32) / 255.0
print(img_f.min(), img_f.max())         # 0.0, 1.0

python

import torch
# numpy / PIL: (H, W, C)
print(arr.shape)                        # (224, 224, 3)
# PyTorch:    (C, H, W)
ten = torch.from_numpy(arr).permute(2, 0, 1)
print(ten.shape)                        # (3, 224, 224)

# The modern torchvision v2 conversion — explicit about layout and scaling:
from torchvision.transforms import v2 as T
tensor = T.Compose([
    T.ToImage(),                        # PIL/ndarray → CHW tensor
    T.ToDtype(torch.float32, scale=True),  # uint8 [0,255] → float32 [0,1]
])(img)

python

from PIL import Image

# Read
img = Image.open("input.jpg")           # lazy; data not loaded yet
img.load()                              # force load

# Write
img.save("output.png")                  # PNG (lossless)
img.save("output.jpg", quality=92)      # JPEG with quality

# Straight to a tensor, skipping PIL entirely:
from torchvision.io import decode_image
ten = decode_image("input.jpg")         # uint8 tensor, (C, H, W), RGB

python

import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 3, figsize=(12, 4))
ax[0].imshow(arr); ax[0].set_title("RGB")
ax[1].imshow(np.array(gray), cmap="gray"); ax[1].set_title("Gray")
ax[2].imshow(arr[..., 0], cmap="gray"); ax[2].set_title("R channel")
for a in ax: a.axis("off")
plt.show()

python

# Top-left 100×100 patch
patch = arr[:100, :100]

# Center crop
H, W = arr.shape[:2]; s = 224
top  = (H - s) // 2; left = (W - s) // 2
center = arr[top:top + s, left:left + s]

# Mask: where red is dominant
mask = (arr[..., 0] > 150) & (arr[..., 0] > arr[..., 1] + 30)
print(mask.shape, mask.dtype)           # (H, W), bool

python

import torch
from torchvision.io import decode_image
from torchvision.transforms import v2 as T

x = decode_image("cat.jpg")             # (3, H, W)   uint8    [0, 255]  RGB

pipeline = T.Compose([
    T.Resize(256),                      # (3, 256, W') uint8   [0, 255]
    T.CenterCrop(224),                  # (3, 224, 224) uint8  [0, 255]
    T.ToDtype(torch.float32, scale=True),  # float32 [0, 1]
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std =[0.229, 0.224, 0.225]),  # float32, per-channel ~N(0, 1)
])

x = pipeline(x)                         # (3, 224, 224) float32, mean ≈ 0
batch = x.unsqueeze(0)                  # (1, 3, 224, 224) — models want NCHW
logits = model(batch)                   # (1, num_classes) float32

1. An Image Is a Numpy Array

2. RGB and Grayscale

3. The Color Space Zoo

4. Bit Depth and Range

5. Gamma, sRGB, and a Word on HDR

6. Channel-First vs Channel-Last

7. Reading and Writing Images

8. Visualizing Images

9. Indexing and Slicing

10. Tensors End-to-End: From JPEG Bytes to Logits

11. Exercises