25 GPU Training
Let’s train a model on GPU.
25.1 Full Training Example
import tensorweaver as tw
from tensorweaver.nn import Sequential, Linear, ReLU
from tensorweaver.optim import Adam
from tensorweaver.data import TensorDataset, DataLoader
# Enable GPU
tw.use_gpu()
# Create data (will be on GPU)
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, test_size=0.2
)
# Normalize
mean, std = X_train.mean(0), X_train.std(0)
X_train = (X_train - mean) / std
X_test = (X_test - mean) / std
# Create tensors (on GPU due to use_gpu())
X_train_t = tw.Tensor(X_train)
X_test_t = tw.Tensor(X_test)
y_train_t = tw.Tensor(y_train)
y_test_t = tw.Tensor(y_test)
# Model (parameters on GPU)
model = Sequential(
Linear(4, 32),
ReLU(),
Linear(32, 16),
ReLU(),
Linear(16, 3)
)
optimizer = Adam(model.parameters(), lr=0.01)
# DataLoader
train_dataset = TensorDataset(X_train_t, y_train_t)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
# Training loop
for epoch in range(100):
model.train()
for batch_x, batch_y in train_loader:
logits = model(batch_x)
loss = cross_entropy(logits, batch_y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
if epoch % 20 == 0:
model.eval()
test_logits = model(X_test_t)
test_preds = test_logits.data.argmax(axis=-1)
acc = (test_preds == y_test_t.data).mean()
print(f"Epoch {epoch}: test_acc={acc:.2%}")25.2 Benchmarking CPU vs GPU
import time
import tensorweaver as tw
def train_epoch(model, optimizer, loader):
for batch_x, batch_y in loader:
logits = model(batch_x)
loss = cross_entropy(logits, batch_y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
def benchmark_training(use_gpu=False, epochs=10):
# Set device
if use_gpu:
tw.use_gpu()
else:
tw.use_cpu()
# Setup
model = Sequential(
Linear(784, 256),
ReLU(),
Linear(256, 128),
ReLU(),
Linear(128, 10)
)
optimizer = Adam(model.parameters())
# Fake MNIST-like data
X = tw.Tensor(tw.get_backend().randn(1000, 784))
y = tw.Tensor(tw.get_backend().array(
list(range(10)) * 100 # Fake labels
))
dataset = TensorDataset(X, y)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
# Warmup
train_epoch(model, optimizer, loader)
# Benchmark
start = time.perf_counter()
for _ in range(epochs):
train_epoch(model, optimizer, loader)
elapsed = time.perf_counter() - start
return elapsed
cpu_time = benchmark_training(use_gpu=False)
gpu_time = benchmark_training(use_gpu=True)
print(f"CPU: {cpu_time:.2f}s")
print(f"GPU: {gpu_time:.2f}s")
print(f"Speedup: {cpu_time/gpu_time:.1f}x")25.3 When to Use GPU
| Scenario | Recommendation |
|---|---|
| Small data (<1000 samples) | CPU |
| Large matrices (>1000×1000) | GPU |
| Transformers | Always GPU |
| Debugging | CPU (easier) |
| Production inference | Depends on latency needs |
25.4 Memory Considerations
GPUs have limited memory:
# Check memory (if using CuPy backend)
import cupy as cp
mempool = cp.get_default_memory_pool()
print(f"Used: {mempool.used_bytes() / 1e9:.2f} GB")
print(f"Total: {mempool.total_bytes() / 1e9:.2f} GB")Tips for large models:
- Reduce batch size: Trade speed for memory
- Gradient checkpointing: Recompute activations
- Mixed precision: Use float16 where possible
- Model parallelism: Split across GPUs
25.5 Mixed Precision (Advanced)
Use float16 for speed:
class MixedPrecisionBackend(CuNumericBackend):
"""Use float16 for forward, float32 for gradients."""
def array(self, data, dtype=None):
dtype = dtype or cn.float16 # Default to float16
return cn.array(data, dtype=dtype)
def to_float32(self, x):
return x.astype(cn.float32)25.6 Part VII Complete!
Tip
Milestone: You can now train on GPU!
- ✓ cuNumeric for GPU computing
- ✓ Backend abstraction design
- ✓ Same code on CPU/GPU
- ✓ Significant speedups
Your framework is now hardware-accelerated!
25.7 Summary
- cuNumeric provides drop-in GPU acceleration
- Backend abstraction keeps code portable
- 10-50x speedups for large models
- Use GPU for large data, CPU for small/debugging
Next: The grand finale — building GPT.