16 Layer and Module
Our Part IV code works but is messy. Let’s create clean abstractions.
16.1 The Problem
Our MLP has scattered parameters:
# Ugly: parameters everywhere
W1 = Tensor(..., requires_grad=True)
b1 = Tensor(..., requires_grad=True)
W2 = Tensor(..., requires_grad=True)
b2 = Tensor(..., requires_grad=True)
# Must manually collect all parameters
optimizer = Adam([W1, b1, W2, b2, ...])PyTorch solves this elegantly:
# Clean: model manages its own parameters
model = nn.Sequential(
nn.Linear(4, 16),
nn.ReLU(),
nn.Linear(16, 3)
)
optimizer = Adam(model.parameters()) # Automatic!16.2 The Module Base Class
class Module:
"""Base class for all neural network modules."""
def __init__(self):
self._parameters = {} # name -> Tensor
self._modules = {} # name -> Module
self.training = True
def forward(self, *args, **kwargs):
"""Define the computation."""
raise NotImplementedError
def __call__(self, *args, **kwargs):
"""Make module callable."""
return self.forward(*args, **kwargs)
def parameters(self):
"""Return all parameters (recursive)."""
params = list(self._parameters.values())
for module in self._modules.values():
params.extend(module.parameters())
return params
def train(self):
"""Set training mode."""
self.training = True
for module in self._modules.values():
module.train()
def eval(self):
"""Set evaluation mode."""
self.training = False
for module in self._modules.values():
module.eval()
Note
Code Reference: See src/tensorweaver/layers/layer.py for the full implementation.
16.3 Automatic Parameter Registration
The magic: __setattr__ automatically registers parameters and submodules:
class Module:
def __setattr__(self, name, value):
if isinstance(value, Tensor) and value.requires_grad:
# Register as parameter
self._parameters[name] = value
elif isinstance(value, Module):
# Register as submodule
self._modules[name] = value
super().__setattr__(name, value)Now this works automatically:
class MyModule(Module):
def __init__(self):
super().__init__()
self.weight = Tensor(..., requires_grad=True) # Auto-registered!
self.linear = Linear(10, 5) # Auto-registered!16.4 The Linear Layer (Proper)
class Linear(Module):
"""Linear transformation: y = xW + b"""
def __init__(self, in_features, out_features, bias=True):
super().__init__()
# Xavier initialization
scale = np.sqrt(2.0 / (in_features + out_features))
self.weight = Tensor(
np.random.randn(in_features, out_features) * scale,
requires_grad=True
)
if bias:
self.bias = Tensor(np.zeros(out_features), requires_grad=True)
else:
self.bias = None
def forward(self, x):
out = x @ self.weight
if self.bias is not None:
out = out + self.bias
return out
Note
Code Reference: See src/tensorweaver/layers/linear.py for the implementation.
16.5 Using Linear
linear = Linear(4, 8)
# Parameters are automatically tracked
print(f"Parameters: {len(linear.parameters())}") # 2 (weight + bias)
# Forward pass
x = Tensor(np.random.randn(32, 4))
y = linear(x)
print(f"Output shape: {y.shape}") # (32, 8)16.6 Building the MLP with Modules
class MLP(Module):
"""Multi-layer Perceptron."""
def __init__(self, input_size, hidden_size, output_size, dropout=0.2):
super().__init__()
self.fc1 = Linear(input_size, hidden_size)
self.ln1 = LayerNorm(hidden_size)
self.dropout = Dropout(dropout)
self.fc2 = Linear(hidden_size, output_size)
def forward(self, x):
x = self.fc1(x)
x = self.ln1(x)
x = relu(x)
if self.training:
x = self.dropout(x)
x = self.fc2(x)
return xUsage is now clean:
model = MLP(4, 16, 3)
optimizer = Adam(model.parameters()) # Collects all automatically!
for epoch in range(epochs):
model.train()
logits = model(x_train)
loss = cross_entropy(logits, y_train)
loss.backward()
optimizer.step()
optimizer.zero_grad()
model.eval()
predictions = model(x_test)16.7 Nested Modules Work Automatically
class DeepMLP(Module):
def __init__(self):
super().__init__()
self.block1 = MLP(4, 16, 8) # MLP is itself a Module
self.block2 = MLP(8, 16, 3)
def forward(self, x):
x = self.block1(x)
x = relu(x)
x = self.block2(x)
return x
model = DeepMLP()
print(f"Total parameters: {len(model.parameters())}")
# Recursively collects from block1 and block216.8 Common Module Methods
class Module:
def num_parameters(self):
"""Count total parameters."""
return sum(p.data.size for p in self.parameters())
def to_dict(self):
"""Get module structure as dict."""
return {
'class': self.__class__.__name__,
'parameters': {k: v.shape for k, v in self._parameters.items()},
'modules': {k: v.to_dict() for k, v in self._modules.items()}
}
def __repr__(self):
lines = [f"{self.__class__.__name__}("]
for name, module in self._modules.items():
lines.append(f" ({name}): {module}")
lines.append(")")
return "\n".join(lines)Example output:
print(model)
# MLP(
# (fc1): Linear(in_features=4, out_features=16)
# (ln1): LayerNorm(normalized_shape=16)
# (dropout): Dropout(p=0.2)
# (fc2): Linear(in_features=16, out_features=3)
# )
print(f"Parameters: {model.num_parameters():,}")
# Parameters: 19516.9 Summary
- Module base class manages parameters and submodules
- Automatic registration via
__setattr__ - Recursive parameters() collects from all children
- train()/eval() propagates to all submodules
- Clean, composable architecture
Next: Container modules for flexible architectures.