Vuk Rosić

Posted on Jul 7

Code a Neural Network from Scratch in NumPy

Part of my Zero to AI Researcher / Engineer Course

Part 1: Getting Started - Your First Neural Network

import numpy as np
import matplotlib.pyplot as plt

# Create simple dataset - XOR problem
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])

print(f"Input shape: {X.shape}")    # (4, 2)
print(f"Output shape: {y.shape}")   # (4, 1)
print(f"Dataset:")
for i in range(len(X)):
    print(f"  {X[i]} -> {y[i][0]}")  # Print each input pair and its expected output

What happened: We created the XOR dataset - a classic non-linearly separable problem that requires a neural network to solve.

Part 2: Understanding the Math - Forward Pass

Basic Network Architecture

# Network architecture: 2 -> 4 -> 1 (input -> hidden -> output)
input_size = 2      # Number of input features (x and y coordinates)
hidden_size = 4     # Number of neurons in hidden layer
output_size = 1     # Number of output values (0 or 1)

# Initialize weights and biases
np.random.seed(42)
W1 = np.random.randn(input_size, hidden_size) * 0.5  # Weights: connection strengths between layers
b1 = np.zeros((1, hidden_size))                      # Biases: adjustable offsets for each neuron
W2 = np.random.randn(hidden_size, output_size) * 0.5 # Weights for output layer
b2 = np.zeros((1, output_size))                      # Bias for output layer

print(f"W1 shape: {W1.shape}")  # (2, 4)
print(f"b1 shape: {b1.shape}")  # (1, 4)
print(f"W2 shape: {W2.shape}")  # (4, 1)
print(f"b2 shape: {b2.shape}")  # (1, 1)

OPTIONAL: Understanding Matrix Shapes in Neural Networks

# Let's trace through the shapes step by step
print("Shape flow through network:")
print(f"Input X:        {X.shape}")           # (4, 2)
print(f"Weights W1:     {W1.shape}")          # (2, 4)
print(f"X @ W1:         {(X @ W1).shape}")    # (4, 4)
print(f"Bias b1:        {b1.shape}")          # (1, 4)

# Broadcasting explanation
sample_mult = X @ W1
print(f"\nBroadcasting bias:")
print(f"(X @ W1) shape: {sample_mult.shape}")  # (4, 4)
print(f"b1 shape:       {b1.shape}")           # (1, 4)
print(f"Result shape:   {(sample_mult + b1).shape}")  # (4, 4)

Forward Pass Implementation

def forward_pass(X, W1, b1, W2, b2):
    """Complete forward pass through the network"""
    # Hidden layer
    z1 = X @ W1 + b1              # Linear transformation
    a1 = sigmoid(z1)              # Activation

    # Output layer
    z2 = a1 @ W2 + b2            # Linear transformation
    a2 = sigmoid(z2)             # Activation

    return z1, a1, z2, a2

# Test forward pass
z1, a1, z2, predictions = forward_pass(X, W1, b1, W2, b2)
print(f"Predictions shape: {predictions.shape}")
print(f"Predictions:\n{predictions.flatten()}")
print(f"Actual labels:\n{y.flatten()}")

OPTIONAL: Understanding the Sigmoid Function

def sigmoid(x):
    """Sigmoid activation function"""
    return 1 / (1 + np.exp(-np.clip(x, -500, 500)))  # Clip to prevent overflow

# Test sigmoid on different inputs
test_values = np.array([-10, -1, 0, 1, 10])
sigmoid_values = sigmoid(test_values)

print("Sigmoid function behavior:")
for i, val in enumerate(test_values):
    print(f"  sigmoid({val:3.0f}) = {sigmoid_values[i]:.3f}")

# Visualize sigmoid
x_range = np.linspace(-10, 10, 100)
y_sigmoid = sigmoid(x_range)
plt.figure(figsize=(8, 4))
plt.plot(x_range, y_sigmoid, 'b-', linewidth=2)
plt.title('Sigmoid Activation Function')
plt.xlabel('x')
plt.ylabel('sigmoid(x)')
plt.grid(True, alpha=0.3)
plt.show()

OPTIONAL: Breaking Down the Sigmoid Formula

# Let's understand sigmoid step by step: 1 / (1 + e^(-x))
x = 2.0
print(f"Input: x = {x}")
print(f"Step 1: -x = {-x}")
print(f"Step 2: e^(-x) = np.exp(-x) = {np.exp(-x):.3f}")
print(f"Step 3: 1 + e^(-x) = {1 + np.exp(-x):.3f}")
print(f"Step 4: 1 / (1 + e^(-x)) = {1 / (1 + np.exp(-x)):.3f}")
print(f"Sigmoid result: {sigmoid(x):.3f}")

# Why clipping?
print(f"\nWhy we clip large values:")
large_x = 1000
print(f"Without clipping: np.exp(-{large_x}) would cause overflow")
print(f"With clipping: np.exp(-500) = {np.exp(-500):.2e} (very small, safe)")

Key insight: The forward pass transforms input through weighted connections and activations to produce predictions.

Part 3: Computing Loss and Gradients

Loss Function

def compute_loss(predictions, targets):
    """Mean squared error loss"""
    return np.mean((predictions - targets) ** 2)  # Square the difference to penalize large errors

OPTIONAL: Why We Square the Difference

# Let's see why we use (predictions - targets) ** 2
pred = np.array([0.8, 0.2, 0.9, 0.1])
target = np.array([1.0, 0.0, 1.0, 0.0])

differences = pred - target
squared_differences = differences ** 2

print("Understanding squared error:")
print(f"Predictions: {pred}")
print(f"Targets:     {target}")
print(f"Differences: {differences}")
print(f"Squared:     {squared_differences}")
print(f"Mean squared error: {np.mean(squared_differences):.4f}")

# Why not just absolute difference?
abs_differences = np.abs(differences)
print(f"\nComparison:")
print(f"Absolute differences: {abs_differences}")
print(f"Squared differences:  {squared_differences}")
print("Squared errors penalize large mistakes more heavily!")

Calculate initial loss

initial_loss = compute_loss(predictions, y)
print(f"Initial loss: {initial_loss:.4f}")

OPTIONAL: Understanding Derivative of Sigmoid

def sigmoid_derivative(x):
    """Derivative of sigmoid function"""
    s = sigmoid(x)
    return s * (1 - s)  # Derivative formula: sigmoid(x) * (1 - sigmoid(x))

# Test derivative
test_vals = np.array([-2, -1, 0, 1, 2])
sigmoid_vals = sigmoid(test_vals)
derivative_vals = sigmoid_derivative(test_vals)

print("Sigmoid and its derivative:")
for i, val in enumerate(test_vals):
    print(f"  x={val:2.0f}: sigmoid={sigmoid_vals[i]:.3f}, derivative={derivative_vals[i]:.3f}")

# Visualize both functions
x_range = np.linspace(-6, 6, 100)
y_sigmoid = sigmoid(x_range)
y_derivative = sigmoid_derivative(x_range)

plt.figure(figsize=(10, 4))
plt.plot(x_range, y_sigmoid, 'b-', label='sigmoid(x)', linewidth=2)
plt.plot(x_range, y_derivative, 'r--', label="sigmoid'(x)", linewidth=2)
plt.title('Sigmoid Function and Its Derivative')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

OPTIONAL: Chain Rule in Backpropagation

# The chain rule: if y = f(g(x)), then dy/dx = f'(g(x)) * g'(x)
# 
# For our network: Loss = MSE(sigmoid(W2 * sigmoid(W1 * X + b1) + b2), y)
# We need: dLoss/dW2, dLoss/db2, dLoss/dW1, dLoss/db1

print("Chain rule breakdown:")
print("dLoss/dW2 = dLoss/da2 * da2/dz2 * dz2/dW2")
print("  where:")
print("    dLoss/da2 = 2 * (predictions - targets)  # MSE derivative")
print("    da2/dz2 = sigmoid'(z2)                   # sigmoid derivative")
print("    dz2/dW2 = a1                             # linear layer derivative")

Backpropagation Implementation

def backward_pass(X, y, z1, a1, z2, a2, W1, b1, W2, b2):
    """Compute gradients using backpropagation"""
    m = X.shape[0]  # Number of samples

    # Output layer gradients
    dz2 = 2 * (a2 - y) * sigmoid_derivative(z2)  # (4, 1)
    dW2 = a1.T @ dz2 / m                         # (4, 1)
    db2 = np.mean(dz2, axis=0, keepdims=True)    # (1, 1)

    # Hidden layer gradients
    dz1 = (dz2 @ W2.T) * sigmoid_derivative(z1)  # (4, 4)
    dW1 = X.T @ dz1 / m                          # (2, 4)
    db1 = np.mean(dz1, axis=0, keepdims=True)    # (1, 4)

    return dW1, db1, dW2, db2

# Test backpropagation
dW1, db1, dW2, db2 = backward_pass(X, y, z1, a1, z2, predictions, W1, b1, W2, b2)
print(f"Gradient shapes:")
print(f"  dW1: {dW1.shape}, dW2: {dW2.shape}")
print(f"  db1: {db1.shape}, db2: {db2.shape}")

OPTIONAL: Understanding Output Layer Gradients

# Let's break down the output layer gradient calculation
print("Output layer gradient breakdown:")
print("dz2 = 2 * (a2 - y) * sigmoid_derivative(z2)")

# Step by step
error = a2 - y  # How far off our predictions are
print(f"Error (a2 - y) shape: {error.shape}")
print(f"Error values:\n{error.flatten()}")

mse_gradient = 2 * error  # Derivative of MSE
print(f"\nMSE gradient (2 * error) shape: {mse_gradient.shape}")

sigmoid_grad = sigmoid_derivative(z2)  # Derivative of sigmoid
print(f"Sigmoid gradient shape: {sigmoid_grad.shape}")

dz2_step = mse_gradient * sigmoid_grad  # Chain rule
print(f"Combined gradient (dz2) shape: {dz2_step.shape}")

OPTIONAL: Understanding Hidden Layer Gradients

# Hidden layer gradients are more complex due to chain rule
print("Hidden layer gradient breakdown:")
print("dz1 = (dz2 @ W2.T) * sigmoid_derivative(z1)")

# Step by step
error_propagated = dz2 @ W2.T  # Propagate error backwards
print(f"Error propagated shape: {error_propagated.shape}")
print(f"This spreads output error to each hidden neuron")

hidden_sigmoid_grad = sigmoid_derivative(z1)  # Local gradient
print(f"Hidden sigmoid gradient shape: {hidden_sigmoid_grad.shape}")

dz1_step = error_propagated * hidden_sigmoid_grad  # Final gradient
print(f"Combined hidden gradient (dz1) shape: {dz1_step.shape}")

OPTIONAL: Understanding Weight Gradients

# Weight gradients show how to adjust connections
print("Weight gradient calculation:")
print("dW2 = a1.T @ dz2 / m")

print(f"a1.T shape: {a1.T.shape}")  # Transposed hidden activations
print(f"dz2 shape: {dz2.shape}")    # Output gradients
print(f"dW2 shape: {(a1.T @ dz2).shape}")  # Weight gradients

# This gives us the gradient for each weight connection
print(f"\nWeight gradients tell us:")
print(f"- Positive gradient: decrease this weight")
print(f"- Negative gradient: increase this weight")
print(f"- Large gradient: this weight has big impact on error")

Critical concept: Backpropagation uses the chain rule to compute how much each weight contributes to the total error.

Part 4: Training the Network

Training Loop

def train_network(X, y, epochs=1000, learning_rate=1.0):
    """Train the neural network"""
    # Initialize weights
    np.random.seed(42)
    W1 = np.random.randn(2, 4) * 0.5
    b1 = np.zeros((1, 4))
    W2 = np.random.randn(4, 1) * 0.5
    b2 = np.zeros((1, 1))

    losses = []

    for epoch in range(epochs):
        # Forward pass
        z1, a1, z2, predictions = forward_pass(X, W1, b1, W2, b2)

        # Compute loss
        loss = compute_loss(predictions, y)
        losses.append(loss)

        # Backward pass
        dW1, db1, dW2, db2 = backward_pass(X, y, z1, a1, z2, predictions, W1, b1, W2, b2)

        # Update weights
        W1 -= learning_rate * dW1
        b1 -= learning_rate * db1
        W2 -= learning_rate * dW2
        b2 -= learning_rate * db2

        # Print progress
        if epoch % 100 == 0:
            print(f"Epoch {epoch:4d}: Loss = {loss:.6f}")

    return W1, b1, W2, b2, losses

# Train the network
W1_trained, b1_trained, W2_trained, b2_trained, loss_history = train_network(X, y)

OPTIONAL: Understanding Learning Rate

# Learning rate controls how big steps we take during optimization
# Too small: slow convergence, too large: might overshoot minimum

learning_rates = [0.1, 1.0, 10.0]
plt.figure(figsize=(12, 4))

for i, lr in enumerate(learning_rates):
    plt.subplot(1, 3, i+1)

    # Train with this learning rate
    _, _, _, _, losses = train_network(X, y, epochs=500, learning_rate=lr)

    plt.plot(losses)
    plt.title(f'Learning Rate = {lr}')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.yscale('log')
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

OPTIONAL: Visualizing Training Progress

# Plot loss curve
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.plot(loss_history)
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.yscale('log')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(loss_history[-100:])  # Last 100 epochs
plt.title('Training Loss (Last 100 Epochs)')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Final loss: {loss_history[-1]:.6f}")

Part 5: Testing the Trained Network

Final Predictions

# Test the trained network
z1_final, a1_final, z2_final, final_predictions = forward_pass(X, W1_trained, b1_trained, W2_trained, b2_trained)

print("Final Results:")
print("Input -> Target | Prediction | Rounded")
print("-" * 40)
for i in range(len(X)):
    pred = final_predictions[i, 0]
    rounded = round(pred)
    target = y[i, 0]
    print(f"{X[i]} -> {target}      | {pred:.4f}    | {rounded}")

# Calculate accuracy
rounded_predictions = np.round(final_predictions)
accuracy = np.mean(rounded_predictions == y)
print(f"\nAccuracy: {accuracy:.1%}")

OPTIONAL: Visualizing Decision Boundary

# Create a grid of points to visualize the decision boundary
def plot_decision_boundary(W1, b1, W2, b2):
    """Plot the decision boundary learned by the network"""
    # Create a grid
    xx, yy = np.meshgrid(np.linspace(-0.5, 1.5, 100),
                         np.linspace(-0.5, 1.5, 100))

    # Flatten the grid for prediction
    grid_points = np.c_[xx.ravel(), yy.ravel()]

    # Make predictions on the grid
    _, _, _, grid_predictions = forward_pass(grid_points, W1, b1, W2, b2)
    grid_predictions = grid_predictions.reshape(xx.shape)

    # Plot
    plt.figure(figsize=(8, 6))
    plt.contourf(xx, yy, grid_predictions, levels=50, alpha=0.8, cmap='RdYlBu')
    plt.colorbar(label='Network Output')

    # Plot data points
    colors = ['red' if label == 0 else 'blue' for label in y.flatten()]
    plt.scatter(X[:, 0], X[:, 1], c=colors, s=100, edgecolors='black', linewidth=2)

    # Add labels
    for i, (x, y_val) in enumerate(zip(X, y.flatten())):
        plt.annotate(f'({x[0]},{x[1]})→{y_val}', 
                    (x[0], x[1]), xytext=(5, 5), textcoords='offset points')

    plt.title('Neural Network Decision Boundary')
    plt.xlabel('Input 1')
    plt.ylabel('Input 2')
    plt.grid(True, alpha=0.3)
    plt.show()

# Visualize the decision boundary
plot_decision_boundary(W1_trained, b1_trained, W2_trained, b2_trained)

Part 6: Understanding What We Built

Complete Neural Network Class

class SimpleNeuralNetwork:
    """A simple 2-layer neural network implementation"""

    def __init__(self, input_size=2, hidden_size=4, output_size=1):
        # Initialize weights
        self.W1 = np.random.randn(input_size, hidden_size) * 0.5
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * 0.5
        self.b2 = np.zeros((1, output_size))

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))

    def forward(self, X):
        self.z1 = X @ self.W1 + self.b1
        self.a1 = self.sigmoid(self.z1)
        self.z2 = self.a1 @ self.W2 + self.b2
        self.a2 = self.sigmoid(self.z2)
        return self.a2

    def backward(self, X, y):
        m = X.shape[0]

        # Output layer gradients
        dz2 = 2 * (self.a2 - y) * self.sigmoid(self.z2) * (1 - self.sigmoid(self.z2))
        dW2 = self.a1.T @ dz2 / m
        db2 = np.mean(dz2, axis=0, keepdims=True)

        # Hidden layer gradients
        dz1 = (dz2 @ self.W2.T) * self.sigmoid(self.z1) * (1 - self.sigmoid(self.z1))
        dW1 = X.T @ dz1 / m
        db1 = np.mean(dz1, axis=0, keepdims=True)

        return dW1, db1, dW2, db2

    def train(self, X, y, epochs=1000, learning_rate=1.0):
        losses = []

        for epoch in range(epochs):
            # Forward pass
            predictions = self.forward(X)

            # Compute loss
            loss = np.mean((predictions - y) ** 2)
            losses.append(loss)

            # Backward pass
            dW1, db1, dW2, db2 = self.backward(X, y)

            # Update weights
            self.W1 -= learning_rate * dW1
            self.b1 -= learning_rate * db1
            self.W2 -= learning_rate * dW2
            self.b2 -= learning_rate * db2

            if epoch % 100 == 0:
                print(f"Epoch {epoch:4d}: Loss = {loss:.6f}")

        return losses

    def predict(self, X):
        return self.forward(X)

# Test the class
nn = SimpleNeuralNetwork()
losses = nn.train(X, y, epochs=1000, learning_rate=1.0)
predictions = nn.predict(X)

print("\nClass-based Neural Network Results:")
for i in range(len(X)):
    pred = predictions[i, 0]
    target = y[i, 0]
    print(f"{X[i]} -> {target} | Prediction: {pred:.4f} | Rounded: {round(pred)}")

OPTIONAL: Comparing with Different Architectures

# Test different hidden layer sizes
hidden_sizes = [2, 4, 8, 16]
results = {}

for hidden_size in hidden_sizes:
    print(f"\nTesting hidden size: {hidden_size}")
    nn = SimpleNeuralNetwork(input_size=2, hidden_size=hidden_size, output_size=1)
    losses = nn.train(X, y, epochs=1000, learning_rate=1.0)
    predictions = nn.predict(X)

    # Calculate accuracy
    accuracy = np.mean(np.round(predictions) == y)
    results[hidden_size] = {
        'final_loss': losses[-1],
        'accuracy': accuracy,
        'predictions': predictions
    }

    print(f"  Final loss: {losses[-1]:.6f}")
    print(f"  Accuracy: {accuracy:.1%}")

# Plot comparison
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
hidden_sizes_list = list(results.keys())
final_losses = [results[hs]['final_loss'] for hs in hidden_sizes_list]
plt.bar(hidden_sizes_list, final_losses)
plt.title('Final Loss vs Hidden Size')
plt.xlabel('Hidden Layer Size')
plt.ylabel('Final Loss')
plt.yscale('log')

plt.subplot(1, 2, 2)
accuracies = [results[hs]['accuracy'] for hs in hidden_sizes_list]
plt.bar(hidden_sizes_list, accuracies)
plt.title('Accuracy vs Hidden Size')
plt.xlabel('Hidden Layer Size')
plt.ylabel('Accuracy')
plt.ylim(0, 1)

plt.tight_layout()
plt.show()

Key takeaway: You've built a complete neural network from scratch! The network learns to solve the XOR problem by discovering the right weights and biases through gradient descent.

Summary

You've successfully implemented:

Forward propagation: Computing predictions from inputs
Loss computation: Measuring how wrong the predictions are
Backpropagation: Computing gradients using the chain rule
Weight updates: Using gradient descent to improve the network
Complete training loop: Putting it all together

The neural network learns by repeatedly adjusting its weights based on the errors it makes, eventually discovering the complex decision boundary needed to solve the XOR problem.

DEV Community