Part of my Zero to AI Researcher / Engineer Course
Part 1: Getting Started - Your First Neural Network
import numpy as np
import matplotlib.pyplot as plt
# Create simple dataset - XOR problem
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])
print(f"Input shape: {X.shape}") # (4, 2)
print(f"Output shape: {y.shape}") # (4, 1)
print(f"Dataset:")
for i in range(len(X)):
print(f" {X[i]} -> {y[i][0]}") # Print each input pair and its expected output
What happened: We created the XOR dataset - a classic non-linearly separable problem that requires a neural network to solve.
Part 2: Understanding the Math - Forward Pass
Basic Network Architecture
# Network architecture: 2 -> 4 -> 1 (input -> hidden -> output)
input_size = 2 # Number of input features (x and y coordinates)
hidden_size = 4 # Number of neurons in hidden layer
output_size = 1 # Number of output values (0 or 1)
# Initialize weights and biases
np.random.seed(42)
W1 = np.random.randn(input_size, hidden_size) * 0.5 # Weights: connection strengths between layers
b1 = np.zeros((1, hidden_size)) # Biases: adjustable offsets for each neuron
W2 = np.random.randn(hidden_size, output_size) * 0.5 # Weights for output layer
b2 = np.zeros((1, output_size)) # Bias for output layer
print(f"W1 shape: {W1.shape}") # (2, 4)
print(f"b1 shape: {b1.shape}") # (1, 4)
print(f"W2 shape: {W2.shape}") # (4, 1)
print(f"b2 shape: {b2.shape}") # (1, 1)
OPTIONAL: Understanding Matrix Shapes in Neural Networks
# Let's trace through the shapes step by step
print("Shape flow through network:")
print(f"Input X: {X.shape}") # (4, 2)
print(f"Weights W1: {W1.shape}") # (2, 4)
print(f"X @ W1: {(X @ W1).shape}") # (4, 4)
print(f"Bias b1: {b1.shape}") # (1, 4)
# Broadcasting explanation
sample_mult = X @ W1
print(f"\nBroadcasting bias:")
print(f"(X @ W1) shape: {sample_mult.shape}") # (4, 4)
print(f"b1 shape: {b1.shape}") # (1, 4)
print(f"Result shape: {(sample_mult + b1).shape}") # (4, 4)
Forward Pass Implementation
def forward_pass(X, W1, b1, W2, b2):
"""Complete forward pass through the network"""
# Hidden layer
z1 = X @ W1 + b1 # Linear transformation
a1 = sigmoid(z1) # Activation
# Output layer
z2 = a1 @ W2 + b2 # Linear transformation
a2 = sigmoid(z2) # Activation
return z1, a1, z2, a2
# Test forward pass
z1, a1, z2, predictions = forward_pass(X, W1, b1, W2, b2)
print(f"Predictions shape: {predictions.shape}")
print(f"Predictions:\n{predictions.flatten()}")
print(f"Actual labels:\n{y.flatten()}")
OPTIONAL: Understanding the Sigmoid Function
def sigmoid(x):
"""Sigmoid activation function"""
return 1 / (1 + np.exp(-np.clip(x, -500, 500))) # Clip to prevent overflow
# Test sigmoid on different inputs
test_values = np.array([-10, -1, 0, 1, 10])
sigmoid_values = sigmoid(test_values)
print("Sigmoid function behavior:")
for i, val in enumerate(test_values):
print(f" sigmoid({val:3.0f}) = {sigmoid_values[i]:.3f}")
# Visualize sigmoid
x_range = np.linspace(-10, 10, 100)
y_sigmoid = sigmoid(x_range)
plt.figure(figsize=(8, 4))
plt.plot(x_range, y_sigmoid, 'b-', linewidth=2)
plt.title('Sigmoid Activation Function')
plt.xlabel('x')
plt.ylabel('sigmoid(x)')
plt.grid(True, alpha=0.3)
plt.show()
OPTIONAL: Breaking Down the Sigmoid Formula
# Let's understand sigmoid step by step: 1 / (1 + e^(-x))
x = 2.0
print(f"Input: x = {x}")
print(f"Step 1: -x = {-x}")
print(f"Step 2: e^(-x) = np.exp(-x) = {np.exp(-x):.3f}")
print(f"Step 3: 1 + e^(-x) = {1 + np.exp(-x):.3f}")
print(f"Step 4: 1 / (1 + e^(-x)) = {1 / (1 + np.exp(-x)):.3f}")
print(f"Sigmoid result: {sigmoid(x):.3f}")
# Why clipping?
print(f"\nWhy we clip large values:")
large_x = 1000
print(f"Without clipping: np.exp(-{large_x}) would cause overflow")
print(f"With clipping: np.exp(-500) = {np.exp(-500):.2e} (very small, safe)")
Key insight: The forward pass transforms input through weighted connections and activations to produce predictions.
Part 3: Computing Loss and Gradients
Loss Function
def compute_loss(predictions, targets):
"""Mean squared error loss"""
return np.mean((predictions - targets) ** 2) # Square the difference to penalize large errors
OPTIONAL: Why We Square the Difference
# Let's see why we use (predictions - targets) ** 2
pred = np.array([0.8, 0.2, 0.9, 0.1])
target = np.array([1.0, 0.0, 1.0, 0.0])
differences = pred - target
squared_differences = differences ** 2
print("Understanding squared error:")
print(f"Predictions: {pred}")
print(f"Targets: {target}")
print(f"Differences: {differences}")
print(f"Squared: {squared_differences}")
print(f"Mean squared error: {np.mean(squared_differences):.4f}")
# Why not just absolute difference?
abs_differences = np.abs(differences)
print(f"\nComparison:")
print(f"Absolute differences: {abs_differences}")
print(f"Squared differences: {squared_differences}")
print("Squared errors penalize large mistakes more heavily!")
Calculate initial loss
initial_loss = compute_loss(predictions, y)
print(f"Initial loss: {initial_loss:.4f}")
OPTIONAL: Understanding Derivative of Sigmoid
def sigmoid_derivative(x):
"""Derivative of sigmoid function"""
s = sigmoid(x)
return s * (1 - s) # Derivative formula: sigmoid(x) * (1 - sigmoid(x))
# Test derivative
test_vals = np.array([-2, -1, 0, 1, 2])
sigmoid_vals = sigmoid(test_vals)
derivative_vals = sigmoid_derivative(test_vals)
print("Sigmoid and its derivative:")
for i, val in enumerate(test_vals):
print(f" x={val:2.0f}: sigmoid={sigmoid_vals[i]:.3f}, derivative={derivative_vals[i]:.3f}")
# Visualize both functions
x_range = np.linspace(-6, 6, 100)
y_sigmoid = sigmoid(x_range)
y_derivative = sigmoid_derivative(x_range)
plt.figure(figsize=(10, 4))
plt.plot(x_range, y_sigmoid, 'b-', label='sigmoid(x)', linewidth=2)
plt.plot(x_range, y_derivative, 'r--', label="sigmoid'(x)", linewidth=2)
plt.title('Sigmoid Function and Its Derivative')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
OPTIONAL: Chain Rule in Backpropagation
# The chain rule: if y = f(g(x)), then dy/dx = f'(g(x)) * g'(x)
#
# For our network: Loss = MSE(sigmoid(W2 * sigmoid(W1 * X + b1) + b2), y)
# We need: dLoss/dW2, dLoss/db2, dLoss/dW1, dLoss/db1
print("Chain rule breakdown:")
print("dLoss/dW2 = dLoss/da2 * da2/dz2 * dz2/dW2")
print(" where:")
print(" dLoss/da2 = 2 * (predictions - targets) # MSE derivative")
print(" da2/dz2 = sigmoid'(z2) # sigmoid derivative")
print(" dz2/dW2 = a1 # linear layer derivative")
Backpropagation Implementation
def backward_pass(X, y, z1, a1, z2, a2, W1, b1, W2, b2):
"""Compute gradients using backpropagation"""
m = X.shape[0] # Number of samples
# Output layer gradients
dz2 = 2 * (a2 - y) * sigmoid_derivative(z2) # (4, 1)
dW2 = a1.T @ dz2 / m # (4, 1)
db2 = np.mean(dz2, axis=0, keepdims=True) # (1, 1)
# Hidden layer gradients
dz1 = (dz2 @ W2.T) * sigmoid_derivative(z1) # (4, 4)
dW1 = X.T @ dz1 / m # (2, 4)
db1 = np.mean(dz1, axis=0, keepdims=True) # (1, 4)
return dW1, db1, dW2, db2
# Test backpropagation
dW1, db1, dW2, db2 = backward_pass(X, y, z1, a1, z2, predictions, W1, b1, W2, b2)
print(f"Gradient shapes:")
print(f" dW1: {dW1.shape}, dW2: {dW2.shape}")
print(f" db1: {db1.shape}, db2: {db2.shape}")
OPTIONAL: Understanding Output Layer Gradients
# Let's break down the output layer gradient calculation
print("Output layer gradient breakdown:")
print("dz2 = 2 * (a2 - y) * sigmoid_derivative(z2)")
# Step by step
error = a2 - y # How far off our predictions are
print(f"Error (a2 - y) shape: {error.shape}")
print(f"Error values:\n{error.flatten()}")
mse_gradient = 2 * error # Derivative of MSE
print(f"\nMSE gradient (2 * error) shape: {mse_gradient.shape}")
sigmoid_grad = sigmoid_derivative(z2) # Derivative of sigmoid
print(f"Sigmoid gradient shape: {sigmoid_grad.shape}")
dz2_step = mse_gradient * sigmoid_grad # Chain rule
print(f"Combined gradient (dz2) shape: {dz2_step.shape}")
OPTIONAL: Understanding Hidden Layer Gradients
# Hidden layer gradients are more complex due to chain rule
print("Hidden layer gradient breakdown:")
print("dz1 = (dz2 @ W2.T) * sigmoid_derivative(z1)")
# Step by step
error_propagated = dz2 @ W2.T # Propagate error backwards
print(f"Error propagated shape: {error_propagated.shape}")
print(f"This spreads output error to each hidden neuron")
hidden_sigmoid_grad = sigmoid_derivative(z1) # Local gradient
print(f"Hidden sigmoid gradient shape: {hidden_sigmoid_grad.shape}")
dz1_step = error_propagated * hidden_sigmoid_grad # Final gradient
print(f"Combined hidden gradient (dz1) shape: {dz1_step.shape}")
OPTIONAL: Understanding Weight Gradients
# Weight gradients show how to adjust connections
print("Weight gradient calculation:")
print("dW2 = a1.T @ dz2 / m")
print(f"a1.T shape: {a1.T.shape}") # Transposed hidden activations
print(f"dz2 shape: {dz2.shape}") # Output gradients
print(f"dW2 shape: {(a1.T @ dz2).shape}") # Weight gradients
# This gives us the gradient for each weight connection
print(f"\nWeight gradients tell us:")
print(f"- Positive gradient: decrease this weight")
print(f"- Negative gradient: increase this weight")
print(f"- Large gradient: this weight has big impact on error")
Critical concept: Backpropagation uses the chain rule to compute how much each weight contributes to the total error.
Part 4: Training the Network
Training Loop
def train_network(X, y, epochs=1000, learning_rate=1.0):
"""Train the neural network"""
# Initialize weights
np.random.seed(42)
W1 = np.random.randn(2, 4) * 0.5
b1 = np.zeros((1, 4))
W2 = np.random.randn(4, 1) * 0.5
b2 = np.zeros((1, 1))
losses = []
for epoch in range(epochs):
# Forward pass
z1, a1, z2, predictions = forward_pass(X, W1, b1, W2, b2)
# Compute loss
loss = compute_loss(predictions, y)
losses.append(loss)
# Backward pass
dW1, db1, dW2, db2 = backward_pass(X, y, z1, a1, z2, predictions, W1, b1, W2, b2)
# Update weights
W1 -= learning_rate * dW1
b1 -= learning_rate * db1
W2 -= learning_rate * dW2
b2 -= learning_rate * db2
# Print progress
if epoch % 100 == 0:
print(f"Epoch {epoch:4d}: Loss = {loss:.6f}")
return W1, b1, W2, b2, losses
# Train the network
W1_trained, b1_trained, W2_trained, b2_trained, loss_history = train_network(X, y)
OPTIONAL: Understanding Learning Rate
# Learning rate controls how big steps we take during optimization
# Too small: slow convergence, too large: might overshoot minimum
learning_rates = [0.1, 1.0, 10.0]
plt.figure(figsize=(12, 4))
for i, lr in enumerate(learning_rates):
plt.subplot(1, 3, i+1)
# Train with this learning rate
_, _, _, _, losses = train_network(X, y, epochs=500, learning_rate=lr)
plt.plot(losses)
plt.title(f'Learning Rate = {lr}')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.yscale('log')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
OPTIONAL: Visualizing Training Progress
# Plot loss curve
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(loss_history)
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.yscale('log')
plt.grid(True, alpha=0.3)
plt.subplot(1, 2, 2)
plt.plot(loss_history[-100:]) # Last 100 epochs
plt.title('Training Loss (Last 100 Epochs)')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Final loss: {loss_history[-1]:.6f}")
Part 5: Testing the Trained Network
Final Predictions
# Test the trained network
z1_final, a1_final, z2_final, final_predictions = forward_pass(X, W1_trained, b1_trained, W2_trained, b2_trained)
print("Final Results:")
print("Input -> Target | Prediction | Rounded")
print("-" * 40)
for i in range(len(X)):
pred = final_predictions[i, 0]
rounded = round(pred)
target = y[i, 0]
print(f"{X[i]} -> {target} | {pred:.4f} | {rounded}")
# Calculate accuracy
rounded_predictions = np.round(final_predictions)
accuracy = np.mean(rounded_predictions == y)
print(f"\nAccuracy: {accuracy:.1%}")
OPTIONAL: Visualizing Decision Boundary
# Create a grid of points to visualize the decision boundary
def plot_decision_boundary(W1, b1, W2, b2):
"""Plot the decision boundary learned by the network"""
# Create a grid
xx, yy = np.meshgrid(np.linspace(-0.5, 1.5, 100),
np.linspace(-0.5, 1.5, 100))
# Flatten the grid for prediction
grid_points = np.c_[xx.ravel(), yy.ravel()]
# Make predictions on the grid
_, _, _, grid_predictions = forward_pass(grid_points, W1, b1, W2, b2)
grid_predictions = grid_predictions.reshape(xx.shape)
# Plot
plt.figure(figsize=(8, 6))
plt.contourf(xx, yy, grid_predictions, levels=50, alpha=0.8, cmap='RdYlBu')
plt.colorbar(label='Network Output')
# Plot data points
colors = ['red' if label == 0 else 'blue' for label in y.flatten()]
plt.scatter(X[:, 0], X[:, 1], c=colors, s=100, edgecolors='black', linewidth=2)
# Add labels
for i, (x, y_val) in enumerate(zip(X, y.flatten())):
plt.annotate(f'({x[0]},{x[1]})→{y_val}',
(x[0], x[1]), xytext=(5, 5), textcoords='offset points')
plt.title('Neural Network Decision Boundary')
plt.xlabel('Input 1')
plt.ylabel('Input 2')
plt.grid(True, alpha=0.3)
plt.show()
# Visualize the decision boundary
plot_decision_boundary(W1_trained, b1_trained, W2_trained, b2_trained)
Part 6: Understanding What We Built
Complete Neural Network Class
class SimpleNeuralNetwork:
"""A simple 2-layer neural network implementation"""
def __init__(self, input_size=2, hidden_size=4, output_size=1):
# Initialize weights
self.W1 = np.random.randn(input_size, hidden_size) * 0.5
self.b1 = np.zeros((1, hidden_size))
self.W2 = np.random.randn(hidden_size, output_size) * 0.5
self.b2 = np.zeros((1, output_size))
def sigmoid(self, x):
return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
def forward(self, X):
self.z1 = X @ self.W1 + self.b1
self.a1 = self.sigmoid(self.z1)
self.z2 = self.a1 @ self.W2 + self.b2
self.a2 = self.sigmoid(self.z2)
return self.a2
def backward(self, X, y):
m = X.shape[0]
# Output layer gradients
dz2 = 2 * (self.a2 - y) * self.sigmoid(self.z2) * (1 - self.sigmoid(self.z2))
dW2 = self.a1.T @ dz2 / m
db2 = np.mean(dz2, axis=0, keepdims=True)
# Hidden layer gradients
dz1 = (dz2 @ self.W2.T) * self.sigmoid(self.z1) * (1 - self.sigmoid(self.z1))
dW1 = X.T @ dz1 / m
db1 = np.mean(dz1, axis=0, keepdims=True)
return dW1, db1, dW2, db2
def train(self, X, y, epochs=1000, learning_rate=1.0):
losses = []
for epoch in range(epochs):
# Forward pass
predictions = self.forward(X)
# Compute loss
loss = np.mean((predictions - y) ** 2)
losses.append(loss)
# Backward pass
dW1, db1, dW2, db2 = self.backward(X, y)
# Update weights
self.W1 -= learning_rate * dW1
self.b1 -= learning_rate * db1
self.W2 -= learning_rate * dW2
self.b2 -= learning_rate * db2
if epoch % 100 == 0:
print(f"Epoch {epoch:4d}: Loss = {loss:.6f}")
return losses
def predict(self, X):
return self.forward(X)
# Test the class
nn = SimpleNeuralNetwork()
losses = nn.train(X, y, epochs=1000, learning_rate=1.0)
predictions = nn.predict(X)
print("\nClass-based Neural Network Results:")
for i in range(len(X)):
pred = predictions[i, 0]
target = y[i, 0]
print(f"{X[i]} -> {target} | Prediction: {pred:.4f} | Rounded: {round(pred)}")
OPTIONAL: Comparing with Different Architectures
# Test different hidden layer sizes
hidden_sizes = [2, 4, 8, 16]
results = {}
for hidden_size in hidden_sizes:
print(f"\nTesting hidden size: {hidden_size}")
nn = SimpleNeuralNetwork(input_size=2, hidden_size=hidden_size, output_size=1)
losses = nn.train(X, y, epochs=1000, learning_rate=1.0)
predictions = nn.predict(X)
# Calculate accuracy
accuracy = np.mean(np.round(predictions) == y)
results[hidden_size] = {
'final_loss': losses[-1],
'accuracy': accuracy,
'predictions': predictions
}
print(f" Final loss: {losses[-1]:.6f}")
print(f" Accuracy: {accuracy:.1%}")
# Plot comparison
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
hidden_sizes_list = list(results.keys())
final_losses = [results[hs]['final_loss'] for hs in hidden_sizes_list]
plt.bar(hidden_sizes_list, final_losses)
plt.title('Final Loss vs Hidden Size')
plt.xlabel('Hidden Layer Size')
plt.ylabel('Final Loss')
plt.yscale('log')
plt.subplot(1, 2, 2)
accuracies = [results[hs]['accuracy'] for hs in hidden_sizes_list]
plt.bar(hidden_sizes_list, accuracies)
plt.title('Accuracy vs Hidden Size')
plt.xlabel('Hidden Layer Size')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.tight_layout()
plt.show()
Key takeaway: You've built a complete neural network from scratch! The network learns to solve the XOR problem by discovering the right weights and biases through gradient descent.
Summary
You've successfully implemented:
- Forward propagation: Computing predictions from inputs
- Loss computation: Measuring how wrong the predictions are
- Backpropagation: Computing gradients using the chain rule
- Weight updates: Using gradient descent to improve the network
- Complete training loop: Putting it all together
The neural network learns by repeatedly adjusting its weights based on the errors it makes, eventually discovering the complex decision boundary needed to solve the XOR problem.
Top comments (0)