# Introduction to Neural Networks - SOLUTIONS
## Lesson 4: Building Your First Neural Network

**Evolve AI Institute - Teacher Edition**

This notebook contains complete solutions for the neural network lesson.

In [None]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris, load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns

np.random.seed(42)
print("Libraries imported successfully!")

## Solution to Exercise 1: Network Architecture

Here are recommended values with explanations:

In [None]:
# Load and prepare data
iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

encoder = OneHotEncoder(sparse_output=False)
y_train_encoded = encoder.fit_transform(y_train.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.reshape(-1, 1))

# SOLUTION: Network architecture
input_size = 4  # 4 features in iris dataset
hidden_size = 10  # 10 neurons is good starting point (can try 5-50)
output_size = 3  # 3 classes of iris flowers
learning_rate = 0.1  # 0.1 works well for this problem
epochs = 1000  # 1000 epochs for good convergence

print(f"Network Architecture:")
print(f"Input Layer: {input_size} neurons (4 features)")
print(f"Hidden Layer: {hidden_size} neurons")
print(f"Output Layer: {output_size} neurons (3 classes)")
print(f"\nTraining Parameters:")
print(f"Learning Rate: {learning_rate}")
print(f"Epochs: {epochs}")
print(f"\nRationale:")
print(f"- Input size must match number of features (4)")
print(f"- Output size must match number of classes (3)")
print(f"- Hidden layer: 10 neurons is 2-3x the input size")
print(f"- Learning rate 0.1: fast learning without overshooting")
print(f"- 1000 epochs: allows network to converge fully")

In [None]:
# Activation functions (complete implementation)
def sigmoid(x):
    return 1 / (1 + np.exp(-np.clip(x, -500, 500)))

def sigmoid_derivative(x):
    return x * (1 - x)

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

print("Activation functions defined.")

In [None]:
# Initialize parameters
W1 = np.random.randn(input_size, hidden_size) * 0.01
b1 = np.zeros((1, hidden_size))
W2 = np.random.randn(hidden_size, output_size) * 0.01
b2 = np.zeros((1, output_size))

print("Parameters initialized.")

In [None]:
# Complete forward propagation
def forward_propagation(X, W1, b1, W2, b2):
    z1 = np.dot(X, W1) + b1
    a1 = relu(z1)
    z2 = np.dot(a1, W2) + b2
    a2 = softmax(z2)
    
    cache = {'z1': z1, 'a1': a1, 'z2': z2, 'a2': a2}
    return cache

def cross_entropy_loss(predictions, targets):
    m = targets.shape[0]
    loss = -np.sum(targets * np.log(predictions + 1e-8)) / m
    return loss

def backward_propagation(X, y_true, cache, W1, W2):
    m = X.shape[0]
    
    dz2 = cache['a2'] - y_true
    dW2 = np.dot(cache['a1'].T, dz2) / m
    db2 = np.sum(dz2, axis=0, keepdims=True) / m
    
    da1 = np.dot(dz2, W2.T)
    dz1 = da1 * relu_derivative(cache['z1'])
    dW1 = np.dot(X.T, dz1) / m
    db1 = np.sum(dz1, axis=0, keepdims=True) / m
    
    return {'dW1': dW1, 'db1': db1, 'dW2': dW2, 'db2': db2}

print("Training functions defined.")

In [None]:
# Training loop
def train_network(X_train, y_train, X_test, y_test, 
                  W1, b1, W2, b2, learning_rate, epochs):
    train_losses = []
    test_losses = []
    train_accuracies = []
    test_accuracies = []
    
    for epoch in range(epochs):
        train_cache = forward_propagation(X_train, W1, b1, W2, b2)
        train_loss = cross_entropy_loss(train_cache['a2'], y_train)
        train_losses.append(train_loss)
        
        train_predictions = np.argmax(train_cache['a2'], axis=1)
        train_true = np.argmax(y_train, axis=1)
        train_accuracy = accuracy_score(train_true, train_predictions)
        train_accuracies.append(train_accuracy)
        
        gradients = backward_propagation(X_train, y_train, train_cache, W1, W2)
        
        W1 -= learning_rate * gradients['dW1']
        b1 -= learning_rate * gradients['db1']
        W2 -= learning_rate * gradients['dW2']
        b2 -= learning_rate * gradients['db2']
        
        test_cache = forward_propagation(X_test, W1, b1, W2, b2)
        test_loss = cross_entropy_loss(test_cache['a2'], y_test)
        test_losses.append(test_loss)
        
        test_predictions = np.argmax(test_cache['a2'], axis=1)
        test_true = np.argmax(y_test, axis=1)
        test_accuracy = accuracy_score(test_true, test_predictions)
        test_accuracies.append(test_accuracy)
        
        if (epoch + 1) % 100 == 0 or epoch == 0:
            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_accuracy:.4f}")
            print(f"  Test Loss: {test_loss:.4f} | Test Acc: {test_accuracy:.4f}\n")
    
    history = {
        'train_losses': train_losses,
        'test_losses': test_losses,
        'train_accuracies': train_accuracies,
        'test_accuracies': test_accuracies
    }
    
    return W1, b1, W2, b2, history

# Train the network
print("Training network...\n")
W1_trained, b1_trained, W2_trained, b2_trained, history = train_network(
    X_train_scaled, y_train_encoded,
    X_test_scaled, y_test_encoded,
    W1, b1, W2, b2,
    learning_rate, epochs
)
print("Training complete!")

In [None]:
# Visualize results
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(history['train_losses'], label='Training Loss', linewidth=2)
axes[0].plot(history['test_losses'], label='Testing Loss', linewidth=2)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training and Testing Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(history['train_accuracies'], label='Training Accuracy', linewidth=2)
axes[1].plot(history['test_accuracies'], label='Testing Accuracy', linewidth=2)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Training and Testing Accuracy')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Final Training Accuracy: {history['train_accuracies'][-1]:.4f}")
print(f"Final Testing Accuracy: {history['test_accuracies'][-1]:.4f}")

In [None]:
# Confusion matrix
test_cache = forward_propagation(X_test_scaled, W1_trained, b1_trained, 
                                 W2_trained, b2_trained)
test_predictions = np.argmax(test_cache['a2'], axis=1)
test_true = np.argmax(y_test_encoded, axis=1)

cm = confusion_matrix(test_true, test_predictions)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=iris.target_names,
            yticklabels=iris.target_names)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

print("Classification Report:")
print(classification_report(test_true, test_predictions, 
                          target_names=iris.target_names))

## Solution to Exercise 2: Hyperparameter Experiments

### Expected Observations:

**Number of Hidden Neurons:**
- 5 neurons: May struggle with complex patterns, ~93% accuracy
- 10 neurons: Good balance, ~97-100% accuracy
- 20 neurons: Excellent performance, ~97-100% accuracy
- 50+ neurons: Possible overfitting, no significant improvement

**Learning Rate:**
- 0.001: Very slow learning, needs more epochs
- 0.01: Steady learning, good for beginners
- 0.1: Fast convergence, works well
- 1.0: May overshoot, unstable training

**Number of Epochs:**
- 100: May not fully converge
- 500: Usually sufficient
- 1000: Full convergence
- 2000+: Diminishing returns

In [None]:
# Comparison experiment
configs = [
    {'hidden': 5, 'lr': 0.1, 'epochs': 500},
    {'hidden': 10, 'lr': 0.1, 'epochs': 500},
    {'hidden': 20, 'lr': 0.1, 'epochs': 500},
    {'hidden': 50, 'lr': 0.1, 'epochs': 500},
]

results = []

for config in configs:
    print(f"\nTesting configuration: {config}")
    
    W1 = np.random.randn(input_size, config['hidden']) * 0.01
    b1 = np.zeros((1, config['hidden']))
    W2 = np.random.randn(config['hidden'], output_size) * 0.01
    b2 = np.zeros((1, output_size))
    
    W1_t, b1_t, W2_t, b2_t, hist = train_network(
        X_train_scaled, y_train_encoded,
        X_test_scaled, y_test_encoded,
        W1, b1, W2, b2,
        config['lr'], config['epochs']
    )
    
    results.append({
        'config': config,
        'final_train_acc': hist['train_accuracies'][-1],
        'final_test_acc': hist['test_accuracies'][-1]
    })

# Summary
print("\n=== Experiment Results ===\n")
for r in results:
    print(f"Hidden neurons: {r['config']['hidden']}")
    print(f"  Train Acc: {r['final_train_acc']:.4f}")
    print(f"  Test Acc: {r['final_test_acc']:.4f}")
    print(f"  Overfit gap: {r['final_train_acc'] - r['final_test_acc']:.4f}\n")

## MNIST Challenge Solution

In [None]:
# Load MNIST digits
digits = load_digits()
X_digits = digits.data
y_digits = digits.target

# Prepare data
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
    X_digits, y_digits, test_size=0.2, random_state=42
)

scaler_d = StandardScaler()
X_train_d_scaled = scaler_d.fit_transform(X_train_d)
X_test_d_scaled = scaler_d.transform(X_test_d)

encoder_d = OneHotEncoder(sparse_output=False)
y_train_d_encoded = encoder_d.fit_transform(y_train_d.reshape(-1, 1))
y_test_d_encoded = encoder_d.transform(y_test_d.reshape(-1, 1))

# Network for MNIST
input_size_d = 64  # 8x8 pixels
hidden_size_d = 50  # More neurons for more complex task
output_size_d = 10  # 10 digits

W1_d = np.random.randn(input_size_d, hidden_size_d) * 0.01
b1_d = np.zeros((1, hidden_size_d))
W2_d = np.random.randn(hidden_size_d, output_size_d) * 0.01
b2_d = np.zeros((1, output_size_d))

print("Training on MNIST digits...\n")
W1_d_t, b1_d_t, W2_d_t, b2_d_t, history_d = train_network(
    X_train_d_scaled, y_train_d_encoded,
    X_test_d_scaled, y_test_d_encoded,
    W1_d, b1_d, W2_d, b2_d,
    0.1, 1000
)

print(f"\nFinal MNIST Test Accuracy: {history_d['test_accuracies'][-1]:.4f}")
print("Expected: ~95-97% accuracy")

## Sample Reflection Answers

### 1. Architecture
More hidden neurons generally improve performance up to a point. With too few neurons (e.g., 5), the network can't capture complex patterns. With too many (e.g., 100+ for Iris), we risk overfitting and increased computation time without significant accuracy gains.

### 2. Learning Rate
- Too high (>0.5): Training becomes unstable, loss oscillates
- Too low (<0.001): Very slow convergence, needs many epochs
- Just right (0.01-0.1): Steady improvement, good convergence

### 3. Training Process
One epoch consists of:
1. **Forward Propagation**: Input passes through layers, computing weighted sums and activations
2. **Loss Calculation**: Comparing predictions to true labels using cross-entropy
3. **Backward Propagation**: Computing gradients of loss with respect to weights
4. **Gradient Descent**: Updating weights to minimize loss

### 4. Overfitting
Signs: Training accuracy much higher than test accuracy (e.g., 100% vs 90%). For Iris with simple architecture, overfitting is rare. Solutions: use dropout, regularization, or more training data.

### 5. Real World Applications
- Medical diagnosis from images
- Spam detection
- Voice assistants
- Autonomous vehicles
- Recommendation systems

### 6. Limitations
Our simple network works for small datasets but would struggle with:
- Large images (needs CNNs)
- Sequential data (needs RNNs)
- Complex relationships (needs deeper networks)
- Real-time processing (needs optimization)

### 7. Ethics
Concerns include:
- Bias in training data leading to discriminatory outcomes
- Privacy issues with facial recognition
- Lack of transparency ("black box" decisions)
- Job displacement
- Need for human oversight in critical decisions