| | |
| | """ |
| | ============================================================================= |
| | COMPREHENSIVE ACTIVATION FUNCTION TUTORIAL |
| | ============================================================================= |
| | |
| | This script provides both THEORETICAL explanations and EMPIRICAL experiments |
| | to understand how different activation functions affect: |
| | |
| | 1. GRADIENT FLOW: Do gradients vanish or explode? |
| | 2. SPARSITY & DEAD NEURONS: How easily do units turn on/off? |
| | 3. STABILITY: How robust is training under big learning rates / deep stacks? |
| | 4. REPRESENTATIONAL CAPACITY: How well can the model represent functions? |
| | |
| | Activation Functions Studied: |
| | - Linear (Identity) |
| | - Sigmoid |
| | - Tanh |
| | - ReLU |
| | - Leaky ReLU |
| | - ELU |
| | - GELU |
| | - Swish/SiLU |
| | |
| | Author: Orchestra Research Assistant |
| | Date: 2024 |
| | ============================================================================= |
| | """ |
| |
|
| | import torch |
| | import torch.nn as nn |
| | import torch.nn.functional as F |
| | import numpy as np |
| | import matplotlib.pyplot as plt |
| | import matplotlib.gridspec as gridspec |
| | from collections import defaultdict |
| | import json |
| | import os |
| | import warnings |
| | warnings.filterwarnings('ignore') |
| |
|
| | |
| | torch.manual_seed(42) |
| | np.random.seed(42) |
| |
|
| | |
| | os.makedirs('activation_functions', exist_ok=True) |
| |
|
| | |
| | |
| | |
| |
|
| | THEORETICAL_BACKGROUND = """ |
| | ============================================================================= |
| | THEORETICAL BACKGROUND: ACTIVATION FUNCTIONS |
| | ============================================================================= |
| | |
| | 1. WHY DO WE NEED ACTIVATION FUNCTIONS? |
| | --------------------------------------- |
| | Without non-linear activations, a neural network of any depth is equivalent |
| | to a single linear transformation: |
| | |
| | f(x) = W_n @ W_{n-1} @ ... @ W_1 @ x = W_combined @ x |
| | |
| | Non-linear activations allow networks to approximate any continuous function |
| | (Universal Approximation Theorem). |
| | |
| | |
| | 2. GRADIENT FLOW THEORY |
| | ----------------------- |
| | During backpropagation, gradients flow through the chain rule: |
| | |
| | βL/βW_i = βL/βa_n Γ βa_n/βa_{n-1} Γ ... Γ βa_{i+1}/βa_i Γ βa_i/βW_i |
| | |
| | Each layer contributes a factor of Ο'(z) Γ W, where Ο' is the activation derivative. |
| | |
| | VANISHING GRADIENTS occur when |Ο'(z)| < 1 repeatedly: |
| | - Sigmoid: Ο'(z) β (0, 0.25], maximum at z=0 |
| | - Tanh: Ο'(z) β (0, 1], maximum at z=0 |
| | - For deep networks: gradient β (0.25)^n β 0 as n β β |
| | |
| | EXPLODING GRADIENTS occur when |Ο'(z) Γ W| > 1 repeatedly: |
| | - More common with ReLU (gradient = 1 for z > 0) |
| | - Mitigated by proper initialization and gradient clipping |
| | |
| | |
| | 3. ACTIVATION FUNCTION PROPERTIES |
| | --------------------------------- |
| | |
| | | Function | Range | Ο'(z) Range | Zero-Centered | Saturates | |
| | |-------------|-------------|-------------|---------------|-----------| |
| | | Linear | (-β, β) | 1 | Yes | No | |
| | | Sigmoid | (0, 1) | (0, 0.25] | No | Yes | |
| | | Tanh | (-1, 1) | (0, 1] | Yes | Yes | |
| | | ReLU | [0, β) | {0, 1} | No | Half | |
| | | Leaky ReLU | (-β, β) | {Ξ±, 1} | No | No | |
| | | ELU | (-Ξ±, β) | (0, 1] | ~Yes | Half | |
| | | GELU | (-0.17, β) | smooth | No | Soft | |
| | | Swish | (-0.28, β) | smooth | No | Soft | |
| | |
| | |
| | 4. DEAD NEURON PROBLEM |
| | ---------------------- |
| | ReLU neurons can "die" when they always output 0: |
| | - If z < 0 for all inputs, gradient = 0, weights never update |
| | - Caused by: large learning rates, bad initialization, unlucky gradients |
| | - Solutions: Leaky ReLU, ELU, careful initialization |
| | |
| | |
| | 5. REPRESENTATIONAL CAPACITY |
| | ---------------------------- |
| | Different activations have different "expressiveness": |
| | - Smooth activations (GELU, Swish) β smoother decision boundaries |
| | - Piecewise linear (ReLU) β piecewise linear boundaries |
| | - Bounded activations (Sigmoid, Tanh) β can struggle with unbounded targets |
| | """ |
| |
|
| | print(THEORETICAL_BACKGROUND) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | class ActivationFunctions: |
| | """Collection of activation functions with their derivatives.""" |
| | |
| | @staticmethod |
| | def get_all(): |
| | """Return dict of activation name -> (function, derivative, nn.Module)""" |
| | return { |
| | 'Linear': ( |
| | lambda x: x, |
| | lambda x: torch.ones_like(x), |
| | nn.Identity() |
| | ), |
| | 'Sigmoid': ( |
| | torch.sigmoid, |
| | lambda x: torch.sigmoid(x) * (1 - torch.sigmoid(x)), |
| | nn.Sigmoid() |
| | ), |
| | 'Tanh': ( |
| | torch.tanh, |
| | lambda x: 1 - torch.tanh(x)**2, |
| | nn.Tanh() |
| | ), |
| | 'ReLU': ( |
| | F.relu, |
| | lambda x: (x > 0).float(), |
| | nn.ReLU() |
| | ), |
| | 'LeakyReLU': ( |
| | lambda x: F.leaky_relu(x, 0.01), |
| | lambda x: torch.where(x > 0, torch.ones_like(x), 0.01 * torch.ones_like(x)), |
| | nn.LeakyReLU(0.01) |
| | ), |
| | 'ELU': ( |
| | F.elu, |
| | lambda x: torch.where(x > 0, torch.ones_like(x), F.elu(x) + 1), |
| | nn.ELU() |
| | ), |
| | 'GELU': ( |
| | F.gelu, |
| | lambda x: _gelu_derivative(x), |
| | nn.GELU() |
| | ), |
| | 'Swish': ( |
| | F.silu, |
| | lambda x: torch.sigmoid(x) + x * torch.sigmoid(x) * (1 - torch.sigmoid(x)), |
| | nn.SiLU() |
| | ), |
| | } |
| |
|
| | def _gelu_derivative(x): |
| | """Approximate GELU derivative.""" |
| | cdf = 0.5 * (1 + torch.erf(x / np.sqrt(2))) |
| | pdf = torch.exp(-0.5 * x**2) / np.sqrt(2 * np.pi) |
| | return cdf + x * pdf |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def experiment_1_gradient_flow(): |
| | """ |
| | EXPERIMENT 1: How do gradients flow through deep networks? |
| | |
| | Theory: |
| | - Sigmoid/Tanh: Ο'(z) β€ 0.25/1.0, gradients shrink exponentially |
| | - ReLU: Ο'(z) β {0, 1}, gradients preserved but can die |
| | - Modern activations: designed to maintain gradient flow |
| | |
| | We measure: |
| | - Gradient magnitude at each layer during forward/backward pass |
| | - How gradients change with network depth |
| | """ |
| | print("\n" + "="*80) |
| | print("EXPERIMENT 1: GRADIENT FLOW ANALYSIS") |
| | print("="*80) |
| | |
| | activations = ActivationFunctions.get_all() |
| | depths = [5, 10, 20, 50] |
| | width = 64 |
| | |
| | results = {name: {} for name in activations} |
| | |
| | for depth in depths: |
| | print(f"\n--- Testing depth = {depth} ---") |
| | |
| | for name, (func, deriv, module) in activations.items(): |
| | |
| | layers = [] |
| | for i in range(depth): |
| | layers.append(nn.Linear(width if i > 0 else 1, width)) |
| | layers.append(module if isinstance(module, nn.Identity) else type(module)()) |
| | layers.append(nn.Linear(width, 1)) |
| | |
| | model = nn.Sequential(*layers) |
| | |
| | |
| | for m in model.modules(): |
| | if isinstance(m, nn.Linear): |
| | nn.init.xavier_uniform_(m.weight) |
| | nn.init.zeros_(m.bias) |
| | |
| | |
| | x = torch.randn(32, 1, requires_grad=True) |
| | y = model(x) |
| | loss = y.mean() |
| | loss.backward() |
| | |
| | |
| | grad_mags = [] |
| | for m in model.modules(): |
| | if isinstance(m, nn.Linear) and m.weight.grad is not None: |
| | grad_mags.append(m.weight.grad.abs().mean().item()) |
| | |
| | results[name][depth] = { |
| | 'grad_magnitudes': grad_mags, |
| | 'grad_ratio': grad_mags[-1] / (grad_mags[0] + 1e-10) if grad_mags[0] > 1e-10 else float('inf'), |
| | 'min_grad': min(grad_mags), |
| | 'max_grad': max(grad_mags), |
| | } |
| | |
| | print(f" {name:12s}: grad_ratio={results[name][depth]['grad_ratio']:.2e}, " |
| | f"min={results[name][depth]['min_grad']:.2e}, max={results[name][depth]['max_grad']:.2e}") |
| | |
| | |
| | fig, axes = plt.subplots(2, 2, figsize=(14, 10)) |
| | colors = plt.cm.tab10(np.linspace(0, 1, len(activations))) |
| | |
| | for idx, depth in enumerate(depths): |
| | ax = axes[idx // 2, idx % 2] |
| | for (name, data), color in zip(results.items(), colors): |
| | grads = data[depth]['grad_magnitudes'] |
| | ax.semilogy(range(1, len(grads)+1), grads, 'o-', label=name, color=color, markersize=4) |
| | |
| | ax.set_xlabel('Layer (from input to output)') |
| | ax.set_ylabel('Gradient Magnitude (log scale)') |
| | ax.set_title(f'Gradient Flow: Depth = {depth}') |
| | ax.legend(loc='best', fontsize=8) |
| | ax.grid(True, alpha=0.3) |
| | |
| | plt.tight_layout() |
| | plt.savefig('activation_functions/exp1_gradient_flow.png', dpi=150, bbox_inches='tight') |
| | plt.close() |
| | |
| | print("\nβ Saved: exp1_gradient_flow.png") |
| | |
| | |
| | with open('activation_functions/exp1_gradient_flow.json', 'w') as f: |
| | json.dump({k: {str(d): v for d, v in data.items()} for k, data in results.items()}, f, indent=2) |
| | |
| | return results |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def experiment_2_sparsity_dead_neurons(): |
| | """ |
| | EXPERIMENT 2: How do activation functions affect sparsity and dead neurons? |
| | |
| | Theory: |
| | - ReLU creates sparse activations (many zeros) - good for efficiency |
| | - But neurons can "die" (always output 0) - bad for learning |
| | - Leaky ReLU/ELU prevent dead neurons with small negative slope |
| | - Sigmoid/Tanh rarely have exactly zero activations |
| | |
| | We measure: |
| | - Activation sparsity (% of zeros or near-zeros) |
| | - Dead neuron rate (neurons that never activate across dataset) |
| | - Activation distribution statistics |
| | """ |
| | print("\n" + "="*80) |
| | print("EXPERIMENT 2: SPARSITY AND DEAD NEURONS") |
| | print("="*80) |
| | |
| | activations = ActivationFunctions.get_all() |
| | |
| | |
| | depth = 10 |
| | width = 128 |
| | n_samples = 1000 |
| | |
| | |
| | x_data = torch.randn(n_samples, 10) |
| | y_data = torch.sin(x_data.sum(dim=1, keepdim=True)) + 0.1 * torch.randn(n_samples, 1) |
| | |
| | results = {} |
| | activation_distributions = {} |
| | |
| | for name, (func, deriv, module) in activations.items(): |
| | print(f"\n--- Testing {name} ---") |
| | |
| | |
| | class NetworkWithHooks(nn.Module): |
| | def __init__(self): |
| | super().__init__() |
| | self.layers = nn.ModuleList() |
| | self.activations_list = nn.ModuleList() |
| | |
| | for i in range(depth): |
| | self.layers.append(nn.Linear(width if i > 0 else 10, width)) |
| | self.activations_list.append(type(module)() if not isinstance(module, nn.Identity) else nn.Identity()) |
| | self.layers.append(nn.Linear(width, 1)) |
| | |
| | self.activation_values = [] |
| | |
| | def forward(self, x): |
| | self.activation_values = [] |
| | for i, (layer, act) in enumerate(zip(self.layers[:-1], self.activations_list)): |
| | x = act(layer(x)) |
| | self.activation_values.append(x.detach().clone()) |
| | return self.layers[-1](x) |
| | |
| | model = NetworkWithHooks() |
| | |
| | |
| | for m in model.modules(): |
| | if isinstance(m, nn.Linear): |
| | nn.init.xavier_uniform_(m.weight) |
| | nn.init.zeros_(m.bias) |
| | |
| | |
| | optimizer = torch.optim.SGD(model.parameters(), lr=0.1) |
| | |
| | for epoch in range(100): |
| | optimizer.zero_grad() |
| | pred = model(x_data) |
| | loss = F.mse_loss(pred, y_data) |
| | loss.backward() |
| | optimizer.step() |
| | |
| | |
| | model.eval() |
| | with torch.no_grad(): |
| | _ = model(x_data) |
| | |
| | layer_sparsity = [] |
| | layer_dead_neurons = [] |
| | all_activations = [] |
| | |
| | for layer_idx, acts in enumerate(model.activation_values): |
| | |
| | sparsity = (acts.abs() < 1e-6).float().mean().item() |
| | layer_sparsity.append(sparsity) |
| | |
| | |
| | neuron_activity = (acts.abs() > 1e-6).float().sum(dim=0) |
| | dead_neurons = (neuron_activity == 0).float().mean().item() |
| | layer_dead_neurons.append(dead_neurons) |
| | |
| | all_activations.extend(acts.flatten().numpy()) |
| | |
| | results[name] = { |
| | 'avg_sparsity': np.mean(layer_sparsity), |
| | 'layer_sparsity': layer_sparsity, |
| | 'avg_dead_neurons': np.mean(layer_dead_neurons), |
| | 'layer_dead_neurons': layer_dead_neurons, |
| | } |
| | |
| | activation_distributions[name] = np.array(all_activations) |
| | |
| | print(f" Avg Sparsity: {results[name]['avg_sparsity']*100:.1f}%") |
| | print(f" Avg Dead Neurons: {results[name]['avg_dead_neurons']*100:.1f}%") |
| | |
| | |
| | fig, axes = plt.subplots(1, 2, figsize=(14, 5)) |
| | |
| | names = list(results.keys()) |
| | sparsities = [results[n]['avg_sparsity'] * 100 for n in names] |
| | dead_rates = [results[n]['avg_dead_neurons'] * 100 for n in names] |
| | |
| | colors = plt.cm.Set2(np.linspace(0, 1, len(names))) |
| | |
| | ax1 = axes[0] |
| | bars1 = ax1.bar(names, sparsities, color=colors) |
| | ax1.set_ylabel('Sparsity (%)') |
| | ax1.set_title('Activation Sparsity (% of near-zero activations)') |
| | ax1.set_xticklabels(names, rotation=45, ha='right') |
| | for bar, val in zip(bars1, sparsities): |
| | ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, f'{val:.1f}%', |
| | ha='center', va='bottom', fontsize=9) |
| | |
| | ax2 = axes[1] |
| | bars2 = ax2.bar(names, dead_rates, color=colors) |
| | ax2.set_ylabel('Dead Neuron Rate (%)') |
| | ax2.set_title('Dead Neurons (% never activating)') |
| | ax2.set_xticklabels(names, rotation=45, ha='right') |
| | for bar, val in zip(bars2, dead_rates): |
| | ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, f'{val:.1f}%', |
| | ha='center', va='bottom', fontsize=9) |
| | |
| | plt.tight_layout() |
| | plt.savefig('activation_functions/exp2_sparsity_dead_neurons.png', dpi=150, bbox_inches='tight') |
| | plt.close() |
| | |
| | |
| | fig, axes = plt.subplots(2, 4, figsize=(16, 8)) |
| | axes = axes.flatten() |
| | |
| | for idx, (name, acts) in enumerate(activation_distributions.items()): |
| | ax = axes[idx] |
| | |
| | acts_clean = acts[np.isfinite(acts)] |
| | if len(acts_clean) == 0: |
| | acts_clean = np.array([0.0]) |
| | acts_clipped = np.clip(acts_clean, -5, 5) |
| | ax.hist(acts_clipped, bins=100, density=True, alpha=0.7, color=colors[idx]) |
| | ax.set_title(f'{name}') |
| | ax.set_xlabel('Activation Value') |
| | ax.set_ylabel('Density') |
| | ax.axvline(x=0, color='red', linestyle='--', alpha=0.5) |
| | |
| | |
| | ax.text(0.95, 0.95, f'mean={np.nanmean(acts_clean):.2f}\nstd={np.nanstd(acts_clean):.2f}', |
| | transform=ax.transAxes, ha='right', va='top', fontsize=8, |
| | bbox=dict(boxstyle='round', facecolor='white', alpha=0.8)) |
| | |
| | plt.suptitle('Activation Value Distributions (after training)', fontsize=14) |
| | plt.tight_layout() |
| | plt.savefig('activation_functions/exp2_activation_distributions.png', dpi=150, bbox_inches='tight') |
| | plt.close() |
| | |
| | print("\nβ Saved: exp2_sparsity_dead_neurons.png") |
| | print("β Saved: exp2_activation_distributions.png") |
| | |
| | return results |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def experiment_3_stability(): |
| | """ |
| | EXPERIMENT 3: How stable is training under stress conditions? |
| | |
| | Theory: |
| | - Large learning rates can cause gradient explosion |
| | - Deep networks amplify instability |
| | - Bounded activations (Sigmoid, Tanh) are more stable but learn slower |
| | - Unbounded activations (ReLU, GELU) can diverge but learn faster |
| | |
| | We test: |
| | - Training with increasingly large learning rates |
| | - Training with increasing depth |
| | - Measuring loss divergence and gradient explosion |
| | """ |
| | print("\n" + "="*80) |
| | print("EXPERIMENT 3: STABILITY UNDER STRESS") |
| | print("="*80) |
| | |
| | activations = ActivationFunctions.get_all() |
| | |
| | |
| | print("\n--- Test 3a: Learning Rate Stress ---") |
| | learning_rates = [0.001, 0.01, 0.1, 0.5, 1.0] |
| | depth = 10 |
| | width = 64 |
| | |
| | |
| | x_data = torch.linspace(-2, 2, 200).unsqueeze(1) |
| | y_data = torch.sin(x_data * np.pi) |
| | |
| | lr_results = {name: {} for name in activations} |
| | |
| | for name, (func, deriv, module) in activations.items(): |
| | print(f"\n {name}:") |
| | |
| | for lr in learning_rates: |
| | |
| | layers = [] |
| | for i in range(depth): |
| | layers.append(nn.Linear(width if i > 0 else 1, width)) |
| | layers.append(type(module)() if not isinstance(module, nn.Identity) else nn.Identity()) |
| | layers.append(nn.Linear(width, 1)) |
| | model = nn.Sequential(*layers) |
| | |
| | |
| | for m in model.modules(): |
| | if isinstance(m, nn.Linear): |
| | nn.init.xavier_uniform_(m.weight) |
| | nn.init.zeros_(m.bias) |
| | |
| | optimizer = torch.optim.SGD(model.parameters(), lr=lr) |
| | |
| | |
| | losses = [] |
| | diverged = False |
| | |
| | for epoch in range(100): |
| | optimizer.zero_grad() |
| | pred = model(x_data) |
| | loss = F.mse_loss(pred, y_data) |
| | |
| | if torch.isnan(loss) or torch.isinf(loss) or loss.item() > 1e6: |
| | diverged = True |
| | break |
| | |
| | losses.append(loss.item()) |
| | loss.backward() |
| | |
| | |
| | max_grad = max(p.grad.abs().max().item() for p in model.parameters() if p.grad is not None) |
| | if max_grad > 1e6: |
| | diverged = True |
| | break |
| | |
| | optimizer.step() |
| | |
| | lr_results[name][lr] = { |
| | 'diverged': diverged, |
| | 'final_loss': losses[-1] if losses else float('inf'), |
| | 'epochs_completed': len(losses), |
| | } |
| | |
| | status = "DIVERGED" if diverged else f"loss={losses[-1]:.4f}" |
| | print(f" lr={lr}: {status}") |
| | |
| | |
| | print("\n--- Test 3b: Depth Stress ---") |
| | depths = [5, 10, 20, 50, 100] |
| | lr = 0.01 |
| | |
| | depth_results = {name: {} for name in activations} |
| | |
| | for name, (func, deriv, module) in activations.items(): |
| | print(f"\n {name}:") |
| | |
| | for depth in depths: |
| | |
| | layers = [] |
| | for i in range(depth): |
| | layers.append(nn.Linear(width if i > 0 else 1, width)) |
| | layers.append(type(module)() if not isinstance(module, nn.Identity) else nn.Identity()) |
| | layers.append(nn.Linear(width, 1)) |
| | model = nn.Sequential(*layers) |
| | |
| | |
| | for m in model.modules(): |
| | if isinstance(m, nn.Linear): |
| | nn.init.xavier_uniform_(m.weight) |
| | nn.init.zeros_(m.bias) |
| | |
| | optimizer = torch.optim.Adam(model.parameters(), lr=lr) |
| | |
| | |
| | losses = [] |
| | diverged = False |
| | |
| | for epoch in range(200): |
| | optimizer.zero_grad() |
| | pred = model(x_data) |
| | loss = F.mse_loss(pred, y_data) |
| | |
| | if torch.isnan(loss) or torch.isinf(loss) or loss.item() > 1e6: |
| | diverged = True |
| | break |
| | |
| | losses.append(loss.item()) |
| | loss.backward() |
| | optimizer.step() |
| | |
| | depth_results[name][depth] = { |
| | 'diverged': diverged, |
| | 'final_loss': losses[-1] if losses else float('inf'), |
| | 'loss_history': losses, |
| | } |
| | |
| | status = "DIVERGED" if diverged else f"loss={losses[-1]:.4f}" |
| | print(f" depth={depth}: {status}") |
| | |
| | |
| | fig, axes = plt.subplots(1, 2, figsize=(14, 5)) |
| | |
| | |
| | ax1 = axes[0] |
| | names = list(lr_results.keys()) |
| | x_pos = np.arange(len(learning_rates)) |
| | width_bar = 0.1 |
| | |
| | for idx, name in enumerate(names): |
| | final_losses = [] |
| | for lr in learning_rates: |
| | data = lr_results[name][lr] |
| | if data['diverged']: |
| | final_losses.append(10) |
| | else: |
| | final_losses.append(min(data['final_loss'], 10)) |
| | |
| | ax1.bar(x_pos + idx * width_bar, final_losses, width_bar, label=name) |
| | |
| | ax1.set_xlabel('Learning Rate') |
| | ax1.set_ylabel('Final Loss (capped at 10)') |
| | ax1.set_title('Stability vs Learning Rate (depth=10)') |
| | ax1.set_xticks(x_pos + width_bar * len(names) / 2) |
| | ax1.set_xticklabels([str(lr) for lr in learning_rates]) |
| | ax1.legend(loc='upper left', fontsize=7) |
| | ax1.set_yscale('log') |
| | ax1.axhline(y=10, color='red', linestyle='--', label='Diverged') |
| | |
| | |
| | ax2 = axes[1] |
| | colors = plt.cm.tab10(np.linspace(0, 1, len(names))) |
| | |
| | for idx, name in enumerate(names): |
| | final_losses = [] |
| | for depth in depths: |
| | data = depth_results[name][depth] |
| | if data['diverged']: |
| | final_losses.append(10) |
| | else: |
| | final_losses.append(min(data['final_loss'], 10)) |
| | |
| | ax2.semilogy(depths, final_losses, 'o-', label=name, color=colors[idx]) |
| | |
| | ax2.set_xlabel('Network Depth') |
| | ax2.set_ylabel('Final Loss (log scale)') |
| | ax2.set_title('Stability vs Network Depth (lr=0.01)') |
| | ax2.legend(loc='upper left', fontsize=7) |
| | ax2.grid(True, alpha=0.3) |
| | |
| | plt.tight_layout() |
| | plt.savefig('activation_functions/exp3_stability.png', dpi=150, bbox_inches='tight') |
| | plt.close() |
| | |
| | print("\nβ Saved: exp3_stability.png") |
| | |
| | return {'lr_results': lr_results, 'depth_results': depth_results} |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def experiment_4_representational_capacity(): |
| | """ |
| | EXPERIMENT 4: How well can networks represent different functions? |
| | |
| | Theory: |
| | - Universal Approximation: Any continuous function can be approximated |
| | with enough neurons, but activation choice affects efficiency |
| | - Smooth activations β smoother approximations |
| | - Piecewise linear (ReLU) β piecewise linear approximations |
| | - Some functions are easier/harder for certain activations |
| | |
| | We test approximation of: |
| | - Smooth function: sin(x) |
| | - Sharp function: |x| |
| | - Discontinuous-like: step function (smoothed) |
| | - High-frequency: sin(10x) |
| | - Polynomial: x^3 |
| | """ |
| | print("\n" + "="*80) |
| | print("EXPERIMENT 4: REPRESENTATIONAL CAPACITY") |
| | print("="*80) |
| | |
| | activations = ActivationFunctions.get_all() |
| | |
| | |
| | target_functions = { |
| | 'sin(x)': lambda x: torch.sin(x), |
| | '|x|': lambda x: torch.abs(x), |
| | 'step': lambda x: torch.sigmoid(10 * x), |
| | 'sin(10x)': lambda x: torch.sin(10 * x), |
| | 'xΒ³': lambda x: x ** 3, |
| | } |
| | |
| | depth = 5 |
| | width = 64 |
| | epochs = 500 |
| | |
| | results = {name: {} for name in activations} |
| | predictions = {name: {} for name in activations} |
| | |
| | x_train = torch.linspace(-2, 2, 200).unsqueeze(1) |
| | x_test = torch.linspace(-2, 2, 500).unsqueeze(1) |
| | |
| | for func_name, func in target_functions.items(): |
| | print(f"\n--- Target: {func_name} ---") |
| | |
| | y_train = func(x_train) |
| | y_test = func(x_test) |
| | |
| | for name, (_, _, module) in activations.items(): |
| | |
| | layers = [] |
| | for i in range(depth): |
| | layers.append(nn.Linear(width if i > 0 else 1, width)) |
| | layers.append(type(module)() if not isinstance(module, nn.Identity) else nn.Identity()) |
| | layers.append(nn.Linear(width, 1)) |
| | model = nn.Sequential(*layers) |
| | |
| | |
| | for m in model.modules(): |
| | if isinstance(m, nn.Linear): |
| | nn.init.xavier_uniform_(m.weight) |
| | nn.init.zeros_(m.bias) |
| | |
| | optimizer = torch.optim.Adam(model.parameters(), lr=0.001) |
| | |
| | |
| | for epoch in range(epochs): |
| | optimizer.zero_grad() |
| | pred = model(x_train) |
| | loss = F.mse_loss(pred, y_train) |
| | loss.backward() |
| | optimizer.step() |
| | |
| | |
| | model.eval() |
| | with torch.no_grad(): |
| | pred_test = model(x_test) |
| | test_loss = F.mse_loss(pred_test, y_test).item() |
| | |
| | results[name][func_name] = test_loss |
| | predictions[name][func_name] = pred_test.numpy() |
| | |
| | print(f" {name:12s}: MSE = {test_loss:.6f}") |
| | |
| | |
| | fig, ax = plt.subplots(figsize=(10, 8)) |
| | |
| | act_names = list(results.keys()) |
| | func_names = list(target_functions.keys()) |
| | |
| | data = np.array([[results[act][func] for func in func_names] for act in act_names]) |
| | |
| | |
| | data_log = np.log10(data + 1e-10) |
| | |
| | im = ax.imshow(data_log, cmap='RdYlGn_r', aspect='auto') |
| | |
| | ax.set_xticks(range(len(func_names))) |
| | ax.set_xticklabels(func_names, rotation=45, ha='right') |
| | ax.set_yticks(range(len(act_names))) |
| | ax.set_yticklabels(act_names) |
| | |
| | |
| | for i in range(len(act_names)): |
| | for j in range(len(func_names)): |
| | text = f'{data[i, j]:.4f}' |
| | ax.text(j, i, text, ha='center', va='center', fontsize=8, |
| | color='white' if data_log[i, j] > -2 else 'black') |
| | |
| | ax.set_title('Representational Capacity: MSE by Activation Γ Target Function\n(lower is better)') |
| | plt.colorbar(im, label='log10(MSE)') |
| | |
| | plt.tight_layout() |
| | plt.savefig('activation_functions/exp4_representational_heatmap.png', dpi=150, bbox_inches='tight') |
| | plt.close() |
| | |
| | |
| | fig, axes = plt.subplots(len(target_functions), 1, figsize=(12, 3*len(target_functions))) |
| | |
| | colors = plt.cm.tab10(np.linspace(0, 1, len(activations))) |
| | x_np = x_test.numpy().flatten() |
| | |
| | for idx, (func_name, func) in enumerate(target_functions.items()): |
| | ax = axes[idx] |
| | y_true = func(x_test).numpy().flatten() |
| | |
| | ax.plot(x_np, y_true, 'k-', linewidth=3, label='Ground Truth', alpha=0.7) |
| | |
| | for act_idx, name in enumerate(activations.keys()): |
| | pred = predictions[name][func_name].flatten() |
| | ax.plot(x_np, pred, '--', color=colors[act_idx], label=name, alpha=0.7, linewidth=1.5) |
| | |
| | ax.set_title(f'Target: {func_name}') |
| | ax.set_xlabel('x') |
| | ax.set_ylabel('y') |
| | ax.legend(loc='best', fontsize=7, ncol=3) |
| | ax.grid(True, alpha=0.3) |
| | |
| | plt.tight_layout() |
| | plt.savefig('activation_functions/exp4_predictions.png', dpi=150, bbox_inches='tight') |
| | plt.close() |
| | |
| | print("\nβ Saved: exp4_representational_heatmap.png") |
| | print("β Saved: exp4_predictions.png") |
| | |
| | return results |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def main(): |
| | """Run all experiments and generate comprehensive report.""" |
| | |
| | print("\n" + "="*80) |
| | print("ACTIVATION FUNCTION COMPREHENSIVE TUTORIAL") |
| | print("="*80) |
| | |
| | |
| | exp1_results = experiment_1_gradient_flow() |
| | exp2_results = experiment_2_sparsity_dead_neurons() |
| | exp3_results = experiment_3_stability() |
| | exp4_results = experiment_4_representational_capacity() |
| | |
| | |
| | generate_summary_figure(exp1_results, exp2_results, exp3_results, exp4_results) |
| | |
| | |
| | generate_tutorial_report(exp1_results, exp2_results, exp3_results, exp4_results) |
| | |
| | print("\n" + "="*80) |
| | print("ALL EXPERIMENTS COMPLETE!") |
| | print("="*80) |
| | print("\nGenerated files:") |
| | print(" - exp1_gradient_flow.png") |
| | print(" - exp2_sparsity_dead_neurons.png") |
| | print(" - exp2_activation_distributions.png") |
| | print(" - exp3_stability.png") |
| | print(" - exp4_representational_heatmap.png") |
| | print(" - exp4_predictions.png") |
| | print(" - summary_figure.png") |
| | print(" - activation_tutorial.md") |
| |
|
| |
|
| | def generate_summary_figure(exp1, exp2, exp3, exp4): |
| | """Generate a comprehensive summary figure.""" |
| | |
| | fig = plt.figure(figsize=(20, 16)) |
| | gs = gridspec.GridSpec(3, 3, figure=fig, hspace=0.3, wspace=0.3) |
| | |
| | activations = list(exp1.keys()) |
| | colors = plt.cm.tab10(np.linspace(0, 1, len(activations))) |
| | |
| | |
| | ax1 = fig.add_subplot(gs[0, 0]) |
| | for (name, data), color in zip(exp1.items(), colors): |
| | if 20 in data: |
| | grads = data[20]['grad_magnitudes'] |
| | ax1.semilogy(range(1, len(grads)+1), grads, 'o-', label=name, color=color, markersize=3) |
| | ax1.set_xlabel('Layer') |
| | ax1.set_ylabel('Gradient Magnitude') |
| | ax1.set_title('1. Gradient Flow (depth=20)') |
| | ax1.legend(fontsize=7) |
| | ax1.grid(True, alpha=0.3) |
| | |
| | |
| | ax2 = fig.add_subplot(gs[0, 1]) |
| | sparsities = [exp2[n]['avg_sparsity'] * 100 for n in activations] |
| | bars = ax2.bar(range(len(activations)), sparsities, color=colors) |
| | ax2.set_xticks(range(len(activations))) |
| | ax2.set_xticklabels(activations, rotation=45, ha='right', fontsize=8) |
| | ax2.set_ylabel('Sparsity (%)') |
| | ax2.set_title('2. Activation Sparsity') |
| | |
| | |
| | ax3 = fig.add_subplot(gs[0, 2]) |
| | dead_rates = [exp2[n]['avg_dead_neurons'] * 100 for n in activations] |
| | bars = ax3.bar(range(len(activations)), dead_rates, color=colors) |
| | ax3.set_xticks(range(len(activations))) |
| | ax3.set_xticklabels(activations, rotation=45, ha='right', fontsize=8) |
| | ax3.set_ylabel('Dead Neuron Rate (%)') |
| | ax3.set_title('3. Dead Neurons') |
| | |
| | |
| | ax4 = fig.add_subplot(gs[1, 0]) |
| | learning_rates = [0.001, 0.01, 0.1, 0.5, 1.0] |
| | for idx, name in enumerate(activations): |
| | final_losses = [] |
| | for lr in learning_rates: |
| | data = exp3['lr_results'][name][lr] |
| | if data['diverged']: |
| | final_losses.append(10) |
| | else: |
| | final_losses.append(min(data['final_loss'], 10)) |
| | ax4.semilogy(learning_rates, final_losses, 'o-', label=name, color=colors[idx], markersize=4) |
| | ax4.set_xlabel('Learning Rate') |
| | ax4.set_ylabel('Final Loss') |
| | ax4.set_title('4. Stability vs Learning Rate') |
| | ax4.legend(fontsize=6) |
| | ax4.grid(True, alpha=0.3) |
| | |
| | |
| | ax5 = fig.add_subplot(gs[1, 1]) |
| | depths = [5, 10, 20, 50, 100] |
| | for idx, name in enumerate(activations): |
| | final_losses = [] |
| | for depth in depths: |
| | data = exp3['depth_results'][name][depth] |
| | if data['diverged']: |
| | final_losses.append(10) |
| | else: |
| | final_losses.append(min(data['final_loss'], 10)) |
| | ax5.semilogy(depths, final_losses, 'o-', label=name, color=colors[idx], markersize=4) |
| | ax5.set_xlabel('Network Depth') |
| | ax5.set_ylabel('Final Loss') |
| | ax5.set_title('5. Stability vs Depth') |
| | ax5.legend(fontsize=6) |
| | ax5.grid(True, alpha=0.3) |
| | |
| | |
| | ax6 = fig.add_subplot(gs[1, 2]) |
| | func_names = list(exp4[activations[0]].keys()) |
| | data = np.array([[exp4[act][func] for func in func_names] for act in activations]) |
| | data_log = np.log10(data + 1e-10) |
| | im = ax6.imshow(data_log, cmap='RdYlGn_r', aspect='auto') |
| | ax6.set_xticks(range(len(func_names))) |
| | ax6.set_xticklabels(func_names, rotation=45, ha='right', fontsize=8) |
| | ax6.set_yticks(range(len(activations))) |
| | ax6.set_yticklabels(activations, fontsize=8) |
| | ax6.set_title('6. Representational Capacity (log MSE)') |
| | plt.colorbar(im, ax=ax6, shrink=0.8) |
| | |
| | |
| | ax7 = fig.add_subplot(gs[2, :]) |
| | ax7.axis('off') |
| | |
| | insights_text = """ |
| | KEY INSIGHTS FROM EXPERIMENTS |
| | βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| | |
| | 1. GRADIENT FLOW: |
| | β’ Sigmoid/Tanh suffer severe vanishing gradients in deep networks (gradients shrink exponentially) |
| | β’ ReLU maintains gradient magnitude but can have zero gradients (dead neurons) |
| | β’ GELU/Swish provide smooth, well-behaved gradient flow |
| | |
| | 2. SPARSITY & DEAD NEURONS: |
| | β’ ReLU creates highly sparse activations (~50% zeros) - good for efficiency, bad if neurons die |
| | β’ Leaky ReLU/ELU prevent dead neurons while maintaining some sparsity |
| | β’ Sigmoid/Tanh rarely have exact zeros but can saturate |
| | |
| | 3. STABILITY: |
| | β’ Bounded activations (Sigmoid, Tanh) are more stable but learn slower |
| | β’ ReLU can diverge with large learning rates or deep networks |
| | β’ Modern activations (GELU, Swish) offer good stability-performance tradeoff |
| | |
| | 4. REPRESENTATIONAL CAPACITY: |
| | β’ All activations can approximate smooth functions well (Universal Approximation) |
| | β’ ReLU excels at sharp/piecewise functions (|x|) |
| | β’ Smooth activations (GELU, Swish) better for smooth targets |
| | β’ High-frequency functions are challenging for all activations |
| | |
| | RECOMMENDATIONS: |
| | β’ Default choice: ReLU or LeakyReLU (simple, fast, effective) |
| | β’ For transformers/attention: GELU (standard in BERT, GPT) |
| | β’ For very deep networks: LeakyReLU, ELU, or use residual connections |
| | β’ Avoid: Sigmoid/Tanh in hidden layers of deep networks |
| | """ |
| | |
| | ax7.text(0.5, 0.5, insights_text, transform=ax7.transAxes, fontsize=10, |
| | verticalalignment='center', horizontalalignment='center', |
| | fontfamily='monospace', |
| | bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.8)) |
| | |
| | plt.suptitle('Comprehensive Activation Function Analysis', fontsize=16, fontweight='bold') |
| | plt.savefig('activation_functions/summary_figure.png', dpi=150, bbox_inches='tight') |
| | plt.close() |
| | |
| | print("\nβ Saved: summary_figure.png") |
| |
|
| |
|
| | def generate_tutorial_report(exp1, exp2, exp3, exp4): |
| | """Generate comprehensive markdown tutorial.""" |
| | |
| | activations = list(exp1.keys()) |
| | |
| | report = """# Comprehensive Tutorial: Activation Functions in Deep Learning |
| | |
| | ## Table of Contents |
| | 1. [Introduction](#introduction) |
| | 2. [Theoretical Background](#theoretical-background) |
| | 3. [Experiment 1: Gradient Flow](#experiment-1-gradient-flow) |
| | 4. [Experiment 2: Sparsity and Dead Neurons](#experiment-2-sparsity-and-dead-neurons) |
| | 5. [Experiment 3: Training Stability](#experiment-3-training-stability) |
| | 6. [Experiment 4: Representational Capacity](#experiment-4-representational-capacity) |
| | 7. [Summary and Recommendations](#summary-and-recommendations) |
| | |
| | --- |
| | |
| | ## Introduction |
| | |
| | Activation functions are a critical component of neural networks that introduce non-linearity, enabling networks to learn complex patterns. This tutorial provides both **theoretical explanations** and **empirical experiments** to understand how different activation functions affect: |
| | |
| | 1. **Gradient Flow**: Do gradients vanish or explode during backpropagation? |
| | 2. **Sparsity & Dead Neurons**: How easily do units turn on/off? |
| | 3. **Stability**: How robust is training under stress (large learning rates, deep networks)? |
| | 4. **Representational Capacity**: How well can the network approximate different functions? |
| | |
| | ### Activation Functions Studied |
| | |
| | | Function | Formula | Range | Key Property | |
| | |----------|---------|-------|--------------| |
| | | Linear | f(x) = x | (-β, β) | No non-linearity | |
| | | Sigmoid | f(x) = 1/(1+eβ»Λ£) | (0, 1) | Bounded, saturates | |
| | | Tanh | f(x) = (eΛ£-eβ»Λ£)/(eΛ£+eβ»Λ£) | (-1, 1) | Zero-centered, saturates | |
| | | ReLU | f(x) = max(0, x) | [0, β) | Sparse, can die | |
| | | Leaky ReLU | f(x) = max(Ξ±x, x) | (-β, β) | Prevents dead neurons | |
| | | ELU | f(x) = x if x>0, Ξ±(eΛ£-1) otherwise | (-Ξ±, β) | Smooth negative region | |
| | | GELU | f(x) = xΒ·Ξ¦(x) | β(-0.17, β) | Smooth, probabilistic | |
| | | Swish | f(x) = xΒ·Ο(x) | β(-0.28, β) | Self-gated | |
| | |
| | --- |
| | |
| | ## Theoretical Background |
| | |
| | ### Why Non-linearity Matters |
| | |
| | Without activation functions, a neural network of any depth is equivalent to a single linear transformation: |
| | |
| | ``` |
| | f(x) = Wβ Γ Wβββ Γ ... Γ Wβ Γ x = W_combined Γ x |
| | ``` |
| | |
| | Non-linear activations allow networks to approximate **any continuous function** (Universal Approximation Theorem). |
| | |
| | ### The Gradient Flow Problem |
| | |
| | During backpropagation, gradients flow through the chain rule: |
| | |
| | ``` |
| | βL/βWα΅’ = βL/βaβ Γ βaβ/βaβββ Γ ... Γ βaα΅’ββ/βaα΅’ Γ βaα΅’/βWα΅’ |
| | ``` |
| | |
| | Each layer contributes a factor of **Ο'(z) Γ W**, where Ο' is the activation derivative. |
| | |
| | **Vanishing Gradients**: When |Ο'(z)| < 1 repeatedly |
| | - Sigmoid: Ο'(z) β (0, 0.25], maximum at z=0 |
| | - For n layers: gradient β (0.25)βΏ β 0 as n β β |
| | |
| | **Exploding Gradients**: When |Ο'(z) Γ W| > 1 repeatedly |
| | - More common with unbounded activations |
| | - Mitigated by gradient clipping, proper initialization |
| | |
| | --- |
| | |
| | ## Experiment 1: Gradient Flow |
| | |
| | ### Question |
| | How do gradients propagate through deep networks with different activations? |
| | |
| | ### Method |
| | - Built networks with depths [5, 10, 20, 50] |
| | - Measured gradient magnitude at each layer during backpropagation |
| | - Used Xavier initialization for fair comparison |
| | |
| | ### Results |
| | |
| |  |
| | |
| | """ |
| | |
| | |
| | report += "#### Gradient Ratio (Layer 10 / Layer 1) at Depth=20\n\n" |
| | report += "| Activation | Gradient Ratio | Interpretation |\n" |
| | report += "|------------|----------------|----------------|\n" |
| | |
| | for name in activations: |
| | if 20 in exp1[name]: |
| | ratio = exp1[name][20]['grad_ratio'] |
| | if ratio > 1e6: |
| | interp = "Severe vanishing gradients" |
| | elif ratio > 100: |
| | interp = "Significant gradient decay" |
| | elif ratio > 10: |
| | interp = "Moderate gradient decay" |
| | elif ratio > 0.1: |
| | interp = "Stable gradient flow" |
| | else: |
| | interp = "Gradient amplification" |
| | report += f"| {name} | {ratio:.2e} | {interp} |\n" |
| | |
| | report += """ |
| | ### Theoretical Explanation |
| | |
| | **Sigmoid** shows the most severe gradient decay because: |
| | - Maximum derivative is only 0.25 (at z=0) |
| | - In deep networks: 0.25Β²β° β 10β»ΒΉΒ² (effectively zero!) |
| | |
| | **ReLU** maintains gradients better because: |
| | - Derivative is exactly 1 for positive inputs |
| | - But can be exactly 0 for negative inputs (dead neurons) |
| | |
| | **GELU/Swish** provide smooth gradient flow: |
| | - Derivatives are bounded but not as severely as Sigmoid |
| | - Smooth transitions prevent sudden gradient changes |
| | |
| | --- |
| | |
| | ## Experiment 2: Sparsity and Dead Neurons |
| | |
| | ### Question |
| | How do activations affect the sparsity of representations and the "death" of neurons? |
| | |
| | ### Method |
| | - Trained 10-layer networks with high learning rate (0.1) to stress-test |
| | - Measured activation sparsity (% of near-zero activations) |
| | - Measured dead neuron rate (neurons that never activate) |
| | |
| | ### Results |
| | |
| |  |
| | |
| | """ |
| | |
| | |
| | report += "| Activation | Sparsity (%) | Dead Neurons (%) |\n" |
| | report += "|------------|--------------|------------------|\n" |
| | |
| | for name in activations: |
| | sparsity = exp2[name]['avg_sparsity'] * 100 |
| | dead = exp2[name]['avg_dead_neurons'] * 100 |
| | report += f"| {name} | {sparsity:.1f}% | {dead:.1f}% |\n" |
| | |
| | report += """ |
| | ### Theoretical Explanation |
| | |
| | **ReLU creates sparse representations**: |
| | - Any negative input β output is exactly 0 |
| | - ~50% sparsity is typical with zero-mean inputs |
| | - Sparsity can be beneficial (efficiency, regularization) |
| | |
| | **Dead Neuron Problem**: |
| | - If a ReLU neuron's input is always negative, it outputs 0 forever |
| | - Gradient is 0, so weights never update |
| | - Caused by: bad initialization, large learning rates, unlucky gradients |
| | |
| | **Solutions**: |
| | - **Leaky ReLU**: Small gradient (0.01) for negative inputs |
| | - **ELU**: Smooth negative region with non-zero gradient |
| | - **Proper initialization**: Keep activations in a good range |
| | |
| | --- |
| | |
| | ## Experiment 3: Training Stability |
| | |
| | ### Question |
| | How stable is training under stress conditions (large learning rates, deep networks)? |
| | |
| | ### Method |
| | - Tested learning rates: [0.001, 0.01, 0.1, 0.5, 1.0] |
| | - Tested depths: [5, 10, 20, 50, 100] |
| | - Measured whether training diverged (loss β β) |
| | |
| | ### Results |
| | |
| |  |
| | |
| | ### Key Observations |
| | |
| | **Learning Rate Stability**: |
| | - Sigmoid/Tanh: Most stable (bounded outputs prevent explosion) |
| | - ReLU: Can diverge at high learning rates |
| | - GELU/Swish: Good balance of stability and performance |
| | |
| | **Depth Stability**: |
| | - All activations struggle with depth > 50 without special techniques |
| | - Sigmoid fails earliest due to vanishing gradients |
| | - ReLU/LeakyReLU maintain trainability longer |
| | |
| | ### Theoretical Explanation |
| | |
| | **Why bounded activations are more stable**: |
| | - Sigmoid outputs β (0, 1), so activations can't explode |
| | - But gradients can vanish, making learning very slow |
| | |
| | **Why ReLU can be unstable**: |
| | - Unbounded outputs: large inputs β large outputs β larger gradients |
| | - Positive feedback loop can cause explosion |
| | |
| | **Modern solutions**: |
| | - Batch Normalization: Keeps activations in good range |
| | - Residual Connections: Allow gradients to bypass layers |
| | - Gradient Clipping: Prevents explosion |
| | |
| | --- |
| | |
| | ## Experiment 4: Representational Capacity |
| | |
| | ### Question |
| | How well can networks with different activations approximate various functions? |
| | |
| | ### Method |
| | - Target functions: sin(x), |x|, step, sin(10x), xΒ³ |
| | - 5-layer networks, 500 epochs training |
| | - Measured test MSE |
| | |
| | ### Results |
| | |
| |  |
| | |
| |  |
| | |
| | """ |
| | |
| | |
| | report += "#### Test MSE by Activation Γ Target Function\n\n" |
| | func_names = list(exp4[activations[0]].keys()) |
| | |
| | report += "| Activation | " + " | ".join(func_names) + " |\n" |
| | report += "|------------|" + "|".join(["------" for _ in func_names]) + "|\n" |
| | |
| | for name in activations: |
| | values = [f"{exp4[name][f]:.4f}" for f in func_names] |
| | report += f"| {name} | " + " | ".join(values) + " |\n" |
| | |
| | report += """ |
| | ### Theoretical Explanation |
| | |
| | **Universal Approximation Theorem**: |
| | - Any continuous function can be approximated with enough neurons |
| | - But different activations have different "inductive biases" |
| | |
| | **ReLU excels at piecewise functions** (like |x|): |
| | - ReLU networks compute piecewise linear functions |
| | - Perfect match for |x| which is piecewise linear |
| | |
| | **Smooth activations for smooth functions**: |
| | - GELU, Swish produce smoother decision boundaries |
| | - Better for smooth targets like sin(x) |
| | |
| | **High-frequency functions are hard**: |
| | - sin(10x) has 10 oscillations in [-2, 2] |
| | - Requires many neurons to capture all oscillations |
| | - All activations struggle without sufficient width |
| | |
| | --- |
| | |
| | ## Summary and Recommendations |
| | |
| | ### Comparison Table |
| | |
| | | Property | Best Activations | Worst Activations | |
| | |----------|------------------|-------------------| |
| | | Gradient Flow | LeakyReLU, GELU | Sigmoid, Tanh | |
| | | Avoids Dead Neurons | LeakyReLU, ELU, GELU | ReLU | |
| | | Training Stability | Sigmoid, Tanh, GELU | ReLU (high lr) | |
| | | Smooth Functions | GELU, Swish, Tanh | ReLU | |
| | | Sharp Functions | ReLU, LeakyReLU | Sigmoid | |
| | | Computational Speed | ReLU, LeakyReLU | GELU, Swish | |
| | |
| | ### Practical Recommendations |
| | |
| | 1. **Default Choice**: **ReLU** or **LeakyReLU** |
| | - Simple, fast, effective for most tasks |
| | - Use LeakyReLU if dead neurons are a concern |
| | |
| | 2. **For Transformers/Attention**: **GELU** |
| | - Standard in BERT, GPT, modern transformers |
| | - Smooth gradients help with optimization |
| | |
| | 3. **For Very Deep Networks**: **LeakyReLU** or **ELU** |
| | - Or use residual connections + batch normalization |
| | - Avoid Sigmoid/Tanh in hidden layers |
| | |
| | 4. **For Regression with Bounded Outputs**: **Sigmoid** (output layer only) |
| | - Use for probabilities or [0, 1] outputs |
| | - Never in hidden layers of deep networks |
| | |
| | 5. **For RNNs/LSTMs**: **Tanh** (traditional choice) |
| | - Zero-centered helps with recurrent dynamics |
| | - Modern alternative: use Transformers instead |
| | |
| | ### The Big Picture |
| | |
| | ``` |
| | ACTIVATION FUNCTION SELECTION GUIDE |
| | |
| | βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| | β Is it a hidden layer? β |
| | βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| | β |
| | βββββββββββββββββ΄ββββββββββββββββ |
| | βΌ βΌ |
| | YES NO (output layer) |
| | β β |
| | βΌ βΌ |
| | βββββββββββββββββββ βββββββββββββββββββββββ |
| | β Is it a β β What's the task? β |
| | β Transformer? β β β |
| | βββββββββββββββββββ β Binary class β Sigmoid |
| | β β Multi-class β Softmax |
| | βββββββββ΄ββββββββ β Regression β Linear β |
| | βΌ βΌ βββββββββββββββββββββββ |
| | YES NO |
| | β β |
| | βΌ βΌ |
| | GELU βββββββββββββββββββ |
| | β Worried about β |
| | β dead neurons? β |
| | βββββββββββββββββββ |
| | β |
| | βββββββββ΄ββββββββ |
| | βΌ βΌ |
| | YES NO |
| | β β |
| | βΌ βΌ |
| | LeakyReLU ReLU |
| | or ELU |
| | ``` |
| | |
| | --- |
| | |
| | ## Files Generated |
| | |
| | | File | Description | |
| | |------|-------------| |
| | | exp1_gradient_flow.png | Gradient magnitude across layers | |
| | | exp2_sparsity_dead_neurons.png | Sparsity and dead neuron rates | |
| | | exp2_activation_distributions.png | Activation value distributions | |
| | | exp3_stability.png | Stability vs learning rate and depth | |
| | | exp4_representational_heatmap.png | MSE heatmap for different targets | |
| | | exp4_predictions.png | Actual predictions vs ground truth | |
| | | summary_figure.png | Comprehensive summary visualization | |
| | |
| | --- |
| | |
| | ## References |
| | |
| | 1. Glorot, X., & Bengio, Y. (2010). Understanding the difficulty of training deep feedforward neural networks. |
| | 2. He, K., et al. (2015). Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification. |
| | 3. Hendrycks, D., & Gimpel, K. (2016). Gaussian Error Linear Units (GELUs). |
| | 4. Ramachandran, P., et al. (2017). Searching for Activation Functions. |
| | 5. Nwankpa, C., et al. (2018). Activation Functions: Comparison of trends in Practice and Research for Deep Learning. |
| | |
| | --- |
| | |
| | *Tutorial generated by Orchestra Research Assistant* |
| | *All experiments are reproducible with the provided code* |
| | """ |
| | |
| | with open('activation_functions/activation_tutorial.md', 'w') as f: |
| | f.write(report) |
| | |
| | print("\nβ Saved: activation_tutorial.md") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|