ディープラーニング逆伝播（バックプロパゲーション）完全ガイド – 仕組みから実装まで徹底解説

「逆伝播って聞くけど、何をしているの？」「数式が難しくて理解できない…」「どうやって実装すればいいの？」

逆伝播（Backpropagation）は、ディープラーニングの心臓部とも言える重要なアルゴリズムです。これがなければ、ニューラルネットワークは学習することができません。

でも、その仕組みは意外とシンプル。「間違いを遡って、各パラメータの責任を明らかにする」という考え方です。

この記事では、逆伝播について、直感的な理解から数学的な詳細、そして実装まで、段階的に分かりやすく解説していきます。

逆伝播とは？直感的な理解

レストランの例で理解する

逆伝播を理解するために、レストランの例を考えてみましょう。

状況： 料理の味が悪く、お客様から「塩辛い」というクレームが来た

原因究明の流れ（逆伝播）：

最終的な料理が塩辛い（出力の誤差）
シェフの味付けを確認（最後の層）
下ごしらえ担当の塩加減を確認（中間層）
仕入れた材料の塩分を確認（入力層）

各担当者の「責任の度合い」を計算して、次回は各自がどれだけ調整すべきかを決める。これが逆伝播の考え方です。

ニューラルネットワークでの逆伝播

import numpy as np
import matplotlib.pyplot as plt

class SimpleNeuralNetwork:
    """逆伝播を理解するためのシンプルなネットワーク"""
    
    def __init__(self):
        # 2入力 → 3隠れ → 1出力
        self.w1 = np.random.randn(2, 3) * 0.5  # 入力層→隠れ層の重み
        self.b1 = np.zeros((1, 3))             # 隠れ層のバイアス
        self.w2 = np.random.randn(3, 1) * 0.5  # 隠れ層→出力層の重み
        self.b2 = np.zeros((1, 1))             # 出力層のバイアス
        
        # 順伝播の中間結果を保存（逆伝播で使用）
        self.z1 = None
        self.a1 = None
        self.z2 = None
        self.a2 = None
    
    def forward(self, X):
        """順伝播：入力から出力を計算"""
        # 入力層 → 隠れ層
        self.z1 = np.dot(X, self.w1) + self.b1
        self.a1 = self.sigmoid(self.z1)
        
        # 隠れ層 → 出力層
        self.z2 = np.dot(self.a1, self.w2) + self.b2
        self.a2 = self.sigmoid(self.z2)
        
        return self.a2
    
    def backward(self, X, y, learning_rate=0.1):
        """逆伝播：誤差から各パラメータの勾配を計算"""
        m = X.shape[0]  # サンプル数
        
        # 出力層の誤差
        dz2 = self.a2 - y  # 誤差 = 予測値 - 正解
        
        # 出力層の重みとバイアスの勾配
        dw2 = (1/m) * np.dot(self.a1.T, dz2)
        db2 = (1/m) * np.sum(dz2, axis=0, keepdims=True)
        
        # 隠れ層への誤差の伝播
        da1 = np.dot(dz2, self.w2.T)
        dz1 = da1 * self.sigmoid_derivative(self.z1)
        
        # 隠れ層の重みとバイアスの勾配
        dw1 = (1/m) * np.dot(X.T, dz1)
        db1 = (1/m) * np.sum(dz1, axis=0, keepdims=True)
        
        # パラメータの更新
        self.w2 -= learning_rate * dw2
        self.b2 -= learning_rate * db2
        self.w1 -= learning_rate * dw1
        self.b1 -= learning_rate * db1
        
        return dw1, db1, dw2, db2
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_derivative(self, x):
        s = self.sigmoid(x)
        return s * (1 - s)

連鎖律：逆伝播の数学的基礎

連鎖律とは

逆伝播の核心は「連鎖律（Chain Rule）」という微分の法則です。

基本的な連鎖律：

もし z = f(g(x)) なら
dz/dx = (dz/dg) × (dg/dx)

つまり、「最終的な結果への影響」は「各段階の影響の掛け算」で表されます。

具体例で理解する連鎖律

def chain_rule_example():
    """連鎖律の具体例"""
    
    # 関数: z = (x + 2)^2
    # これは z = u^2, u = x + 2 の合成関数
    
    x = 3.0
    
    # 順伝播
    u = x + 2       # u = 5
    z = u ** 2      # z = 25
    
    # 逆伝播（連鎖律）
    dz_du = 2 * u   # ∂z/∂u = 2u = 10
    du_dx = 1       # ∂u/∂x = 1
    dz_dx = dz_du * du_dx  # ∂z/∂x = 10 × 1 = 10
    
    print(f"x = {x}")
    print(f"z = {z}")
    print(f"dz/dx = {dz_dx}")
    
    # 検証：直接微分
    # z = (x + 2)^2 を展開すると z = x^2 + 4x + 4
    # dz/dx = 2x + 4 = 2(3) + 4 = 10 ✓

chain_rule_example()

ニューラルネットワークでの連鎖律

class ChainRuleVisualization:
    """ニューラルネットワークでの連鎖律を可視化"""
    
    def __init__(self):
        self.history = []
    
    def simple_network(self, x, w1, w2, b1, b2):
        """
        単純な2層ネットワーク
        x → (w1*x + b1) → ReLU → (w2*h + b2) → y
        """
        # 順伝播
        z1 = w1 * x + b1
        h = max(0, z1)  # ReLU
        z2 = w2 * h + b2
        y = z2
        
        # 各段階の値を保存
        self.history = {
            'x': x, 'w1': w1, 'b1': b1,
            'z1': z1, 'h': h,
            'w2': w2, 'b2': b2,
            'z2': z2, 'y': y
        }
        
        return y
    
    def compute_gradients(self, target):
        """連鎖律で勾配を計算"""
        # 損失関数: L = (y - target)^2 / 2
        y = self.history['y']
        h = self.history['h']
        z1 = self.history['z1']
        x = self.history['x']
        w1 = self.history['w1']
        w2 = self.history['w2']
        
        # 逆伝播
        dL_dy = y - target                    # ∂L/∂y
        dy_dz2 = 1                            # ∂y/∂z2
        dL_dz2 = dL_dy * dy_dz2              # ∂L/∂z2
        
        dz2_dw2 = h                           # ∂z2/∂w2
        dL_dw2 = dL_dz2 * dz2_dw2            # ∂L/∂w2
        
        dz2_db2 = 1                           # ∂z2/∂b2
        dL_db2 = dL_dz2 * dz2_db2            # ∂L/∂b2
        
        dz2_dh = w2                           # ∂z2/∂h
        dL_dh = dL_dz2 * dz2_dh              # ∂L/∂h
        
        dh_dz1 = 1 if z1 > 0 else 0          # ReLUの微分
        dL_dz1 = dL_dh * dh_dz1              # ∂L/∂z1
        
        dz1_dw1 = x                           # ∂z1/∂w1
        dL_dw1 = dL_dz1 * dz1_dw1            # ∂L/∂w1
        
        dz1_db1 = 1                           # ∂z1/∂b1
        dL_db1 = dL_dz1 * dz1_db1            # ∂L/∂b1
        
        return {
            'dL_dw1': dL_dw1, 'dL_db1': dL_db1,
            'dL_dw2': dL_dw2, 'dL_db2': dL_db2
        }
    
    def visualize_chain_rule(self):
        """連鎖律の流れを図示"""
        # 簡単な例で計算
        x = 2.0
        w1, b1 = 1.5, 0.5
        w2, b2 = -0.8, 0.3
        target = 1.0
        
        y = self.simple_network(x, w1, w2, b1, b2)
        grads = self.compute_gradients(target)
        
        print("順伝播の流れ:")
        print(f"  x={x:.2f} → z1={self.history['z1']:.2f} → h={self.history['h']:.2f}")
        print(f"  → z2={self.history['z2']:.2f} → y={y:.2f}")
        print(f"\n目標値: {target:.2f}")
        print(f"誤差: {y - target:.2f}")
        print("\n逆伝播で計算された勾配:")
        for key, value in grads.items():
            print(f"  {key}: {value:.4f}")

# 実行
viz = ChainRuleVisualization()
viz.visualize_chain_rule()

実装：ステップバイステップ

全結合層の逆伝播

class FullyConnectedLayer:
    """全結合層の順伝播と逆伝播"""
    
    def __init__(self, input_size, output_size):
        # Xavierの初期化
        self.W = np.random.randn(input_size, output_size) * np.sqrt(2.0 / input_size)
        self.b = np.zeros((1, output_size))
        
        # 勾配
        self.dW = None
        self.db = None
        
        # 順伝播の入力を保存（逆伝播で使用）
        self.input = None
        self.output = None
    
    def forward(self, X):
        """順伝播"""
        self.input = X
        self.output = np.dot(X, self.W) + self.b
        return self.output
    
    def backward(self, grad_output):
        """逆伝播"""
        batch_size = self.input.shape[0]
        
        # パラメータの勾配
        self.dW = np.dot(self.input.T, grad_output) / batch_size
        self.db = np.sum(grad_output, axis=0, keepdims=True) / batch_size
        
        # 入力に対する勾配（前の層に渡す）
        grad_input = np.dot(grad_output, self.W.T)
        
        return grad_input
    
    def update_params(self, learning_rate):
        """パラメータ更新"""
        self.W -= learning_rate * self.dW
        self.b -= learning_rate * self.db

活性化関数の逆伝播

class ActivationLayers:
    """各種活性化関数の順伝播と逆伝播"""
    
    class ReLU:
        def __init__(self):
            self.mask = None
        
        def forward(self, x):
            self.mask = (x > 0)
            return x * self.mask
        
        def backward(self, grad_output):
            return grad_output * self.mask
    
    class Sigmoid:
        def __init__(self):
            self.output = None
        
        def forward(self, x):
            self.output = 1 / (1 + np.exp(-np.clip(x, -500, 500)))
            return self.output
        
        def backward(self, grad_output):
            return grad_output * self.output * (1 - self.output)
    
    class Tanh:
        def __init__(self):
            self.output = None
        
        def forward(self, x):
            self.output = np.tanh(x)
            return self.output
        
        def backward(self, grad_output):
            return grad_output * (1 - self.output ** 2)
    
    class Softmax:
        def __init__(self):
            self.output = None
        
        def forward(self, x):
            # オーバーフロー対策
            x_exp = np.exp(x - np.max(x, axis=1, keepdims=True))
            self.output = x_exp / np.sum(x_exp, axis=1, keepdims=True)
            return self.output
        
        def backward(self, grad_output):
            # Softmaxの微分は複雑
            batch_size = self.output.shape[0]
            grad_input = np.zeros_like(grad_output)
            
            for i in range(batch_size):
                jacobian = np.diag(self.output[i]) - np.outer(
                    self.output[i], self.output[i]
                )
                grad_input[i] = np.dot(jacobian, grad_output[i])
            
            return grad_input

損失関数の逆伝播

class LossFunctions:
    """各種損失関数とその微分"""
    
    @staticmethod
    def mse_loss(y_pred, y_true):
        """平均二乗誤差"""
        loss = np.mean((y_pred - y_true) ** 2)
        grad = 2 * (y_pred - y_true) / y_true.shape[0]
        return loss, grad
    
    @staticmethod
    def cross_entropy_loss(y_pred, y_true):
        """クロスエントロピー損失（Softmax出力用）"""
        batch_size = y_true.shape[0]
        
        # 数値安定性のためのクリッピング
        y_pred = np.clip(y_pred, 1e-7, 1 - 1e-7)
        
        # One-hotエンコーディングの場合
        if len(y_true.shape) == 2:
            loss = -np.sum(y_true * np.log(y_pred)) / batch_size
            grad = (y_pred - y_true) / batch_size
        # クラスインデックスの場合
        else:
            loss = -np.mean(np.log(y_pred[range(batch_size), y_true]))
            grad = y_pred.copy()
            grad[range(batch_size), y_true] -= 1
            grad /= batch_size
        
        return loss, grad
    
    @staticmethod
    def binary_cross_entropy(y_pred, y_true):
        """二値分類用のクロスエントロピー"""
        y_pred = np.clip(y_pred, 1e-7, 1 - 1e-7)
        
        loss = -np.mean(
            y_true * np.log(y_pred) + 
            (1 - y_true) * np.log(1 - y_pred)
        )
        
        grad = (y_pred - y_true) / (y_pred * (1 - y_pred)) / y_true.shape[0]
        
        return loss, grad

完全なニューラルネットワークの実装

逆伝播を含む完全なネットワーク

class NeuralNetwork:
    """逆伝播を実装した完全なニューラルネットワーク"""
    
    def __init__(self, layer_sizes, activation='relu'):
        """
        layer_sizes: 各層のサイズ [input, hidden1, hidden2, ..., output]
        activation: 活性化関数 ('relu', 'sigmoid', 'tanh')
        """
        self.layers = []
        self.activations = []
        
        # 層を構築
        for i in range(len(layer_sizes) - 1):
            self.layers.append(
                FullyConnectedLayer(layer_sizes[i], layer_sizes[i+1])
            )
            
            # 最後の層以外に活性化関数を追加
            if i < len(layer_sizes) - 2:
                if activation == 'relu':
                    self.activations.append(ActivationLayers.ReLU())
                elif activation == 'sigmoid':
                    self.activations.append(ActivationLayers.Sigmoid())
                elif activation == 'tanh':
                    self.activations.append(ActivationLayers.Tanh())
        
        # 出力層の活性化（分類タスク用）
        self.output_activation = ActivationLayers.Softmax()
    
    def forward(self, X):
        """順伝播"""
        self.layer_outputs = []
        output = X
        
        for i, layer in enumerate(self.layers):
            output = layer.forward(output)
            self.layer_outputs.append(output)
            
            # 最後の層以外は活性化関数を適用
            if i < len(self.layers) - 1:
                output = self.activations[i].forward(output)
                self.layer_outputs.append(output)
        
        # 出力層の活性化
        output = self.output_activation.forward(output)
        
        return output
    
    def backward(self, X, y_true, y_pred, learning_rate=0.01):
        """逆伝播"""
        # 損失関数の勾配
        _, grad = LossFunctions.cross_entropy_loss(y_pred, y_true)
        
        # 出力層の活性化関数の逆伝播
        grad = self.output_activation.backward(grad)
        
        # 各層を逆順に処理
        for i in range(len(self.layers) - 1, -1, -1):
            # 活性化関数の逆伝播（最後の層以外）
            if i < len(self.layers) - 1:
                grad = self.activations[i].backward(grad)
            
            # 全結合層の逆伝播
            grad = self.layers[i].backward(grad)
            
            # パラメータ更新
            self.layers[i].update_params(learning_rate)
    
    def train_step(self, X_batch, y_batch, learning_rate=0.01):
        """1回の学習ステップ"""
        # 順伝播
        y_pred = self.forward(X_batch)
        
        # 損失計算
        loss, _ = LossFunctions.cross_entropy_loss(y_pred, y_batch)
        
        # 逆伝播
        self.backward(X_batch, y_batch, y_pred, learning_rate)
        
        return loss
    
    def train(self, X_train, y_train, epochs=100, batch_size=32, learning_rate=0.01):
        """学習"""
        n_samples = X_train.shape[0]
        losses = []
        
        for epoch in range(epochs):
            epoch_loss = 0
            n_batches = 0
            
            # ミニバッチ学習
            for i in range(0, n_samples, batch_size):
                X_batch = X_train[i:i+batch_size]
                y_batch = y_train[i:i+batch_size]
                
                loss = self.train_step(X_batch, y_batch, learning_rate)
                epoch_loss += loss
                n_batches += 1
            
            avg_loss = epoch_loss / n_batches
            losses.append(avg_loss)
            
            if (epoch + 1) % 10 == 0:
                print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")
        
        return losses

勾配消失・爆発問題と対策

勾配消失・爆発の可視化

def visualize_gradient_flow():
    """勾配の流れを可視化"""
    
    # 深いネットワークでの勾配の変化
    n_layers = 10
    
    # Sigmoid活性化の場合（勾配消失しやすい）
    gradients_sigmoid = []
    grad = 1.0
    for _ in range(n_layers):
        # Sigmoidの微分の最大値は0.25
        grad *= 0.25
        gradients_sigmoid.append(grad)
    
    # ReLU活性化の場合
    gradients_relu = []
    grad = 1.0
    for _ in range(n_layers):
        # ReLUの微分は0か1
        grad *= 1.0  # アクティブな場合
        gradients_relu.append(grad)
    
    # Tanh活性化の場合
    gradients_tanh = []
    grad = 1.0
    for _ in range(n_layers):
        # Tanhの微分の最大値は1
        grad *= 0.5  # 平均的な値
        gradients_tanh.append(grad)
    
    # プロット
    plt.figure(figsize=(12, 6))
    
    plt.subplot(1, 2, 1)
    plt.plot(range(1, n_layers+1), gradients_sigmoid, 'o-', label='Sigmoid')
    plt.plot(range(1, n_layers+1), gradients_relu, 's-', label='ReLU')
    plt.plot(range(1, n_layers+1), gradients_tanh, '^-', label='Tanh')
    plt.xlabel('Layer Depth')
    plt.ylabel('Gradient Magnitude')
    plt.title('Gradient Flow in Deep Networks')
    plt.legend()
    plt.yscale('log')
    plt.grid(True)
    
    # 勾配クリッピングの効果
    plt.subplot(1, 2, 2)
    gradients_clipped = []
    grad = 1.0
    clip_value = 1.0
    
    for i in range(n_layers):
        # ランダムな勾配を生成
        random_grad = np.random.randn() * (i + 1)
        # クリッピング
        clipped_grad = np.clip(random_grad, -clip_value, clip_value)
        gradients_clipped.append(abs(clipped_grad))
    
    plt.plot(range(1, n_layers+1), gradients_clipped, 'o-', label='With Clipping')
    plt.axhline(y=clip_value, color='r', linestyle='--', label='Clip Threshold')
    plt.xlabel('Layer Depth')
    plt.ylabel('Gradient Magnitude')
    plt.title('Effect of Gradient Clipping')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()

visualize_gradient_flow()

対策の実装

class GradientStabilization:
    """勾配を安定化させる技術"""
    
    @staticmethod
    def gradient_clipping(gradients, max_norm=5.0):
        """勾配クリッピング"""
        total_norm = 0
        
        # 全勾配のノルムを計算
        for grad in gradients:
            total_norm += np.sum(grad ** 2)
        total_norm = np.sqrt(total_norm)
        
        # クリッピング
        clip_ratio = max_norm / (total_norm + 1e-8)
        if clip_ratio < 1:
            for grad in gradients:
                grad *= clip_ratio
        
        return gradients
    
    @staticmethod
    def batch_normalization(x, gamma, beta, running_mean, running_var, 
                           training=True, momentum=0.9, eps=1e-5):
        """バッチ正規化"""
        if training:
            # バッチの統計量
            batch_mean = np.mean(x, axis=0)
            batch_var = np.var(x, axis=0)
            
            # 正規化
            x_norm = (x - batch_mean) / np.sqrt(batch_var + eps)
            
            # 移動平均の更新
            running_mean = momentum * running_mean + (1 - momentum) * batch_mean
            running_var = momentum * running_var + (1 - momentum) * batch_var
        else:
            # 推論時は移動平均を使用
            x_norm = (x - running_mean) / np.sqrt(running_var + eps)
        
        # スケールとシフト
        out = gamma * x_norm + beta
        
        # 逆伝播用にキャッシュ
        cache = (x, x_norm, batch_mean, batch_var, gamma, beta, eps)
        
        return out, cache, running_mean, running_var
    
    @staticmethod
    def batch_norm_backward(dout, cache):
        """バッチ正規化の逆伝播"""
        x, x_norm, mean, var, gamma, beta, eps = cache
        N = x.shape[0]
        
        # 勾配計算
        dbeta = np.sum(dout, axis=0)
        dgamma = np.sum(dout * x_norm, axis=0)
        
        dx_norm = dout * gamma
        dvar = np.sum(dx_norm * (x - mean) * -0.5 * (var + eps) ** (-1.5), axis=0)
        dmean = np.sum(dx_norm * -1 / np.sqrt(var + eps), axis=0) + \
                dvar * np.mean(-2 * (x - mean), axis=0)
        
        dx = dx_norm / np.sqrt(var + eps) + \
             dvar * 2 * (x - mean) / N + \
             dmean / N
        
        return dx, dgamma, dbeta

高度な逆伝播技術

自動微分の実装

class AutoGrad:
    """簡単な自動微分の実装"""
    
    class Variable:
        def __init__(self, data, requires_grad=True):
            self.data = data
            self.grad = None
            self.requires_grad = requires_grad
            self.grad_fn = None
            self.is_leaf = True
        
        def backward(self, grad=None):
            if not self.requires_grad:
                return
            
            if grad is None:
                grad = np.ones_like(self.data)
            
            if self.grad is None:
                self.grad = grad
            else:
                self.grad += grad
            
            if self.grad_fn is not None:
                self.grad_fn.backward(grad)
        
        def __add__(self, other):
            return Add.apply(self, other)
        
        def __mul__(self, other):
            return Mul.apply(self, other)
    
    class Function:
        @classmethod
        def apply(cls, *args):
            instance = cls()
            output = Variable(instance.forward(*[a.data for a in args]))
            output.grad_fn = instance
            output.is_leaf = False
            instance.saved_tensors = args
            return output
        
        def backward(self, grad_output):
            grad_inputs = self.backward_impl(grad_output)
            if not isinstance(grad_inputs, tuple):
                grad_inputs = (grad_inputs,)
            
            for var, grad in zip(self.saved_tensors, grad_inputs):
                if var.requires_grad:
                    var.backward(grad)
    
    class Add(Function):
        def forward(self, x, y):
            return x + y
        
        def backward_impl(self, grad_output):
            return grad_output, grad_output
    
    class Mul(Function):
        def forward(self, x, y):
            self.x = x
            self.y = y
            return x * y
        
        def backward_impl(self, grad_output):
            return self.y * grad_output, self.x * grad_output

# 使用例
x = AutoGrad.Variable(np.array([2.0]), requires_grad=True)
y = AutoGrad.Variable(np.array([3.0]), requires_grad=True)

z = x * y + x  # z = 2*3 + 2 = 8
z.backward()

print(f"x.grad: {x.grad}")  # dz/dx = y + 1 = 4
print(f"y.grad: {y.grad}")  # dz/dy = x = 2

勾配チェック

def gradient_check(network, X, y, epsilon=1e-7):
    """数値微分で勾配の正確性をチェック"""
    
    # 解析的勾配（逆伝播）を計算
    y_pred = network.forward(X)
    loss, _ = LossFunctions.cross_entropy_loss(y_pred, y)
    network.backward(X, y, y_pred, learning_rate=0)  # 更新はしない
    
    # 各パラメータについて数値微分と比較
    for layer_idx, layer in enumerate(network.layers):
        # 重みの勾配チェック
        analytical_grad = layer.dW.copy()
        
        # 数値微分
        numerical_grad = np.zeros_like(layer.W)
        
        for i in range(layer.W.shape[0]):
            for j in range(layer.W.shape[1]):
                # +epsilon
                layer.W[i, j] += epsilon
                y_pred_plus = network.forward(X)
                loss_plus, _ = LossFunctions.cross_entropy_loss(y_pred_plus, y)
                
                # -epsilon
                layer.W[i, j] -= 2 * epsilon
                y_pred_minus = network.forward(X)
                loss_minus, _ = LossFunctions.cross_entropy_loss(y_pred_minus, y)
                
                # 数値微分
                numerical_grad[i, j] = (loss_plus - loss_minus) / (2 * epsilon)
                
                # 元に戻す
                layer.W[i, j] += epsilon
        
        # 相対誤差を計算
        relative_error = np.abs(analytical_grad - numerical_grad) / \
                        (np.abs(analytical_grad) + np.abs(numerical_grad) + epsilon)
        
        print(f"Layer {layer_idx} - Weight gradient check:")
        print(f"  Max relative error: {np.max(relative_error):.2e}")
        print(f"  Mean relative error: {np.mean(relative_error):.2e}")
        
        if np.max(relative_error) > 1e-5:
            print("  WARNING: Gradient may be incorrect!")

実践例：MNISTでの逆伝播

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

def train_on_synthetic_data():
    """合成データでニューラルネットワークを学習"""
    
    # データ生成
    X, y = make_classification(
        n_samples=1000,
        n_features=20,
        n_informative=15,
        n_redundant=5,
        n_classes=3,
        random_state=42
    )
    
    # One-hotエンコーディング
    y_onehot = OneHotEncoder(sparse=False).fit_transform(y.reshape(-1, 1))
    
    # データ分割
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_onehot, test_size=0.2, random_state=42
    )
    
    # ネットワーク構築
    network = NeuralNetwork(
        layer_sizes=[20, 64, 32, 3],
        activation='relu'
    )
    
    # 学習
    print("Training neural network with backpropagation...")
    losses = network.train(
        X_train, y_train,
        epochs=50,
        batch_size=32,
        learning_rate=0.01
    )
    
    # 結果をプロット
    plt.figure(figsize=(10, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(losses)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss')
    plt.grid(True)
    
    # テストデータで評価
    y_pred = network.forward(X_test)
    test_loss, _ = LossFunctions.cross_entropy_loss(y_pred, y_test)
    accuracy = np.mean(np.argmax(y_pred, axis=1) == np.argmax(y_test, axis=1))
    
    plt.subplot(1, 2, 2)
    plt.bar(['Train Loss', 'Test Loss', 'Test Accuracy'], 
            [losses[-1], test_loss, accuracy])
    plt.ylabel('Value')
    plt.title('Final Performance')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nFinal Results:")
    print(f"  Training Loss: {losses[-1]:.4f}")
    print(f"  Test Loss: {test_loss:.4f}")
    print(f"  Test Accuracy: {accuracy:.2%}")
    
    return network

# 実行
trained_network = train_on_synthetic_data()

逆伝播の最適化とTips

学習率スケジューリング

class LearningRateSchedulers:
    """学習率のスケジューリング"""
    
    @staticmethod
    def step_decay(initial_lr, epoch, drop_rate=0.5, epochs_drop=10):
        """ステップ減衰"""
        return initial_lr * (drop_rate ** (epoch // epochs_drop))
    
    @staticmethod
    def exponential_decay(initial_lr, epoch, decay_rate=0.95):
        """指数減衰"""
        return initial_lr * (decay_rate ** epoch)
    
    @staticmethod
    def cosine_annealing(initial_lr, epoch, total_epochs):
        """コサインアニーリング"""
        return initial_lr * (1 + np.cos(np.pi * epoch / total_epochs)) / 2
    
    @staticmethod
    def warmup(initial_lr, epoch, warmup_epochs=5):
        """ウォームアップ"""
        if epoch < warmup_epochs:
            return initial_lr * (epoch + 1) / warmup_epochs
        return initial_lr

# 可視化
def visualize_lr_schedules():
    epochs = np.arange(50)
    initial_lr = 0.1
    
    plt.figure(figsize=(12, 8))
    
    schedules = {
        'Step Decay': [LearningRateSchedulers.step_decay(initial_lr, e) for e in epochs],
        'Exponential': [LearningRateSchedulers.exponential_decay(initial_lr, e) for e in epochs],
        'Cosine': [LearningRateSchedulers.cosine_annealing(initial_lr, e, 50) for e in epochs],
        'Warmup + Cosine': [
            LearningRateSchedulers.warmup(
                LearningRateSchedulers.cosine_annealing(initial_lr, e, 50), e
            ) for e in epochs
        ]
    }
    
    for name, lr_values in schedules.items():
        plt.plot(epochs, lr_values, label=name, linewidth=2)
    
    plt.xlabel('Epoch')
    plt.ylabel('Learning Rate')
    plt.title('Learning Rate Scheduling Strategies')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

visualize_lr_schedules()