温馨提示:
本文将介绍统计学中的优化知识,凸优化和梯度下降,多为公式推导和图形化展示,较为硬核
优化与深度学习 优化与估计 尽管优化方法可以最小化深度学习中的损失函数值,但本质上优化方法达到的目标与深度学习的目标并不相同。
优化方法目标 :训练集损失函数值
深度学习目标 :测试集损失函数值(泛化性)
借助图形直观比较
1 2 3 4 5 6 %matplotlib inline import syssys.path.append('path to file storge d2lzh1981' ) import d2lzh1981 as d2lfrom mpl_toolkits import mplot3d import numpy as np
1 2 3 4 5 6 7 8 9 10 11 12 def f (x) : return x * np.cos(np.pi * x)def g (x) : return f(x) + 0.2 * np.cos(5 * np.pi * x)d2l.set_figsize((5 , 3 )) x = np.arange(0.5 , 1.5 , 0.01 ) fig_f, = d2l.plt.plot(x, f(x),label="train error" ) fig_g, = d2l.plt.plot(x, g(x),'--' , c='purple' , label="test error" ) fig_f.axes.annotate('empirical risk' , (1.0 , -1.2 ), (0.5 , -1.1 ),arrowprops=dict(arrowstyle='->' )) fig_g.axes.annotate('expected risk' , (1.1 , -1.05 ), (0.95 , -0.5 ),arrowprops=dict(arrowstyle='->' )) d2l.plt.xlabel('x' ) d2l.plt.ylabel('risk' ) d2l.plt.legend(loc="upper right" )
优化在深度学习中的挑战
局部最小值
鞍点
梯度消失
局部最小值
$$ f(x) = x\cos \pi x $$
1 2 3 4 5 6 7 8 9 10 11 12 def f (x) : return x * np.cos(np.pi * x) d2l.set_figsize((4.5 , 2.5 )) x = np.arange(-1.0 , 2.0 , 0.1 ) fig, = d2l.plt.plot(x, f(x)) fig.axes.annotate('local minimum' , xy=(-0.3 , -0.25 ), xytext=(-0.77 , -1.0 ), arrowprops=dict(arrowstyle='->' )) fig.axes.annotate('global minimum' , xy=(1.1 , -0.95 ), xytext=(0.6 , 0.8 ), arrowprops=dict(arrowstyle='->' )) d2l.plt.xlabel('x' ) d2l.plt.ylabel('f(x)' );
鞍点
函数在一阶导数为零处(驻点)的黑塞矩阵为不定矩阵。
1 2 3 4 5 6 x = np.arange(-2.0 , 2.0 , 0.1 ) fig, = d2l.plt.plot(x, x**3 ) fig.axes.annotate('saddle point' , xy=(0 , -0.2 ), xytext=(-0.52 , -5.0 ), arrowprops=dict(arrowstyle='->' )) d2l.plt.xlabel('x' ) d2l.plt.ylabel('f(x)' );
海森矩阵
$$ A=\left[\begin{array}{cccc}{\frac{\partial^{2} f}{\partial x_{1}^{2}}} & {\frac{\partial^{2} f}{\partial x_{1} \partial x_{2}}} & {\cdots} & {\frac{\partial^{2} f}{\partial x_{1} \partial x_{n}}} \\ {\frac{\partial^{2} f}{\partial x_{2} \partial x_{1}}} & {\frac{\partial^{2} f}{\partial x_{2}^{2}}} & {\cdots} & {\frac{\partial^{2} f}{\partial x_{2} \partial x_{n}}} \\ {\vdots} & {\vdots} & {\ddots} & {\vdots} \\ {\frac{\partial^{2} f}{\partial x_{n} \partial x_{1}}} & {\frac{\partial^{2} f}{\partial x_{n} \partial x_{2}}} & {\cdots} & {\frac{\partial^{2} f}{\partial x_{n}^{2}}}\end{array}\right] $$
海森矩阵特征值和鞍点还有局部极小值的点的关系
偏导数为零的点
特征值都大于零是局部极小值点
都为负数是局部极大指点
有正有负就是鞍点
1 2 3 4 5 6 7 8 9 10 11 12 13 x, y = np.mgrid[-1 : 1 : 31j , -1 : 1 : 31j ] z = x**2 - y**2 d2l.set_figsize((6 , 4 )) ax = d2l.plt.figure().add_subplot(111 , projection='3d' ) ax.plot_wireframe(x, y, z, **{'rstride' : 2 , 'cstride' : 2 }) ax.plot([0 ], [0 ], [0 ], 'ro' , markersize=10 ) ticks = [-1 , 0 , 1 ] d2l.plt.xticks(ticks) d2l.plt.yticks(ticks) ax.set_zticks(ticks) d2l.plt.xlabel('x' ) d2l.plt.ylabel('y' );
梯度消失
1 2 3 4 5 x = np.arange(-2.0 , 5.0 , 0.01 ) fig, = d2l.plt.plot(x, np.tanh(x)) d2l.plt.xlabel('x' ) d2l.plt.ylabel('f(x)' ) fig.axes.annotate('vanishing gradient' , (4 , 1 ), (2 , 0.0 ) ,arrowprops=dict(arrowstyle='->' ))
梯度下降 1 2 3 4 5 6 7 8 9 %matplotlib inline import numpy as npimport torchimport timefrom torch import nn, optimimport mathimport syssys.path.append('path to file storge d2lzh1981' ) import d2lzh1981 as d2l
一维梯度下降
证明:沿梯度反方向移动自变量可以减小函数值
泰勒展开:
$$ f(x+\epsilon)=f(x)+\epsilon f^{\prime}(x)+\mathcal{O}\left(\epsilon^{2}\right) $$
代入沿梯度方向的移动量 $\eta f^{\prime}(x)$:
$$ f\left(x-\eta f^{\prime}(x)\right)=f(x)-\eta f^{\prime 2}(x)+\mathcal{O}\left(\eta^{2} f^{\prime 2}(x)\right) $$
$$ f\left(x-\eta f^{\prime}(x)\right) \lesssim f(x) $$
$$ x \leftarrow x-\eta f^{\prime}(x) $$
e.g.
$$ f(x) = x^2 $$
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 def f (x) : return x**2 def gradf (x) : return 2 * x def gd (eta) : x = 10 results = [x] for i in range(20 ): x -= eta * gradf(x) results.append(x) print('epoch 20, x:' , x) return results res = gd(0.2 )
梯度下降轨迹
1 2 3 4 5 6 7 8 9 10 11 def show_trace (res) : n = max(abs(min(res)), abs(max(res))) f_line = np.arange(-n, n, 0.01 ) d2l.set_figsize((3.5 , 2.5 )) d2l.plt.plot(f_line, [f(x) for x in f_line],'-' ) d2l.plt.plot(res, [f(x) for x in res],'-o' ) d2l.plt.xlabel('x' ) d2l.plt.ylabel('f(x)' ) show_trace(res)
学习率
学习率过小 Code:show_trace(gd(0.05))
学习率过大 Code:show_trace(gd(1.1))
局部极小值
$$ f(x) = x\cos cx $$
1 2 3 4 5 6 7 8 9 10 11 c = 0.15 * np.pi def f (x) : return x * np.cos(c * x) def gradf (x) : return np.cos(c * x) - c * x * np.sin(c * x) show_trace(gd(2 )) show_trace(gd(0.5 ))
多维梯度下降 $$ \nabla f(\mathbf{x})=\left[\frac{\partial f(\mathbf{x})}{\partial x_{1}}, \frac{\partial f(\mathbf{x})}{\partial x_{2}}, \dots, \frac{\partial f(\mathbf{x})}{\partial x_{d}}\right]^{\top} $$
$$ f(\mathbf{x}+\epsilon)=f(\mathbf{x})+\epsilon^{\top} \nabla f(\mathbf{x})+\mathcal{O}\left(|\epsilon|^{2}\right) $$
$$ \mathbf{x} \leftarrow \mathbf{x}-\eta \nabla f(\mathbf{x}) $$
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 def train_2d (trainer, steps=20 ) : x1, x2 = -5 , -2 results = [(x1, x2)] for i in range(steps): x1, x2 = trainer(x1, x2) results.append((x1, x2)) print('epoch %d, x1 %f, x2 %f' % (i + 1 , x1, x2)) return results def show_trace_2d (f, results) : d2l.plt.plot(*zip(*results), '-o' , color='#ff7f0e' ) x1, x2 = np.meshgrid(np.arange(-5.5 , 1.0 , 0.1 ), np.arange(-3.0 , 1.0 , 0.1 )) d2l.plt.contour(x1, x2, f(x1, x2), colors='#1f77b4' ) d2l.plt.xlabel('x1' ) d2l.plt.ylabel('x2' )
e.g.
$$ f(x) = x_1^2 + 2x_2^2 $$
1 2 3 4 5 6 7 8 9 eta = 0.1 def f_2d (x1, x2) : return x1 ** 2 + 2 * x2 ** 2 def gd_2d (x1, x2) : return (x1 - eta * 2 * x1, x2 - eta * 4 * x2) show_trace_2d(f_2d, train_2d(gd_2d))
自适应方法 牛顿法
优势 :
梯度下降“步幅”的确定比较困难 而牛顿法相当于可以通过Hessian矩阵来调整“步幅”。
在牛顿法中,局部极小值也可以通过调整学习率来解决。
在 $x + \epsilon$ 处泰勒展开:
$$ f(\mathbf{x}+\epsilon)=f(\mathbf{x})+\epsilon^{\top} \nabla f(\mathbf{x})+\frac{1}{2} \epsilon^{\top} \nabla \nabla^{\top} f(\mathbf{x}) \epsilon+\mathcal{O}\left(|\epsilon|^{3}\right) $$
最小值点处满足: $\nabla f(\mathbf{x})=0$, 即我们希望 $\nabla f(\mathbf{x} + \epsilon)=0$, 对上式关于 $\epsilon$ 求导,忽略高阶无穷小,有:
$$ \nabla f(\mathbf{x})+\boldsymbol{H}{f} \boldsymbol{\epsilon}=0 \text { and hence } \epsilon=-\boldsymbol{H} {f}^{-1} \nabla f(\mathbf{x}) $$
牛顿法需要计算Hessian矩阵的逆,计算量比较大。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 c = 0.5 def f (x) : return np.cosh(c * x) def gradf (x) : return c * np.sinh(c * x) def hessf (x) : return c**2 * np.cosh(c * x) def newton (eta=1 ) : x = 10 results = [x] for i in range(10 ): x -= eta * gradf(x) / hessf(x) results.append(x) print('epoch 10, x:' , x) return results show_trace(newton())
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 c = 0.15 * np.pi def f (x) : return x * np.cos(c * x) def gradf (x) : return np.cos(c * x) - c * x * np.sin(c * x) def hessf (x) : return - 2 * c * np.sin(c * x) - x * c**2 * np.cos(c * x) show_trace(newton())
show_trace(newton(0.5))
收敛性分析 只考虑在函数为凸函数, 且最小值点上 $f’’(x^*) >0$ 时的收敛速度:
令 $x_k$ 为第 $k$ 次迭代后 $x$ 的值, $e_{k}:=x_{k}-x^{*}$ 表示 $x_k$ 到最小值点 $x^{*}$ 的距离,由 $f’(x^{*}) = 0$:
$$ 0=f^{\prime}\left(x_{k}-e_{k}\right)=f^{\prime}\left(x_{k}\right)-e_{k} f^{\prime \prime}\left(x_{k}\right)+\frac{1}{2} e_{k}^{2} f^{\prime \prime \prime}\left(\xi_{k}\right) \text{for some } \xi_{k} \in\left[x_{k}-e_{k}, x_{k}\right] $$
两边除以 $f’’(x_k)$, 有:
$$ e_{k}-f^{\prime}\left(x_{k}\right) / f^{\prime \prime}\left(x_{k}\right)=\frac{1}{2} e_{k}^{2} f^{\prime \prime \prime}\left(\xi_{k}\right) / f^{\prime \prime}\left(x_{k}\right) $$
代入更新方程 $x_{k+1} = x_{k} - f^{\prime}\left(x_{k}\right) / f^{\prime \prime}\left(x_{k}\right)$, 得到:
$$ x_k - x^{*} - f^{\prime}\left(x_{k}\right) / f^{\prime \prime}\left(x_{k}\right) =\frac{1}{2} e_{k}^{2} f^{\prime \prime \prime}\left(\xi_{k}\right) / f^{\prime \prime}\left(x_{k}\right) $$
$$ x_{k+1} - x^{*} = e_{k+1} = \frac{1}{2} e_{k}^{2} f^{\prime \prime \prime}\left(\xi_{k}\right) / f^{\prime \prime}\left(x_{k}\right) $$
当 $\frac{1}{2} f^{\prime \prime \prime}\left(\xi_{k}\right) / f^{\prime \prime}\left(x_{k}\right) \leq c$ 时,有:
$$ e_{k+1} \leq c e_{k}^{2} $$
预处理 (Heissan阵辅助梯度下降) $$ \mathbf{x} \leftarrow \mathbf{x}-\eta \operatorname{diag}\left(H_{f}\right)^{-1} \nabla \mathbf{x} $$
梯度下降与线性搜索(共轭梯度法) 随机梯度下降 随机梯度下降参数更新 对于有 $n$ 个样本对训练数据集,设 $f_i(x)$ 是第 $i$ 个样本的损失函数, 则目标函数为:
$$ f(\mathbf{x})=\frac{1}{n} \sum_{i=1}^{n} f_{i}(\mathbf{x}) $$
其梯度为:
$$ \nabla f(\mathbf{x})=\frac{1}{n} \sum_{i=1}^{n} \nabla f_{i}(\mathbf{x}) $$
每一个样本的梯度是对整体的梯度的无偏估计
使用该梯度的一次更新的时间复杂度为 $\mathcal{O}(n)$
随机梯度下降更新公式 $\mathcal{O}(1)$:
$$ \mathbf{x} \leftarrow \mathbf{x}-\eta \nabla f_{i}(\mathbf{x}) $$
且有:
$$ \mathbb{E}{i} \nabla f {i}(\mathbf{x})=\frac{1}{n} \sum_{i=1}^{n} \nabla f_{i}(\mathbf{x})=\nabla f(\mathbf{x}) $$ e.g.
$$ f(x_1, x_2) = x_1^2 + 2 x_2^2 $$
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 def f (x1, x2) : return x1 ** 2 + 2 * x2 ** 2 def gradf (x1, x2) : return (2 * x1, 4 * x2) def sgd (x1, x2) : global lr (g1, g2) = gradf(x1, x2) (g1, g2) = (g1 + np.random.normal(0.1 ), g2 + np.random.normal(0.1 )) eta_t = eta * lr() return (x1 - eta_t * g1, x2 - eta_t * g2) eta = 0.1 lr = (lambda : 1 ) show_trace_2d(f, train_2d(sgd, steps=50 ))
动态学习率
在最开始学习率设计比较大,加速收敛
学习率可以设计为指数衰减或多项式衰减
在优化进行一段时间后可以适当减小学习率来避免振荡
$$ \begin{array}{ll}{\eta(t)=\eta_{i} \text { if } t_{i} \leq t \leq t_{i+1}} & {\text { piecewise constant }} \\ {\eta(t)=\eta_{0} \cdot e^{-\lambda t}} & {\text { exponential }} \\ {\eta(t)=\eta_{0} \cdot(\beta t+1)^{-\alpha}} & {\text { polynomial }}\end{array} $$
1 2 3 4 5 6 7 8 def exponential () : global ctr ctr += 1 return math.exp(-0.1 * ctr) ctr = 1 lr = exponential show_trace_2d(f, train_2d(sgd, steps=1000 ))
1 2 3 4 5 6 7 8 9 def polynomial () : global ctr ctr += 1 return (1 + 0.1 * ctr)**(-0.5 ) ctr = 1 lr = polynomial show_trace_2d(f, train_2d(sgd, steps=50 ))
小批量随机梯度下降 读取数据 读取数据
1 2 3 4 5 6 7 8 def get_data_ch7 () : data = np.genfromtxt('/home/kesci/input/airfoil4755/airfoil_self_noise.dat' , delimiter='\t' ) data = (data - data.mean(axis=0 )) / data.std(axis=0 ) return torch.tensor(data[:1500 , :-1 ], dtype=torch.float32), \ torch.tensor(data[:1500 , -1 ], dtype=torch.float32) features, labels = get_data_ch7() features.shape
数据可视化
1 2 3 import pandas as pddf = pd.read_csv('path to airfoil_self_noise.dat' , delimiter='\t' , header=None ) df.head(10 )
Stochastic Gradient Descent (SGD)函数
1 2 3 def sgd (params, states, hyperparams) : for p in params: p.data -= hyperparams['lr' ] * p.grad.data
训练
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 def train_ch7 (optimizer_fn, states, hyperparams, features, labels, batch_size=10 , num_epochs=2 ) : net, loss = d2l.linreg, d2l.squared_loss w = torch.nn.Parameter(torch.tensor(np.random.normal(0 , 0.01 , size=(features.shape[1 ], 1 )), dtype=torch.float32), requires_grad=True ) b = torch.nn.Parameter(torch.zeros(1 , dtype=torch.float32), requires_grad=True ) def eval_loss () : return loss(net(features, w, b), labels).mean().item() ls = [eval_loss()] data_iter = torch.utils.data.DataLoader( torch.utils.data.TensorDataset(features, labels), batch_size, shuffle=True ) for _ in range(num_epochs): start = time.time() for batch_i, (X, y) in enumerate(data_iter): l = loss(net(X, w, b), y).mean() if w.grad is not None : w.grad.data.zero_() b.grad.data.zero_() l.backward() optimizer_fn([w, b], states, hyperparams) if (batch_i + 1 ) * batch_size % 100 == 0 : ls.append(eval_loss()) print('loss: %f, %f sec per epoch' % (ls[-1 ], time.time() - start)) d2l.set_figsize() d2l.plt.plot(np.linspace(0 , num_epochs, len(ls)), ls) d2l.plt.xlabel('epoch' ) d2l.plt.ylabel('loss' )
测试
1 2 def train_sgd (lr, batch_size, num_epochs=2 ) : train_ch7(sgd, None , {'lr' : lr}, features, labels, batch_size, num_epochs)
Result
train_sgd(1, 1500, 6)
train_sgd(0.005, 1)
train_sgd(0.05, 10)
简化模型
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 def train_pytorch_ch7 (optimizer_fn, optimizer_hyperparams, features, labels, batch_size=10 , num_epochs=2 ) : net = nn.Sequential( nn.Linear(features.shape[-1 ], 1 ) ) loss = nn.MSELoss() optimizer = optimizer_fn(net.parameters(), **optimizer_hyperparams) def eval_loss () : return loss(net(features).view(-1 ), labels).item() / 2 ls = [eval_loss()] data_iter = torch.utils.data.DataLoader( torch.utils.data.TensorDataset(features, labels), batch_size, shuffle=True ) for _ in range(num_epochs): start = time.time() for batch_i, (X, y) in enumerate(data_iter): l = loss(net(X).view(-1 ), y) / 2 optimizer.zero_grad() l.backward() optimizer.step() if (batch_i + 1 ) * batch_size % 100 == 0 : ls.append(eval_loss()) print('loss: %f, %f sec per epoch' % (ls[-1 ], time.time() - start)) d2l.set_figsize() d2l.plt.plot(np.linspace(0 , num_epochs, len(ls)), ls) d2l.plt.xlabel('epoch' ) d2l.plt.ylabel('loss' )
train_pytorch_ch7(optim.SGD, {“lr”: 0.05}, features, labels, 10)