温馨提示:

    本文将介绍统计学中的优化知识,凸优化和梯度下降,多为公式推导和图形化展示,较为硬核

优化与深度学习

优化与估计

尽管优化方法可以最小化深度学习中的损失函数值,但本质上优化方法达到的目标与深度学习的目标并不相同。

  • 优化方法目标:训练集损失函数值

  • 深度学习目标:测试集损失函数值(泛化性)

  • 借助图形直观比较

    1
    2
    3
    4
    5
    6
    %matplotlib inline
    import sys
    sys.path.append('path to file storge d2lzh1981')
    import d2lzh1981 as d2l
    from mpl_toolkits import mplot3d # 三维画图
    import numpy as np
1
2
3
4
5
6
7
8
9
10
11
12
def f(x): return x * np.cos(np.pi * x)
def g(x): return f(x) + 0.2 * np.cos(5 * np.pi * x)

d2l.set_figsize((5, 3))
x = np.arange(0.5, 1.5, 0.01)
fig_f, = d2l.plt.plot(x, f(x),label="train error")
fig_g, = d2l.plt.plot(x, g(x),'--', c='purple', label="test error")
fig_f.axes.annotate('empirical risk', (1.0, -1.2), (0.5, -1.1),arrowprops=dict(arrowstyle='->'))
fig_g.axes.annotate('expected risk', (1.1, -1.05), (0.95, -0.5),arrowprops=dict(arrowstyle='->'))
d2l.plt.xlabel('x')
d2l.plt.ylabel('risk')
d2l.plt.legend(loc="upper right")

优化在深度学习中的挑战

  1. 局部最小值
  2. 鞍点
  3. 梯度消失
局部最小值

$$
f(x) = x\cos \pi x
$$

1
2
3
4
5
6
7
8
9
10
11
12
def f(x):
return x * np.cos(np.pi * x)

d2l.set_figsize((4.5, 2.5))
x = np.arange(-1.0, 2.0, 0.1)
fig, = d2l.plt.plot(x, f(x))
fig.axes.annotate('local minimum', xy=(-0.3, -0.25), xytext=(-0.77, -1.0),
arrowprops=dict(arrowstyle='->'))
fig.axes.annotate('global minimum', xy=(1.1, -0.95), xytext=(0.6, 0.8),
arrowprops=dict(arrowstyle='->'))
d2l.plt.xlabel('x')
d2l.plt.ylabel('f(x)');
鞍点

函数在一阶导数为零处(驻点)的黑塞矩阵为不定矩阵。

1
2
3
4
5
6
x = np.arange(-2.0, 2.0, 0.1)
fig, = d2l.plt.plot(x, x**3)
fig.axes.annotate('saddle point', xy=(0, -0.2), xytext=(-0.52, -5.0),
arrowprops=dict(arrowstyle='->'))
d2l.plt.xlabel('x')
d2l.plt.ylabel('f(x)');

海森矩阵

$$
A=\left[\begin{array}{cccc}{\frac{\partial^{2} f}{\partial x_{1}^{2}}} & {\frac{\partial^{2} f}{\partial x_{1} \partial x_{2}}} & {\cdots} & {\frac{\partial^{2} f}{\partial x_{1} \partial x_{n}}} \\ {\frac{\partial^{2} f}{\partial x_{2} \partial x_{1}}} & {\frac{\partial^{2} f}{\partial x_{2}^{2}}} & {\cdots} & {\frac{\partial^{2} f}{\partial x_{2} \partial x_{n}}} \\ {\vdots} & {\vdots} & {\ddots} & {\vdots} \\ {\frac{\partial^{2} f}{\partial x_{n} \partial x_{1}}} & {\frac{\partial^{2} f}{\partial x_{n} \partial x_{2}}} & {\cdots} & {\frac{\partial^{2} f}{\partial x_{n}^{2}}}\end{array}\right]
$$

海森矩阵特征值和鞍点还有局部极小值的点的关系

偏导数为零的点

  • 特征值都大于零是局部极小值点
  • 都为负数是局部极大指点
  • 有正有负就是鞍点

1
2
3
4
5
6
7
8
9
10
11
12
13
x, y = np.mgrid[-1: 1: 31j, -1: 1: 31j]
z = x**2 - y**2

d2l.set_figsize((6, 4))
ax = d2l.plt.figure().add_subplot(111, projection='3d')
ax.plot_wireframe(x, y, z, **{'rstride': 2, 'cstride': 2})
ax.plot([0], [0], [0], 'ro', markersize=10)
ticks = [-1, 0, 1]
d2l.plt.xticks(ticks)
d2l.plt.yticks(ticks)
ax.set_zticks(ticks)
d2l.plt.xlabel('x')
d2l.plt.ylabel('y');
梯度消失
1
2
3
4
5
x = np.arange(-2.0, 5.0, 0.01)
fig, = d2l.plt.plot(x, np.tanh(x))
d2l.plt.xlabel('x')
d2l.plt.ylabel('f(x)')
fig.axes.annotate('vanishing gradient', (4, 1), (2, 0.0) ,arrowprops=dict(arrowstyle='->'))

梯度下降

1
2
3
4
5
6
7
8
9
%matplotlib inline
import numpy as np
import torch
import time
from torch import nn, optim
import math
import sys
sys.path.append('path to file storge d2lzh1981')
import d2lzh1981 as d2l

一维梯度下降


证明:沿梯度反方向移动自变量可以减小函数值

泰勒展开:

$$
f(x+\epsilon)=f(x)+\epsilon f^{\prime}(x)+\mathcal{O}\left(\epsilon^{2}\right)
$$

代入沿梯度方向的移动量 $\eta f^{\prime}(x)$:

$$
f\left(x-\eta f^{\prime}(x)\right)=f(x)-\eta f^{\prime 2}(x)+\mathcal{O}\left(\eta^{2} f^{\prime 2}(x)\right)
$$

$$
f\left(x-\eta f^{\prime}(x)\right) \lesssim f(x)
$$

$$
x \leftarrow x-\eta f^{\prime}(x)
$$


e.g.

$$
f(x) = x^2
$$

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def f(x):
return x**2 # Objective function

def gradf(x):
return 2 * x # Its derivative

def gd(eta):
x = 10
results = [x]
for i in range(20):
# eta 学习率
x -= eta * gradf(x)
results.append(x)
print('epoch 20, x:', x)
return results

res = gd(0.2)

梯度下降轨迹

1
2
3
4
5
6
7
8
9
10
11
def show_trace(res):
n = max(abs(min(res)), abs(max(res)))
f_line = np.arange(-n, n, 0.01)
d2l.set_figsize((3.5, 2.5))
d2l.plt.plot(f_line, [f(x) for x in f_line],'-')
d2l.plt.plot(res, [f(x) for x in res],'-o')
d2l.plt.xlabel('x')
d2l.plt.ylabel('f(x)')


show_trace(res)
学习率

学习率过小 Code:show_trace(gd(0.05))
在这里插入图片描述

学习率过大 Code:show_trace(gd(1.1))
在这里插入图片描述

局部极小值

$$
f(x) = x\cos cx
$$

1
2
3
4
5
6
7
8
9
10
11
c = 0.15 * np.pi

def f(x):
return x * np.cos(c * x)

def gradf(x):
return np.cos(c * x) - c * x * np.sin(c * x)

# 学习率不合适容易导致
show_trace(gd(2))
show_trace(gd(0.5))

多维梯度下降

$$
\nabla f(\mathbf{x})=\left[\frac{\partial f(\mathbf{x})}{\partial x_{1}}, \frac{\partial f(\mathbf{x})}{\partial x_{2}}, \dots, \frac{\partial f(\mathbf{x})}{\partial x_{d}}\right]^{\top}
$$

$$
f(\mathbf{x}+\epsilon)=f(\mathbf{x})+\epsilon^{\top} \nabla f(\mathbf{x})+\mathcal{O}\left(|\epsilon|^{2}\right)
$$

$$
\mathbf{x} \leftarrow \mathbf{x}-\eta \nabla f(\mathbf{x})
$$

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# 训练 trainer展示x如何更新
def train_2d(trainer, steps=20):
x1, x2 = -5, -2
results = [(x1, x2)]
for i in range(steps):
x1, x2 = trainer(x1, x2)
results.append((x1, x2))
print('epoch %d, x1 %f, x2 %f' % (i + 1, x1, x2))
return results
# 垂直于等高线梯度下降
def show_trace_2d(f, results):
d2l.plt.plot(*zip(*results), '-o', color='#ff7f0e')
x1, x2 = np.meshgrid(np.arange(-5.5, 1.0, 0.1), np.arange(-3.0, 1.0, 0.1))
d2l.plt.contour(x1, x2, f(x1, x2), colors='#1f77b4')
d2l.plt.xlabel('x1')
d2l.plt.ylabel('x2')

e.g.

$$
f(x) = x_1^2 + 2x_2^2
$$

1
2
3
4
5
6
7
8
9
eta = 0.1

def f_2d(x1, x2): # 目标函数
return x1 ** 2 + 2 * x2 ** 2

def gd_2d(x1, x2):
return (x1 - eta * 2 * x1, x2 - eta * 4 * x2)

show_trace_2d(f_2d, train_2d(gd_2d))

自适应方法

牛顿法

优势 :

梯度下降“步幅”的确定比较困难

而牛顿法相当于可以通过Hessian矩阵来调整“步幅”。

在牛顿法中,局部极小值也可以通过调整学习率来解决。

在 $x + \epsilon$ 处泰勒展开:

$$
f(\mathbf{x}+\epsilon)=f(\mathbf{x})+\epsilon^{\top} \nabla f(\mathbf{x})+\frac{1}{2} \epsilon^{\top} \nabla \nabla^{\top} f(\mathbf{x}) \epsilon+\mathcal{O}\left(|\epsilon|^{3}\right)
$$

最小值点处满足: $\nabla f(\mathbf{x})=0$, 即我们希望 $\nabla f(\mathbf{x} + \epsilon)=0$, 对上式关于 $\epsilon$ 求导,忽略高阶无穷小,有:

$$
\nabla f(\mathbf{x})+\boldsymbol{H}{f} \boldsymbol{\epsilon}=0 \text { and hence } \epsilon=-\boldsymbol{H}{f}^{-1} \nabla f(\mathbf{x})
$$

牛顿法需要计算Hessian矩阵的逆,计算量比较大。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
c = 0.5

def f(x):
return np.cosh(c * x) # Objective

def gradf(x):
return c * np.sinh(c * x) # Derivative

def hessf(x):
return c**2 * np.cosh(c * x) # Hessian

# Hide learning rate for now
def newton(eta=1):
x = 10
results = [x]
for i in range(10):
x -= eta * gradf(x) / hessf(x)
results.append(x)
print('epoch 10, x:', x)
return results

show_trace(newton())
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 牛顿法对于有局部极小值的情况
# 和梯度下降的方法有一样的效果
# 正确的方法还是降低学习率
c = 0.15 * np.pi

def f(x):
return x * np.cos(c * x)

def gradf(x):
return np.cos(c * x) - c * x * np.sin(c * x)

def hessf(x):
return - 2 * c * np.sin(c * x) - x * c**2 * np.cos(c * x)

show_trace(newton())

show_trace(newton(0.5))
在这里插入图片描述

收敛性分析

只考虑在函数为凸函数, 且最小值点上 $f’’(x^*) >0$ 时的收敛速度:

令 $x_k$ 为第 $k$ 次迭代后 $x$ 的值, $e_{k}:=x_{k}-x^{*}$ 表示 $x_k$ 到最小值点 $x^{*}$ 的距离,由 $f’(x^{*}) = 0$:

$$
0=f^{\prime}\left(x_{k}-e_{k}\right)=f^{\prime}\left(x_{k}\right)-e_{k} f^{\prime \prime}\left(x_{k}\right)+\frac{1}{2} e_{k}^{2} f^{\prime \prime \prime}\left(\xi_{k}\right) \text{for some } \xi_{k} \in\left[x_{k}-e_{k}, x_{k}\right]
$$

两边除以 $f’’(x_k)$, 有:

$$
e_{k}-f^{\prime}\left(x_{k}\right) / f^{\prime \prime}\left(x_{k}\right)=\frac{1}{2} e_{k}^{2} f^{\prime \prime \prime}\left(\xi_{k}\right) / f^{\prime \prime}\left(x_{k}\right)
$$

代入更新方程 $x_{k+1} = x_{k} - f^{\prime}\left(x_{k}\right) / f^{\prime \prime}\left(x_{k}\right)$, 得到:

$$
x_k - x^{*} - f^{\prime}\left(x_{k}\right) / f^{\prime \prime}\left(x_{k}\right) =\frac{1}{2} e_{k}^{2} f^{\prime \prime \prime}\left(\xi_{k}\right) / f^{\prime \prime}\left(x_{k}\right)
$$

$$
x_{k+1} - x^{*} = e_{k+1} = \frac{1}{2} e_{k}^{2} f^{\prime \prime \prime}\left(\xi_{k}\right) / f^{\prime \prime}\left(x_{k}\right)
$$

当 $\frac{1}{2} f^{\prime \prime \prime}\left(\xi_{k}\right) / f^{\prime \prime}\left(x_{k}\right) \leq c$ 时,有:

$$
e_{k+1} \leq c e_{k}^{2}
$$

预处理 (Heissan阵辅助梯度下降)

$$
\mathbf{x} \leftarrow \mathbf{x}-\eta \operatorname{diag}\left(H_{f}\right)^{-1} \nabla \mathbf{x}
$$

梯度下降与线性搜索(共轭梯度法)

随机梯度下降

随机梯度下降参数更新

对于有 $n$ 个样本对训练数据集,设 $f_i(x)$ 是第 $i$ 个样本的损失函数, 则目标函数为:

$$
f(\mathbf{x})=\frac{1}{n} \sum_{i=1}^{n} f_{i}(\mathbf{x})
$$

其梯度为:

$$
\nabla f(\mathbf{x})=\frac{1}{n} \sum_{i=1}^{n} \nabla f_{i}(\mathbf{x})
$$

每一个样本的梯度是对整体的梯度的无偏估计

使用该梯度的一次更新的时间复杂度为 $\mathcal{O}(n)$

随机梯度下降更新公式 $\mathcal{O}(1)$:

$$
\mathbf{x} \leftarrow \mathbf{x}-\eta \nabla f_{i}(\mathbf{x})
$$

且有:

$$
\mathbb{E}{i} \nabla f{i}(\mathbf{x})=\frac{1}{n} \sum_{i=1}^{n} \nabla f_{i}(\mathbf{x})=\nabla f(\mathbf{x})
$$
e.g.

$$
f(x_1, x_2) = x_1^2 + 2 x_2^2
$$

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def f(x1, x2):
return x1 ** 2 + 2 * x2 ** 2 # Objective

def gradf(x1, x2):
return (2 * x1, 4 * x2) # Gradient

def sgd(x1, x2): # Simulate noisy gradient
global lr # Learning rate scheduler
(g1, g2) = gradf(x1, x2) # Compute gradient
(g1, g2) = (g1 + np.random.normal(0.1), g2 + np.random.normal(0.1))
eta_t = eta * lr() # Learning rate at time t
return (x1 - eta_t * g1, x2 - eta_t * g2) # Update variables

eta = 0.1
lr = (lambda: 1) # Constant learning rate
show_trace_2d(f, train_2d(sgd, steps=50))

动态学习率

  • 在最开始学习率设计比较大,加速收敛
  • 学习率可以设计为指数衰减或多项式衰减
  • 在优化进行一段时间后可以适当减小学习率来避免振荡

$$
\begin{array}{ll}{\eta(t)=\eta_{i} \text { if } t_{i} \leq t \leq t_{i+1}} & {\text { piecewise constant }} \\ {\eta(t)=\eta_{0} \cdot e^{-\lambda t}} & {\text { exponential }} \\ {\eta(t)=\eta_{0} \cdot(\beta t+1)^{-\alpha}} & {\text { polynomial }}\end{array}
$$

1
2
3
4
5
6
7
8
def exponential():
global ctr
ctr += 1
return math.exp(-0.1 * ctr)

ctr = 1
lr = exponential # Set up learning rate
show_trace_2d(f, train_2d(sgd, steps=1000))
1
2
3
4
5
6
7
8
9
def polynomial():
# 迭代次数
global ctr
ctr += 1
return (1 + 0.1 * ctr)**(-0.5)

ctr = 1
lr = polynomial # Set up learning rate
show_trace_2d(f, train_2d(sgd, steps=50))

小批量随机梯度下降

读取数据

读取数据

1
2
3
4
5
6
7
8
def get_data_ch7():
data = np.genfromtxt('/home/kesci/input/airfoil4755/airfoil_self_noise.dat', delimiter='\t')
data = (data - data.mean(axis=0)) / data.std(axis=0) # 标准化
return torch.tensor(data[:1500, :-1], dtype=torch.float32), \
torch.tensor(data[:1500, -1], dtype=torch.float32) # 前1500个样本(每个样本5个特征)

features, labels = get_data_ch7()
features.shape

数据可视化

1
2
3
import pandas as pd
df = pd.read_csv('path to airfoil_self_noise.dat', delimiter='\t', header=None)
df.head(10)

Stochastic Gradient Descent (SGD)函数

1
2
3
def sgd(params, states, hyperparams):
for p in params:
p.data -= hyperparams['lr'] * p.grad.data

训练

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def train_ch7(optimizer_fn, states, hyperparams, features, labels,
batch_size=10, num_epochs=2):
# 初始化模型
net, loss = d2l.linreg, d2l.squared_loss

w = torch.nn.Parameter(torch.tensor(np.random.normal(0, 0.01, size=(features.shape[1], 1)), dtype=torch.float32),
requires_grad=True)
b = torch.nn.Parameter(torch.zeros(1, dtype=torch.float32), requires_grad=True)

def eval_loss():
return loss(net(features, w, b), labels).mean().item()

ls = [eval_loss()]
data_iter = torch.utils.data.DataLoader(
torch.utils.data.TensorDataset(features, labels), batch_size, shuffle=True)

for _ in range(num_epochs):
start = time.time()
for batch_i, (X, y) in enumerate(data_iter):
l = loss(net(X, w, b), y).mean() # 使用平均损失

# 梯度清零
if w.grad is not None:
w.grad.data.zero_()
b.grad.data.zero_()

l.backward()
optimizer_fn([w, b], states, hyperparams) # 迭代模型参数
if (batch_i + 1) * batch_size % 100 == 0:
ls.append(eval_loss()) # 每100个样本记录下当前训练误差
# 打印结果和作图
print('loss: %f, %f sec per epoch' % (ls[-1], time.time() - start))
d2l.set_figsize()
d2l.plt.plot(np.linspace(0, num_epochs, len(ls)), ls)
d2l.plt.xlabel('epoch')
d2l.plt.ylabel('loss')

测试

1
2
def train_sgd(lr, batch_size, num_epochs=2):
train_ch7(sgd, None, {'lr': lr}, features, labels, batch_size, num_epochs)

Result

  • train_sgd(1, 1500, 6)
    在这里插入图片描述
  • train_sgd(0.005, 1)
    在这里插入图片描述
  • train_sgd(0.05, 10)
    在这里插入图片描述

简化模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def train_pytorch_ch7(optimizer_fn, optimizer_hyperparams, features, labels,
batch_size=10, num_epochs=2):
# 初始化模型
net = nn.Sequential(
nn.Linear(features.shape[-1], 1)
)
loss = nn.MSELoss()
optimizer = optimizer_fn(net.parameters(), **optimizer_hyperparams)

def eval_loss():
return loss(net(features).view(-1), labels).item() / 2

ls = [eval_loss()]
data_iter = torch.utils.data.DataLoader(
torch.utils.data.TensorDataset(features, labels), batch_size, shuffle=True)

for _ in range(num_epochs):
start = time.time()
for batch_i, (X, y) in enumerate(data_iter):
# 除以2是为了和train_ch7保持一致, 因为squared_loss中除了2
l = loss(net(X).view(-1), y) / 2

optimizer.zero_grad()
l.backward()
optimizer.step()
if (batch_i + 1) * batch_size % 100 == 0:
ls.append(eval_loss())
# 打印结果和作图
print('loss: %f, %f sec per epoch' % (ls[-1], time.time() - start))
d2l.set_figsize()
d2l.plt.plot(np.linspace(0, num_epochs, len(ls)), ls)
d2l.plt.xlabel('epoch')
d2l.plt.ylabel('loss')

train_pytorch_ch7(optim.SGD, {“lr”: 0.05}, features, labels, 10)
在这里插入图片描述