就深层次 CNN 的结构进一步探讨归一化和残差网络。
处理后的任意一个特征在数据集中所有样本上的均值为0、标准差为1。 标准化处理输入数据使各个特征的分布相近
全连接: $$ \boldsymbol{x} = \boldsymbol{W\boldsymbol{u} + \boldsymbol{b}} \\ output =\phi(\boldsymbol{x}) $$
批量归一化: $$ output=\phi(\text{BN}(\boldsymbol{x}))$$
$$ \boldsymbol{y}^{(i)} = \text{BN}(\boldsymbol{x}^{(i)}) $$
$$ \boldsymbol{\mu}\mathcal{B} \leftarrow \frac{1}{m}\sum {i = 1}^{m} \boldsymbol{x}^{(i)}, $$
$$ \boldsymbol{\sigma}_\mathcal{B}^2 \leftarrow \frac{1}{m} \sum_{i=1}^{m}(\boldsymbol{x}^{(i)} - \boldsymbol{\mu}_\mathcal{B})^2, $$
$$ \hat{\boldsymbol{x}}^{(i)} \leftarrow \frac{\boldsymbol{x}^{(i)} - \boldsymbol{\mu}_\mathcal{B}}{\sqrt{\boldsymbol{\sigma}_\mathcal{B}^2 + \epsilon}}, $$
$$ 标准化处理 $$ 这⾥ϵ > 0是个很小的常数,保证分母大于0
$$ {\boldsymbol{y}}^{(i)} \leftarrow \boldsymbol{\gamma} \odot \hat{\boldsymbol{x}}^{(i)} + \boldsymbol{\beta}. $$
引入可学习参数:拉伸参数γ和偏移参数β。若$\boldsymbol{\gamma} = \sqrt{\boldsymbol{\sigma}_\mathcal{B}^2 + \epsilon}$和$\boldsymbol{\beta} = \boldsymbol{\mu}_\mathcal{B}$,批量归一化无效。
计算:对单通道,$batchsize = m,卷积计算输出 = p \times q$
对该通道中 $m\times p\times q$ 个元素同时做批量归一化,使用相同的均值和方差。
训练:以 batch 为单位, 对每个 batch 计算均值和方差。
实现 batch_norm function 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 import timeimport torchfrom torch import nn, optimimport torch.nn.functional as Fimport torchvisionimport syssys.path.append("path to file storge d2lzh1981" ) import d2lzh1981 as d2ldevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu' ) def batch_norm (is_training, X, gamma, beta, moving_mean, moving_var, eps, momentum) : if not is_training: X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps) else : assert len(X.shape) in (2 , 4 ) if len(X.shape) == 2 : mean = X.mean(dim=0 ) var = ((X - mean) ** 2 ).mean(dim=0 ) else : mean = X.mean(dim=0 , keepdim=True ).mean(dim=2 , keepdim=True ).mean(dim=3 , keepdim=True ) var = ((X - mean) ** 2 ).mean(dim=0 , keepdim=True ).mean(dim=2 , keepdim=True ).mean(dim=3 , keepdim=True ) X_hat = (X - mean) / torch.sqrt(var + eps) moving_mean = momentum * moving_mean + (1.0 - momentum) * mean moving_var = momentum * moving_var + (1.0 - momentum) * var Y = gamma * X_hat + beta return Y, moving_mean, moving_var
batch_norm class 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 class BatchNorm (nn.Module) : def __init__ (self, num_features, num_dims) : super(BatchNorm, self).__init__() if num_dims == 2 : shape = (1 , num_features) else : shape = (1 , num_features, 1 , 1 ) self.gamma = nn.Parameter(torch.ones(shape)) self.beta = nn.Parameter(torch.zeros(shape)) self.moving_mean = torch.zeros(shape) self.moving_var = torch.zeros(shape) def forward (self, X) : if self.moving_mean.device != X.device: self.moving_mean = self.moving_mean.to(X.device) self.moving_var = self.moving_var.to(X.device) Y, self.moving_mean, self.moving_var = batch_norm(self.training, X, self.gamma, self.beta, self.moving_mean, self.moving_var, eps=1e-5 , momentum=0.9 ) return Y
基于LeNet的应用 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 net = nn.Sequential( nn.Conv2d(1 , 6 , 5 ), BatchNorm(6 , num_dims=4 ), nn.Sigmoid(), nn.MaxPool2d(2 , 2 ), nn.Conv2d(6 , 16 , 5 ), BatchNorm(16 , num_dims=4 ), nn.Sigmoid(), nn.MaxPool2d(2 , 2 ), d2l.FlattenLayer(), nn.Linear(16 *4 *4 , 120 ), BatchNorm(120 , num_dims=2 ), nn.Sigmoid(), nn.Linear(120 , 84 ), BatchNorm(84 , num_dims=2 ), nn.Sigmoid(), nn.Linear(84 , 10 ) ) print(net)
load data 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 batch_size=16 def load_data_fashion_mnist (batch_size, resize=None, root='/home/kesci/input/FashionMNIST2065' ) : """Download the fashion mnist dataset and then load into memory.""" trans = [] if resize: trans.append(torchvision.transforms.Resize(size=resize)) trans.append(torchvision.transforms.ToTensor()) transform = torchvision.transforms.Compose(trans) mnist_train = torchvision.datasets.FashionMNIST(root=root, train=True , download=True , transform=transform) mnist_test = torchvision.datasets.FashionMNIST(root=root, train=False , download=True , transform=transform) train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True , num_workers=2 ) test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False , num_workers=2 ) return train_iter, test_iter train_iter, test_iter = load_data_fashion_mnist(batch_size)
train and test 1 2 3 lr, num_epochs = 0.001 , 5 optimizer = torch.optim.Adam(net.parameters(), lr=lr) d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 ''' 在自己应用时不需要写class和function ''' net = nn.Sequential( nn.Conv2d(1 , 6 , 5 ), nn.BatchNorm2d(6 ), nn.Sigmoid(), nn.MaxPool2d(2 , 2 ), nn.Conv2d(6 , 16 , 5 ), nn.BatchNorm2d(16 ), nn.Sigmoid(), nn.MaxPool2d(2 , 2 ), d2l.FlattenLayer(), nn.Linear(16 *4 *4 , 120 ), nn.BatchNorm1d(120 ), nn.Sigmoid(), nn.Linear(120 , 84 ), nn.BatchNorm1d(84 ), nn.Sigmoid(), nn.Linear(84 , 10 ) ) optimizer = torch.optim.Adam(net.parameters(), lr=lr) d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)
残差网络(ResNet) 深层网络能够拟合出的映射就一定能够包含浅层网络拟合出的映射
但 CNN 模型在建立的时候并不是越深越好
残差块(Residual Block) 恒等映射:
右边:$f(x)-x=0$ (易于捕捉恒等映射的细微波动 ; 易于优化)
$$ 神经网络普通层(left)残差网络(right) $$
在残差块中,输⼊可通过跨层的数据线路更快 地向前传播。
残差块实现 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 class Residual (nn.Module) : def __init__ (self, in_channels, out_channels, use_1x1conv=False, stride=1 ) : super(Residual, self).__init__() self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3 , padding=1 , stride=stride) self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3 , padding=1 ) if use_1x1conv: self.conv3 = nn.Conv2d(in_channels, out_channels, kernel_size=1 , stride=stride) else : self.conv3 = None self.bn1 = nn.BatchNorm2d(out_channels) self.bn2 = nn.BatchNorm2d(out_channels) def forward (self, X) : Y = F.relu(self.bn1(self.conv1(X))) Y = self.bn2(self.conv2(Y)) if self.conv3: X = self.conv3(X) return F.relu(Y + X)
是否需要 $1\times1$ 卷积层 1 2 3 4 5 6 7 blk = Residual(3 , 3 ) X = torch.rand((4 , 3 , 6 , 6 )) blk(X).shape blk = Residual(3 , 6 , use_1x1conv=True , stride=2 ) blk(X).shape
ResNet模型 简化实现 1 2 3 4 5 net = nn.Sequential( nn.Conv2d(1 , 64 , kernel_size=7 , stride=2 , padding=3 ), nn.BatchNorm2d(64 ), nn.ReLU(), nn.MaxPool2d(kernel_size=3 , stride=2 , padding=1 ))
残差块 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 def resnet_block (in_channels, out_channels, num_residuals, first_block=False) : if first_block: assert in_channels == out_channels blk = [] for i in range(num_residuals): if i == 0 and not first_block: blk.append(Residual(in_channels, out_channels, use_1x1conv=True , stride=2 )) else : blk.append(Residual(out_channels, out_channels)) return nn.Sequential(*blk) net.add_module("resnet_block1" , resnet_block(64 , 64 , 2 , first_block=True )) net.add_module("resnet_block2" , resnet_block(64 , 128 , 2 )) net.add_module("resnet_block3" , resnet_block(128 , 256 , 2 )) net.add_module("resnet_block4" , resnet_block(256 , 512 , 2 ))
全局平均池化 1 2 net.add_module("global_avg_pool" , d2l.GlobalAvgPool2d()) net.add_module("fc" , nn.Sequential(d2l.FlattenLayer(), nn.Linear(512 , 10 )))
train and test 1 2 3 lr, num_epochs = 0.001 , 5 optimizer = torch.optim.Adam(net.parameters(), lr=lr) d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)
ResNet 的引申设计
特征:concat 连接
稠密块(dense block): 定义了输入和输出是如何连结的。
过渡层(transition layer):用来控制通道数,使之不过大。稠密块
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 def conv_block (in_channels, out_channels) : blk = nn.Sequential(nn.BatchNorm2d(in_channels), nn.ReLU(), nn.Conv2d(in_channels, out_channels, kernel_size=3 , padding=1 )) return blk class DenseBlock (nn.Module) : def __init__ (self, num_convs, in_channels, out_channels) : super(DenseBlock, self).__init__() net = [] for i in range(num_convs): in_c = in_channels + i * out_channels net.append(conv_block(in_c, out_channels)) self.net = nn.ModuleList(net) self.out_channels = in_channels + num_convs * out_channels def forward (self, X) : for blk in self.net: Y = blk(X) X = torch.cat((X, Y), dim=1 ) return X
1 2 3 4 blk = DenseBlock(2 , 3 , 10 ) X = torch.rand(4 , 3 , 8 , 8 ) Y = blk(X) Y.shape
1 2 3 4 5 6 7 8 9 10 def transition_block (in_channels, out_channels) : blk = nn.Sequential( nn.BatchNorm2d(in_channels), nn.ReLU(), nn.Conv2d(in_channels, out_channels, kernel_size=1 ), nn.AvgPool2d(kernel_size=2 , stride=2 )) return blk blk = transition_block(23 , 10 ) blk(Y).shape
DenseNet模型 1 2 3 4 5 6 7 net = nn.Sequential( nn.Conv2d(1 , 64 , kernel_size=7 , stride=2 , padding=3 ), nn.BatchNorm2d(64 ), nn.ReLU(), nn.MaxPool2d(kernel_size=3 , stride=2 , padding=1 ))
1 2 3 4 5 6 7 8 9 10 11 12 num_channels, growth_rate = 64 , 32 num_convs_in_dense_blocks = [4 , 4 , 4 , 4 ] for i, num_convs in enumerate(num_convs_in_dense_blocks): DB = DenseBlock(num_convs, num_channels, growth_rate) net.add_module("DenseBlosk_%d" % i, DB) num_channels = DB.out_channels if i != len(num_convs_in_dense_blocks) - 1 : net.add_module("transition_block_%d" % i, transition_block(num_channels, num_channels // 2 )) num_channels = num_channels // 2
1 2 3 4 5 6 7 8 9 10 net.add_module("BN" , nn.BatchNorm2d(num_channels)) net.add_module("relu" , nn.ReLU()) net.add_module("global_avg_pool" , d2l.GlobalAvgPool2d()) net.add_module("fc" , nn.Sequential(d2l.FlattenLayer(), nn.Linear(num_channels, 10 ))) X = torch.rand((1 , 1 , 96 , 96 )) for name, layer in net.named_children(): X = layer(X) print(name, ' output shape:\t' , X.shape)
1 2 3 4 5 6 7 batch_size=16 train_iter, test_iter =load_data_fashion_mnist(batch_size, resize=96 ) lr, num_epochs = 0.001 , 5 optimizer = torch.optim.Adam(net.parameters(), lr=lr) d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)
Copyright (c) 2019 CC-BY-NC-4.0 LICENSE
How to thought, there is what kind of life.