import os
import json
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import torchvision
from torchvision import models
from torch.utils.data import Dataset
from torchvision import transforms
from torch.utils.data import DataLoader
import visdom
# from tensorboardX import SummaryWriter
from torch.utils.tensorboard import SummaryWriter
全连接层
nn.Linear(in_features, out_features, bias=True)
>>> linear = nn.Linear(784, 10)
>>> input = torch.randn(4, 784)
>>> output = linear(input)
>>> output.shape
torch.Size([4, 10])
卷积层
nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0,
dilation=1, groups=1, bias=True, padding_mode='zeros')
- dilation:空洞卷积,当大于1的时候可以增大感受野,同时保持特征图的尺寸
- groups:可实现组卷积,即在卷积操作时不是逐点卷积,而是将输入通道范围分为多个组,稀疏连接达到降低计算量的目的
通过.weight
和.bias
查看卷积核的权重与偏置
>>> conv = nn.Conv2d(1, 1, 3, 1, 1)
>>> conv.weight.shape
torch.Size([1, 1, 3, 3])
>>> conv.bias.shape
torch.Size([1])
输入特征图必须写为( N , C , H , W ) (N, C, H, W)(N,C,H,W)的形式
>>> input = torch.randn(1, 1, 5, 5)
>>> output = conv(input)
>>> output.shape
torch.Size([1, 1, 5, 5])
池化层
最大池化层
nn.MaxPool2d(kernel_size, stride=None, padding=0,
dilation=1, return_indices=False, ceil_mode=False)
return_indices
– ifTrue
, will return the max indices along with the outputs.ceil_mode
– whenTrue
, will use ceil instead of floor to compute the output shapestride
– 注意:stride
默认值为kernel_size
,而非1
>>> max_pooling = nn.MaxPool2d(2, stride=2) >>> input = torch.randn(1, 1, 4, 4) >>> max_pooling(input) tensor([[[[0.9636, 0.7075], [1.0641, 1.1749]]]]) >>> max_pooling(input).shape torch.Size([1, 1, 2, 2])
平均池化层
nn.AvgPool2d(kernel_size, stride=None, padding=0,
ceil_mode=False, count_include_pad=True, divisor_override=None)
If padding
is non-zero, then the input is implicitly zero-padded on both sides for padding
number of points.
ceil_mode
– whenTrue
, will use ceil instead of floor to compute the output shapecount_include_pad
– whenTrue
, will include the zero-padding in the averaging calculationdivisor_override
– if specified, it will be used as divisor, otherwiseattr:kernel_size
will be used
The parameters kernel_size
, stride
, padding
can either be:
- a single
int
– in which case the same value is used for the height and width dimension - a
tuple
of two ints – in which case, the first int is used for the height dimension, and the second int for the width dimension
全局平均池化层
nn.Sequential(
nn.AdaptiveMaxPool2d((1,1)),
nn.Flatten()
}
激活函数层
当然,下面的层也可以用torch.nn.functional
中的函数替代
Sigmoid层
nn.Sigmoid()
>>> sigmoid = nn.Sigmoid()
>>> sigmoid(torch.Tensor([1, 1, 2, 2]))
tensor([0.7311, 0.7311, 0.8808, 0.8808])
ReLU层
nn.ReLU(inplace=False)
>>> relu = nn.ReLU(inplace=True)
>>> input = torch.randn(2, 2)
>>> input
tensor([[-0.4853, 2.3864],
[ 0.7122, -0.6493]])
>>> relu(input)
tensor([[0.0000, 2.3864],
[0.7122, 0.0000]])
>>> input
tensor([[0.0000, 2.3864],
[0.7122, 0.0000]])
Softmax层
nn.Softmax(dim=None)
LogSoftmax层
nn.LogSoftmax(dim=None)
后接nn.NLLLoss
层相当于CrossEntropyLoss
层
Dropout层
nn.Dropout(p=0.5, inplace=False)
>>> dropout = nn.Dropout(0.5, inplace=False)
>>> input = torch.randn(1, 20)
>>> output = dropout(input)
>>> output
tensor([[-2.9413, 0.0000, 1.8461, 1.9605, 0.2774, -0.0000, -2.5381, -2.0313,
-0.1914, 0.0000, 0.5346, -0.0000, 0.0000, 4.4960, -3.8345, -1.0938,
4.3297, 2.1258, -4.1431, 0.0000]])
>>> input
tensor([[-1.4707, 0.5105, 0.9231, 0.9802, 0.1387, -0.4195, -1.2690, -1.0156,
-0.0957, 0.8108, 0.2673, -2.0898, 0.6666, 2.2480, -1.9173, -0.5469,
2.1648, 1.0629, -2.0716, 0.9974]])
BN层
torch.nn.BatchNorm2d(num_features, eps=1e-05, momentum=0.1,
affine=True, track_running_stats=True)
- num_features – C CC from an expected input of size ( N , C , H , W )
eps
– a value added to the denominator for numerical stability. Default:1e-5
momentum
– the value used for the running_mean and running_var computation. Can be set to None for cumulative moving average (i.e. simple average). Default:0.1
affine
– a boolean value that when set toTrue
, this module has learnable affine parameters. Default:True
track_running_stats
– a boolean value that when set toTrue
, this module tracks the running mean and variance, and when set toFalse
, this module does not track such statistics and always uses batch statistics in both training and eval modes. Default:True
Because the Batch Normalization is done over the C dimension, computing statistics on ( N , H , W ) (N, H, W)(N,H,W) slices, it’s common terminology to call this Spatial Batch Normalization.
The mean and standard-deviation are calculated per-dimension over the mini-batches and γ \gammaγ and β \betaβ are learnable parameter vectors of size C (where C is the input size). By default, the elements of γ \gammaγ are set to 1 and the elements of β \betaβ are set to 0.>>> bn = nn.BatchNorm2d(64)
>>> input = torch.randn(4, 64, 28, 28)
>>> output = bn(input)
>>> output.shape
torch.Size([4, 64, 28, 28])
LSTM 层
nn.LSTM
Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence. In a multilayer LSTM, the input x t ( l ) x^{(l)}_tx t(l) of the l ll -th layer (l ≥ 2 l\geq2l≥2) is the hidden state h t ( l − 1 ) h^{(l−1)}_th
t(l−1)of the previous layer multiplied by dropout δ t ( l − 1 ) δ^{(l−1)}_tδ t(l−1) where each δ t ( l − 1 ) δ^{(l−1)}_tδ t(l−1) is a Bernoulli random variable which is 0 00 with probability dropout.
input_size
– The number of expected features in the input xhidden_size
– The number of features in the hidden state hnum_layers
– Number of recurrent layers. E.g., settingnum_layers=2
would mean stacking two LSTMs together. Default: 1bias
– IfFalse
, then the layer does not use bias weightsb_ih
andb_hh
. Default:True
batch_first
– IfTrue
, then the input and output tensors are provided as ( b a t c h , s e q , f e a t u r e ) (batch, seq, feature)(batch,seq,feature) instead of ( s e q , b a t c h , f e a t u r e ) (seq, batch, feature)(seq,batch,feature). Note that this does not apply to hidden or cell states. Default:False
dropout
– If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout probability equal todropout
. Default: 0bidirectional
– IfTrue
, becomes a bidirectional LSTM. Default:False
损失函数层
NLLLoss
nn.NLLLoss(weight=None, size_average=None,
ignore_index=-100, reduce=None, reduction='mean')
- It is useful to train a classification problem with
C
(C
= number of classes) classes.
- The input given through a forward call is expected to contain log-probabilities of each class.
- input has to be a Tensor of size either ( N , C )or ( N , C , d 1 , d 2 , . . . , d K )with K ≥ 1 for the K-
dimensional case (In the case of images, it computes NLL loss per-pixel). (N NN is the size of mini-batch)
- input has to be a Tensor of size either ( N , C )or ( N , C , d 1 , d 2 , . . . , d K )with K ≥ 1 for the K-
- The target that this loss expects should be a class index in the range [ 0 , C − 1 ] [0, C-1][0,C−1] where C = number of classes;
- if
ignore_index
is specified, this loss also accepts this class index (this index may not necessarily be in the class range). -
Shape: ( N )where each value is 0 ≤ targets [ i ] ≤ C − 1 , or ( N , d 1 , d 2 , . . . , d K )with K ≥ 1 in the case of K-dimensional loss.
Output:
scalar
. Ifreduction
is ‘none
’, then the same size as the target:(N) , or ( N , d 1 , d 2 , . . . , d K ) with K ≥ 1 in the case of K -dimensional loss. - The unreduced (i.e. with
reduction
set to 'none'
) loss can be described as: - 其中x为输入,y为标签,weight表示每个类别在计算loss时的权重,xn,yn表示第n个样本中正确类别的log(score)
- If
reduction
is ‘mean
’ (default ‘mean
’), then -
- If
reduction
is ‘sum
’ (default ‘mean
’), then -
-
Parameters
weight
(Tensor
, optional) – a manual rescaling weight given to each class.
- If given, it has to be a Tensor of size
C
, assigning weight to each of the classes. This is particularly useful when you have an unbalanced training set. - Otherwise, it is treated as if having all ones.
- If given, it has to be a Tensor of size
size_average
(bool
, optional) – Deprecatedignore_index
(int
, optional) – Specifies a target value that is ignored and does not contribute to the input gradient.reduce
(bool
, optional) – Deprecatedreduction
(string
, optional) – Specifies the reduction to apply to the output:none
| ’mean’
| ‘sum’
. Default: ‘mean
’
m = nn.LogSoftmax(dim=1)
loss = nn.NLLLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.tensor([1, 0, 4])
output = loss(m(input), target)
N, C = 5, 4
loss = nn.NLLLoss()
# input is of size N x C x height x width
data = torch.randn(N, C, 8, 8)
m = nn.LogSoftmax(dim=1)
# each element in target has to have 0 <= value < C
target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
output = loss(m(data), target)
CrossEntropyLoss
nn.CrossEntropyLoss(weight=None, size_average=None,
ignore_index=-100, reduce=None, reduction='mean')
- This criterion combines
nn.LogSoftmax()
andnn.NLLLoss()
in one single class.
- 其实就是 Softmax + CrossEntropyLoss,它们两个结合在一起时梯度反向传播的时候结果就会是漂亮的 y − t
- 参数的意义跟上面的
nn.NLLLoss
一样,这里就不多说了loss = nn.CrossEntropyLoss() input = torch.randn(3, 5, requires_grad=True) target = torch.empty(3, dtype=torch.long).random_(5) output = loss(input, target) output.backward()
优化器
SGD(包含了Momentum以及Nesterov Momentum)
optim.SGD(params, lr=<required parameter>, momentum=0, dampening=0, weight_decay=0, nesterov=False)
dampening
(float
, optional) – dampening for momentum (default:0
)
疑问:这个dampening
是干啥的 看源码时再解答
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
# 每次优化之前都要先清空梯度
optimizer.zero_grad()
loss.backward()
optimizer.step()
Adagrad
optim.Adagrad(params, lr=0.01, lr_decay=0, weight_decay=0,
initial_accumulator_value=0, eps=1e-10)
lr
(float
, optional) – learning rate (default:1e-2
)lr_decay
(float
, optional) – learning rate decay (default:0
)
RMSProp
optim.RMSprop(params, lr=0.01, alpha=0.99, eps=1e-08,
weight_decay=0, momentum=0, centered=False)
alpha
(float
, optional) – smoothing constant (default: 0.99)momentum
(float
, optional) – momentum factor (default: 0)centered
(bool
, optional) – ifTrue
, compute the centered RMSProp, the gradient is normalized by an estimation of its variance
这个alpha
应该就是RMSProp中遗忘过去梯度的动量参数,那么这个momentum
又是什么?同样也只能等看了源码再解答
Adadelta
optim.Adadelta(params, lr=1.0, rho=0.9, eps=1e-06, weight_decay=0)
lr
(float
, optional) – coefficient that scale delta before it is applied to the parameters (default:1.0
) 按照Adadelta原公式的话应该是不用lr
的,这里却有lr
参数,还是需要阅读源码后再解答rho
(float
, optional) – coefficient used for computing a running average of squared gradients (default:0.9
)
Adam
optim.Adam(params, lr=0.001, betas=(0.9, 0.999),
eps=1e-08, weight_decay=0, amsgrad=False)
amsgrad
(boolean
, optional) – whether to use the AMSGrad variant of this algorithm from the paper On the Convergence of Adam and Beyond (default:False
)
参考文献
- PyTorch 官方教程中文版
- Pytorch 中文文档
- PyTorch 英文文档
- 《深度学习之 PyTorch 物体检测实战》
评论(0)
您还未登录,请登录后发表或查看评论