1 MobileNetV2簡(jiǎn)介
MobileNetV2是一個(gè)輕量型卷積神經(jīng)網(wǎng)絡(luò),使用深度可分離卷積。
如下圖表示其中一個(gè)block的結(jié)構(gòu),主要包括Expansion layer,Depthwise Convolution,Projection layer。
Expansion layer表示擴(kuò)展層,使用1x1卷積,目的是將低維空間映射到高維空間。
Projection layer表示投影層,使用1x1卷積,目的是把高維特征映射到低維空間去。
Depthwise Convolution表示深度可分離卷積,完成卷積功能,降低計(jì)算量、參數(shù)量。
宏觀上看,結(jié)構(gòu)是短連接,內(nèi)部結(jié)構(gòu)是CBR+CBR+CB
,最后一個(gè)沒有Relu了,論文中所謂使用了線性激活函數(shù),也就是恒等函數(shù)()的意思。[注釋:CBR表示Conv+BN+Relu]
這種Inverted residuals是一種中間胖,兩頭窄的結(jié)構(gòu),像一個(gè)紡錘形,常規(guī)Residual Block結(jié)構(gòu),是兩頭胖,中間窄的結(jié)構(gòu)。
那Inverted residuals從瘦到胖,胖多少呢?再從胖到瘦,又瘦多少呢?這就涉及到新名詞Expansion factor(擴(kuò)展系數(shù)),它控制著網(wǎng)絡(luò)維度,為了保證短連接的形成,一個(gè)block中的“胖瘦”系數(shù)相同,這個(gè)系數(shù)通常是6,可改動(dòng)。如下圖所示。
2 介紹一種常規(guī)MobileNetv2結(jié)構(gòu)
如下表所示,t 表示bottleneck中“胖瘦”系數(shù),通道數(shù)變?yōu)閹妆叮籧 表示輸出通道數(shù),n 表示這個(gè)模塊整了幾次,s 表示stride,步長(zhǎng),控制特征圖尺寸大小,1的話尺寸不變,2的話,尺寸變?yōu)樵瓉淼囊话搿?/p>
3 MobilenetV2代碼
直接看代碼,可運(yùn)行,獲取網(wǎng)絡(luò)計(jì)算量與參數(shù)量。
import torch
from torch import nn
# from torchvision.models.utils import load_state_dict_from_url # 低版本pytorch用這個(gè)
from torch.hub import load_state_dict_from_url # 從鏈接中下載模型預(yù)訓(xùn)練權(quán)重
model_urls = {
'mobilenet_v2': 'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth',
}
# ------------------------------------------------------#
# 這個(gè)函數(shù)的目的是確保Channel個(gè)數(shù)能被8整除。
# ------------------------------------------------------#
def _make_divisible(v, divisor, min_value=None):
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
# Make sure that round down does not go down by more than 10%.
if new_v < 0.9 * v:
new_v += divisor
return new_v
# ------------------------------------------------------#
# Conv+BN+ReLU組在一起,參數(shù)順序:輸入通道數(shù),輸出通道數(shù)...
# 經(jīng)常會(huì)用到,整合在一起而已
# ------------------------------------------------------#
class ConvBNReLU(nn.Sequential):
def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
padding = (kernel_size - 1) // 2
super(ConvBNReLU, self).__init__(
nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
nn.BatchNorm2d(out_planes),
nn.ReLU6(inplace=True)
)
# ------------------------------------------------------#
# InvertedResidual,先變胖后變瘦
# 參數(shù)順序:輸入通道數(shù),輸出通道數(shù),步長(zhǎng),變胖倍數(shù)
# ------------------------------------------------------#
class InvertedResidual(nn.Module):
def __init__(self, inp, oup, stride, expand_ratio):
super(InvertedResidual, self).__init__()
self.stride = stride
assert stride in [1, 2]
# 所謂的隱藏維度,其實(shí)就是輸入通道數(shù)*變胖倍數(shù)
hidden_dim = int(round(inp * expand_ratio))
self.use_res_connect = self.stride == 1 and inp == oup
layers = []
if expand_ratio != 1:
layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1)) # pointwise
layers.extend([
ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim), # depthwise
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), # pointwise-linear
nn.BatchNorm2d(oup),
])
self.conv = nn.Sequential(*layers)
def forward(self, x):
if self.use_res_connect:
return x + self.conv(x)
else:
return self.conv(x)
class MobileNetV2(nn.Module):
def __init__(self, num_classes=1000, width_mult=1.0, inverted_residual_setting=None, round_nearest=8):
"""
MobileNet V2 main class
Args:
num_classes (int): Number of classes
width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
inverted_residual_setting: Network structure
round_nearest (int): Round the number of channels in each layer to be a multiple of this number
Set to 1 to turn off rounding
"""
super(MobileNetV2, self).__init__()
block = InvertedResidual
input_channel = 32
last_channel = 1280
if inverted_residual_setting is None:
inverted_residual_setting = [
# t, c, n, s
# 208,208,32 -> 208,208,16
[1, 16, 1, 1],
# 208,208,16 -> 104,104,24
[6, 24, 2, 2],
# 104,104,24 -> 52,52,32
[6, 32, 3, 2],
# 52,52,32 -> 26,26,64
[6, 64, 4, 2],
# 26,26,64 -> 26,26,96
[6, 96, 3, 1],
# 26,26,96 -> 13,13,160
[6, 160, 3, 2],
# 13,13,160 -> 13,13,320
[6, 320, 1, 1],
]
# only check the first element, assuming user knows t,c,n,s are required
if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
raise ValueError("inverted_residual_setting should be non-empty "
"or a 4-element list, got {}".format(inverted_residual_setting))
# building first layer
input_channel = _make_divisible(input_channel * width_mult, round_nearest)
self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
# 416,416,3 -> 208,208,32
features = [ConvBNReLU(3, input_channel, stride=2)]
# building inverted residual blocks
for t, c, n, s in inverted_residual_setting:
output_channel = _make_divisible(c * width_mult, round_nearest)
for i in range(n):
stride = s if i == 0 else 1
# 這個(gè)block就是上面那個(gè)InvertedResidual函數(shù)
features.append(block(input_channel, output_channel, stride, expand_ratio=t))
input_channel = output_channel
# building last several layers
features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1))
# make it nn.Sequential
self.features = nn.Sequential(*features)
# building classifier
self.classifier = nn.Sequential(
nn.Dropout(0.2),
nn.Linear(self.last_channel, num_classes),
)
# weight initialization
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m, nn.BatchNorm2d):
nn.init.ones_(m.weight)
nn.init.zeros_(m.bias)
elif isinstance(m, nn.Linear):
nn.init.normal_(m.weight, 0, 0.01)
nn.init.zeros_(m.bias)
def forward(self, x):
x = self.features(x)
x = x.mean([2, 3]) # 對(duì)第二第三維度進(jìn)行求平均,為啥?
x = self.classifier(x)
return x
def mobilenet_v2(pretrained=False, progress=True):
model = MobileNetV2()
if pretrained:
state_dict = load_state_dict_from_url(model_urls['mobilenet_v2'], model_dir="model_data",
progress=progress)
model.load_state_dict(state_dict)
return model
if __name__ == "__main__":
model = mobilenet_v2()
print(model)
# ------------------------------------#
# 方法1 獲取計(jì)算量與參數(shù)量
# ------------------------------------#
from torchsummaryX import summary
summary(model, torch.zeros(1, 3, 416, 416))
# ------------------------------------#
# 方法2 獲取計(jì)算量與參數(shù)量
# ------------------------------------#
from thop import profile
input = torch.randn(1, 3, 416, 416) # 1張3通道尺寸為416x416的圖片作為輸入
flops, params = profile(model, (input,))
print(flops, params)
4 YOLOv3網(wǎng)絡(luò)模型-----backbone可選MobileNetv2和darknet53
可結(jié)合【YOLOv3 net】網(wǎng)絡(luò)結(jié)構(gòu)及代碼詳解進(jìn)行閱讀
from collections import OrderedDict
import torch
import torch.nn as nn
from nets.darknet import darknet53 # darknet53的分析可見http://www.lxweimin.com/p/6b4675a9f378
from nets.mobilenet_v2 import mobilenet_v2 # 可見上面的代碼
# --------------------------------------------------#
# YOLOv3的FPN特征金字塔檢測(cè)頭,需要從三個(gè)地方引輸出
# model.features就像列表里面有各個(gè)塊,通過索引方式控制
# 得到out3, out4, out5
# --------------------------------------------------#
class MobileNetV2(nn.Module):
def __init__(self, pretrained = False):
super(MobileNetV2, self).__init__()
self.model = mobilenet_v2(pretrained=pretrained)
def forward(self, x):
out3 = self.model.features[:7](x)
out4 = self.model.features[7:14](out3)
out5 = self.model.features[14:18](out4)
return out3, out4, out5
# --------------------------------------------------#
# 再整個(gè)CBR放在一起
# --------------------------------------------------#
def conv2d(filter_in, filter_out, kernel_size):
pad = (kernel_size - 1) // 2 if kernel_size else 0
return nn.Sequential(OrderedDict([
("conv", nn.Conv2d(filter_in, filter_out, kernel_size=kernel_size, stride=1, padding=pad, bias=False)),
("bn", nn.BatchNorm2d(filter_out)),
("relu", nn.LeakyReLU(0.1)),
]))
# ------------------------------------------------------------------------#
# make_last_layers里面一共有七個(gè)卷積,前五個(gè)用于提取特征。
# 后兩個(gè)用于獲得yolo網(wǎng)絡(luò)的預(yù)測(cè)結(jié)果,稱之為yolo head
# ------------------------------------------------------------------------#
def make_last_layers(filters_list, in_filters, out_filter):
m = nn.Sequential(
conv2d(in_filters, filters_list[0], 1), # 1表示kernel_size
conv2d(filters_list[0], filters_list[1], 3),
conv2d(filters_list[1], filters_list[0], 1),
conv2d(filters_list[0], filters_list[1], 3),
conv2d(filters_list[1], filters_list[0], 1),
conv2d(filters_list[0], filters_list[1], 3),
nn.Conv2d(filters_list[1], out_filter, kernel_size=1, stride=1, padding=0, bias=True)
)
return m
# ---------------------------------------------------#
# 獲得類
# ---------------------------------------------------#
def get_classes(classes_path):
with open(classes_path, encoding='utf-8') as f:
class_names = f.readlines()
class_names = [c.strip() for c in class_names]
return class_names, len(class_names)
class YoloBody(nn.Module):
def __init__(self, anchors_mask, num_classes, backbone="mobilenetv2"):
super(YoloBody, self).__init__()
#---------------------------------------------------#
# 生成darknet53的主干模型
# 獲得三個(gè)有效特征層,他們的shape分別是:
# 52,52,256
# 26,26,512
# 13,13,1024
#---------------------------------------------------#
if backbone == "darknet53":
self.backbone = darknet53()
in_filters = [256, 512, 1024]
elif backbone == "mobilenetv2":
#---------------------------------------------------#
# 52,52,32;26,26,92;13,13,320
#---------------------------------------------------#
self.backbone = MobileNetV2(pretrained=False)
in_filters = [32, 96, 320]
else:
raise ValueError('Unsupported backbone - `{}`, Use darknet53, mobilenetv2.'.format(backbone))
#---------------------------------------------------#
# out_filters : [64, 128, 256, 512, 1024],利用最后三個(gè)進(jìn)行FPN融合
#---------------------------------------------------#
# out_filters = self.backbone.layers_out_filters # 表示Darknet53網(wǎng)絡(luò)幾個(gè)結(jié)構(gòu)塊的輸出通道數(shù),make_last_layers中用到此處
out_filters = in_filters
#------------------------------------------------------------------------#
# 計(jì)算yolo_head的輸出通道數(shù),對(duì)于voc數(shù)據(jù)集而言
# final_out_filter0 = final_out_filter1 = final_out_filter2 = 75
# final_out_filter0 = len(anchors_mask[0]) * (num_classes + 5) = 3*(20+5)
# 3*(20+5)含義:
# 3表示網(wǎng)格點(diǎn)上先驗(yàn)框個(gè)數(shù),
# 20表示voc分類類別數(shù),coco是80類,5:
# 4個(gè)先驗(yàn)框框調(diào)整參數(shù)+1表示網(wǎng)格內(nèi)是否有物體
# anchors_mask:表示先驗(yàn)框尺寸變化,通常有9種,一般不改,具體詳見正文分析
#------------------------------------------------------------------------#
self.last_layer0 = make_last_layers([512, 1024], out_filters[-1], len(anchors_mask[0]) * (num_classes + 5))
self.last_layer1_conv = conv2d(512, 256, 1) # 2D卷積,降低通道數(shù)
self.last_layer1_upsample = nn.Upsample(scale_factor=2, mode='nearest') # 上采樣:c通道數(shù)不變,w,h尺寸變?yōu)樵瓉?倍
self.last_layer1 = make_last_layers([256, 512], out_filters[-2] + 256, len(anchors_mask[1]) * (num_classes + 5))
self.last_layer2_conv = conv2d(256, 128, 1)
self.last_layer2_upsample = nn.Upsample(scale_factor=2, mode='nearest')
self.last_layer2 = make_last_layers([128, 256], out_filters[-3] + 128, len(anchors_mask[2]) * (num_classes + 5))
def forward(self, x):
#---------------------------------------------------#
# 獲得三個(gè)有效特征層,他們的shape分別是:
# 52,52,256;26,26,512;13,13,1024
#---------------------------------------------------#
x2, x1, x0 = self.backbone(x) # backbone return out3, out4, out5
#---------------------------------------------------#
# 第一個(gè)特征層
# out0 = (batch_size,255,13,13)
#---------------------------------------------------#
# 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512
# yolo head中有七層卷積(nn.Sequential整合的),前5層提取特征,同時(shí)其輸出要進(jìn)行 卷積+上采樣 去和上一個(gè)layer輸出融合形成FPN。
# 故這個(gè)地方[:5]和[5:]
out0_branch = self.last_layer0[:5](x0)
out0 = self.last_layer0[5:](out0_branch) # torch.size([1,75,13,13])
# 13,13,512 -> 13,13,256 -> 26,26,256
x1_in = self.last_layer1_conv(out0_branch) # {Tensor:1}
x1_in = self.last_layer1_upsample(x1_in) # {Tensor:1}
# 26,26,256 + 26,26,512 -> 26,26,768
x1_in = torch.cat([x1_in, x1], 1) # 所謂融合也就是特征圖拼接,層數(shù)變多 # 后一個(gè)參數(shù)1的作用 {Tensor:1} torch.size([1,768,26,26])
#---------------------------------------------------#
# 第二個(gè)特征層
# out1 = (batch_size,255,26,26)
#---------------------------------------------------#
# 26,26,768 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256
out1_branch = self.last_layer1[:5](x1_in)
out1 = self.last_layer1[5:](out1_branch) # torch.size([1,75,26,26])
# 26,26,256 -> 26,26,128 -> 52,52,128
x2_in = self.last_layer2_conv(out1_branch)
x2_in = self.last_layer2_upsample(x2_in)
# 52,52,128 + 52,52,256 -> 52,52,384
x2_in = torch.cat([x2_in, x2], 1) # torch.size([1,384,52,52])
#---------------------------------------------------#
# 第一個(gè)特征層
# out3 = (batch_size,255,52,52)
#---------------------------------------------------#
# 52,52,384 -> 52,52,128 -> 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128
out2 = self.last_layer2(x2_in) # torch.size([1,75,52,52])
return out0, out1, out2
if __name__ == '__main__':
classes_path = '../model_data/voc_classes.txt' # 見下方
class_names, num_classes = get_classes(classes_path)
anchors_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] # 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
model = YoloBody(anchors_mask, num_classes, backbone="mobilenetv2") # backbone="mobilenetv2" or darknet53
print(model)
from torchsummaryX import summary
summary(model, torch.zeros(1, 3, 416, 416))
from thop import profile
input = torch.randn(1, 3, 416, 416) # 1張3通道尺寸為416x416的圖片作為輸入
flops, params = profile(model, (input,))
print(flops, params)
voc_classes.txt
的內(nèi)容:
aeroplane
bicycle
bird
boat
bottle
bus
car
cat
chair
cow
diningtable
dog
horse
motorbike
person
pottedplant
sheep
sofa
train
tvmonitor
參考鏈接
https://zhuanlan.zhihu.com/p/98874284
https://github.com/bubbliiiing/mobilenet-yolov4-pytorch