Pyramid Scene Parsing Network
论文在结构上提供了一个pyramid pooling module,在不同层次上融合feature,达到语义和细节的融合。
PSPNet通过引入空洞卷积来修改基础的ResNet架构,,特征经过最初的池化,在整个编码器网络中以相同的分辨率处理(原始图像的1/4),直到它到达空间池化模块。
在ResNet的中间层引入辅助损失,以优化整体学习。
在修改后的ResNet编码器顶部的空间金字塔池化聚合全局上下文。
或许可以考虑在金字塔结构那里加一个SE模块
贡献:
- 提出了金字塔的场景解析网络(将困难的上下文特征嵌入到FCN的预测网络中)
- 提出了一种基于有监督的loss的ResNet的优化策略。
- 构建了一个实用的system用于场景解析
- 基础层经过预训练模型(ResnNet)和空洞卷积策略提取feature map,提取后的feature map是输入的1/8.
- feature map经过Pyramid Pooling Module得到融合的带有整体信息的feature,在上采样与池化前的feautre map相concat
- 最后经过一个卷积层得到最终输出。
该模块融合了4种不同金字塔尺度的特征,第一行红色是最粗糙的特征即全局池化生成单个bin输出,后面三行是不同尺度的池化特征。为了保证全局特征的权重,如果金字塔共有N个级别,则在每个级别后使用1 x 1的卷积对于级别通道降为原来的1/N.再通过双线性插值获得未池化前的大小,最终concat到一起。
论文架构:
-
Abstract
本文提出的金字塔池化模块能够聚合不同区域的上下文信息,从而提高获取全局信息的能力。
-
Introduction
场景分割是对场景的理解。它包含了预测这个标签,位置以及形状。 目前已知的并没有很好地利用全局场景种类。获取全局的图像特征。空间金字塔池化。并提出优化策略。
-
Related Work
回顾了场景解析以及语义分割任务 高层的特征包含了更多的语义信息以及更少的位置信息。 要利用好全局的文本信息 global context information by different-region-based context aggregation 语义分割模型的工作基于两个方法: 一方面:基于多尺度的特征融合,高层特征具有很强的语义信息,底层特征包含更多的细节; 另外一方面,基于结构的越策。例如使用CRF(条件随机场)做后端细化分割结果。 为了充分的利用全局特征层次先验知识来进行不同场景理解,本文提出的PSP模型能够聚合不同区域的上下文而达到该目的。
-
Pyramid Scene Parsing Network
-
Deep Supervision for ResNet-Based FCN
-
Experiments
devil is always in the details. base learning rate;momentum and weight decay;data augmentation;auxiliary loss; 评价指标:Mean IoU and Pixel Acc
代码
IMAGE_ORDERING = 'channels_last'
def relu6(x):
return K.relu(x, max_value=6)
def _conv_block(inputs, filters, alpha, kernel=(3, 3), strides=(1, 1)):
channel_axis = 1 if IMAGE_ORDERING == 'channels_first' else -1
filters = int(filters * alpha)
x = ZeroPadding2D(padding=(1, 1), name='conv1_pad', data_format=IMAGE_ORDERING )(inputs)
x = Conv2D(filters, kernel , data_format=IMAGE_ORDERING ,
padding='valid',
use_bias=False,
strides=strides,
name='conv1')(x)
x = BatchNormalization(axis=channel_axis, name='conv1_bn')(x)
return Activation(relu6, name='conv1_relu')(x)
def _depthwise_conv_block(inputs, pointwise_conv_filters, alpha,
depth_multiplier=1, strides=(1, 1), block_id=1):
channel_axis = 1 if IMAGE_ORDERING == 'channels_first' else -1
pointwise_conv_filters = int(pointwise_conv_filters * alpha)
x = ZeroPadding2D((1, 1) , data_format=IMAGE_ORDERING , name='conv_pad_%d' % block_id)(inputs)
x = DepthwiseConv2D((3, 3) , data_format=IMAGE_ORDERING ,
padding='valid',
depth_multiplier=depth_multiplier,
strides=strides,
use_bias=False,
name='conv_dw_%d' % block_id)(x)
x = BatchNormalization(
axis=channel_axis, name='conv_dw_%d_bn' % block_id)(x)
x = Activation(relu6, name='conv_dw_%d_relu' % block_id)(x)
x = Conv2D(pointwise_conv_filters, (1, 1), data_format=IMAGE_ORDERING ,
padding='same',
use_bias=False,
strides=(1, 1),
name='conv_pw_%d' % block_id)(x)
x = BatchNormalization(axis=channel_axis,
name='conv_pw_%d_bn' % block_id)(x)
return Activation(relu6, name='conv_pw_%d_relu' % block_id)(x)
def get_mobilenet_encoder( input_height=224 , input_width=224 , pretrained='imagenet' ):
alpha=1.0
depth_multiplier=1
dropout=1e-3
img_input = Input(shape=(input_height,input_width , 3 ))
x = _conv_block(img_input, 32, alpha, strides=(2, 2))
x = _depthwise_conv_block(x, 64, alpha, depth_multiplier, block_id=1)
f1 = x
x = _depthwise_conv_block(x, 128, alpha, depth_multiplier,
strides=(2, 2), block_id=2)
x = _depthwise_conv_block(x, 128, alpha, depth_multiplier, block_id=3)
f2 = x
x = _depthwise_conv_block(x, 256, alpha, depth_multiplier,
strides=(2, 2), block_id=4)
x = _depthwise_conv_block(x, 256, alpha, depth_multiplier, block_id=5)
f3 = x
x = _depthwise_conv_block(x, 512, alpha, depth_multiplier,
strides=(2, 2), block_id=6)
x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=7)
x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=8)
x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=9)
x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=10)
x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=11)
f4 = x
x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier,
strides=(2, 2), block_id=12)
x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier, block_id=13)
f5 = x
return img_input , [f1 , f2 , f3 , f4 , f5 ]
IMAGE_ORDERING = 'channels_last'
MERGE_AXIS = -1
def resize_image(inp, s, data_format):
import tensorflow as tf
return Lambda(
lambda x: tf.image.resize_images(
x, (K.int_shape(x)[1] * s[0], K.int_shape(x)[2] * s[1]))
)(inp)
def pool_block(feats, pool_factor):
if IMAGE_ORDERING == 'channels_first':
h = K.int_shape(feats)[2]
w = K.int_shape(feats)[3]
elif IMAGE_ORDERING == 'channels_last':
h = K.int_shape(feats)[1]
w = K.int_shape(feats)[2]
# strides = [18,18],[9,9],[6,6],[3,3]
pool_size = strides = [int(np.round(float(h) / pool_factor)), int(np.round(float(w) / pool_factor))]
# 进行不同程度的平均
x = AveragePooling2D(pool_size, data_format=IMAGE_ORDERING, strides=strides, padding='same')(feats)
# 进行卷积
x = Conv2D(512, (1, 1), data_format=IMAGE_ORDERING, padding='same', use_bias=False)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = resize_image(x, strides, data_format=IMAGE_ORDERING)
return x
def _pspnet(n_classes, encoder, input_height=384, input_width=576):
assert input_height % 192 == 0
assert input_width % 192 == 0
img_input, levels = encoder(input_height=input_height, input_width=input_width)
[f1, f2, f3, f4, f5] = levels
o = f5
# 对f5进行不同程度的池化
pool_factors = [1, 2, 3, 6]
pool_outs = [o]
for p in pool_factors:
pooled = pool_block(o, p)
pool_outs.append(pooled)
# 连接
o = Concatenate(axis=MERGE_AXIS)(pool_outs)
# 卷积
o = Conv2D(512, (1, 1), data_format=IMAGE_ORDERING, use_bias=False)(o)
o = BatchNormalization()(o)
o = Activation('relu')(o)
# 此时输出为[144,144,nclasses]
o = Conv2D(n_classes, (3, 3), data_format=IMAGE_ORDERING, padding='same')(o)
o = resize_image(o, (8, 8), data_format=IMAGE_ORDERING)
o = Reshape((-1, n_classes))(o)
o = Softmax()(o)
model = Model(img_input, o)
return model
def mobilenet_pspnet(n_classes, input_height=224, input_width=224):
model = _pspnet(n_classes, get_mobilenet_encoder, input_height=input_height, input_width=input_width)
model.model_name = "mobilenet_pspnet"
return model