#PAN: Towards Fast Action Recognition via Learning Persistence of Appearance

两个创新模块的代码实现细节:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import torch
from torch import nn
import math

class PA(nn.Module):
def __init__(self, n_length):
super(PA, self).__init__()
self.shallow_conv = nn.Conv2d(3,8,7,1,3)
self.n_length = n_length
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.normal_(m.weight.data, 0, 0.001)
nn.init.constant_(m.bias.data, 0)

def forward(self, x):
# print(x.shape) x:(batch_size*32,c,h,w)
h, w = x.size(-2), x.size(-1)
x = x.view((-1, 3) + x.size()[-2:]) # 这句话根本没有卵用。。
# print(x.shape)
x = self.shallow_conv(x) # 通道数会变成8 即 [32, 8, 224, 224],batch_size为1
x = x.view(-1, self.n_length, x.size(-3), x.size(-2)*x.size(-1)) # n_length = data_length = 4,4张图 [1*8, 4, 8, 50176] 第一个8是segment
for i in range(self.n_length-1):
d_i = nn.PairwiseDistance(p=2)(x[:,i,:,:], x[:,i+1,:,:]).unsqueeze(1)
d = d_i if i == 0 else torch.cat((d, d_i), 1)
PA = d.view(-1, 1*(self.n_length-1), h, w)
# print(PA.shape) PA:(batch_size*segment,m-1,h,w)
return PA

class VAP(nn.Module):
def __init__(self, n_segment, feature_dim, num_class, dropout_ratio):
super(VAP, self).__init__() # feature_dim = 2048
VAP_level = int(math.log(n_segment, 2)) # 这个就是3级
print("=> Using {}-level VAP".format(VAP_level))
self.n_segment = n_segment # 8片
self.VAP_level = VAP_level
total_timescale = 0
# for i in range(VAP_level):
# timescale = 2**i
# total_timescale += timescale
# # https://pytorch-cn.readthedocs.io/zh/latest/package_references/torch-nn/#_2
# setattr(self, "VAP_{}".format(timescale), nn.MaxPool3d((n_segment//timescale,1,1),1,0,(timescale,1,1)))
for i in range(VAP_level):
timescale = 2**i
total_timescale += timescale
# https://pytorch-cn.readthedocs.io/zh/latest/package_references/torch-nn/#_2
setattr(self, "VAP_{}".format(timescale), nn.MaxPool1d(n_segment//timescale,1,0,timescale)) # TODO 平均池化
self.GAP = nn.AdaptiveAvgPool1d(1)
self.TES = nn.Sequential(
nn.Linear(total_timescale, total_timescale*4, bias=False),
nn.ReLU(inplace=True),
nn.Linear(total_timescale*4, total_timescale, bias=False)
)
self.softmax = nn.Softmax(dim=1)
self.dropout = nn.Dropout(p=dropout_ratio)
self.pred = nn.Linear(feature_dim, num_class)

# fc init
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.normal_(m.weight.data, 0, 0.001)
if hasattr(m.bias, 'data'):
nn.init.constant_(m.bias.data, 0)

def forward(self, x):
# print(x.shape) [8 , 2048] 8应该是batch_size * segment 网络除全连接外的最后一层输出,以下都以ResNet50为例子
_, d = x.size() # d = 2048
# x = x.view(-1, self.n_segment, d, 1, 1).permute(0,2,1,3,4) # 把 特征维度和segment维度调换了一下顺序,顺便加了两个维度,1 -> 1*1*1
# x现在是 [batch_size,2048,segment,1,1]
# https://www.runoob.com/python/att-tuple-tuple.html tuple()方法介绍,就是个数组合并成元组的方法
# torch.cat(tuple(),2)就是按照第三个维度合并(0是第一个维度)
# 两个.squeeze(3)就是把之前的加的两个维度给还原回去,然后再把调换的维度还原回来
# x = torch.cat(tuple([getattr(self, "VAP_{}".format(2**i))(x) for i in range(self.VAP_level)]), 2).squeeze(3).squeeze(3).permute(0,2,1)
# x 变成了 [batch_size,7,2048]

x = x.view(-1, self.n_segment, d).permute(0,2,1) #TODO 这里可以试试15维
x = torch.cat(tuple([getattr(self, "VAP_{}".format(2**i))(x) for i in range(self.VAP_level)]),2).permute(0,2,1)
w = self.GAP(x).squeeze(2) # 一个求和平均,在7这个维度上的
# w = [batch_size,7]
w = self.softmax(self.TES(w)) # 这个TES是可学习的
x = x * w.unsqueeze(2)
x = x.sum(dim=1)
x = self.dropout(x)
x = self.pred(x.view(-1,d))
return x