PAN代码实现细节

#PAN: Towards Fast Action Recognition via Learning Persistence of Appearance

两个创新模块的代码实现细节：
import torch
from torch import nn
import math

class PA(nn.Module):
	def __init__(self, n_length):
		super(PA, self).__init__()
		self.shallow_conv = nn.Conv2d(3,8,7,1,3)
		self.n_length = n_length
		for m in self.modules():
			if isinstance(m, nn.Conv2d):
				nn.init.normal_(m.weight.data, 0, 0.001)
				nn.init.constant_(m.bias.data, 0)

	def forward(self, x):
		# print(x.shape)  x:(batch_size*32,c,h,w) 
		h, w = x.size(-2), x.size(-1)
		x = x.view((-1, 3) + x.size()[-2:]) # 这句话根本没有卵用。。
		# print(x.shape)
		x = self.shallow_conv(x) # 通道数会变成8 即 [32, 8, 224, 224]，batch_size为1
		x = x.view(-1, self.n_length, x.size(-3), x.size(-2)*x.size(-1)) # n_length = data_length = 4，4张图 [1*8, 4, 8, 50176] 第一个8是segment
		for i in range(self.n_length-1):
			d_i = nn.PairwiseDistance(p=2)(x[:,i,:,:], x[:,i+1,:,:]).unsqueeze(1)
			d = d_i if i == 0 else torch.cat((d, d_i), 1)
		PA = d.view(-1, 1*(self.n_length-1), h, w)
		# print(PA.shape) PA:(batch_size*segment,m-1,h,w)
		return PA

class VAP(nn.Module):
	def __init__(self, n_segment, feature_dim, num_class, dropout_ratio):
		super(VAP, self).__init__() # feature_dim = 2048
		VAP_level = int(math.log(n_segment, 2)) # 这个就是3级 
		print("=> Using {}-level VAP".format(VAP_level))
		self.n_segment = n_segment # 8片
		self.VAP_level = VAP_level
		total_timescale = 0
		# for i in range(VAP_level):
		#    timescale = 2**i
		#    total_timescale += timescale
		#    # https://pytorch-cn.readthedocs.io/zh/latest/package_references/torch-nn/#_2
		#    setattr(self, "VAP_{}".format(timescale), nn.MaxPool3d((n_segment//timescale,1,1),1,0,(timescale,1,1)))
		for i in range(VAP_level):
		   timescale = 2**i
		   total_timescale += timescale
		   # https://pytorch-cn.readthedocs.io/zh/latest/package_references/torch-nn/#_2
		   setattr(self, "VAP_{}".format(timescale), nn.MaxPool1d(n_segment//timescale,1,0,timescale)) # TODO 平均池化
		self.GAP = nn.AdaptiveAvgPool1d(1)
		self.TES = nn.Sequential(
			nn.Linear(total_timescale, total_timescale*4, bias=False),
			nn.ReLU(inplace=True),
			nn.Linear(total_timescale*4, total_timescale, bias=False)
		)
		self.softmax = nn.Softmax(dim=1)
		self.dropout = nn.Dropout(p=dropout_ratio)
		self.pred = nn.Linear(feature_dim, num_class)
		
		# fc init
		for m in self.modules():
			if isinstance(m, nn.Linear):
				nn.init.normal_(m.weight.data, 0, 0.001)
				if hasattr(m.bias, 'data'):
					nn.init.constant_(m.bias.data, 0)

	def forward(self, x):
		# print(x.shape) [8 , 2048] 8应该是batch_size * segment 网络除全连接外的最后一层输出，以下都以ResNet50为例子
		_, d = x.size() # d = 2048
		# x = x.view(-1, self.n_segment, d, 1, 1).permute(0,2,1,3,4) # 把 特征维度和segment维度调换了一下顺序，顺便加了两个维度，1 -> 1*1*1
		# x现在是 [batch_size,2048,segment,1,1]
		# https://www.runoob.com/python/att-tuple-tuple.html tuple()方法介绍，就是个数组合并成元组的方法
		# torch.cat(tuple(),2)就是按照第三个维度合并（0是第一个维度）
		# 两个.squeeze(3)就是把之前的加的两个维度给还原回去，然后再把调换的维度还原回来
		# x = torch.cat(tuple([getattr(self, "VAP_{}".format(2**i))(x) for i in range(self.VAP_level)]), 2).squeeze(3).squeeze(3).permute(0,2,1)
		# x 变成了 [batch_size,7,2048]

		x = x.view(-1, self.n_segment, d).permute(0,2,1) #TODO 这里可以试试15维
		x = torch.cat(tuple([getattr(self, "VAP_{}".format(2**i))(x) for i in range(self.VAP_level)]),2).permute(0,2,1)
		w = self.GAP(x).squeeze(2) # 一个求和平均，在7这个维度上的
		# w = [batch_size,7]
		w = self.softmax(self.TES(w)) # 这个TES是可学习的
		x = x * w.unsqueeze(2)
		x = x.sum(dim=1)
		x = self.dropout(x)
		x = self.pred(x.view(-1,d))
		return x
Contents

#PAN: Towards Fast Action Recognition via Learning Persistence of Appearance