cb29b6d7 by jiangwenqiang

first commit

1 parent 78b00ada
File mode changed
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
import os
import csv
import pickle
import numpy as np
from sklearn.externals import joblib
def start_filter(config):
cls_audio_path = config['MODEL']['CLS_AUDIO']
feature_save_dir = config['VIDEO']['IS10_FEATURE_NP_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['AUDIO']['RESULT_FILE']
feature_name = config['AUDIO']['DATA_NAME']
svm_clf = joblib.load(cls_audio_path)
result_file_path = os.path.join(frame_list_dir, result_file_name)
result_file = open(result_file_path, 'w')
feature_path = os.path.join(feature_save_dir, feature_name)
val_annotation_pairs = np.load(feature_path, allow_pickle=True, encoding='latin1')
for pair in val_annotation_pairs:
v = pair[0]
n = pair[2]
feature_np = np.reshape(v, (1, -1))
res = svm_clf.predict_proba(feature_np)
proba = np.squeeze(res)
# class_pre = svm_clf.predict(feature_np)
result_file.write(str(pair[2])[:-4] + ' ')
result_file.write(str(proba[0]) + ',' + str(proba[1]) + ',' + str(proba[2]) + '\n')
result_file.close()
def start_filter_xgboost(config):
cls_class_path = config['MODEL']['CLS_AUDIO']
feature_save_dir = config['VIDEO']['IS10_FEATURE_NP_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['AUDIO']['RESULT_FILE']
feature_name = config['AUDIO']['DATA_NAME']
xgboost_model = pickle.load(open(cls_class_path, "rb"))
result_file_path = os.path.join(frame_list_dir, result_file_name)
result_file = open(result_file_path, 'w')
feature_path = os.path.join(feature_save_dir, feature_name)
val_annotation_pairs = np.load(feature_path, allow_pickle=True, encoding='latin1')
X_val = []
Y_names = []
for pair in val_annotation_pairs:
n, v = pair.items()
X_val.append(v)
Y_names.append(n)
X_val = np.array(X_val)
y_pred = xgboost_model.predict_proba(X_val)
for i, Y_name in enumerate(Y_names):
result_file.write(Y_name + ' ')
result_file.write(str(y_pred[i][0]) + ',' + str(y_pred[i][1]) + ',' + str(y_pred[i][2]) + '\n')
result_file.close()
import os
import cv2
import numpy as np
import pickle
def start_filter(config):
cls_class_path = config['MODEL']['CLS_BG']
feature_save_dir = config['VIDEO']['FACE_FEATURE_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['BG']['RESULT_FILE']
feature_name = config['BG']['DATA_NAME']
xgboost_model = pickle.load(open(cls_class_path, "rb"))
result_file_path = os.path.join(frame_list_dir, result_file_name)
result_file = open(result_file_path, 'w')
feature_path = os.path.join(feature_save_dir, feature_name)
val_annotation_pairs = np.load(feature_path, allow_pickle=True)
X_val = []
Y_val = []
Y_names = []
for j in range(len(val_annotation_pairs)):
pair = val_annotation_pairs[j]
X_val.append(np.squeeze(pair[0]))
Y_val.append(pair[1])
Y_names.append(pair[2])
X_val = np.array(X_val)
y_pred = xgboost_model.predict_proba(X_val)
for i, Y_name in enumerate(Y_names):
result_file.write(Y_name + ' ')
result_file.write(str(y_pred[i][0]) + ',' + str(y_pred[i][1]) + ',' + str(y_pred[i][2]) + '\n')
result_file.close()
import os
import pickle
import numpy as np
def start_filter(config):
cls_class_path = config['MODEL']['CLS_CLASS']
feature_save_dir = config['VIDEO']['CLASS_FEATURE_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['CLASS']['RESULT_FILE']
feature_name = config['CLASS']['DATA_NAME']
xgboost_model = pickle.load(open(cls_class_path, "rb"))
result_file_path = os.path.join(frame_list_dir, result_file_name)
result_file = open(result_file_path, 'w')
feature_path = os.path.join(feature_save_dir, feature_name)
val_annotation_pairs = np.load(feature_path, allow_pickle=True)
X_val = []
Y_val = []
Y_names = []
for j in range(len(val_annotation_pairs)):
pair = val_annotation_pairs[j]
X_val.append(pair[0])
Y_val.append(pair[1])
Y_names.append(pair[2])
X_val = np.array(X_val)
y_pred = xgboost_model.predict(X_val)
for i, Y_name in enumerate(Y_names):
result_file.write(Y_name + ' ')
result_file.write(str(y_pred[i]) + '\n')
result_file.close()
MODEL:
CLS_FIGHTING_2: '/home/jwq/models/cls_fighting_2/cls_fighting_2_v0.0.1.pth'
CLS_EMOTION: '/home/jwq/models/cls_emotion/v0.1.0.m'
FEATURE_EMOTION: '/home/jwq/models/feature_emotion/FerPlus3.h5'
CLS_AUDIO: '/home/jwq/models/cls_audio/v0.0.1.m'
CLS_CLASS: '/home/jwq/models/cls_class/v_0.0.1_xgb.pkl'
CLS_VIDEO: '/home/jwq/models/cls_video/v0.4.1.pth'
CLS_POSE: '/home/jwq/models/cls_pose/v0.0.1.pth'
CLS_FLOW: '/home/jwq/models/cls_flow/v0.1.1.pth'
CLS_BG: '/home/jwq/models/cls_bg/v0.1.1.pkl'
CLS_PERSON: '/home/jwq/models/cls_person/v0.1.1.pkl'
THRESHOLD:
FACES_THRESHOLD: 0.6
FILTER:
VIDEO:
VIDEO_DIR: '/home/jwq/Desktop/VGAF_EmotiW/Val'
LABEL_PATH: '/home/jwq/Desktop/VGAF_EmotiW/Val_labels.txt'
VIDEO_SAVE_DIR: '/home/jwq/Desktop/tmp/video'
AUDIO_SAVE_DIR: '/home/jwq/npys/'
FRAME_SAVE_DIR: '/home/jwq/Desktop/tmp/frame'
# FRAME_SAVE_DIR: '/home/jwq/Desktop/VGAF_EmotiW_class/train_frame'
FLOW_SAVE_DIR: '/home/jwq/Desktop/tmp/flow'
POSE_FRAME_SAVE_DIR: '/home/jwq/Desktop/tmp/pose_frame'
FRAME_LIST_DIR: '/home/jwq/Desktop/tmp/file_list'
IS10_FEATURE_NP_DIR: '/home/jwq/npys'
IS10_FEATURE_CSV_DIR: '/home/jwq/Desktop/tmp/is10'
# FACE_FEATURE_DIR: '/home/jwq/Desktop/tmp/face_feature_retina'
# FACE_FEATURE_DIR: '/data2/retinaface/random_face_frame_features/'
FACE_FEATURE_DIR: '/data1/segment/'
# FACE_FEATURE_DIR: '/home/jwq/npys/'
FACE_IMAGE_DIR: '/data2/retinaface/train/'
CLASS_FEATURE_DIR: '/home/jwq/Desktop/tmp/class'
PREFIX: 'img_{:05d}.jpg'
FLOW_PREFIX: 'flow_{}_{:05d}.jpg'
THREAD_NUM: 10
FPS: 5
VIDEO_FILTER:
TEST_SEGMENT: 8
TEST_CROP: 1
BATCH_SIZE: 1
INPUT_SIZE: 224
MODALITY: 'RGB'
ARCH: 'resnet50'
RESULT_FILE: 'video_filter.txt'
VIDEO_1_FILTER:
TEST_SEGMENT: 8
TEST_CROP: 1
BATCH_SIZE: 1
INPUT_SIZE: 224
MODALITY: 'RGB'
ARCH: 'resnet34'
RESULT_FILE: 'video_1_filter.txt'
EMOTION:
INTERVAL: 1
INPUT_SIZE: 224
RESULT_FILE: 'emotion_filter.txt'
EMOTION_1:
RESULT_FILE: 'emotion_1_filter.txt'
DATA_NAME: 'val.npy'
ARGUE:
DIMENSION: 1582
RESULT_FILE: 'argue_filter.txt'
FIGHTING:
TEST_SEGMENT: 8
TEST_CROP: 1
BATCH_SIZE: 1
INPUT_SIZE: 224
MODALITY: 'RGB'
ARCH: 'resnet50'
RESULT_FILE: 'fighting_filter.txt'
FIGHTING_2:
TEST_SEGMENT: 8
TEST_CROP: 1
BATCH_SIZE: 1
INPUT_SIZE: 224
MODALITY: 'RGB'
ARCH: 'resnet50'
RESULT_FILE: 'fighting_2_filter.txt'
MEETING:
TEST_SEGMENT: 8
TEST_CROP: 1
BATCH_SIZE: 1
INPUT_SIZE: 224
MODALITY: 'RGB'
ARCH: 'resnet50'
RESULT_FILE: 'meeting_filter.txt'
TROOPS:
TEST_SEGMENT: 8
TEST_CROP: 1
BATCH_SIZE: 1
INPUT_SIZE: 224
MODALITY: 'RGB'
ARCH: 'resnet50'
RESULT_FILE: 'troops_filter.txt'
FLOW:
TEST_SEGMENT: 8
TEST_CROP: 1
BATCH_SIZE: 1
INPUT_SIZE: 224
MODALITY: 'Flow'
ARCH: 'resnet50'
RESULT_FILE: 'flow_filter.txt'
FINAL:
RESULT_FILE: 'final.txt'
ERROR_FILE: 'error.txt'
SIM_FILE: 'image_sim.txt'
AUDIO:
RESULT_FILE: 'audio.txt'
OPENSMILE_DIR: '/home/jwq/Downloads/opensmile-2.3.0'
DATA_NAME: 'val.npy'
CLASS:
RESULT_FILE: 'class.txt'
DATA_NAME: 'val _reannotation.npy'
POSE:
TEST_SEGMENT: 8
TEST_CROP: 1
BATCH_SIZE: 1
INPUT_SIZE: 224
MODALITY: 'RGB'
ARCH: 'resnet50'
RESULT_FILE: 'pose_filter.txt'
BG:
RESULT_FILE: 'bg_filter.txt'
DATA_NAME: 'bg_val_feature.npy'
PERSON:
RESULT_FILE: 'person_filter.txt'
DATA_NAME: 'person_val_feature.npy'
import os
import cv2
import numpy as np
from keras.models import Model
from keras.models import load_model
from sklearn.externals import joblib
from tensorflow.keras.preprocessing.image import img_to_array
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
class FeatureExtractor(object):
def __init__(self, input_size=224, out_put_layer='avg_pool', model_path='FerPlus3.h5'):
self.model = load_model(model_path)
self.input_size = input_size
self.model_inter = Model(inputs=self.model.input, outputs=self.model.get_layer(out_put_layer).output)
def inference(self, image):
image = cv2.resize(image, (self.input_size, self.input_size))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = image.astype("float") / 255.0
image = img_to_array(image)
image = np.expand_dims(image, axis=0)
feature = self.model_inter.predict(image)[0]
return feature
def features2feature(pics_features):
pics_features = np.array(pics_features)
fea_mean = pics_features.mean(axis=0)
fea_max = np.amax(pics_features, axis=0)
fea_min = np.amin(pics_features, axis=0)
fea_std = pics_features.std(axis=0)
return np.concatenate((fea_mean, fea_max, fea_min, fea_std), axis=1).reshape(1, -1)
def start_filter(config):
cls_emotion_path = config['MODEL']['CLS_EMOTION']
face_feature_dir = config['VIDEO']['FACE_FEATURE_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['EMOTION']['RESULT_FILE']
svm_clf = joblib.load(cls_emotion_path)
result_file_path = os.path.join(frame_list_dir, result_file_name)
result_file = open(result_file_path, 'w')
face_feature_names = os.listdir(face_feature_dir)
for face_feature in face_feature_names:
face_feature_path = os.path.join(face_feature_dir, face_feature)
features_np = np.load(face_feature_path, allow_pickle=True)
feature = features2feature(features_np)
res = svm_clf.predict_proba(feature)
proba = np.squeeze(res)
# class_pre = svm_clf.predict(feature)
result_file.write(face_feature[:-4] + ' ')
result_file.write(str(proba[0]) + ',' + str(proba[1]) + ',' + str(proba[2]) + '\n')
result_file.close()
import os
import torch.optim
import numpy as np
import torch.optim
import torch.nn.parallel
from ops.models import TSN
from ops.transforms import *
from ops.dataset import TSNDataSet
from torch.nn import functional as F
def gen_file_list(frame_save_dir, frame_list_dir):
val_path = os.path.join(frame_list_dir, 'val.txt')
video_names = os.listdir(frame_save_dir)
ucf101_rgb_val_file = open(val_path, 'w')
for video_name in video_names:
images_dir = os.path.join(frame_save_dir, video_name)
ucf101_rgb_val_file.write(video_name)
ucf101_rgb_val_file.write(' ')
ucf101_rgb_val_file.write(str(len(os.listdir(images_dir))))
ucf101_rgb_val_file.write('\n')
ucf101_rgb_val_file.close()
return val_path
def start_filter(config):
arch = config['FIGHTING_2']['ARCH']
prefix = config['VIDEO']['PREFIX']
modality = config['FIGHTING_2']['MODALITY']
test_crop = config['FIGHTING_2']['TEST_CROP']
batch_size = config['FIGHTING_2']['BATCH_SIZE']
weights_path = config['MODEL']['CLS_FIGHTING_2']
test_segment = config['FIGHTING_2']['TEST_SEGMENT']
frame_save_dir = config['VIDEO']['FRAME_SAVE_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['FIGHTING_2']['RESULT_FILE']
workers = 8
num_class = 2
shift_div = 8
img_feature_dim = 256
softmax = False
is_shift = True
full_res = False
non_local = False
dense_sample = False
twice_sample = False
val_list = gen_file_list(frame_save_dir, frame_list_dir)
result_file_path = os.path.join(frame_list_dir, result_file_name)
pretrain = 'imagenet'
shift_place = 'blockres'
crop_fusion_type = 'avg'
net = TSN(num_class, test_segment if is_shift else 1, modality,
base_model=arch,
consensus_type=crop_fusion_type,
img_feature_dim=img_feature_dim,
pretrain=pretrain,
is_shift=is_shift, shift_div=shift_div, shift_place=shift_place,
non_local=non_local,
)
checkpoint = torch.load(weights_path)
checkpoint = checkpoint['state_dict']
base_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint.items())}
replace_dict = {'base_model.classifier.weight': 'new_fc.weight',
'base_model.classifier.bias': 'new_fc.bias',
}
for k, v in replace_dict.items():
if k in base_dict:
base_dict[v] = base_dict.pop(k)
net.load_state_dict(base_dict)
input_size = net.scale_size if full_res else net.input_size
if test_crop == 1:
cropping = torchvision.transforms.Compose([
GroupScale(net.scale_size),
GroupCenterCrop(input_size),
])
elif test_crop == 3: # do not flip, so only 5 crops
cropping = torchvision.transforms.Compose([
GroupFullResSample(input_size, net.scale_size, flip=False)
])
elif test_crop == 5: # do not flip, so only 5 crops
cropping = torchvision.transforms.Compose([
GroupOverSample(input_size, net.scale_size, flip=False)
])
elif test_crop == 10:
cropping = torchvision.transforms.Compose([
GroupOverSample(input_size, net.scale_size)
])
else:
raise ValueError("Only 1, 5, 10 crops are supported while we got {}".format(test_crop))
data_loader = torch.utils.data.DataLoader(
TSNDataSet(frame_save_dir, val_list, num_segments=test_segment,
new_length=1 if modality == "RGB" else 5,
modality=modality,
image_tmpl=prefix,
test_mode=True,
remove_missing=False,
transform=torchvision.transforms.Compose([
cropping,
Stack(roll=(arch in ['BNInception', 'InceptionV3'])),
ToTorchFormatTensor(div=(arch not in ['BNInception', 'InceptionV3'])),
GroupNormalize(net.input_mean, net.input_std),
]), dense_sample=dense_sample, twice_sample=twice_sample),
batch_size=batch_size, shuffle=False,
num_workers=workers, pin_memory=True,
)
net = torch.nn.DataParallel(net.cuda())
net.eval()
data_gen = enumerate(data_loader)
max_num = len(data_loader.dataset)
result_file = open(result_file_path, 'w')
for i, data_pair in data_gen:
directory, data = data_pair
with torch.no_grad():
if i >= max_num:
break
num_crop = test_crop
if dense_sample:
num_crop *= 10 # 10 clips for testing when using dense sample
if twice_sample:
num_crop *= 2
if modality == 'RGB':
length = 3
elif modality == 'Flow':
length = 10
elif modality == 'RGBDiff':
length = 18
else:
raise ValueError("Unknown modality " + modality)
data_in = data.view(-1, length, data.size(2), data.size(3))
if is_shift:
data_in = data_in.view(batch_size * num_crop, test_segment, length, data_in.size(2), data_in.size(3))
rst, feature = net(data_in)
rst = rst.reshape(batch_size, num_crop, -1).mean(1)
if softmax:
# take the softmax to normalize the output to probability
rst = F.softmax(rst, dim=1)
rst = rst.data.cpu().numpy().copy()
if net.module.is_shift:
rst = rst.reshape(batch_size, num_class)
else:
rst = rst.reshape((batch_size, -1, num_class)).mean(axis=1).reshape((batch_size, num_class))
proba = np.squeeze(rst)
print(proba)
proba = np.exp(proba)/sum(np.exp(proba))
result_file.write(str(directory[0]) + ' ')
result_file.write(str(proba[0]) + ',' + str(proba[1]) + '\n')
result_file.close()
print('fighting filter end')
\ No newline at end of file
import os
import torch.optim
import numpy as np
import torch.optim
import torch.nn.parallel
from ops.models import TSN
from ops.transforms import *
from ops.dataset import TSNDataSet
from torch.nn import functional as F
def gen_file_list(frame_save_dir, frame_list_dir):
val_path = os.path.join(frame_list_dir, 'flow_val.txt')
video_names = os.listdir(frame_save_dir)
ucf101_rgb_val_file = open(val_path, 'w')
for video_name in video_names:
images_dir = os.path.join(frame_save_dir, video_name)
ucf101_rgb_val_file.write(video_name)
ucf101_rgb_val_file.write(' ')
ori_list = os.listdir(images_dir)
select_list = [element for element in ori_list if 'x' in element]
ucf101_rgb_val_file.write(str(len(select_list)))
ucf101_rgb_val_file.write('\n')
ucf101_rgb_val_file.close()
return val_path
def start_filter(config):
arch = config['FLOW']['ARCH']
prefix = config['VIDEO']['FLOW_PREFIX']
modality = config['FLOW']['MODALITY']
test_crop = config['FLOW']['TEST_CROP']
batch_size = config['FLOW']['BATCH_SIZE']
weights_path = config['MODEL']['CLS_FLOW']
test_segment = config['FLOW']['TEST_SEGMENT']
frame_save_dir = config['VIDEO']['FLOW_SAVE_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['FLOW']['RESULT_FILE']
workers = 8
num_class = 3
shift_div = 8
img_feature_dim = 256
softmax = False
is_shift = True
full_res = False
non_local = False
dense_sample = False
twice_sample = False
val_list = gen_file_list(frame_save_dir, frame_list_dir)
result_file_path = os.path.join(frame_list_dir, result_file_name)
pretrain = 'imagenet'
shift_place = 'blockres'
crop_fusion_type = 'avg'
net = TSN(num_class, test_segment if is_shift else 1, modality,
base_model=arch,
consensus_type=crop_fusion_type,
img_feature_dim=img_feature_dim,
pretrain=pretrain,
is_shift=is_shift, shift_div=shift_div, shift_place=shift_place,
non_local=non_local,
)
checkpoint = torch.load(weights_path)
checkpoint = checkpoint['state_dict']
base_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint.items())}
replace_dict = {'base_model.classifier.weight': 'new_fc.weight',
'base_model.classifier.bias': 'new_fc.bias',
}
for k, v in replace_dict.items():
if k in base_dict:
base_dict[v] = base_dict.pop(k)
net.load_state_dict(base_dict)
input_size = net.scale_size if full_res else net.input_size
if test_crop == 1:
cropping = torchvision.transforms.Compose([
GroupScale(net.scale_size),
GroupCenterCrop(input_size),
])
elif test_crop == 3: # do not flip, so only 5 crops
cropping = torchvision.transforms.Compose([
GroupFullResSample(input_size, net.scale_size, flip=False)
])
elif test_crop == 5: # do not flip, so only 5 crops
cropping = torchvision.transforms.Compose([
GroupOverSample(input_size, net.scale_size, flip=False)
])
elif test_crop == 10:
cropping = torchvision.transforms.Compose([
GroupOverSample(input_size, net.scale_size)
])
else:
raise ValueError("Only 1, 5, 10 crops are supported while we got {}".format(test_crop))
data_loader = torch.utils.data.DataLoader(
TSNDataSet(frame_save_dir, val_list, num_segments=test_segment,
new_length=1 if modality == "RGB" else 5,
modality=modality,
image_tmpl=prefix,
test_mode=True,
remove_missing=False,
transform=torchvision.transforms.Compose([
cropping,
Stack(roll=(arch in ['BNInception', 'InceptionV3'])),
ToTorchFormatTensor(div=(arch not in ['BNInception', 'InceptionV3'])),
GroupNormalize(net.input_mean, net.input_std),
]), dense_sample=dense_sample, twice_sample=twice_sample),
batch_size=batch_size, shuffle=False,
num_workers=workers, pin_memory=True,
)
net = torch.nn.DataParallel(net.cuda())
net.eval()
data_gen = enumerate(data_loader)
max_num = len(data_loader.dataset)
result_file = open(result_file_path, 'w')
for i, data_pair in data_gen:
directory, data = data_pair
with torch.no_grad():
if i >= max_num:
break
num_crop = test_crop
if dense_sample:
num_crop *= 10 # 10 clips for testing when using dense sample
if twice_sample:
num_crop *= 2
if modality == 'RGB':
length = 3
elif modality == 'Flow':
length = 10
elif modality == 'RGBDiff':
length = 18
else:
raise ValueError("Unknown modality " + modality)
data_in = data.view(-1, length, data.size(2), data.size(3))
if is_shift:
data_in = data_in.view(batch_size * num_crop, test_segment, length, data_in.size(2), data_in.size(3))
rst, feature = net(data_in)
rst = rst.reshape(batch_size, num_crop, -1).mean(1)
if softmax:
# take the softmax to normalize the output to probability
rst = F.softmax(rst, dim=1)
rst = rst.data.cpu().numpy().copy()
if net.module.is_shift:
rst = rst.reshape(batch_size, num_class)
else:
rst = rst.reshape((batch_size, -1, num_class)).mean(axis=1).reshape((batch_size, num_class))
proba = np.squeeze(rst)
proba = np.exp(proba)/sum(np.exp(proba))
result_file.write(str(directory[0]) + ' ')
result_file.write(str(proba[0]) + ',' + str(proba[1]) + ',' + str(proba[2]) + '\n')
result_file.close()
print('fighting filter end')
\ No newline at end of file
import os
import cv2
import yaml
import tensorflow as tf
def load_config(config_path):
with open(config_path, 'r') as cf:
config_obj = yaml.load(cf, Loader=yaml.FullLoader)
print(config_obj)
return config_obj
def load_argue_model(config):
cls_argue_path = config['MODEL']['CLS_ARGUE']
with tf.Graph().as_default():
if os.path.isfile(cls_argue_path):
print('Model filename: %s' % cls_argue_path)
with tf.gfile.GFile(cls_argue_path, 'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
tf.import_graph_def(graph_def, name='')
x = tf.get_default_graph().get_tensor_by_name("x_batch:0")
output = tf.get_default_graph().get_tensor_by_name("output/BiasAdd:0")
config = tf.ConfigProto()
config.gpu_options.allow_growth = False
sess = tf.Session(config=config)
return x, output, sess
from ops.basic_ops import *
\ No newline at end of file
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
import torch
class Identity(torch.nn.Module):
def forward(self, input):
return input
class SegmentConsensus(torch.nn.Module):
def __init__(self, consensus_type, dim=1):
super(SegmentConsensus, self).__init__()
self.consensus_type = consensus_type
self.dim = dim
self.shape = None
def forward(self, input_tensor):
self.shape = input_tensor.size()
if self.consensus_type == 'avg':
output = input_tensor.mean(dim=self.dim, keepdim=True)
elif self.consensus_type == 'identity':
output = input_tensor
else:
output = None
return output
class ConsensusModule(torch.nn.Module):
def __init__(self, consensus_type, dim=1):
super(ConsensusModule, self).__init__()
self.consensus_type = consensus_type if consensus_type != 'rnn' else 'identity'
self.dim = dim
def forward(self, input):
return SegmentConsensus(self.consensus_type, self.dim)(input)
# Code for "TSM: Temporal Shift Module for Efficient Video Understanding"
# arXiv:1811.08383
# Ji Lin*, Chuang Gan, Song Han
# {jilin, songhan}@mit.edu, ganchuang@csail.mit.edu
import torch.utils.data as data
from PIL import Image
import os
import numpy as np
from numpy.random import randint
class VideoRecord(object):
def __init__(self, row):
self._data = row
@property
def path(self):
return self._data[0]
@property
def num_frames(self):
return int(self._data[1])
class TSNDataSet(data.Dataset):
def __init__(self, root_path, list_file,
num_segments=3, new_length=1, modality='RGB',
image_tmpl='img_{:05d}.jpg', transform=None,
random_shift=True, test_mode=False,
remove_missing=False, dense_sample=False, twice_sample=False):
self.root_path = root_path
self.list_file = list_file
self.num_segments = num_segments
self.new_length = new_length
self.modality = modality
self.image_tmpl = image_tmpl
self.transform = transform
self.random_shift = random_shift
self.test_mode = test_mode
self.remove_missing = remove_missing
self.dense_sample = dense_sample # using dense sample as I3D
self.twice_sample = twice_sample # twice sample for more validation
if self.dense_sample:
print('=> Using dense sample for the dataset...')
if self.twice_sample:
print('=> Using twice sample for the dataset...')
if self.modality == 'RGBDiff':
self.new_length += 1 # Diff needs one more image to calculate diff
self._parse_list()
def _load_image(self, directory, idx):
if self.modality == 'RGB' or self.modality == 'RGBDiff':
try:
return [Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(idx))).convert('RGB')]
except Exception:
print('error loading image:', os.path.join(self.root_path, directory, self.image_tmpl.format(idx)))
return [Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(1))).convert('RGB')]
elif self.modality == 'Flow':
if self.image_tmpl == 'flow_{}_{:05d}.jpg': # ucf
x_img = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format('x', idx))).convert(
'L')
y_img = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format('y', idx))).convert(
'L')
elif self.image_tmpl == '{:06d}-{}_{:05d}.jpg': # something v1 flow
x_img = Image.open(os.path.join(self.root_path, '{:06d}'.format(int(directory)), self.image_tmpl.
format(int(directory), 'x', idx))).convert('L')
y_img = Image.open(os.path.join(self.root_path, '{:06d}'.format(int(directory)), self.image_tmpl.
format(int(directory), 'y', idx))).convert('L')
else:
try:
# idx_skip = 1 + (idx-1)*5
flow = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(idx))).convert(
'RGB')
except Exception:
print('error loading flow file:',
os.path.join(self.root_path, directory, self.image_tmpl.format(idx)))
flow = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(1))).convert('RGB')
# the input flow file is RGB image with (flow_x, flow_y, blank) for each channel
flow_x, flow_y, _ = flow.split()
x_img = flow_x.convert('L')
y_img = flow_y.convert('L')
return [x_img, y_img]
def _parse_list(self):
# check the frame number is large >3:
tmp = [x.strip().split(' ') for x in open(self.list_file)]
if not self.test_mode or self.remove_missing:
tmp = [item for item in tmp if int(item[1]) >= 3]
self.video_list = [VideoRecord(item) for item in tmp]
if self.image_tmpl == '{:06d}-{}_{:05d}.jpg':
for v in self.video_list:
v._data[1] = int(v._data[1]) / 2
print('video number:%d' % (len(self.video_list)))
def _sample_indices(self, record):
"""
:param record: VideoRecord
:return: list
"""
if self.dense_sample: # i3d dense sample
sample_pos = max(1, 1 + record.num_frames - 64)
t_stride = 64 // self.num_segments
start_idx = 0 if sample_pos == 1 else np.random.randint(0, sample_pos - 1)
offsets = [(idx * t_stride + start_idx) % record.num_frames for idx in range(self.num_segments)]
return np.array(offsets) + 1
else: # normal sample
average_duration = (record.num_frames - self.new_length + 1) // self.num_segments
if average_duration > 0:
offsets = np.multiply(list(range(self.num_segments)), average_duration) + randint(average_duration,
size=self.num_segments)
elif record.num_frames > self.num_segments:
offsets = np.sort(randint(record.num_frames - self.new_length + 1, size=self.num_segments))
else:
offsets = np.zeros((self.num_segments,))
return offsets + 1
def _get_val_indices(self, record):
if self.dense_sample: # i3d dense sample
sample_pos = max(1, 1 + record.num_frames - 64)
t_stride = 64 // self.num_segments
start_idx = 0 if sample_pos == 1 else np.random.randint(0, sample_pos - 1)
offsets = [(idx * t_stride + start_idx) % record.num_frames for idx in range(self.num_segments)]
return np.array(offsets) + 1
else:
if record.num_frames > self.num_segments + self.new_length - 1:
tick = (record.num_frames - self.new_length + 1) / float(self.num_segments)
offsets = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segments)])
else:
offsets = np.zeros((self.num_segments,))
return offsets + 1
def _get_test_indices(self, record):
if self.dense_sample:
sample_pos = max(1, 1 + record.num_frames - 64)
t_stride = 64 // self.num_segments
start_list = np.linspace(0, sample_pos - 1, num=10, dtype=int)
offsets = []
for start_idx in start_list.tolist():
offsets += [(idx * t_stride + start_idx) % record.num_frames for idx in range(self.num_segments)]
return np.array(offsets) + 1
elif self.twice_sample:
tick = (record.num_frames - self.new_length + 1) / float(self.num_segments)
offsets = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segments)] +
[int(tick * x) for x in range(self.num_segments)])
return offsets + 1
else:
tick = (record.num_frames - self.new_length + 1) / float(self.num_segments)
offsets = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segments)])
return offsets + 1
def __getitem__(self, index):
record = self.video_list[index]
# check this is a legit video folder
if self.image_tmpl == 'flow_{}_{:05d}.jpg':
file_name = self.image_tmpl.format('x', 1)
full_path = os.path.join(self.root_path, record.path, file_name)
elif self.image_tmpl == '{:06d}-{}_{:05d}.jpg':
file_name = self.image_tmpl.format(int(record.path), 'x', 1)
full_path = os.path.join(self.root_path, '{:06d}'.format(int(record.path)), file_name)
else:
file_name = self.image_tmpl.format(1)
full_path = os.path.join(self.root_path, record.path, file_name)
while not os.path.exists(full_path):
print('################## Not Found:', os.path.join(self.root_path, record.path, file_name))
index = np.random.randint(len(self.video_list))
record = self.video_list[index]
if self.image_tmpl == 'flow_{}_{:05d}.jpg':
file_name = self.image_tmpl.format('x', 1)
full_path = os.path.join(self.root_path, record.path, file_name)
elif self.image_tmpl == '{:06d}-{}_{:05d}.jpg':
file_name = self.image_tmpl.format(int(record.path), 'x', 1)
full_path = os.path.join(self.root_path, '{:06d}'.format(int(record.path)), file_name)
else:
file_name = self.image_tmpl.format(1)
full_path = os.path.join(self.root_path, record.path, file_name)
if not self.test_mode:
segment_indices = self._sample_indices(record) if self.random_shift else self._get_val_indices(record)
else:
segment_indices = self._get_test_indices(record)
return self.get(record, segment_indices)
def get(self, record, indices):
images = list()
for seg_ind in indices:
p = int(seg_ind)
for i in range(self.new_length):
seg_imgs = self._load_image(record.path, p)
images.extend(seg_imgs)
if p < record.num_frames:
p += 1
process_data = self.transform(images)
return record.path, process_data
def __len__(self):
return len(self.video_list)
# Code for "TSM: Temporal Shift Module for Efficient Video Understanding"
# arXiv:1811.08383
# Ji Lin*, Chuang Gan, Song Han
# {jilin, songhan}@mit.edu, ganchuang@csail.mit.edu
import os
ROOT_DATASET = '/data1/action_1_images/' # '/data/jilin/'
def return_ucf101(modality):
filename_categories = 'labels/classInd.txt'
if modality == 'RGB':
root_data = ROOT_DATASET + 'images'
filename_imglist_train = 'file_list/ucf101_rgb_train_split_1.txt'
filename_imglist_val = 'file_list/ucf101_rgb_val_split_1.txt'
prefix = 'img_{:05d}.jpg'
elif modality == 'Flow':
root_data = ROOT_DATASET + 'UCF101/jpg'
filename_imglist_train = 'UCF101/file_list/ucf101_flow_train_split_1.txt'
filename_imglist_val = 'UCF101/file_list/ucf101_flow_val_split_1.txt'
prefix = 'flow_{}_{:05d}.jpg'
else:
raise NotImplementedError('no such modality:' + modality)
return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
def return_hmdb51(modality):
filename_categories = 51
if modality == 'RGB':
root_data = ROOT_DATASET + 'HMDB51/images'
filename_imglist_train = 'HMDB51/splits/hmdb51_rgb_train_split_1.txt'
filename_imglist_val = 'HMDB51/splits/hmdb51_rgb_val_split_1.txt'
prefix = 'img_{:05d}.jpg'
elif modality == 'Flow':
root_data = ROOT_DATASET + 'HMDB51/images'
filename_imglist_train = 'HMDB51/splits/hmdb51_flow_train_split_1.txt'
filename_imglist_val = 'HMDB51/splits/hmdb51_flow_val_split_1.txt'
prefix = 'flow_{}_{:05d}.jpg'
else:
raise NotImplementedError('no such modality:' + modality)
return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
def return_something(modality):
filename_categories = 'something/v1/category.txt'
if modality == 'RGB':
root_data = ROOT_DATASET + 'something/v1/20bn-something-something-v1'
filename_imglist_train = 'something/v1/train_videofolder.txt'
filename_imglist_val = 'something/v1/val_videofolder.txt'
prefix = '{:05d}.jpg'
elif modality == 'Flow':
root_data = ROOT_DATASET + 'something/v1/20bn-something-something-v1-flow'
filename_imglist_train = 'something/v1/train_videofolder_flow.txt'
filename_imglist_val = 'something/v1/val_videofolder_flow.txt'
prefix = '{:06d}-{}_{:05d}.jpg'
else:
print('no such modality:'+modality)
raise NotImplementedError
return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
def return_somethingv2(modality):
filename_categories = 'something/v2/category.txt'
if modality == 'RGB':
root_data = ROOT_DATASET + 'something/v2/20bn-something-something-v2-frames'
filename_imglist_train = 'something/v2/train_videofolder.txt'
filename_imglist_val = 'something/v2/val_videofolder.txt'
prefix = '{:06d}.jpg'
elif modality == 'Flow':
root_data = ROOT_DATASET + 'something/v2/20bn-something-something-v2-flow'
filename_imglist_train = 'something/v2/train_videofolder_flow.txt'
filename_imglist_val = 'something/v2/val_videofolder_flow.txt'
prefix = '{:06d}.jpg'
else:
raise NotImplementedError('no such modality:'+modality)
return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
def return_jester(modality):
filename_categories = 'jester/category.txt'
if modality == 'RGB':
prefix = '{:05d}.jpg'
root_data = ROOT_DATASET + 'jester/20bn-jester-v1'
filename_imglist_train = 'jester/train_videofolder.txt'
filename_imglist_val = 'jester/val_videofolder.txt'
else:
raise NotImplementedError('no such modality:'+modality)
return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
def return_kinetics(modality):
filename_categories = 400
if modality == 'RGB':
root_data = ROOT_DATASET + 'kinetics/images'
filename_imglist_train = 'kinetics/labels/train_videofolder.txt'
filename_imglist_val = 'kinetics/labels/val_videofolder.txt'
prefix = 'img_{:05d}.jpg'
else:
raise NotImplementedError('no such modality:' + modality)
return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
def return_dataset(dataset, modality):
dict_single = {'jester': return_jester, 'something': return_something, 'somethingv2': return_somethingv2,
'ucf101': return_ucf101, 'hmdb51': return_hmdb51,
'kinetics': return_kinetics}
if dataset in dict_single:
file_categories, file_imglist_train, file_imglist_val, root_data, prefix = dict_single[dataset](modality)
else:
raise ValueError('Unknown dataset '+dataset)
file_imglist_train = os.path.join(ROOT_DATASET, file_imglist_train)
file_imglist_val = os.path.join(ROOT_DATASET, file_imglist_val)
if isinstance(file_categories, str):
file_categories = os.path.join(ROOT_DATASET, file_categories)
with open(file_categories) as f:
lines = f.readlines()
categories = [item.rstrip() for item in lines]
else: # number of categories
categories = [None] * file_categories
n_class = len(categories)
print('{}: {} classes'.format(dataset, n_class))
return n_class, file_imglist_train, file_imglist_val, root_data, prefix
# Non-local block using embedded gaussian
# Code from
# https://github.com/AlexHex7/Non-local_pytorch/blob/master/Non-Local_pytorch_0.3.1/lib/non_local_embedded_gaussian.py
import torch
from torch import nn
from torch.nn import functional as F
class _NonLocalBlockND(nn.Module):
def __init__(self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True):
super(_NonLocalBlockND, self).__init__()
assert dimension in [1, 2, 3]
self.dimension = dimension
self.sub_sample = sub_sample
self.in_channels = in_channels
self.inter_channels = inter_channels
if self.inter_channels is None:
self.inter_channels = in_channels // 2
if self.inter_channels == 0:
self.inter_channels = 1
if dimension == 3:
conv_nd = nn.Conv3d
max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
bn = nn.BatchNorm3d
elif dimension == 2:
conv_nd = nn.Conv2d
max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
bn = nn.BatchNorm2d
else:
conv_nd = nn.Conv1d
max_pool_layer = nn.MaxPool1d(kernel_size=(2))
bn = nn.BatchNorm1d
self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
kernel_size=1, stride=1, padding=0)
if bn_layer:
self.W = nn.Sequential(
conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
kernel_size=1, stride=1, padding=0),
bn(self.in_channels)
)
nn.init.constant_(self.W[1].weight, 0)
nn.init.constant_(self.W[1].bias, 0)
else:
self.W = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
kernel_size=1, stride=1, padding=0)
nn.init.constant_(self.W.weight, 0)
nn.init.constant_(self.W.bias, 0)
self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
kernel_size=1, stride=1, padding=0)
self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
kernel_size=1, stride=1, padding=0)
if sub_sample:
self.g = nn.Sequential(self.g, max_pool_layer)
self.phi = nn.Sequential(self.phi, max_pool_layer)
def forward(self, x):
'''
:param x: (b, c, t, h, w)
:return:
'''
batch_size = x.size(0)
g_x = self.g(x).view(batch_size, self.inter_channels, -1)
g_x = g_x.permute(0, 2, 1)
theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
theta_x = theta_x.permute(0, 2, 1)
phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)
f = torch.matmul(theta_x, phi_x)
f_div_C = F.softmax(f, dim=-1)
y = torch.matmul(f_div_C, g_x)
y = y.permute(0, 2, 1).contiguous()
y = y.view(batch_size, self.inter_channels, *x.size()[2:])
W_y = self.W(y)
z = W_y + x
return z
class NONLocalBlock1D(_NonLocalBlockND):
def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
super(NONLocalBlock1D, self).__init__(in_channels,
inter_channels=inter_channels,
dimension=1, sub_sample=sub_sample,
bn_layer=bn_layer)
class NONLocalBlock2D(_NonLocalBlockND):
def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
super(NONLocalBlock2D, self).__init__(in_channels,
inter_channels=inter_channels,
dimension=2, sub_sample=sub_sample,
bn_layer=bn_layer)
class NONLocalBlock3D(_NonLocalBlockND):
def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
super(NONLocalBlock3D, self).__init__(in_channels,
inter_channels=inter_channels,
dimension=3, sub_sample=sub_sample,
bn_layer=bn_layer)
class NL3DWrapper(nn.Module):
def __init__(self, block, n_segment):
super(NL3DWrapper, self).__init__()
self.block = block
self.nl = NONLocalBlock3D(block.bn3.num_features)
self.n_segment = n_segment
def forward(self, x):
x = self.block(x)
nt, c, h, w = x.size()
x = x.view(nt // self.n_segment, self.n_segment, c, h, w).transpose(1, 2) # n, c, t, h, w
x = self.nl(x)
x = x.transpose(1, 2).contiguous().view(nt, c, h, w)
return x
def make_non_local(net, n_segment):
import torchvision
import archs
if isinstance(net, torchvision.models.ResNet):
net.layer2 = nn.Sequential(
NL3DWrapper(net.layer2[0], n_segment),
net.layer2[1],
NL3DWrapper(net.layer2[2], n_segment),
net.layer2[3],
)
net.layer3 = nn.Sequential(
NL3DWrapper(net.layer3[0], n_segment),
net.layer3[1],
NL3DWrapper(net.layer3[2], n_segment),
net.layer3[3],
NL3DWrapper(net.layer3[4], n_segment),
net.layer3[5],
)
else:
raise NotImplementedError
if __name__ == '__main__':
from torch.autograd import Variable
import torch
sub_sample = True
bn_layer = True
img = Variable(torch.zeros(2, 3, 20))
net = NONLocalBlock1D(3, sub_sample=sub_sample, bn_layer=bn_layer)
out = net(img)
print(out.size())
img = Variable(torch.zeros(2, 3, 20, 20))
net = NONLocalBlock2D(3, sub_sample=sub_sample, bn_layer=bn_layer)
out = net(img)
print(out.size())
img = Variable(torch.randn(2, 3, 10, 20, 20))
net = NONLocalBlock3D(3, sub_sample=sub_sample, bn_layer=bn_layer)
out = net(img)
print(out.size())
\ No newline at end of file
# Code for "TSM: Temporal Shift Module for Efficient Video Understanding"
# arXiv:1811.08383
# Ji Lin*, Chuang Gan, Song Han
# {jilin, songhan}@mit.edu, ganchuang@csail.mit.edu
import torch
import torch.nn as nn
import torch.nn.functional as F
class TemporalShift(nn.Module):
def __init__(self, net, n_segment=3, n_div=8, inplace=False):
super(TemporalShift, self).__init__()
self.net = net
self.n_segment = n_segment
self.fold_div = n_div
self.inplace = inplace
if inplace:
print('=> Using in-place shift...')
print('=> Using fold div: {}'.format(self.fold_div))
def forward(self, x):
x = self.shift(x, self.n_segment, fold_div=self.fold_div, inplace=self.inplace)
return self.net(x)
@staticmethod
def shift(x, n_segment, fold_div=3, inplace=False):
nt, c, h, w = x.size()
n_batch = nt // n_segment
x = x.view(n_batch, n_segment, c, h, w)
fold = c // fold_div
if inplace:
# Due to some out of order error when performing parallel computing.
# May need to write a CUDA kernel.
raise NotImplementedError
# out = InplaceShift.apply(x, fold)
else:
out = torch.zeros_like(x)
out[:, :-1, :fold] = x[:, 1:, :fold] # shift left
out[:, 1:, fold: 2 * fold] = x[:, :-1, fold: 2 * fold] # shift right
out[:, :, 2 * fold:] = x[:, :, 2 * fold:] # not shift
return out.view(nt, c, h, w)
class InplaceShift(torch.autograd.Function):
# Special thanks to @raoyongming for the help to this function
@staticmethod
def forward(ctx, input, fold):
# not support higher order gradient
# input = input.detach_()
ctx.fold_ = fold
n, t, c, h, w = input.size()
buffer = input.data.new(n, t, fold, h, w).zero_()
buffer[:, :-1] = input.data[:, 1:, :fold]
input.data[:, :, :fold] = buffer
buffer.zero_()
buffer[:, 1:] = input.data[:, :-1, fold: 2 * fold]
input.data[:, :, fold: 2 * fold] = buffer
return input
@staticmethod
def backward(ctx, grad_output):
# grad_output = grad_output.detach_()
fold = ctx.fold_
n, t, c, h, w = grad_output.size()
buffer = grad_output.data.new(n, t, fold, h, w).zero_()
buffer[:, 1:] = grad_output.data[:, :-1, :fold]
grad_output.data[:, :, :fold] = buffer
buffer.zero_()
buffer[:, :-1] = grad_output.data[:, 1:, fold: 2 * fold]
grad_output.data[:, :, fold: 2 * fold] = buffer
return grad_output, None
class TemporalPool(nn.Module):
def __init__(self, net, n_segment):
super(TemporalPool, self).__init__()
self.net = net
self.n_segment = n_segment
def forward(self, x):
x = self.temporal_pool(x, n_segment=self.n_segment)
return self.net(x)
@staticmethod
def temporal_pool(x, n_segment):
nt, c, h, w = x.size()
n_batch = nt // n_segment
x = x.view(n_batch, n_segment, c, h, w).transpose(1, 2) # n, c, t, h, w
x = F.max_pool3d(x, kernel_size=(3, 1, 1), stride=(2, 1, 1), padding=(1, 0, 0))
x = x.transpose(1, 2).contiguous().view(nt // 2, c, h, w)
return x
def make_temporal_shift(net, n_segment, n_div=8, place='blockres', temporal_pool=False):
if temporal_pool:
n_segment_list = [n_segment, n_segment // 2, n_segment // 2, n_segment // 2]
else:
n_segment_list = [n_segment] * 4
assert n_segment_list[-1] > 0
print('=> n_segment per stage: {}'.format(n_segment_list))
import torchvision
if isinstance(net, torchvision.models.ResNet):
if place == 'block':
def make_block_temporal(stage, this_segment):
blocks = list(stage.children())
print('=> Processing stage with {} blocks'.format(len(blocks)))
for i, b in enumerate(blocks):
blocks[i] = TemporalShift(b, n_segment=this_segment, n_div=n_div)
return nn.Sequential(*(blocks))
net.layer1 = make_block_temporal(net.layer1, n_segment_list[0])
net.layer2 = make_block_temporal(net.layer2, n_segment_list[1])
net.layer3 = make_block_temporal(net.layer3, n_segment_list[2])
net.layer4 = make_block_temporal(net.layer4, n_segment_list[3])
elif 'blockres' in place:
n_round = 1
if len(list(net.layer3.children())) >= 23:
n_round = 2
print('=> Using n_round {} to insert temporal shift'.format(n_round))
def make_block_temporal(stage, this_segment):
blocks = list(stage.children())
print('=> Processing stage with {} blocks residual'.format(len(blocks)))
for i, b in enumerate(blocks):
if i % n_round == 0:
blocks[i].conv1 = TemporalShift(b.conv1, n_segment=this_segment, n_div=n_div)
return nn.Sequential(*blocks)
net.layer1 = make_block_temporal(net.layer1, n_segment_list[0])
net.layer2 = make_block_temporal(net.layer2, n_segment_list[1])
net.layer3 = make_block_temporal(net.layer3, n_segment_list[2])
net.layer4 = make_block_temporal(net.layer4, n_segment_list[3])
else:
raise NotImplementedError(place)
def make_temporal_pool(net, n_segment):
import torchvision
if isinstance(net, torchvision.models.ResNet):
print('=> Injecting nonlocal pooling')
net.layer2 = TemporalPool(net.layer2, n_segment)
else:
raise NotImplementedError
if __name__ == '__main__':
# test inplace shift v.s. vanilla shift
tsm1 = TemporalShift(nn.Sequential(), n_segment=8, n_div=8, inplace=False)
tsm2 = TemporalShift(nn.Sequential(), n_segment=8, n_div=8, inplace=True)
print('=> Testing CPU...')
# test forward
with torch.no_grad():
for i in range(10):
x = torch.rand(2 * 8, 3, 224, 224)
y1 = tsm1(x)
y2 = tsm2(x)
assert torch.norm(y1 - y2).item() < 1e-5
# test backward
with torch.enable_grad():
for i in range(10):
x1 = torch.rand(2 * 8, 3, 224, 224)
x1.requires_grad_()
x2 = x1.clone()
y1 = tsm1(x1)
y2 = tsm2(x2)
grad1 = torch.autograd.grad((y1 ** 2).mean(), [x1])[0]
grad2 = torch.autograd.grad((y2 ** 2).mean(), [x2])[0]
assert torch.norm(grad1 - grad2).item() < 1e-5
print('=> Testing GPU...')
tsm1.cuda()
tsm2.cuda()
# test forward
with torch.no_grad():
for i in range(10):
x = torch.rand(2 * 8, 3, 224, 224).cuda()
y1 = tsm1(x)
y2 = tsm2(x)
assert torch.norm(y1 - y2).item() < 1e-5
# test backward
with torch.enable_grad():
for i in range(10):
x1 = torch.rand(2 * 8, 3, 224, 224).cuda()
x1.requires_grad_()
x2 = x1.clone()
y1 = tsm1(x1)
y2 = tsm2(x2)
grad1 = torch.autograd.grad((y1 ** 2).mean(), [x1])[0]
grad2 = torch.autograd.grad((y2 ** 2).mean(), [x2])[0]
assert torch.norm(grad1 - grad2).item() < 1e-5
print('Test passed.')
import numpy as np
def softmax(scores):
es = np.exp(scores - scores.max(axis=-1)[..., None])
return es / es.sum(axis=-1)[..., None]
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def accuracy(output, target, topk=(1,)):
"""Computes the precision@k for the specified values of k"""
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0)
res.append(correct_k.mul_(100.0 / batch_size))
return res
\ No newline at end of file
import os
import cv2
import numpy as np
import pickle
def start_filter(config):
cls_class_path = config['MODEL']['CLS_PERSON']
feature_save_dir = config['VIDEO']['FACE_FEATURE_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['PERSON']['RESULT_FILE']
feature_name = config['PERSON']['DATA_NAME']
xgboost_model = pickle.load(open(cls_class_path, "rb"))
result_file_path = os.path.join(frame_list_dir, result_file_name)
result_file = open(result_file_path, 'w')
feature_path = os.path.join(feature_save_dir, feature_name)
val_annotation_pairs = np.load(feature_path, allow_pickle=True, encoding='latin1')
X_val = []
Y_val = []
Y_names = []
for j in range(len(val_annotation_pairs)):
pair = val_annotation_pairs[j]
X_val.append(np.squeeze(pair[0]))
Y_val.append(pair[1])
Y_names.append(pair[2])
X_val = np.array(X_val)
y_pred = xgboost_model.predict_proba(X_val)
for i, Y_name in enumerate(Y_names):
result_file.write(Y_name + ' ')
result_file.write(str(y_pred[i][0]) + ',' + str(y_pred[i][1]) + ',' + str(y_pred[i][2]) + '\n')
result_file.close()
import os
import torch.optim
import numpy as np
import torch.optim
import torch.nn.parallel
from ops.models import TSN
from ops.transforms import *
from ops.dataset import TSNDataSet
from torch.nn import functional as F
def gen_file_list(frame_save_dir, frame_list_dir):
val_path = os.path.join(frame_list_dir, 'val.txt')
video_names = os.listdir(frame_save_dir)
ucf101_rgb_val_file = open(val_path, 'w')
for video_name in video_names:
images_dir = os.path.join(frame_save_dir, video_name)
ucf101_rgb_val_file.write(video_name)
ucf101_rgb_val_file.write(' ')
ucf101_rgb_val_file.write(str(len(os.listdir(images_dir))))
ucf101_rgb_val_file.write('\n')
ucf101_rgb_val_file.close()
return val_path
def start_filter(config):
arch = config['FIGHTING']['ARCH']
prefix = config['VIDEO']['PREFIX']
modality = config['POSE']['MODALITY']
test_crop = config['POSE']['TEST_CROP']
batch_size = config['POSE']['BATCH_SIZE']
weights_path = config['MODEL']['CLS_POSE']
test_segment = config['POSE']['TEST_SEGMENT']
frame_save_dir = config['VIDEO']['POSE_FRAME_SAVE_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['POSE']['RESULT_FILE']
workers = 8
num_class = 3
shift_div = 8
img_feature_dim = 256
softmax = False
is_shift = True
full_res = False
non_local = False
dense_sample = False
twice_sample = False
val_list = gen_file_list(frame_save_dir, frame_list_dir)
result_file_path = os.path.join(frame_list_dir, result_file_name)
pretrain = 'imagenet'
shift_place = 'blockres'
crop_fusion_type = 'avg'
net = TSN(num_class, test_segment if is_shift else 1, modality,
base_model=arch,
consensus_type=crop_fusion_type,
img_feature_dim=img_feature_dim,
pretrain=pretrain,
is_shift=is_shift, shift_div=shift_div, shift_place=shift_place,
non_local=non_local,
)
checkpoint = torch.load(weights_path)
checkpoint = checkpoint['state_dict']
base_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint.items())}
replace_dict = {'base_model.classifier.weight': 'new_fc.weight',
'base_model.classifier.bias': 'new_fc.bias',
}
for k, v in replace_dict.items():
if k in base_dict:
base_dict[v] = base_dict.pop(k)
net.load_state_dict(base_dict)
input_size = net.scale_size if full_res else net.input_size
if test_crop == 1:
cropping = torchvision.transforms.Compose([
GroupScale(net.scale_size),
GroupCenterCrop(input_size),
])
elif test_crop == 3: # do not flip, so only 5 crops
cropping = torchvision.transforms.Compose([
GroupFullResSample(input_size, net.scale_size, flip=False)
])
elif test_crop == 5: # do not flip, so only 5 crops
cropping = torchvision.transforms.Compose([
GroupOverSample(input_size, net.scale_size, flip=False)
])
elif test_crop == 10:
cropping = torchvision.transforms.Compose([
GroupOverSample(input_size, net.scale_size)
])
else:
raise ValueError("Only 1, 5, 10 crops are supported while we got {}".format(test_crop))
data_loader = torch.utils.data.DataLoader(
TSNDataSet(frame_save_dir, val_list, num_segments=test_segment,
new_length=1 if modality == "RGB" else 5,
modality=modality,
image_tmpl=prefix,
test_mode=True,
remove_missing=False,
transform=torchvision.transforms.Compose([
cropping,
Stack(roll=(arch in ['BNInception', 'InceptionV3'])),
ToTorchFormatTensor(div=(arch not in ['BNInception', 'InceptionV3'])),
GroupNormalize(net.input_mean, net.input_std),
]), dense_sample=dense_sample, twice_sample=twice_sample),
batch_size=batch_size, shuffle=False,
num_workers=workers, pin_memory=True,
)
net = torch.nn.DataParallel(net.cuda())
net.eval()
data_gen = enumerate(data_loader)
max_num = len(data_loader.dataset)
result_file = open(result_file_path, 'w')
for i, data_pair in data_gen:
directory, data = data_pair
with torch.no_grad():
if i >= max_num:
break
num_crop = test_crop
if dense_sample:
num_crop *= 10 # 10 clips for testing when using dense sample
if twice_sample:
num_crop *= 2
if modality == 'RGB':
length = 3
elif modality == 'Flow':
length = 10
elif modality == 'RGBDiff':
length = 18
else:
raise ValueError("Unknown modality " + modality)
data_in = data.view(-1, length, data.size(2), data.size(3))
if is_shift:
data_in = data_in.view(batch_size * num_crop, test_segment, length, data_in.size(2), data_in.size(3))
rst, feature = net(data_in)
rst = rst.reshape(batch_size, num_crop, -1).mean(1)
if softmax:
# take the softmax to normalize the output to probability
rst = F.softmax(rst, dim=1)
rst = rst.data.cpu().numpy().copy()
if net.module.is_shift:
rst = rst.reshape(batch_size, num_class)
else:
rst = rst.reshape((batch_size, -1, num_class)).mean(axis=1).reshape((batch_size, num_class))
proba = np.squeeze(rst)
proba = np.exp(proba)/sum(np.exp(proba))
result_file.write(str(directory[0]) + ' ')
result_file.write(str(proba[0]) + ',' + str(proba[1]) + ',' + str(proba[2]) + '\n')
result_file.close()
print('video filter end')
\ No newline at end of file
import os
import cv2
import load_util
import media_util
import numpy as np
from sklearn.metrics import confusion_matrix
import fighting_filter, emotion_filter, argue_filter, audio_filter, class_filter
import video_filter, pose_filter, flow_filter
def accuracy_cal(config):
label_file_path = config['VIDEO']['LABEL_PATH']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
final_file_name = config['AUDIO']['RESULT_FILE']
final_file_path = os.path.join(frame_list_dir, final_file_name)
final_file_lines = open(final_file_path).readlines()
label_file_lines = open(label_file_path).readlines()
final_pairs = {line.strip().split(' ')[0]: line.strip().split(' ')[1] for line in final_file_lines}
lines_num = len(label_file_lines) - 1
hit = 0
for i, label_line in enumerate(label_file_lines):
if i == 0:
continue
file, label = label_line.strip().split(' ')
final_pre = final_pairs[file]
final_pre_class = np.argmax(np.array(final_pre.split(','))) + 1
print(final_pre_class, label)
if final_pre_class == int(label):
hit += 1
return hit/lines_num
def main():
config_path = r'config.yaml'
config = load_util.load_config(config_path)
media_util.extract_wav(config)
media_util.extract_frame(config)
media_util.extract_frame_pose(config)
media_util.extract_is10(config)
media_util.extract_random_face_feature(config)
media_util.extract_mirror(config)
fighting_2_filter.start_filter(config)
emotion_filter.start_filter(config)
audio_filter.start_filter(config)
class_filter.start_filter(config)
video_filter.start_filter(config)
pose_filter.start_filter(config)
flow_filter.start_filter(config)
acc = accuracy_cal(config)
print(acc)
if __name__ == '__main__':
main()
\ No newline at end of file
import os
import torch.optim
import numpy as np
import torch.nn.parallel
from ops.models import TSN
from ops.transforms import *
from ops.dataset import TSNDataSet
from torch.nn import functional as F
def gen_file_list(frame_save_dir, frame_list_dir):
val_path = os.path.join(frame_list_dir, 'val.txt')
video_names = os.listdir(frame_save_dir)
ucf101_rgb_val_file = open(val_path, 'w')
for video_name in video_names:
images_dir = os.path.join(frame_save_dir, video_name)
ucf101_rgb_val_file.write(video_name)
ucf101_rgb_val_file.write(' ')
ucf101_rgb_val_file.write(str(len(os.listdir(images_dir))))
ucf101_rgb_val_file.write('\n')
ucf101_rgb_val_file.close()
return val_path
def start_filter(config):
arch = config['FIGHTING']['ARCH']
prefix = config['VIDEO']['PREFIX']
modality = config['VIDEO_FILTER']['MODALITY']
test_crop = config['VIDEO_FILTER']['TEST_CROP']
batch_size = config['VIDEO_FILTER']['BATCH_SIZE']
weights_path = config['MODEL']['CLS_VIDEO']
test_segment = config['VIDEO_FILTER']['TEST_SEGMENT']
frame_save_dir = config['VIDEO']['FRAME_SAVE_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['VIDEO_FILTER']['RESULT_FILE']
workers = 8
num_class = 3
shift_div = 8
img_feature_dim = 256
softmax = False
is_shift = True
full_res = False
non_local = False
dense_sample = False
twice_sample = False
val_list = gen_file_list(frame_save_dir, frame_list_dir)
result_file_path = os.path.join(frame_list_dir, result_file_name)
pretrain = 'imagenet'
shift_place = 'blockres'
crop_fusion_type = 'avg'
net = TSN(num_class, test_segment if is_shift else 1, modality,
base_model=arch,
consensus_type=crop_fusion_type,
img_feature_dim=img_feature_dim,
pretrain=pretrain,
is_shift=is_shift, shift_div=shift_div, shift_place=shift_place,
non_local=non_local,
)
checkpoint = torch.load(weights_path)
checkpoint = checkpoint['state_dict']
base_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint.items())}
replace_dict = {'base_model.classifier.weight': 'new_fc.weight',
'base_model.classifier.bias': 'new_fc.bias',
}
for k, v in replace_dict.items():
if k in base_dict:
base_dict[v] = base_dict.pop(k)
net.load_state_dict(base_dict)
input_size = net.scale_size if full_res else net.input_size
if test_crop == 1:
cropping = torchvision.transforms.Compose([
GroupScale(net.scale_size),
GroupCenterCrop(input_size),
])
elif test_crop == 3: # do not flip, so only 5 crops
cropping = torchvision.transforms.Compose([
GroupFullResSample(input_size, net.scale_size, flip=False)
])
elif test_crop == 5: # do not flip, so only 5 crops
cropping = torchvision.transforms.Compose([
GroupOverSample(input_size, net.scale_size, flip=False)
])
elif test_crop == 10:
cropping = torchvision.transforms.Compose([
GroupOverSample(input_size, net.scale_size)
])
else:
raise ValueError("Only 1, 5, 10 crops are supported while we got {}".format(test_crop))
data_loader = torch.utils.data.DataLoader(
TSNDataSet(frame_save_dir, val_list, num_segments=test_segment,
new_length=1 if modality == "RGB" else 5,
modality=modality,
image_tmpl=prefix,
test_mode=True,
remove_missing=False,
transform=torchvision.transforms.Compose([
cropping,
Stack(roll=(arch in ['BNInception', 'InceptionV3'])),
ToTorchFormatTensor(div=(arch not in ['BNInception', 'InceptionV3'])),
GroupNormalize(net.input_mean, net.input_std),
]), dense_sample=dense_sample, twice_sample=twice_sample),
batch_size=batch_size, shuffle=False,
num_workers=workers, pin_memory=True,
)
net = torch.nn.DataParallel(net.cuda())
net.eval()
data_gen = enumerate(data_loader)
max_num = len(data_loader.dataset)
result_file = open(result_file_path, 'w')
for i, data_pair in data_gen:
directory, data = data_pair
with torch.no_grad():
if i >= max_num:
break
num_crop = test_crop
if dense_sample:
num_crop *= 10 # 10 clips for testing when using dense sample
if twice_sample:
num_crop *= 2
if modality == 'RGB':
length = 3
elif modality == 'Flow':
length = 10
elif modality == 'RGBDiff':
length = 18
else:
raise ValueError("Unknown modality " + modality)
data_in = data.view(-1, length, data.size(2), data.size(3))
if is_shift:
data_in = data_in.view(batch_size * num_crop, test_segment, length, data_in.size(2), data_in.size(3))
rst, feature = net(data_in)
rst = rst.reshape(batch_size, num_crop, -1).mean(1)
if softmax:
# take the softmax to normalize the output to probability
rst = F.softmax(rst, dim=1)
rst = rst.data.cpu().numpy().copy()
if net.module.is_shift:
rst = rst.reshape(batch_size, num_class)
else:
rst = rst.reshape((batch_size, -1, num_class)).mean(axis=1).reshape((batch_size, num_class))
proba = np.squeeze(rst)
proba = np.exp(proba)/sum(np.exp(proba))
result_file.write(str(directory[0]) + ' ')
result_file.write(str(proba[0]) + ',' + str(proba[1]) + ',' + str(proba[2]) + '\n')
result_file.close()
print('video filter end')
\ No newline at end of file
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!