cb29b6d7 by jiangwenqiang

first commit

1 parent 78b00ada
File mode changed
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
import os
import csv
import pickle
import numpy as np
from sklearn.externals import joblib
def start_filter(config):
cls_audio_path = config['MODEL']['CLS_AUDIO']
feature_save_dir = config['VIDEO']['IS10_FEATURE_NP_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['AUDIO']['RESULT_FILE']
feature_name = config['AUDIO']['DATA_NAME']
svm_clf = joblib.load(cls_audio_path)
result_file_path = os.path.join(frame_list_dir, result_file_name)
result_file = open(result_file_path, 'w')
feature_path = os.path.join(feature_save_dir, feature_name)
val_annotation_pairs = np.load(feature_path, allow_pickle=True, encoding='latin1')
for pair in val_annotation_pairs:
v = pair[0]
n = pair[2]
feature_np = np.reshape(v, (1, -1))
res = svm_clf.predict_proba(feature_np)
proba = np.squeeze(res)
# class_pre = svm_clf.predict(feature_np)
result_file.write(str(pair[2])[:-4] + ' ')
result_file.write(str(proba[0]) + ',' + str(proba[1]) + ',' + str(proba[2]) + '\n')
result_file.close()
def start_filter_xgboost(config):
cls_class_path = config['MODEL']['CLS_AUDIO']
feature_save_dir = config['VIDEO']['IS10_FEATURE_NP_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['AUDIO']['RESULT_FILE']
feature_name = config['AUDIO']['DATA_NAME']
xgboost_model = pickle.load(open(cls_class_path, "rb"))
result_file_path = os.path.join(frame_list_dir, result_file_name)
result_file = open(result_file_path, 'w')
feature_path = os.path.join(feature_save_dir, feature_name)
val_annotation_pairs = np.load(feature_path, allow_pickle=True, encoding='latin1')
X_val = []
Y_names = []
for pair in val_annotation_pairs:
n, v = pair.items()
X_val.append(v)
Y_names.append(n)
X_val = np.array(X_val)
y_pred = xgboost_model.predict_proba(X_val)
for i, Y_name in enumerate(Y_names):
result_file.write(Y_name + ' ')
result_file.write(str(y_pred[i][0]) + ',' + str(y_pred[i][1]) + ',' + str(y_pred[i][2]) + '\n')
result_file.close()
import os
import cv2
import numpy as np
import pickle
def start_filter(config):
cls_class_path = config['MODEL']['CLS_BG']
feature_save_dir = config['VIDEO']['FACE_FEATURE_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['BG']['RESULT_FILE']
feature_name = config['BG']['DATA_NAME']
xgboost_model = pickle.load(open(cls_class_path, "rb"))
result_file_path = os.path.join(frame_list_dir, result_file_name)
result_file = open(result_file_path, 'w')
feature_path = os.path.join(feature_save_dir, feature_name)
val_annotation_pairs = np.load(feature_path, allow_pickle=True)
X_val = []
Y_val = []
Y_names = []
for j in range(len(val_annotation_pairs)):
pair = val_annotation_pairs[j]
X_val.append(np.squeeze(pair[0]))
Y_val.append(pair[1])
Y_names.append(pair[2])
X_val = np.array(X_val)
y_pred = xgboost_model.predict_proba(X_val)
for i, Y_name in enumerate(Y_names):
result_file.write(Y_name + ' ')
result_file.write(str(y_pred[i][0]) + ',' + str(y_pred[i][1]) + ',' + str(y_pred[i][2]) + '\n')
result_file.close()
import os
import pickle
import numpy as np
def start_filter(config):
cls_class_path = config['MODEL']['CLS_CLASS']
feature_save_dir = config['VIDEO']['CLASS_FEATURE_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['CLASS']['RESULT_FILE']
feature_name = config['CLASS']['DATA_NAME']
xgboost_model = pickle.load(open(cls_class_path, "rb"))
result_file_path = os.path.join(frame_list_dir, result_file_name)
result_file = open(result_file_path, 'w')
feature_path = os.path.join(feature_save_dir, feature_name)
val_annotation_pairs = np.load(feature_path, allow_pickle=True)
X_val = []
Y_val = []
Y_names = []
for j in range(len(val_annotation_pairs)):
pair = val_annotation_pairs[j]
X_val.append(pair[0])
Y_val.append(pair[1])
Y_names.append(pair[2])
X_val = np.array(X_val)
y_pred = xgboost_model.predict(X_val)
for i, Y_name in enumerate(Y_names):
result_file.write(Y_name + ' ')
result_file.write(str(y_pred[i]) + '\n')
result_file.close()
MODEL:
CLS_FIGHTING_2: '/home/jwq/models/cls_fighting_2/cls_fighting_2_v0.0.1.pth'
CLS_EMOTION: '/home/jwq/models/cls_emotion/v0.1.0.m'
FEATURE_EMOTION: '/home/jwq/models/feature_emotion/FerPlus3.h5'
CLS_AUDIO: '/home/jwq/models/cls_audio/v0.0.1.m'
CLS_CLASS: '/home/jwq/models/cls_class/v_0.0.1_xgb.pkl'
CLS_VIDEO: '/home/jwq/models/cls_video/v0.4.1.pth'
CLS_POSE: '/home/jwq/models/cls_pose/v0.0.1.pth'
CLS_FLOW: '/home/jwq/models/cls_flow/v0.1.1.pth'
CLS_BG: '/home/jwq/models/cls_bg/v0.1.1.pkl'
CLS_PERSON: '/home/jwq/models/cls_person/v0.1.1.pkl'
THRESHOLD:
FACES_THRESHOLD: 0.6
FILTER:
VIDEO:
VIDEO_DIR: '/home/jwq/Desktop/VGAF_EmotiW/Val'
LABEL_PATH: '/home/jwq/Desktop/VGAF_EmotiW/Val_labels.txt'
VIDEO_SAVE_DIR: '/home/jwq/Desktop/tmp/video'
AUDIO_SAVE_DIR: '/home/jwq/npys/'
FRAME_SAVE_DIR: '/home/jwq/Desktop/tmp/frame'
# FRAME_SAVE_DIR: '/home/jwq/Desktop/VGAF_EmotiW_class/train_frame'
FLOW_SAVE_DIR: '/home/jwq/Desktop/tmp/flow'
POSE_FRAME_SAVE_DIR: '/home/jwq/Desktop/tmp/pose_frame'
FRAME_LIST_DIR: '/home/jwq/Desktop/tmp/file_list'
IS10_FEATURE_NP_DIR: '/home/jwq/npys'
IS10_FEATURE_CSV_DIR: '/home/jwq/Desktop/tmp/is10'
# FACE_FEATURE_DIR: '/home/jwq/Desktop/tmp/face_feature_retina'
# FACE_FEATURE_DIR: '/data2/retinaface/random_face_frame_features/'
FACE_FEATURE_DIR: '/data1/segment/'
# FACE_FEATURE_DIR: '/home/jwq/npys/'
FACE_IMAGE_DIR: '/data2/retinaface/train/'
CLASS_FEATURE_DIR: '/home/jwq/Desktop/tmp/class'
PREFIX: 'img_{:05d}.jpg'
FLOW_PREFIX: 'flow_{}_{:05d}.jpg'
THREAD_NUM: 10
FPS: 5
VIDEO_FILTER:
TEST_SEGMENT: 8
TEST_CROP: 1
BATCH_SIZE: 1
INPUT_SIZE: 224
MODALITY: 'RGB'
ARCH: 'resnet50'
RESULT_FILE: 'video_filter.txt'
VIDEO_1_FILTER:
TEST_SEGMENT: 8
TEST_CROP: 1
BATCH_SIZE: 1
INPUT_SIZE: 224
MODALITY: 'RGB'
ARCH: 'resnet34'
RESULT_FILE: 'video_1_filter.txt'
EMOTION:
INTERVAL: 1
INPUT_SIZE: 224
RESULT_FILE: 'emotion_filter.txt'
EMOTION_1:
RESULT_FILE: 'emotion_1_filter.txt'
DATA_NAME: 'val.npy'
ARGUE:
DIMENSION: 1582
RESULT_FILE: 'argue_filter.txt'
FIGHTING:
TEST_SEGMENT: 8
TEST_CROP: 1
BATCH_SIZE: 1
INPUT_SIZE: 224
MODALITY: 'RGB'
ARCH: 'resnet50'
RESULT_FILE: 'fighting_filter.txt'
FIGHTING_2:
TEST_SEGMENT: 8
TEST_CROP: 1
BATCH_SIZE: 1
INPUT_SIZE: 224
MODALITY: 'RGB'
ARCH: 'resnet50'
RESULT_FILE: 'fighting_2_filter.txt'
MEETING:
TEST_SEGMENT: 8
TEST_CROP: 1
BATCH_SIZE: 1
INPUT_SIZE: 224
MODALITY: 'RGB'
ARCH: 'resnet50'
RESULT_FILE: 'meeting_filter.txt'
TROOPS:
TEST_SEGMENT: 8
TEST_CROP: 1
BATCH_SIZE: 1
INPUT_SIZE: 224
MODALITY: 'RGB'
ARCH: 'resnet50'
RESULT_FILE: 'troops_filter.txt'
FLOW:
TEST_SEGMENT: 8
TEST_CROP: 1
BATCH_SIZE: 1
INPUT_SIZE: 224
MODALITY: 'Flow'
ARCH: 'resnet50'
RESULT_FILE: 'flow_filter.txt'
FINAL:
RESULT_FILE: 'final.txt'
ERROR_FILE: 'error.txt'
SIM_FILE: 'image_sim.txt'
AUDIO:
RESULT_FILE: 'audio.txt'
OPENSMILE_DIR: '/home/jwq/Downloads/opensmile-2.3.0'
DATA_NAME: 'val.npy'
CLASS:
RESULT_FILE: 'class.txt'
DATA_NAME: 'val _reannotation.npy'
POSE:
TEST_SEGMENT: 8
TEST_CROP: 1
BATCH_SIZE: 1
INPUT_SIZE: 224
MODALITY: 'RGB'
ARCH: 'resnet50'
RESULT_FILE: 'pose_filter.txt'
BG:
RESULT_FILE: 'bg_filter.txt'
DATA_NAME: 'bg_val_feature.npy'
PERSON:
RESULT_FILE: 'person_filter.txt'
DATA_NAME: 'person_val_feature.npy'
import os
import cv2
import numpy as np
from keras.models import Model
from keras.models import load_model
from sklearn.externals import joblib
from tensorflow.keras.preprocessing.image import img_to_array
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
class FeatureExtractor(object):
def __init__(self, input_size=224, out_put_layer='avg_pool', model_path='FerPlus3.h5'):
self.model = load_model(model_path)
self.input_size = input_size
self.model_inter = Model(inputs=self.model.input, outputs=self.model.get_layer(out_put_layer).output)
def inference(self, image):
image = cv2.resize(image, (self.input_size, self.input_size))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = image.astype("float") / 255.0
image = img_to_array(image)
image = np.expand_dims(image, axis=0)
feature = self.model_inter.predict(image)[0]
return feature
def features2feature(pics_features):
pics_features = np.array(pics_features)
fea_mean = pics_features.mean(axis=0)
fea_max = np.amax(pics_features, axis=0)
fea_min = np.amin(pics_features, axis=0)
fea_std = pics_features.std(axis=0)
return np.concatenate((fea_mean, fea_max, fea_min, fea_std), axis=1).reshape(1, -1)
def start_filter(config):
cls_emotion_path = config['MODEL']['CLS_EMOTION']
face_feature_dir = config['VIDEO']['FACE_FEATURE_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['EMOTION']['RESULT_FILE']
svm_clf = joblib.load(cls_emotion_path)
result_file_path = os.path.join(frame_list_dir, result_file_name)
result_file = open(result_file_path, 'w')
face_feature_names = os.listdir(face_feature_dir)
for face_feature in face_feature_names:
face_feature_path = os.path.join(face_feature_dir, face_feature)
features_np = np.load(face_feature_path, allow_pickle=True)
feature = features2feature(features_np)
res = svm_clf.predict_proba(feature)
proba = np.squeeze(res)
# class_pre = svm_clf.predict(feature)
result_file.write(face_feature[:-4] + ' ')
result_file.write(str(proba[0]) + ',' + str(proba[1]) + ',' + str(proba[2]) + '\n')
result_file.close()
import os
import torch.optim
import numpy as np
import torch.optim
import torch.nn.parallel
from ops.models import TSN
from ops.transforms import *
from ops.dataset import TSNDataSet
from torch.nn import functional as F
def gen_file_list(frame_save_dir, frame_list_dir):
val_path = os.path.join(frame_list_dir, 'val.txt')
video_names = os.listdir(frame_save_dir)
ucf101_rgb_val_file = open(val_path, 'w')
for video_name in video_names:
images_dir = os.path.join(frame_save_dir, video_name)
ucf101_rgb_val_file.write(video_name)
ucf101_rgb_val_file.write(' ')
ucf101_rgb_val_file.write(str(len(os.listdir(images_dir))))
ucf101_rgb_val_file.write('\n')
ucf101_rgb_val_file.close()
return val_path
def start_filter(config):
arch = config['FIGHTING_2']['ARCH']
prefix = config['VIDEO']['PREFIX']
modality = config['FIGHTING_2']['MODALITY']
test_crop = config['FIGHTING_2']['TEST_CROP']
batch_size = config['FIGHTING_2']['BATCH_SIZE']
weights_path = config['MODEL']['CLS_FIGHTING_2']
test_segment = config['FIGHTING_2']['TEST_SEGMENT']
frame_save_dir = config['VIDEO']['FRAME_SAVE_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['FIGHTING_2']['RESULT_FILE']
workers = 8
num_class = 2
shift_div = 8
img_feature_dim = 256
softmax = False
is_shift = True
full_res = False
non_local = False
dense_sample = False
twice_sample = False
val_list = gen_file_list(frame_save_dir, frame_list_dir)
result_file_path = os.path.join(frame_list_dir, result_file_name)
pretrain = 'imagenet'
shift_place = 'blockres'
crop_fusion_type = 'avg'
net = TSN(num_class, test_segment if is_shift else 1, modality,
base_model=arch,
consensus_type=crop_fusion_type,
img_feature_dim=img_feature_dim,
pretrain=pretrain,
is_shift=is_shift, shift_div=shift_div, shift_place=shift_place,
non_local=non_local,
)
checkpoint = torch.load(weights_path)
checkpoint = checkpoint['state_dict']
base_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint.items())}
replace_dict = {'base_model.classifier.weight': 'new_fc.weight',
'base_model.classifier.bias': 'new_fc.bias',
}
for k, v in replace_dict.items():
if k in base_dict:
base_dict[v] = base_dict.pop(k)
net.load_state_dict(base_dict)
input_size = net.scale_size if full_res else net.input_size
if test_crop == 1:
cropping = torchvision.transforms.Compose([
GroupScale(net.scale_size),
GroupCenterCrop(input_size),
])
elif test_crop == 3: # do not flip, so only 5 crops
cropping = torchvision.transforms.Compose([
GroupFullResSample(input_size, net.scale_size, flip=False)
])
elif test_crop == 5: # do not flip, so only 5 crops
cropping = torchvision.transforms.Compose([
GroupOverSample(input_size, net.scale_size, flip=False)
])
elif test_crop == 10:
cropping = torchvision.transforms.Compose([
GroupOverSample(input_size, net.scale_size)
])
else:
raise ValueError("Only 1, 5, 10 crops are supported while we got {}".format(test_crop))
data_loader = torch.utils.data.DataLoader(
TSNDataSet(frame_save_dir, val_list, num_segments=test_segment,
new_length=1 if modality == "RGB" else 5,
modality=modality,
image_tmpl=prefix,
test_mode=True,
remove_missing=False,
transform=torchvision.transforms.Compose([
cropping,
Stack(roll=(arch in ['BNInception', 'InceptionV3'])),
ToTorchFormatTensor(div=(arch not in ['BNInception', 'InceptionV3'])),
GroupNormalize(net.input_mean, net.input_std),
]), dense_sample=dense_sample, twice_sample=twice_sample),
batch_size=batch_size, shuffle=False,
num_workers=workers, pin_memory=True,
)
net = torch.nn.DataParallel(net.cuda())
net.eval()
data_gen = enumerate(data_loader)
max_num = len(data_loader.dataset)
result_file = open(result_file_path, 'w')
for i, data_pair in data_gen:
directory, data = data_pair
with torch.no_grad():
if i >= max_num:
break
num_crop = test_crop
if dense_sample:
num_crop *= 10 # 10 clips for testing when using dense sample
if twice_sample:
num_crop *= 2
if modality == 'RGB':
length = 3
elif modality == 'Flow':
length = 10
elif modality == 'RGBDiff':
length = 18
else:
raise ValueError("Unknown modality " + modality)
data_in = data.view(-1, length, data.size(2), data.size(3))
if is_shift:
data_in = data_in.view(batch_size * num_crop, test_segment, length, data_in.size(2), data_in.size(3))
rst, feature = net(data_in)
rst = rst.reshape(batch_size, num_crop, -1).mean(1)
if softmax:
# take the softmax to normalize the output to probability
rst = F.softmax(rst, dim=1)
rst = rst.data.cpu().numpy().copy()
if net.module.is_shift:
rst = rst.reshape(batch_size, num_class)
else:
rst = rst.reshape((batch_size, -1, num_class)).mean(axis=1).reshape((batch_size, num_class))
proba = np.squeeze(rst)
print(proba)
proba = np.exp(proba)/sum(np.exp(proba))
result_file.write(str(directory[0]) + ' ')
result_file.write(str(proba[0]) + ',' + str(proba[1]) + '\n')
result_file.close()
print('fighting filter end')
\ No newline at end of file
import os
import torch.optim
import numpy as np
import torch.optim
import torch.nn.parallel
from ops.models import TSN
from ops.transforms import *
from ops.dataset import TSNDataSet
from torch.nn import functional as F
def gen_file_list(frame_save_dir, frame_list_dir):
val_path = os.path.join(frame_list_dir, 'flow_val.txt')
video_names = os.listdir(frame_save_dir)
ucf101_rgb_val_file = open(val_path, 'w')
for video_name in video_names:
images_dir = os.path.join(frame_save_dir, video_name)
ucf101_rgb_val_file.write(video_name)
ucf101_rgb_val_file.write(' ')
ori_list = os.listdir(images_dir)
select_list = [element for element in ori_list if 'x' in element]
ucf101_rgb_val_file.write(str(len(select_list)))
ucf101_rgb_val_file.write('\n')
ucf101_rgb_val_file.close()
return val_path
def start_filter(config):
arch = config['FLOW']['ARCH']
prefix = config['VIDEO']['FLOW_PREFIX']
modality = config['FLOW']['MODALITY']
test_crop = config['FLOW']['TEST_CROP']
batch_size = config['FLOW']['BATCH_SIZE']
weights_path = config['MODEL']['CLS_FLOW']
test_segment = config['FLOW']['TEST_SEGMENT']
frame_save_dir = config['VIDEO']['FLOW_SAVE_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['FLOW']['RESULT_FILE']
workers = 8
num_class = 3
shift_div = 8
img_feature_dim = 256
softmax = False
is_shift = True
full_res = False
non_local = False
dense_sample = False
twice_sample = False
val_list = gen_file_list(frame_save_dir, frame_list_dir)
result_file_path = os.path.join(frame_list_dir, result_file_name)
pretrain = 'imagenet'
shift_place = 'blockres'
crop_fusion_type = 'avg'
net = TSN(num_class, test_segment if is_shift else 1, modality,
base_model=arch,
consensus_type=crop_fusion_type,
img_feature_dim=img_feature_dim,
pretrain=pretrain,
is_shift=is_shift, shift_div=shift_div, shift_place=shift_place,
non_local=non_local,
)
checkpoint = torch.load(weights_path)
checkpoint = checkpoint['state_dict']
base_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint.items())}
replace_dict = {'base_model.classifier.weight': 'new_fc.weight',
'base_model.classifier.bias': 'new_fc.bias',
}
for k, v in replace_dict.items():
if k in base_dict:
base_dict[v] = base_dict.pop(k)
net.load_state_dict(base_dict)
input_size = net.scale_size if full_res else net.input_size
if test_crop == 1:
cropping = torchvision.transforms.Compose([
GroupScale(net.scale_size),
GroupCenterCrop(input_size),
])
elif test_crop == 3: # do not flip, so only 5 crops
cropping = torchvision.transforms.Compose([
GroupFullResSample(input_size, net.scale_size, flip=False)
])
elif test_crop == 5: # do not flip, so only 5 crops
cropping = torchvision.transforms.Compose([
GroupOverSample(input_size, net.scale_size, flip=False)
])
elif test_crop == 10:
cropping = torchvision.transforms.Compose([
GroupOverSample(input_size, net.scale_size)
])
else:
raise ValueError("Only 1, 5, 10 crops are supported while we got {}".format(test_crop))
data_loader = torch.utils.data.DataLoader(
TSNDataSet(frame_save_dir, val_list, num_segments=test_segment,
new_length=1 if modality == "RGB" else 5,
modality=modality,
image_tmpl=prefix,
test_mode=True,
remove_missing=False,
transform=torchvision.transforms.Compose([
cropping,
Stack(roll=(arch in ['BNInception', 'InceptionV3'])),
ToTorchFormatTensor(div=(arch not in ['BNInception', 'InceptionV3'])),
GroupNormalize(net.input_mean, net.input_std),
]), dense_sample=dense_sample, twice_sample=twice_sample),
batch_size=batch_size, shuffle=False,
num_workers=workers, pin_memory=True,
)
net = torch.nn.DataParallel(net.cuda())
net.eval()
data_gen = enumerate(data_loader)
max_num = len(data_loader.dataset)
result_file = open(result_file_path, 'w')
for i, data_pair in data_gen:
directory, data = data_pair
with torch.no_grad():
if i >= max_num:
break
num_crop = test_crop
if dense_sample:
num_crop *= 10 # 10 clips for testing when using dense sample
if twice_sample:
num_crop *= 2
if modality == 'RGB':
length = 3
elif modality == 'Flow':
length = 10
elif modality == 'RGBDiff':
length = 18
else:
raise ValueError("Unknown modality " + modality)
data_in = data.view(-1, length, data.size(2), data.size(3))
if is_shift:
data_in = data_in.view(batch_size * num_crop, test_segment, length, data_in.size(2), data_in.size(3))
rst, feature = net(data_in)
rst = rst.reshape(batch_size, num_crop, -1).mean(1)
if softmax:
# take the softmax to normalize the output to probability
rst = F.softmax(rst, dim=1)
rst = rst.data.cpu().numpy().copy()
if net.module.is_shift:
rst = rst.reshape(batch_size, num_class)
else:
rst = rst.reshape((batch_size, -1, num_class)).mean(axis=1).reshape((batch_size, num_class))
proba = np.squeeze(rst)
proba = np.exp(proba)/sum(np.exp(proba))
result_file.write(str(directory[0]) + ' ')
result_file.write(str(proba[0]) + ',' + str(proba[1]) + ',' + str(proba[2]) + '\n')
result_file.close()
print('fighting filter end')
\ No newline at end of file
import os
import cv2
import yaml
import tensorflow as tf
def load_config(config_path):
with open(config_path, 'r') as cf:
config_obj = yaml.load(cf, Loader=yaml.FullLoader)
print(config_obj)
return config_obj
def load_argue_model(config):
cls_argue_path = config['MODEL']['CLS_ARGUE']
with tf.Graph().as_default():
if os.path.isfile(cls_argue_path):
print('Model filename: %s' % cls_argue_path)
with tf.gfile.GFile(cls_argue_path, 'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
tf.import_graph_def(graph_def, name='')
x = tf.get_default_graph().get_tensor_by_name("x_batch:0")
output = tf.get_default_graph().get_tensor_by_name("output/BiasAdd:0")
config = tf.ConfigProto()
config.gpu_options.allow_growth = False
sess = tf.Session(config=config)
return x, output, sess
import os
import cv2
import random
import shutil
import subprocess
import numpy as np
import torch.optim
from tqdm import tqdm
import torch.nn.parallel
from ops.models import TSN
from ops.transforms import *
from functools import partial
from mtcnn.mtcnn import MTCNN
from keras.models import Model
from multiprocessing import Pool
from keras.models import load_model
from sklearn.externals import joblib
from tensorflow.keras.preprocessing.image import img_to_array
from ops.dataset import TSNDataSet
from torch.nn import functional as F
os.environ["CUDA_VISIBLE_DEVICES"] = '1'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
class FeatureExtractor(object):
def __init__(self, input_size=224, out_put_layer='global_average_pooling2d_1', model_path='nceptionResNetV2-final.h5'):
self.model = load_model(model_path)
self.input_size = input_size
self.model_inter = Model(inputs=self.model.input, outputs=self.model.get_layer(out_put_layer).output)
def inference(self, image):
image = cv2.resize(image, (self.input_size, self.input_size))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = image.astype("float") / 255.0
image = img_to_array(image)
image = np.expand_dims(image, axis=0)
feature = self.model_inter.predict(image)[0]
return feature
def extract_wav(config):
video_dir = config['VIDEO']['VIDEO_DIR']
video_save_dir = config['VIDEO']['VIDEO_SAVE_DIR']
audio_save_dir = config['VIDEO']['AUDIO_SAVE_DIR']
assert os.path.exists(video_dir)
video_names = os.listdir(video_dir)
for video_index, video_name in enumerate(video_names):
file_name = video_name.split('.')[0]
video_path = os.path.join(video_dir, video_name)
assert os.path.exists(audio_save_dir)
assert os.path.exists(video_save_dir)
audio_name = file_name + '.wav'
audio_save_path = os.path.join(audio_save_dir, audio_name)
video_save_path = os.path.join(video_save_dir, video_name)
command = 'ffmpeg -i {} -f wav -ar 16000 {}'.format(video_path, audio_save_path)
os.popen(command)
shutil.copyfile(video_path, video_save_path)
def video2frame(file_name, class_path, dst_class_path):
if '.mp4' not in file_name:
return
name, ext = os.path.splitext(file_name)
dst_directory_path = os.path.join(dst_class_path, name)
video_file_path = os.path.join(class_path, file_name)
try:
if os.path.exists(dst_directory_path):
if not os.path.exists(os.path.join(dst_directory_path, 'img_00001.jpg')):
subprocess.call('rm -r \"{}\"'.format(dst_directory_path), shell=True)
print('remove {}'.format(dst_directory_path))
os.mkdir(dst_directory_path)
else:
print('*** convert has been done: {}'.format(dst_directory_path))
return
else:
os.mkdir(dst_directory_path)
except:
print(dst_directory_path)
return
cmd = 'ffmpeg -i \"{}\" -threads 1 -vf scale=-1:331 -q:v 0 \"{}/img_%05d.jpg\"'.format(video_file_path,
dst_directory_path)
# print(cmd)
subprocess.call(cmd, shell=True,
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
def extract_frame(config):
video_save_dir = config['VIDEO']['VIDEO_SAVE_DIR']
frame_save_dir = config['VIDEO']['FRAME_SAVE_DIR']
n_thread = config['VIDEO']['THREAD_NUM']
assert os.path.exists(video_save_dir)
video_names = os.listdir(video_save_dir)
if not os.path.exists(frame_save_dir):
os.mkdir(frame_save_dir)
p = Pool(n_thread)
worker = partial(video2frame, class_path=video_save_dir, dst_class_path=frame_save_dir)
for _ in tqdm(p.imap_unordered(worker, video_names), total=len(video_names)):
pass
p.close()
p.join()
def extract_frame_pose(config):
video_save_dir = config['VIDEO']['VIDEO_SAVE_DIR']
frame_save_dir = config['VIDEO']['POSE_FRAME_SAVE_DIR']
n_thread = config['VIDEO']['THREAD_NUM']
assert os.path.exists(video_save_dir)
video_names = os.listdir(video_save_dir)
if not os.path.exists(frame_save_dir):
os.mkdir(frame_save_dir)
p = Pool(n_thread)
worker = partial(video2frame, class_path=video_save_dir, dst_class_path=frame_save_dir)
for _ in tqdm(p.imap_unordered(worker, video_names), total=len(video_names)):
pass
p.close()
p.join()
def extract_is10(config):
open_smile_dir = config['AUDIO']['OPENSMILE_DIR']
audio_save_dir = config['VIDEO']['AUDIO_SAVE_DIR']
is10_save_dir = config['VIDEO']['IS10_FEATURE_CSV_DIR']
assert os.path.exists(audio_save_dir)
audio_names = os.listdir(audio_save_dir)
if not os.path.exists(is10_save_dir):
os.mkdir(is10_save_dir)
for audio_name in audio_names:
audio_save_path = os.path.join(audio_save_dir, audio_name)
csv_name = audio_name[:-4] + '.csv'
csv_path = os.path.join(is10_save_dir, csv_name)
config = '{}/config/IS10_paraling.conf'.format(open_smile_dir)
command = '{}/SMILExtract -C {} -I {} -O {}'.format(open_smile_dir, config, audio_save_path, csv_path)
os.popen(command)
def extract_face_feature(config):
feature_emotion_path = config['MODEL']['FEATURE_EMOTION']
frame_save_dir = config['VIDEO']['FRAME_SAVE_DIR']
face_feature_dir = config['VIDEO']['FACE_FEATURE_DIR']
interval = config['EMOTION']['INTERVAL']
input_size = config['EMOTION']['INPUT_SIZE']
prefix = config['VIDEO']['PREFIX']
feature_extractor = FeatureExtractor(
input_size=input_size, out_put_layer='global_average_pooling2d_1', model_path=feature_emotion_path)
mtcnn_detector = MTCNN()
video_names = os.listdir(frame_save_dir)
for video_index, video_name in enumerate(video_names):
print('{}/{}'.format(video_index, len(video_names)))
video_dir = os.path.join(frame_save_dir, video_name)
frame_names = os.listdir(video_dir)
end = 0
features = []
while end < len(frame_names):
if end % interval == 0:
frame_name = prefix.format(end + 1)
frame_path = os.path.join(video_dir, frame_name)
frame = cv2.imread(frame_path)
img_h, img_w, img_c = frame.shape
detect_faces = mtcnn_detector.detect_faces(frame)
for i, e in enumerate(detect_faces):
x1, y1, w, h = e['box']
x1 = x1 if x1 > 0 else 0
y1 = y1 if y1 > 0 else 0
x1 = x1 if x1 < img_w else img_w
y1 = y1 if y1 < img_h else img_h
face = frame[y1:y1 + h, x1:x1 + w, :]
if face is []:
continue
features.append(feature_extractor.inference(face)[0])
# top_5 = {}
# for i, e in enumerate(detect_faces):
# x1, y1, w, h = e['box']
# x1 = x1 if x1 > 0 else 0
# y1 = y1 if y1 > 0 else 0
# x1 = x1 if x1 < img_w else img_w
# y1 = y1 if y1 < img_h else img_h
#
# top_5[w*h] = [x1, y1, w, h]
#
# top_5 = sorted(top_5.items(), key=lambda d:d[0], reverse=True)
# j = 0
# for v in top_5:
# if j > 5:
# break
# x1, y1, w, h = v[1]
# face = frame[y1:y1+h, x1:x1+w, :]
# if face is []:
# continue
# features.append(feature_extractor.inference(face)[0])
end += 1
if len(features) is 0:
continue
features_np = np.array(features)
face_feature_path = os.path.join(face_feature_dir, video_name + '.npy')
np.save(face_feature_path, features_np)
def extract_random_face_feature(config):
feature_emotion_path = config['MODEL']['FEATURE_EMOTION']
face_save_dir = config['VIDEO']['FACE_IMAGE_DIR']
face_feature_dir = config['VIDEO']['FACE_FEATURE_DIR']
input_size = config['EMOTION']['INPUT_SIZE']
feature_extractor = FeatureExtractor(
input_size=input_size, out_put_layer='avg_pool', model_path=feature_emotion_path)
video_dirs = []
class_names = os.listdir(face_save_dir)
for class_name in class_names:
class_dir = os.path.join(face_save_dir, class_name)
video_names = os.listdir(class_dir)
for video_name in video_names:
video_dir = os.path.join(class_dir, video_name)
video_dirs.append(video_dir)
for video_dir_index, video_dir in enumerate(video_dirs):
print('{}/{}'.format(video_dir_index, len(video_dirs)))
class_name, video_name = video_dir.split('/')[-2], video_dir.split('/')[-1]
video_file_name = video_name.split('.')[0]
save_class_dir = os.path.join(face_feature_dir, class_name)
face_feature_path = os.path.join(save_class_dir, video_file_name + '.npy')
if os.path.exists(face_feature_path):
print('file is exists')
continue
image_names = os.listdir(video_dir)
image_dirs = []
for image_name in image_names:
image_dir = os.path.join(video_dir, image_name)
image_dirs.append(image_dir)
features = []
for image_dir_index, image_dir in enumerate(image_dirs):
sub_face_names = os.listdir(image_dir)
sub_face_num = len(sub_face_names)
for face_index in range(sub_face_num):
face_path = os.path.join(image_dir, sub_face_names[face_index])
face_image = cv2.imread(face_path)
features.append(feature_extractor.inference(face_image)[0])
face_num = len(features)
random_1 = random.sample(range(face_num), int(0.8 * face_num))
features_random_1 = [features[c] for c in random_1]
random_2 = random.sample(range(face_num), int(0.6 * face_num))
features_random_2 = [features[d] for d in random_2]
random_3 = random.sample(range(face_num), int(0.4 * face_num))
features_random_3 = [features[e] for e in random_3]
if len(features) is 0:
continue
if os.path.exists(save_class_dir) is False:
os.mkdir(save_class_dir)
features_np = np.array(features)
face_feature_path = os.path.join(save_class_dir, video_file_name + '.npy')
np.save(face_feature_path, features_np)
features_np_random_1 = np.array(features_random_1)
face_feature_1_path = os.path.join(save_class_dir, video_file_name + '_1.npy')
np.save(face_feature_1_path, features_np_random_1)
features_np_random_2 = np.array(features_random_2)
face_feature_2_path = os.path.join(save_class_dir, video_file_name + '_2.npy')
np.save(face_feature_2_path, features_np_random_2)
features_np_random_3 = np.array(features_random_3)
face_feature_3_path = os.path.join(save_class_dir, video_file_name + '_3.npy')
np.save(face_feature_3_path, features_np_random_3)
def get_vid_fea(pics_features):
pics_features = np.array(pics_features)
fea_mean = pics_features.mean(axis=0)
fea_max = np.amax(pics_features, axis=0)
fea_min = np.amin(pics_features, axis=0)
fea_std = pics_features.std(axis=0)
feature_concate = np.concatenate((fea_mean, fea_max, fea_min, fea_std), axis=1)
return np.squeeze(feature_concate)
def extract_random_face_and_frame_feature_():
face_feature_dir = r'/data2/3_log-ResNet50/train_mirror/'
new_face_feature_dir = r'/data2/retinaface/random_face_frame_features_train_mirror/'
video_dirs = []
class_names = os.listdir(face_feature_dir)
for class_name in class_names:
class_dir = os.path.join(face_feature_dir, class_name)
video_names = os.listdir(class_dir)
for video_name in video_names:
video_dir = os.path.join(class_dir, video_name)
video_dirs.append(video_dir)
for video_dir in video_dirs:
video_name = video_dir.split('/')[-1]
frame_names = os.listdir(video_dir)
feature = []
for frame_name in frame_names:
feature_dir = os.path.join(video_dir, frame_name)
face_features_names = os.listdir(feature_dir)
for face_features_name in face_features_names:
face_features_path = os.path.join(feature_dir, face_features_name)
feature_np = np.load(face_features_path, allow_pickle=True)
feature.append(feature_np)
feature_num = len(feature)
if feature_num < 4:
continue
random_1 = random.sample(range(feature_num), int(0.9 * feature_num))
features_random_1 = [feature[c] for c in random_1]
random_2 = random.sample(range(feature_num), int(0.7 * feature_num))
features_random_2 = [feature[d] for d in random_2]
random_3 = random.sample(range(feature_num), int(0.5 * feature_num))
features_random_3 = [feature[e] for e in random_3]
video_file_name = video_name.split('.')[0]
features_np = get_vid_fea(feature)
face_feature_path = os.path.join(new_face_feature_dir, video_file_name + '.npy')
np.save(face_feature_path, features_np)
features_np_random_1 = get_vid_fea(features_random_1)
face_feature_1_path = os.path.join(new_face_feature_dir, video_file_name + '_1.npy')
np.save(face_feature_1_path, features_np_random_1)
features_np_random_2 = get_vid_fea(features_random_2)
face_feature_2_path = os.path.join(new_face_feature_dir, video_file_name + '_2.npy')
np.save(face_feature_2_path, features_np_random_2)
features_np_random_3 = get_vid_fea(features_random_3)
face_feature_3_path = os.path.join(new_face_feature_dir, video_file_name + '_3.npy')
np.save(face_feature_3_path, features_np_random_3)
def extract_random_face_and_frame_feature(config):
feature_emotion_path = config['MODEL']['FEATURE_EMOTION']
input_size = config['EMOTION']['INPUT_SIZE']
face_dir = r'/data2/retinaface/train/'
new_face_feature_dir = r'/data2/3_log-ResNet50/train_mirror/'
feature_extractor = FeatureExtractor(
input_size=input_size, out_put_layer='avg_pool', model_path=feature_emotion_path)
sub_face_paths = []
class_names = os.listdir(face_dir)
for class_name in class_names:
class_dir = os.path.join(face_dir, class_name)
video_names = os.listdir(class_dir)
for video_name in video_names:
video_dir = os.path.join(class_dir, video_name)
frame_names = os.listdir(video_dir)
for frame_name in frame_names:
frame_dir = os.path.join(video_dir, frame_name)
sub_face_names = os.listdir(frame_dir)
for sub_face_name in sub_face_names:
sub_face_path = os.path.join(frame_dir, sub_face_name)
sub_face_paths.append(sub_face_path)
for face_index, sub_face_path in enumerate(sub_face_paths):
print('{}/{}'.format(face_index+1, len(sub_face_paths)))
class_name, video_name, frame_name, sub_face_name = sub_face_path.split('/')[-4]\
, sub_face_path.split('/')[-3], sub_face_path.split('/')[-2], sub_face_path.split('/')[-1]
class_dir = os.path.join(new_face_feature_dir, class_name)
video_dir = os.path.join(class_dir, video_name)
frame_dir = os.path.join(video_dir, frame_name)
sub_face_name = sub_face_name.split('.')[0] + '.npy'
face_feature_save_path = os.path.join(frame_dir, sub_face_name)
if os.path.exists(face_feature_save_path):
print('file exists')
continue
face_image = cv2.imread(sub_face_path)
mirror_face_image = cv2.flip(face_image, 0)
feature = feature_extractor.inference(mirror_face_image)[0]
if os.path.exists(class_dir) is False:
os.mkdir(class_dir)
if os.path.exists(video_dir) is False:
os.mkdir(video_dir)
if os.path.exists(frame_dir) is False:
os.mkdir(frame_dir)
np.save(face_feature_save_path, feature)
def gen_file_list(frame_save_dir, frame_list_dir):
val_path = os.path.join(frame_list_dir, 'train.txt')
video_names = os.listdir(frame_save_dir)
ucf101_rgb_val_file = open(val_path, 'w')
for video_name in video_names:
images_dir = os.path.join(frame_save_dir, video_name)
ucf101_rgb_val_file.write(video_name)
ucf101_rgb_val_file.write(' ')
ucf101_rgb_val_file.write(str(len(os.listdir(images_dir))))
ucf101_rgb_val_file.write('\n')
ucf101_rgb_val_file.close()
return val_path
def extract_video_features(config):
arch = config['FIGHTING']['ARCH']
prefix = config['VIDEO']['PREFIX']
modality = config['VIDEO_FILTER']['MODALITY']
test_crop = config['VIDEO_FILTER']['TEST_CROP']
batch_size = config['VIDEO_FILTER']['BATCH_SIZE']
weights_path = config['MODEL']['CLS_VIDEO']
test_segment = config['VIDEO_FILTER']['TEST_SEGMENT']
frame_save_dir = config['VIDEO']['FRAME_SAVE_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
feature_save_dir = r'/home/jwq/Desktop/tmp/video2np/train/'
workers = 8
num_class = 3
shift_div = 8
img_feature_dim = 256
softmax = False
is_shift = True
full_res = False
non_local = False
dense_sample = False
twice_sample = False
val_list = gen_file_list(frame_save_dir, frame_list_dir)
pretrain = 'imagenet'
shift_place = 'blockres'
crop_fusion_type = 'avg'
net = TSN(num_class, test_segment if is_shift else 1, modality,
base_model=arch,
consensus_type=crop_fusion_type,
img_feature_dim=img_feature_dim,
pretrain=pretrain,
is_shift=is_shift, shift_div=shift_div, shift_place=shift_place,
non_local=non_local,
)
checkpoint = torch.load(weights_path)
checkpoint = checkpoint['state_dict']
base_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint.items())}
replace_dict = {'base_model.classifier.weight': 'new_fc.weight',
'base_model.classifier.bias': 'new_fc.bias',
}
for k, v in replace_dict.items():
if k in base_dict:
base_dict[v] = base_dict.pop(k)
net.load_state_dict(base_dict)
input_size = net.scale_size if full_res else net.input_size
if test_crop == 1:
cropping = torchvision.transforms.Compose([
GroupScale(net.scale_size),
GroupCenterCrop(input_size),
])
elif test_crop == 3: # do not flip, so only 5 crops
cropping = torchvision.transforms.Compose([
GroupFullResSample(input_size, net.scale_size, flip=False)
])
elif test_crop == 5: # do not flip, so only 5 crops
cropping = torchvision.transforms.Compose([
GroupOverSample(input_size, net.scale_size, flip=False)
])
elif test_crop == 10:
cropping = torchvision.transforms.Compose([
GroupOverSample(input_size, net.scale_size)
])
else:
raise ValueError("Only 1, 5, 10 crops are supported while we got {}".format(test_crop))
data_loader = torch.utils.data.DataLoader(
TSNDataSet(frame_save_dir, val_list, num_segments=test_segment,
new_length=1 if modality == "RGB" else 5,
modality=modality,
image_tmpl=prefix,
test_mode=True,
remove_missing=False,
transform=torchvision.transforms.Compose([
cropping,
Stack(roll=(arch in ['BNInception', 'InceptionV3'])),
ToTorchFormatTensor(div=(arch not in ['BNInception', 'InceptionV3'])),
GroupNormalize(net.input_mean, net.input_std),
]), dense_sample=dense_sample, twice_sample=twice_sample),
batch_size=batch_size, shuffle=False,
num_workers=workers, pin_memory=True,
)
net = torch.nn.DataParallel(net.cuda())
net.eval()
data_gen = enumerate(data_loader)
max_num = len(data_loader.dataset)
for i, data_pair in data_gen:
directory, data = data_pair
with torch.no_grad():
if i >= max_num:
break
num_crop = test_crop
if dense_sample:
num_crop *= 10 # 10 clips for testing when using dense sample
if twice_sample:
num_crop *= 2
if modality == 'RGB':
length = 3
elif modality == 'Flow':
length = 10
elif modality == 'RGBDiff':
length = 18
else:
raise ValueError("Unknown modality " + modality)
data_in = data.view(-1, length, data.size(2), data.size(3))
if is_shift:
data_in = data_in.view(batch_size * num_crop, test_segment, length, data_in.size(2), data_in.size(3))
rst, feature = net(data_in)
feature = np.squeeze(feature.cpu())
print(feature.shape)
feature_name = str(directory[0]) + '.npy'
feature_save_path = os.path.join(feature_save_dir, feature_name)
np.save(feature_save_path, feature)
if __name__ == '__main__':
extract_random_face_and_frame_feature_()
from ops.basic_ops import *
\ No newline at end of file
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
import torch
class Identity(torch.nn.Module):
def forward(self, input):
return input
class SegmentConsensus(torch.nn.Module):
def __init__(self, consensus_type, dim=1):
super(SegmentConsensus, self).__init__()
self.consensus_type = consensus_type
self.dim = dim
self.shape = None
def forward(self, input_tensor):
self.shape = input_tensor.size()
if self.consensus_type == 'avg':
output = input_tensor.mean(dim=self.dim, keepdim=True)
elif self.consensus_type == 'identity':
output = input_tensor
else:
output = None
return output
class ConsensusModule(torch.nn.Module):
def __init__(self, consensus_type, dim=1):
super(ConsensusModule, self).__init__()
self.consensus_type = consensus_type if consensus_type != 'rnn' else 'identity'
self.dim = dim
def forward(self, input):
return SegmentConsensus(self.consensus_type, self.dim)(input)
# Code for "TSM: Temporal Shift Module for Efficient Video Understanding"
# arXiv:1811.08383
# Ji Lin*, Chuang Gan, Song Han
# {jilin, songhan}@mit.edu, ganchuang@csail.mit.edu
import torch.utils.data as data
from PIL import Image
import os
import numpy as np
from numpy.random import randint
class VideoRecord(object):
def __init__(self, row):
self._data = row
@property
def path(self):
return self._data[0]
@property
def num_frames(self):
return int(self._data[1])
class TSNDataSet(data.Dataset):
def __init__(self, root_path, list_file,
num_segments=3, new_length=1, modality='RGB',
image_tmpl='img_{:05d}.jpg', transform=None,
random_shift=True, test_mode=False,
remove_missing=False, dense_sample=False, twice_sample=False):
self.root_path = root_path
self.list_file = list_file
self.num_segments = num_segments
self.new_length = new_length
self.modality = modality
self.image_tmpl = image_tmpl
self.transform = transform
self.random_shift = random_shift
self.test_mode = test_mode
self.remove_missing = remove_missing
self.dense_sample = dense_sample # using dense sample as I3D
self.twice_sample = twice_sample # twice sample for more validation
if self.dense_sample:
print('=> Using dense sample for the dataset...')
if self.twice_sample:
print('=> Using twice sample for the dataset...')
if self.modality == 'RGBDiff':
self.new_length += 1 # Diff needs one more image to calculate diff
self._parse_list()
def _load_image(self, directory, idx):
if self.modality == 'RGB' or self.modality == 'RGBDiff':
try:
return [Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(idx))).convert('RGB')]
except Exception:
print('error loading image:', os.path.join(self.root_path, directory, self.image_tmpl.format(idx)))
return [Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(1))).convert('RGB')]
elif self.modality == 'Flow':
if self.image_tmpl == 'flow_{}_{:05d}.jpg': # ucf
x_img = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format('x', idx))).convert(
'L')
y_img = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format('y', idx))).convert(
'L')
elif self.image_tmpl == '{:06d}-{}_{:05d}.jpg': # something v1 flow
x_img = Image.open(os.path.join(self.root_path, '{:06d}'.format(int(directory)), self.image_tmpl.
format(int(directory), 'x', idx))).convert('L')
y_img = Image.open(os.path.join(self.root_path, '{:06d}'.format(int(directory)), self.image_tmpl.
format(int(directory), 'y', idx))).convert('L')
else:
try:
# idx_skip = 1 + (idx-1)*5
flow = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(idx))).convert(
'RGB')
except Exception:
print('error loading flow file:',
os.path.join(self.root_path, directory, self.image_tmpl.format(idx)))
flow = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(1))).convert('RGB')
# the input flow file is RGB image with (flow_x, flow_y, blank) for each channel
flow_x, flow_y, _ = flow.split()
x_img = flow_x.convert('L')
y_img = flow_y.convert('L')
return [x_img, y_img]
def _parse_list(self):
# check the frame number is large >3:
tmp = [x.strip().split(' ') for x in open(self.list_file)]
if not self.test_mode or self.remove_missing:
tmp = [item for item in tmp if int(item[1]) >= 3]
self.video_list = [VideoRecord(item) for item in tmp]
if self.image_tmpl == '{:06d}-{}_{:05d}.jpg':
for v in self.video_list:
v._data[1] = int(v._data[1]) / 2
print('video number:%d' % (len(self.video_list)))
def _sample_indices(self, record):
"""
:param record: VideoRecord
:return: list
"""
if self.dense_sample: # i3d dense sample
sample_pos = max(1, 1 + record.num_frames - 64)
t_stride = 64 // self.num_segments
start_idx = 0 if sample_pos == 1 else np.random.randint(0, sample_pos - 1)
offsets = [(idx * t_stride + start_idx) % record.num_frames for idx in range(self.num_segments)]
return np.array(offsets) + 1
else: # normal sample
average_duration = (record.num_frames - self.new_length + 1) // self.num_segments
if average_duration > 0:
offsets = np.multiply(list(range(self.num_segments)), average_duration) + randint(average_duration,
size=self.num_segments)
elif record.num_frames > self.num_segments:
offsets = np.sort(randint(record.num_frames - self.new_length + 1, size=self.num_segments))
else:
offsets = np.zeros((self.num_segments,))
return offsets + 1
def _get_val_indices(self, record):
if self.dense_sample: # i3d dense sample
sample_pos = max(1, 1 + record.num_frames - 64)
t_stride = 64 // self.num_segments
start_idx = 0 if sample_pos == 1 else np.random.randint(0, sample_pos - 1)
offsets = [(idx * t_stride + start_idx) % record.num_frames for idx in range(self.num_segments)]
return np.array(offsets) + 1
else:
if record.num_frames > self.num_segments + self.new_length - 1:
tick = (record.num_frames - self.new_length + 1) / float(self.num_segments)
offsets = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segments)])
else:
offsets = np.zeros((self.num_segments,))
return offsets + 1
def _get_test_indices(self, record):
if self.dense_sample:
sample_pos = max(1, 1 + record.num_frames - 64)
t_stride = 64 // self.num_segments
start_list = np.linspace(0, sample_pos - 1, num=10, dtype=int)
offsets = []
for start_idx in start_list.tolist():
offsets += [(idx * t_stride + start_idx) % record.num_frames for idx in range(self.num_segments)]
return np.array(offsets) + 1
elif self.twice_sample:
tick = (record.num_frames - self.new_length + 1) / float(self.num_segments)
offsets = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segments)] +
[int(tick * x) for x in range(self.num_segments)])
return offsets + 1
else:
tick = (record.num_frames - self.new_length + 1) / float(self.num_segments)
offsets = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segments)])
return offsets + 1
def __getitem__(self, index):
record = self.video_list[index]
# check this is a legit video folder
if self.image_tmpl == 'flow_{}_{:05d}.jpg':
file_name = self.image_tmpl.format('x', 1)
full_path = os.path.join(self.root_path, record.path, file_name)
elif self.image_tmpl == '{:06d}-{}_{:05d}.jpg':
file_name = self.image_tmpl.format(int(record.path), 'x', 1)
full_path = os.path.join(self.root_path, '{:06d}'.format(int(record.path)), file_name)
else:
file_name = self.image_tmpl.format(1)
full_path = os.path.join(self.root_path, record.path, file_name)
while not os.path.exists(full_path):
print('################## Not Found:', os.path.join(self.root_path, record.path, file_name))
index = np.random.randint(len(self.video_list))
record = self.video_list[index]
if self.image_tmpl == 'flow_{}_{:05d}.jpg':
file_name = self.image_tmpl.format('x', 1)
full_path = os.path.join(self.root_path, record.path, file_name)
elif self.image_tmpl == '{:06d}-{}_{:05d}.jpg':
file_name = self.image_tmpl.format(int(record.path), 'x', 1)
full_path = os.path.join(self.root_path, '{:06d}'.format(int(record.path)), file_name)
else:
file_name = self.image_tmpl.format(1)
full_path = os.path.join(self.root_path, record.path, file_name)
if not self.test_mode:
segment_indices = self._sample_indices(record) if self.random_shift else self._get_val_indices(record)
else:
segment_indices = self._get_test_indices(record)
return self.get(record, segment_indices)
def get(self, record, indices):
images = list()
for seg_ind in indices:
p = int(seg_ind)
for i in range(self.new_length):
seg_imgs = self._load_image(record.path, p)
images.extend(seg_imgs)
if p < record.num_frames:
p += 1
process_data = self.transform(images)
return record.path, process_data
def __len__(self):
return len(self.video_list)
# Code for "TSM: Temporal Shift Module for Efficient Video Understanding"
# arXiv:1811.08383
# Ji Lin*, Chuang Gan, Song Han
# {jilin, songhan}@mit.edu, ganchuang@csail.mit.edu
import os
ROOT_DATASET = '/data1/action_1_images/' # '/data/jilin/'
def return_ucf101(modality):
filename_categories = 'labels/classInd.txt'
if modality == 'RGB':
root_data = ROOT_DATASET + 'images'
filename_imglist_train = 'file_list/ucf101_rgb_train_split_1.txt'
filename_imglist_val = 'file_list/ucf101_rgb_val_split_1.txt'
prefix = 'img_{:05d}.jpg'
elif modality == 'Flow':
root_data = ROOT_DATASET + 'UCF101/jpg'
filename_imglist_train = 'UCF101/file_list/ucf101_flow_train_split_1.txt'
filename_imglist_val = 'UCF101/file_list/ucf101_flow_val_split_1.txt'
prefix = 'flow_{}_{:05d}.jpg'
else:
raise NotImplementedError('no such modality:' + modality)
return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
def return_hmdb51(modality):
filename_categories = 51
if modality == 'RGB':
root_data = ROOT_DATASET + 'HMDB51/images'
filename_imglist_train = 'HMDB51/splits/hmdb51_rgb_train_split_1.txt'
filename_imglist_val = 'HMDB51/splits/hmdb51_rgb_val_split_1.txt'
prefix = 'img_{:05d}.jpg'
elif modality == 'Flow':
root_data = ROOT_DATASET + 'HMDB51/images'
filename_imglist_train = 'HMDB51/splits/hmdb51_flow_train_split_1.txt'
filename_imglist_val = 'HMDB51/splits/hmdb51_flow_val_split_1.txt'
prefix = 'flow_{}_{:05d}.jpg'
else:
raise NotImplementedError('no such modality:' + modality)
return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
def return_something(modality):
filename_categories = 'something/v1/category.txt'
if modality == 'RGB':
root_data = ROOT_DATASET + 'something/v1/20bn-something-something-v1'
filename_imglist_train = 'something/v1/train_videofolder.txt'
filename_imglist_val = 'something/v1/val_videofolder.txt'
prefix = '{:05d}.jpg'
elif modality == 'Flow':
root_data = ROOT_DATASET + 'something/v1/20bn-something-something-v1-flow'
filename_imglist_train = 'something/v1/train_videofolder_flow.txt'
filename_imglist_val = 'something/v1/val_videofolder_flow.txt'
prefix = '{:06d}-{}_{:05d}.jpg'
else:
print('no such modality:'+modality)
raise NotImplementedError
return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
def return_somethingv2(modality):
filename_categories = 'something/v2/category.txt'
if modality == 'RGB':
root_data = ROOT_DATASET + 'something/v2/20bn-something-something-v2-frames'
filename_imglist_train = 'something/v2/train_videofolder.txt'
filename_imglist_val = 'something/v2/val_videofolder.txt'
prefix = '{:06d}.jpg'
elif modality == 'Flow':
root_data = ROOT_DATASET + 'something/v2/20bn-something-something-v2-flow'
filename_imglist_train = 'something/v2/train_videofolder_flow.txt'
filename_imglist_val = 'something/v2/val_videofolder_flow.txt'
prefix = '{:06d}.jpg'
else:
raise NotImplementedError('no such modality:'+modality)
return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
def return_jester(modality):
filename_categories = 'jester/category.txt'
if modality == 'RGB':
prefix = '{:05d}.jpg'
root_data = ROOT_DATASET + 'jester/20bn-jester-v1'
filename_imglist_train = 'jester/train_videofolder.txt'
filename_imglist_val = 'jester/val_videofolder.txt'
else:
raise NotImplementedError('no such modality:'+modality)
return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
def return_kinetics(modality):
filename_categories = 400
if modality == 'RGB':
root_data = ROOT_DATASET + 'kinetics/images'
filename_imglist_train = 'kinetics/labels/train_videofolder.txt'
filename_imglist_val = 'kinetics/labels/val_videofolder.txt'
prefix = 'img_{:05d}.jpg'
else:
raise NotImplementedError('no such modality:' + modality)
return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
def return_dataset(dataset, modality):
dict_single = {'jester': return_jester, 'something': return_something, 'somethingv2': return_somethingv2,
'ucf101': return_ucf101, 'hmdb51': return_hmdb51,
'kinetics': return_kinetics}
if dataset in dict_single:
file_categories, file_imglist_train, file_imglist_val, root_data, prefix = dict_single[dataset](modality)
else:
raise ValueError('Unknown dataset '+dataset)
file_imglist_train = os.path.join(ROOT_DATASET, file_imglist_train)
file_imglist_val = os.path.join(ROOT_DATASET, file_imglist_val)
if isinstance(file_categories, str):
file_categories = os.path.join(ROOT_DATASET, file_categories)
with open(file_categories) as f:
lines = f.readlines()
categories = [item.rstrip() for item in lines]
else: # number of categories
categories = [None] * file_categories
n_class = len(categories)
print('{}: {} classes'.format(dataset, n_class))
return n_class, file_imglist_train, file_imglist_val, root_data, prefix
# Code for "TSM: Temporal Shift Module for Efficient Video Understanding"
# arXiv:1811.08383
# Ji Lin*, Chuang Gan, Song Han
# {jilin, songhan}@mit.edu, ganchuang@csail.mit.edu
from torch import nn
from ops.basic_ops import ConsensusModule
from ops.transforms import *
from torch.nn.init import normal_, constant_
class TSN(nn.Module):
def __init__(self, num_class, num_segments, modality,
base_model='resnet101', new_length=None,
consensus_type='avg', before_softmax=True,
dropout=0.8, img_feature_dim=256,
crop_num=1, partial_bn=True, print_spec=True, pretrain='imagenet',
is_shift=True, shift_div=8, shift_place='blockres', fc_lr5=False,
temporal_pool=False, non_local=False):
super(TSN, self).__init__()
self.modality = modality
self.num_segments = num_segments
self.reshape = True
self.before_softmax = before_softmax
self.dropout = dropout
self.crop_num = crop_num
self.consensus_type = consensus_type
self.img_feature_dim = img_feature_dim # the dimension of the CNN feature to represent each frame
self.pretrain = pretrain
self.is_shift = is_shift
self.shift_div = shift_div
self.shift_place = shift_place
self.base_model_name = base_model
self.fc_lr5 = fc_lr5
self.temporal_pool = temporal_pool
self.non_local = non_local
if not before_softmax and consensus_type != 'avg':
raise ValueError("Only avg consensus can be used after Softmax")
if new_length is None:
self.new_length = 1 if modality == "RGB" else 5
else:
self.new_length = new_length
if print_spec:
print(("""
Initializing TSN with base model: {}.
TSN Configurations:
input_modality: {}
num_segments: {}
new_length: {}
consensus_module: {}
dropout_ratio: {}
img_feature_dim: {}
""".format(base_model, self.modality, self.num_segments, self.new_length, consensus_type, self.dropout, self.img_feature_dim)))
self._prepare_base_model(base_model)
feature_dim = self._prepare_tsn(num_class)
if self.modality == 'Flow':
print("Converting the ImageNet model to a flow init model")
self.base_model = self._construct_flow_model(self.base_model)
print("Done. Flow model ready...")
elif self.modality == 'RGBDiff':
print("Converting the ImageNet model to RGB+Diff init model")
self.base_model = self._construct_diff_model(self.base_model)
print("Done. RGBDiff model ready.")
self.consensus = ConsensusModule(consensus_type)
if not self.before_softmax:
self.softmax = nn.Softmax()
self._enable_pbn = partial_bn
if partial_bn:
self.partialBN(True)
def _prepare_tsn(self, num_class):
feature_dim = getattr(self.base_model, self.base_model.last_layer_name).in_features
if self.dropout == 0:
setattr(self.base_model, self.base_model.last_layer_name, nn.Linear(feature_dim, num_class))
self.new_fc = None
else:
setattr(self.base_model, self.base_model.last_layer_name, nn.Dropout(p=self.dropout))
self.new_fc = nn.Linear(feature_dim, num_class)
std = 0.001
if self.new_fc is None:
normal_(getattr(self.base_model, self.base_model.last_layer_name).weight, 0, std)
constant_(getattr(self.base_model, self.base_model.last_layer_name).bias, 0)
else:
if hasattr(self.new_fc, 'weight'):
normal_(self.new_fc.weight, 0, std)
constant_(self.new_fc.bias, 0)
return feature_dim
def _prepare_base_model(self, base_model):
print('=> base model: {}'.format(base_model))
if 'resnet' in base_model:
self.base_model = getattr(torchvision.models, base_model)(True if self.pretrain == 'imagenet' else False)
if self.is_shift:
print('Adding temporal shift...')
from ops.temporal_shift import make_temporal_shift
make_temporal_shift(self.base_model, self.num_segments,
n_div=self.shift_div, place=self.shift_place, temporal_pool=self.temporal_pool)
if self.non_local:
print('Adding non-local module...')
from ops.non_local import make_non_local
make_non_local(self.base_model, self.num_segments)
self.base_model.last_layer_name = 'fc'
self.input_size = 224
self.input_mean = [0.485, 0.456, 0.406]
self.input_std = [0.229, 0.224, 0.225]
self.base_model.avgpool = nn.AdaptiveAvgPool2d(1)
if self.modality == 'Flow':
self.input_mean = [0.5]
self.input_std = [np.mean(self.input_std)]
elif self.modality == 'RGBDiff':
self.input_mean = [0.485, 0.456, 0.406] + [0] * 3 * self.new_length
self.input_std = self.input_std + [np.mean(self.input_std) * 2] * 3 * self.new_length
elif base_model == 'mobilenetv2':
from archs.mobilenet_v2 import mobilenet_v2, InvertedResidual
self.base_model = mobilenet_v2(True if self.pretrain == 'imagenet' else False)
self.base_model.last_layer_name = 'classifier'
self.input_size = 224
self.input_mean = [0.485, 0.456, 0.406]
self.input_std = [0.229, 0.224, 0.225]
self.base_model.avgpool = nn.AdaptiveAvgPool2d(1)
if self.is_shift:
from ops.temporal_shift import TemporalShift
for m in self.base_model.modules():
if isinstance(m, InvertedResidual) and len(m.conv) == 8 and m.use_res_connect:
if self.print_spec:
print('Adding temporal shift... {}'.format(m.use_res_connect))
m.conv[0] = TemporalShift(m.conv[0], n_segment=self.num_segments, n_div=self.shift_div)
if self.modality == 'Flow':
self.input_mean = [0.5]
self.input_std = [np.mean(self.input_std)]
elif self.modality == 'RGBDiff':
self.input_mean = [0.485, 0.456, 0.406] + [0] * 3 * self.new_length
self.input_std = self.input_std + [np.mean(self.input_std) * 2] * 3 * self.new_length
elif base_model == 'BNInception':
from archs.bn_inception import bninception
self.base_model = bninception(pretrained=self.pretrain)
self.input_size = self.base_model.input_size
self.input_mean = self.base_model.mean
self.input_std = self.base_model.std
self.base_model.last_layer_name = 'fc'
if self.modality == 'Flow':
self.input_mean = [128]
elif self.modality == 'RGBDiff':
self.input_mean = self.input_mean * (1 + self.new_length)
if self.is_shift:
print('Adding temporal shift...')
self.base_model.build_temporal_ops(
self.num_segments, is_temporal_shift=self.shift_place, shift_div=self.shift_div)
else:
raise ValueError('Unknown base model: {}'.format(base_model))
def train(self, mode=True):
"""
Override the default train() to freeze the BN parameters
:return:
"""
super(TSN, self).train(mode)
count = 0
if self._enable_pbn and mode:
print("Freezing BatchNorm2D except the first one.")
for m in self.base_model.modules():
if isinstance(m, nn.BatchNorm2d):
count += 1
if count >= (2 if self._enable_pbn else 1):
m.eval()
# shutdown update in frozen mode
m.weight.requires_grad = False
m.bias.requires_grad = False
def partialBN(self, enable):
self._enable_pbn = enable
def get_optim_policies(self):
first_conv_weight = []
first_conv_bias = []
normal_weight = []
normal_bias = []
lr5_weight = []
lr10_bias = []
bn = []
custom_ops = []
conv_cnt = 0
bn_cnt = 0
for m in self.modules():
if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv3d):
ps = list(m.parameters())
conv_cnt += 1
if conv_cnt == 1:
first_conv_weight.append(ps[0])
if len(ps) == 2:
first_conv_bias.append(ps[1])
else:
normal_weight.append(ps[0])
if len(ps) == 2:
normal_bias.append(ps[1])
elif isinstance(m, torch.nn.Linear):
ps = list(m.parameters())
if self.fc_lr5:
lr5_weight.append(ps[0])
else:
normal_weight.append(ps[0])
if len(ps) == 2:
if self.fc_lr5:
lr10_bias.append(ps[1])
else:
normal_bias.append(ps[1])
elif isinstance(m, torch.nn.BatchNorm2d):
bn_cnt += 1
# later BN's are frozen
if not self._enable_pbn or bn_cnt == 1:
bn.extend(list(m.parameters()))
elif isinstance(m, torch.nn.BatchNorm3d):
bn_cnt += 1
# later BN's are frozen
if not self._enable_pbn or bn_cnt == 1:
bn.extend(list(m.parameters()))
elif len(m._modules) == 0:
if len(list(m.parameters())) > 0:
raise ValueError("New atomic module type: {}. Need to give it a learning policy".format(type(m)))
return [
{'params': first_conv_weight, 'lr_mult': 5 if self.modality == 'Flow' else 1, 'decay_mult': 1,
'name': "first_conv_weight"},
{'params': first_conv_bias, 'lr_mult': 10 if self.modality == 'Flow' else 2, 'decay_mult': 0,
'name': "first_conv_bias"},
{'params': normal_weight, 'lr_mult': 1, 'decay_mult': 1,
'name': "normal_weight"},
{'params': normal_bias, 'lr_mult': 2, 'decay_mult': 0,
'name': "normal_bias"},
{'params': bn, 'lr_mult': 1, 'decay_mult': 0,
'name': "BN scale/shift"},
{'params': custom_ops, 'lr_mult': 1, 'decay_mult': 1,
'name': "custom_ops"},
# for fc
{'params': lr5_weight, 'lr_mult': 5, 'decay_mult': 1,
'name': "lr5_weight"},
{'params': lr10_bias, 'lr_mult': 10, 'decay_mult': 0,
'name': "lr10_bias"},
]
def forward(self, input, no_reshape=False):
if not no_reshape:
sample_len = (3 if self.modality == "RGB" else 2) * self.new_length
if self.modality == 'RGBDiff':
sample_len = 3 * self.new_length
input = self._get_diff(input)
base_out = self.base_model(input.view((-1, sample_len) + input.size()[-2:]))
else:
base_out = self.base_model(input)
if self.dropout > 0:
feature = base_out.view(base_out.size(0), -1)
base_out = self.new_fc(base_out)
if not self.before_softmax:
base_out = self.softmax(base_out)
if self.reshape:
if self.is_shift and self.temporal_pool:
base_out = base_out.view((-1, self.num_segments // 2) + base_out.size()[1:])
else:
base_out = base_out.view((-1, self.num_segments) + base_out.size()[1:])
output = self.consensus(base_out)
return output.squeeze(1), feature
def _get_diff(self, input, keep_rgb=False):
input_c = 3 if self.modality in ["RGB", "RGBDiff"] else 2
input_view = input.view((-1, self.num_segments, self.new_length + 1, input_c,) + input.size()[2:])
if keep_rgb:
new_data = input_view.clone()
else:
new_data = input_view[:, :, 1:, :, :, :].clone()
for x in reversed(list(range(1, self.new_length + 1))):
if keep_rgb:
new_data[:, :, x, :, :, :] = input_view[:, :, x, :, :, :] - input_view[:, :, x - 1, :, :, :]
else:
new_data[:, :, x - 1, :, :, :] = input_view[:, :, x, :, :, :] - input_view[:, :, x - 1, :, :, :]
return new_data
def _construct_flow_model(self, base_model):
# modify the convolution layers
# Torch models are usually defined in a hierarchical way.
# nn.modules.children() return all sub modules in a DFS manner
modules = list(self.base_model.modules())
first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv2d), list(range(len(modules)))))[0]
conv_layer = modules[first_conv_idx]
container = modules[first_conv_idx - 1]
# modify parameters, assume the first blob contains the convolution kernels
params = [x.clone() for x in conv_layer.parameters()]
kernel_size = params[0].size()
new_kernel_size = kernel_size[:1] + (2 * self.new_length, ) + kernel_size[2:]
new_kernels = params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()
new_conv = nn.Conv2d(2 * self.new_length, conv_layer.out_channels,
conv_layer.kernel_size, conv_layer.stride, conv_layer.padding,
bias=True if len(params) == 2 else False)
new_conv.weight.data = new_kernels
if len(params) == 2:
new_conv.bias.data = params[1].data # add bias if neccessary
layer_name = list(container.state_dict().keys())[0][:-7] # remove .weight suffix to get the layer name
# replace the first convlution layer
setattr(container, layer_name, new_conv)
if self.base_model_name == 'BNInception':
import torch.utils.model_zoo as model_zoo
sd = model_zoo.load_url('https://www.dropbox.com/s/35ftw2t4mxxgjae/BNInceptionFlow-ef652051.pth.tar?dl=1')
base_model.load_state_dict(sd)
print('=> Loading pretrained Flow weight done...')
else:
print('#' * 30, 'Warning! No Flow pretrained model is found')
return base_model
def _construct_diff_model(self, base_model, keep_rgb=False):
# modify the convolution layers
# Torch models are usually defined in a hierarchical way.
# nn.modules.children() return all sub modules in a DFS manner
modules = list(self.base_model.modules())
first_conv_idx = filter(lambda x: isinstance(modules[x], nn.Conv2d), list(range(len(modules))))[0]
conv_layer = modules[first_conv_idx]
container = modules[first_conv_idx - 1]
# modify parameters, assume the first blob contains the convolution kernels
params = [x.clone() for x in conv_layer.parameters()]
kernel_size = params[0].size()
if not keep_rgb:
new_kernel_size = kernel_size[:1] + (3 * self.new_length,) + kernel_size[2:]
new_kernels = params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()
else:
new_kernel_size = kernel_size[:1] + (3 * self.new_length,) + kernel_size[2:]
new_kernels = torch.cat((params[0].data, params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()),
1)
new_kernel_size = kernel_size[:1] + (3 + 3 * self.new_length,) + kernel_size[2:]
new_conv = nn.Conv2d(new_kernel_size[1], conv_layer.out_channels,
conv_layer.kernel_size, conv_layer.stride, conv_layer.padding,
bias=True if len(params) == 2 else False)
new_conv.weight.data = new_kernels
if len(params) == 2:
new_conv.bias.data = params[1].data # add bias if neccessary
layer_name = list(container.state_dict().keys())[0][:-7] # remove .weight suffix to get the layer name
# replace the first convolution layer
setattr(container, layer_name, new_conv)
return base_model
@property
def crop_size(self):
return self.input_size
@property
def scale_size(self):
return self.input_size * 256 // 224
def get_augmentation(self, flip=True):
if self.modality == 'RGB':
if flip:
return torchvision.transforms.Compose([GroupMultiScaleCrop(self.input_size, [1, .875, .75, .66]),
GroupRandomHorizontalFlip(is_flow=False)])
else:
print('#' * 20, 'NO FLIP!!!')
return torchvision.transforms.Compose([GroupMultiScaleCrop(self.input_size, [1, .875, .75, .66])])
elif self.modality == 'Flow':
return torchvision.transforms.Compose([GroupMultiScaleCrop(self.input_size, [1, .875, .75]),
GroupRandomHorizontalFlip(is_flow=True)])
elif self.modality == 'RGBDiff':
return torchvision.transforms.Compose([GroupMultiScaleCrop(self.input_size, [1, .875, .75]),
GroupRandomHorizontalFlip(is_flow=False)])
# Non-local block using embedded gaussian
# Code from
# https://github.com/AlexHex7/Non-local_pytorch/blob/master/Non-Local_pytorch_0.3.1/lib/non_local_embedded_gaussian.py
import torch
from torch import nn
from torch.nn import functional as F
class _NonLocalBlockND(nn.Module):
def __init__(self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True):
super(_NonLocalBlockND, self).__init__()
assert dimension in [1, 2, 3]
self.dimension = dimension
self.sub_sample = sub_sample
self.in_channels = in_channels
self.inter_channels = inter_channels
if self.inter_channels is None:
self.inter_channels = in_channels // 2
if self.inter_channels == 0:
self.inter_channels = 1
if dimension == 3:
conv_nd = nn.Conv3d
max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
bn = nn.BatchNorm3d
elif dimension == 2:
conv_nd = nn.Conv2d
max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
bn = nn.BatchNorm2d
else:
conv_nd = nn.Conv1d
max_pool_layer = nn.MaxPool1d(kernel_size=(2))
bn = nn.BatchNorm1d
self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
kernel_size=1, stride=1, padding=0)
if bn_layer:
self.W = nn.Sequential(
conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
kernel_size=1, stride=1, padding=0),
bn(self.in_channels)
)
nn.init.constant_(self.W[1].weight, 0)
nn.init.constant_(self.W[1].bias, 0)
else:
self.W = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
kernel_size=1, stride=1, padding=0)
nn.init.constant_(self.W.weight, 0)
nn.init.constant_(self.W.bias, 0)
self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
kernel_size=1, stride=1, padding=0)
self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
kernel_size=1, stride=1, padding=0)
if sub_sample:
self.g = nn.Sequential(self.g, max_pool_layer)
self.phi = nn.Sequential(self.phi, max_pool_layer)
def forward(self, x):
'''
:param x: (b, c, t, h, w)
:return:
'''
batch_size = x.size(0)
g_x = self.g(x).view(batch_size, self.inter_channels, -1)
g_x = g_x.permute(0, 2, 1)
theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
theta_x = theta_x.permute(0, 2, 1)
phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)
f = torch.matmul(theta_x, phi_x)
f_div_C = F.softmax(f, dim=-1)
y = torch.matmul(f_div_C, g_x)
y = y.permute(0, 2, 1).contiguous()
y = y.view(batch_size, self.inter_channels, *x.size()[2:])
W_y = self.W(y)
z = W_y + x
return z
class NONLocalBlock1D(_NonLocalBlockND):
def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
super(NONLocalBlock1D, self).__init__(in_channels,
inter_channels=inter_channels,
dimension=1, sub_sample=sub_sample,
bn_layer=bn_layer)
class NONLocalBlock2D(_NonLocalBlockND):
def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
super(NONLocalBlock2D, self).__init__(in_channels,
inter_channels=inter_channels,
dimension=2, sub_sample=sub_sample,
bn_layer=bn_layer)
class NONLocalBlock3D(_NonLocalBlockND):
def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
super(NONLocalBlock3D, self).__init__(in_channels,
inter_channels=inter_channels,
dimension=3, sub_sample=sub_sample,
bn_layer=bn_layer)
class NL3DWrapper(nn.Module):
def __init__(self, block, n_segment):
super(NL3DWrapper, self).__init__()
self.block = block
self.nl = NONLocalBlock3D(block.bn3.num_features)
self.n_segment = n_segment
def forward(self, x):
x = self.block(x)
nt, c, h, w = x.size()
x = x.view(nt // self.n_segment, self.n_segment, c, h, w).transpose(1, 2) # n, c, t, h, w
x = self.nl(x)
x = x.transpose(1, 2).contiguous().view(nt, c, h, w)
return x
def make_non_local(net, n_segment):
import torchvision
import archs
if isinstance(net, torchvision.models.ResNet):
net.layer2 = nn.Sequential(
NL3DWrapper(net.layer2[0], n_segment),
net.layer2[1],
NL3DWrapper(net.layer2[2], n_segment),
net.layer2[3],
)
net.layer3 = nn.Sequential(
NL3DWrapper(net.layer3[0], n_segment),
net.layer3[1],
NL3DWrapper(net.layer3[2], n_segment),
net.layer3[3],
NL3DWrapper(net.layer3[4], n_segment),
net.layer3[5],
)
else:
raise NotImplementedError
if __name__ == '__main__':
from torch.autograd import Variable
import torch
sub_sample = True
bn_layer = True
img = Variable(torch.zeros(2, 3, 20))
net = NONLocalBlock1D(3, sub_sample=sub_sample, bn_layer=bn_layer)
out = net(img)
print(out.size())
img = Variable(torch.zeros(2, 3, 20, 20))
net = NONLocalBlock2D(3, sub_sample=sub_sample, bn_layer=bn_layer)
out = net(img)
print(out.size())
img = Variable(torch.randn(2, 3, 10, 20, 20))
net = NONLocalBlock3D(3, sub_sample=sub_sample, bn_layer=bn_layer)
out = net(img)
print(out.size())
\ No newline at end of file
# Code for "TSM: Temporal Shift Module for Efficient Video Understanding"
# arXiv:1811.08383
# Ji Lin*, Chuang Gan, Song Han
# {jilin, songhan}@mit.edu, ganchuang@csail.mit.edu
import torch
import torch.nn as nn
import torch.nn.functional as F
class TemporalShift(nn.Module):
def __init__(self, net, n_segment=3, n_div=8, inplace=False):
super(TemporalShift, self).__init__()
self.net = net
self.n_segment = n_segment
self.fold_div = n_div
self.inplace = inplace
if inplace:
print('=> Using in-place shift...')
print('=> Using fold div: {}'.format(self.fold_div))
def forward(self, x):
x = self.shift(x, self.n_segment, fold_div=self.fold_div, inplace=self.inplace)
return self.net(x)
@staticmethod
def shift(x, n_segment, fold_div=3, inplace=False):
nt, c, h, w = x.size()
n_batch = nt // n_segment
x = x.view(n_batch, n_segment, c, h, w)
fold = c // fold_div
if inplace:
# Due to some out of order error when performing parallel computing.
# May need to write a CUDA kernel.
raise NotImplementedError
# out = InplaceShift.apply(x, fold)
else:
out = torch.zeros_like(x)
out[:, :-1, :fold] = x[:, 1:, :fold] # shift left
out[:, 1:, fold: 2 * fold] = x[:, :-1, fold: 2 * fold] # shift right
out[:, :, 2 * fold:] = x[:, :, 2 * fold:] # not shift
return out.view(nt, c, h, w)
class InplaceShift(torch.autograd.Function):
# Special thanks to @raoyongming for the help to this function
@staticmethod
def forward(ctx, input, fold):
# not support higher order gradient
# input = input.detach_()
ctx.fold_ = fold
n, t, c, h, w = input.size()
buffer = input.data.new(n, t, fold, h, w).zero_()
buffer[:, :-1] = input.data[:, 1:, :fold]
input.data[:, :, :fold] = buffer
buffer.zero_()
buffer[:, 1:] = input.data[:, :-1, fold: 2 * fold]
input.data[:, :, fold: 2 * fold] = buffer
return input
@staticmethod
def backward(ctx, grad_output):
# grad_output = grad_output.detach_()
fold = ctx.fold_
n, t, c, h, w = grad_output.size()
buffer = grad_output.data.new(n, t, fold, h, w).zero_()
buffer[:, 1:] = grad_output.data[:, :-1, :fold]
grad_output.data[:, :, :fold] = buffer
buffer.zero_()
buffer[:, :-1] = grad_output.data[:, 1:, fold: 2 * fold]
grad_output.data[:, :, fold: 2 * fold] = buffer
return grad_output, None
class TemporalPool(nn.Module):
def __init__(self, net, n_segment):
super(TemporalPool, self).__init__()
self.net = net
self.n_segment = n_segment
def forward(self, x):
x = self.temporal_pool(x, n_segment=self.n_segment)
return self.net(x)
@staticmethod
def temporal_pool(x, n_segment):
nt, c, h, w = x.size()
n_batch = nt // n_segment
x = x.view(n_batch, n_segment, c, h, w).transpose(1, 2) # n, c, t, h, w
x = F.max_pool3d(x, kernel_size=(3, 1, 1), stride=(2, 1, 1), padding=(1, 0, 0))
x = x.transpose(1, 2).contiguous().view(nt // 2, c, h, w)
return x
def make_temporal_shift(net, n_segment, n_div=8, place='blockres', temporal_pool=False):
if temporal_pool:
n_segment_list = [n_segment, n_segment // 2, n_segment // 2, n_segment // 2]
else:
n_segment_list = [n_segment] * 4
assert n_segment_list[-1] > 0
print('=> n_segment per stage: {}'.format(n_segment_list))
import torchvision
if isinstance(net, torchvision.models.ResNet):
if place == 'block':
def make_block_temporal(stage, this_segment):
blocks = list(stage.children())
print('=> Processing stage with {} blocks'.format(len(blocks)))
for i, b in enumerate(blocks):
blocks[i] = TemporalShift(b, n_segment=this_segment, n_div=n_div)
return nn.Sequential(*(blocks))
net.layer1 = make_block_temporal(net.layer1, n_segment_list[0])
net.layer2 = make_block_temporal(net.layer2, n_segment_list[1])
net.layer3 = make_block_temporal(net.layer3, n_segment_list[2])
net.layer4 = make_block_temporal(net.layer4, n_segment_list[3])
elif 'blockres' in place:
n_round = 1
if len(list(net.layer3.children())) >= 23:
n_round = 2
print('=> Using n_round {} to insert temporal shift'.format(n_round))
def make_block_temporal(stage, this_segment):
blocks = list(stage.children())
print('=> Processing stage with {} blocks residual'.format(len(blocks)))
for i, b in enumerate(blocks):
if i % n_round == 0:
blocks[i].conv1 = TemporalShift(b.conv1, n_segment=this_segment, n_div=n_div)
return nn.Sequential(*blocks)
net.layer1 = make_block_temporal(net.layer1, n_segment_list[0])
net.layer2 = make_block_temporal(net.layer2, n_segment_list[1])
net.layer3 = make_block_temporal(net.layer3, n_segment_list[2])
net.layer4 = make_block_temporal(net.layer4, n_segment_list[3])
else:
raise NotImplementedError(place)
def make_temporal_pool(net, n_segment):
import torchvision
if isinstance(net, torchvision.models.ResNet):
print('=> Injecting nonlocal pooling')
net.layer2 = TemporalPool(net.layer2, n_segment)
else:
raise NotImplementedError
if __name__ == '__main__':
# test inplace shift v.s. vanilla shift
tsm1 = TemporalShift(nn.Sequential(), n_segment=8, n_div=8, inplace=False)
tsm2 = TemporalShift(nn.Sequential(), n_segment=8, n_div=8, inplace=True)
print('=> Testing CPU...')
# test forward
with torch.no_grad():
for i in range(10):
x = torch.rand(2 * 8, 3, 224, 224)
y1 = tsm1(x)
y2 = tsm2(x)
assert torch.norm(y1 - y2).item() < 1e-5
# test backward
with torch.enable_grad():
for i in range(10):
x1 = torch.rand(2 * 8, 3, 224, 224)
x1.requires_grad_()
x2 = x1.clone()
y1 = tsm1(x1)
y2 = tsm2(x2)
grad1 = torch.autograd.grad((y1 ** 2).mean(), [x1])[0]
grad2 = torch.autograd.grad((y2 ** 2).mean(), [x2])[0]
assert torch.norm(grad1 - grad2).item() < 1e-5
print('=> Testing GPU...')
tsm1.cuda()
tsm2.cuda()
# test forward
with torch.no_grad():
for i in range(10):
x = torch.rand(2 * 8, 3, 224, 224).cuda()
y1 = tsm1(x)
y2 = tsm2(x)
assert torch.norm(y1 - y2).item() < 1e-5
# test backward
with torch.enable_grad():
for i in range(10):
x1 = torch.rand(2 * 8, 3, 224, 224).cuda()
x1.requires_grad_()
x2 = x1.clone()
y1 = tsm1(x1)
y2 = tsm2(x2)
grad1 = torch.autograd.grad((y1 ** 2).mean(), [x1])[0]
grad2 = torch.autograd.grad((y2 ** 2).mean(), [x2])[0]
assert torch.norm(grad1 - grad2).item() < 1e-5
print('Test passed.')
import torchvision
import random
from PIL import Image, ImageOps
import numpy as np
import numbers
import math
import torch
class GroupRandomCrop(object):
def __init__(self, size):
if isinstance(size, numbers.Number):
self.size = (int(size), int(size))
else:
self.size = size
def __call__(self, img_group):
w, h = img_group[0].size
th, tw = self.size
out_images = list()
x1 = random.randint(0, w - tw)
y1 = random.randint(0, h - th)
for img in img_group:
assert(img.size[0] == w and img.size[1] == h)
if w == tw and h == th:
out_images.append(img)
else:
out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
return out_images
class GroupCenterCrop(object):
def __init__(self, size):
self.worker = torchvision.transforms.CenterCrop(size)
def __call__(self, img_group):
return [self.worker(img) for img in img_group]
class GroupRandomHorizontalFlip(object):
"""Randomly horizontally flips the given PIL.Image with a probability of 0.5
"""
def __init__(self, is_flow=False):
self.is_flow = is_flow
def __call__(self, img_group, is_flow=False):
v = random.random()
if v < 0.5:
ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
if self.is_flow:
for i in range(0, len(ret), 2):
ret[i] = ImageOps.invert(ret[i]) # invert flow pixel values when flipping
return ret
else:
return img_group
class GroupNormalize(object):
def __init__(self, mean, std):
self.mean = mean
self.std = std
def __call__(self, tensor):
rep_mean = self.mean * (tensor.size()[0]//len(self.mean))
rep_std = self.std * (tensor.size()[0]//len(self.std))
# TODO: make efficient
for t, m, s in zip(tensor, rep_mean, rep_std):
t.sub_(m).div_(s)
return tensor
class GroupScale(object):
""" Rescales the input PIL.Image to the given 'size'.
'size' will be the size of the smaller edge.
For example, if height > width, then image will be
rescaled to (size * height / width, size)
size: size of the smaller edge
interpolation: Default: PIL.Image.BILINEAR
"""
def __init__(self, size, interpolation=Image.BILINEAR):
self.worker = torchvision.transforms.Resize(size, interpolation)
def __call__(self, img_group):
return [self.worker(img) for img in img_group]
class GroupOverSample(object):
def __init__(self, crop_size, scale_size=None, flip=True):
self.crop_size = crop_size if not isinstance(crop_size, int) else (crop_size, crop_size)
if scale_size is not None:
self.scale_worker = GroupScale(scale_size)
else:
self.scale_worker = None
self.flip = flip
def __call__(self, img_group):
if self.scale_worker is not None:
img_group = self.scale_worker(img_group)
image_w, image_h = img_group[0].size
crop_w, crop_h = self.crop_size
offsets = GroupMultiScaleCrop.fill_fix_offset(False, image_w, image_h, crop_w, crop_h)
oversample_group = list()
for o_w, o_h in offsets:
normal_group = list()
flip_group = list()
for i, img in enumerate(img_group):
crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
normal_group.append(crop)
flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
if img.mode == 'L' and i % 2 == 0:
flip_group.append(ImageOps.invert(flip_crop))
else:
flip_group.append(flip_crop)
oversample_group.extend(normal_group)
if self.flip:
oversample_group.extend(flip_group)
return oversample_group
class GroupFullResSample(object):
def __init__(self, crop_size, scale_size=None, flip=True):
self.crop_size = crop_size if not isinstance(crop_size, int) else (crop_size, crop_size)
if scale_size is not None:
self.scale_worker = GroupScale(scale_size)
else:
self.scale_worker = None
self.flip = flip
def __call__(self, img_group):
if self.scale_worker is not None:
img_group = self.scale_worker(img_group)
image_w, image_h = img_group[0].size
crop_w, crop_h = self.crop_size
w_step = (image_w - crop_w) // 4
h_step = (image_h - crop_h) // 4
offsets = list()
offsets.append((0 * w_step, 2 * h_step)) # left
offsets.append((4 * w_step, 2 * h_step)) # right
offsets.append((2 * w_step, 2 * h_step)) # center
oversample_group = list()
for o_w, o_h in offsets:
normal_group = list()
flip_group = list()
for i, img in enumerate(img_group):
crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
normal_group.append(crop)
if self.flip:
flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
if img.mode == 'L' and i % 2 == 0:
flip_group.append(ImageOps.invert(flip_crop))
else:
flip_group.append(flip_crop)
oversample_group.extend(normal_group)
oversample_group.extend(flip_group)
return oversample_group
class GroupMultiScaleCrop(object):
def __init__(self, input_size, scales=None, max_distort=1, fix_crop=True, more_fix_crop=True):
self.scales = scales if scales is not None else [1, .875, .75, .66]
self.max_distort = max_distort
self.fix_crop = fix_crop
self.more_fix_crop = more_fix_crop
self.input_size = input_size if not isinstance(input_size, int) else [input_size, input_size]
self.interpolation = Image.BILINEAR
def __call__(self, img_group):
im_size = img_group[0].size
crop_w, crop_h, offset_w, offset_h = self._sample_crop_size(im_size)
crop_img_group = [img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h)) for img in img_group]
ret_img_group = [img.resize((self.input_size[0], self.input_size[1]), self.interpolation)
for img in crop_img_group]
return ret_img_group
def _sample_crop_size(self, im_size):
image_w, image_h = im_size[0], im_size[1]
# find a crop size
base_size = min(image_w, image_h)
crop_sizes = [int(base_size * x) for x in self.scales]
crop_h = [self.input_size[1] if abs(x - self.input_size[1]) < 3 else x for x in crop_sizes]
crop_w = [self.input_size[0] if abs(x - self.input_size[0]) < 3 else x for x in crop_sizes]
pairs = []
for i, h in enumerate(crop_h):
for j, w in enumerate(crop_w):
if abs(i - j) <= self.max_distort:
pairs.append((w, h))
crop_pair = random.choice(pairs)
if not self.fix_crop:
w_offset = random.randint(0, image_w - crop_pair[0])
h_offset = random.randint(0, image_h - crop_pair[1])
else:
w_offset, h_offset = self._sample_fix_offset(image_w, image_h, crop_pair[0], crop_pair[1])
return crop_pair[0], crop_pair[1], w_offset, h_offset
def _sample_fix_offset(self, image_w, image_h, crop_w, crop_h):
offsets = self.fill_fix_offset(self.more_fix_crop, image_w, image_h, crop_w, crop_h)
return random.choice(offsets)
@staticmethod
def fill_fix_offset(more_fix_crop, image_w, image_h, crop_w, crop_h):
w_step = (image_w - crop_w) // 4
h_step = (image_h - crop_h) // 4
ret = list()
ret.append((0, 0)) # upper left
ret.append((4 * w_step, 0)) # upper right
ret.append((0, 4 * h_step)) # lower left
ret.append((4 * w_step, 4 * h_step)) # lower right
ret.append((2 * w_step, 2 * h_step)) # center
if more_fix_crop:
ret.append((0, 2 * h_step)) # center left
ret.append((4 * w_step, 2 * h_step)) # center right
ret.append((2 * w_step, 4 * h_step)) # lower center
ret.append((2 * w_step, 0 * h_step)) # upper center
ret.append((1 * w_step, 1 * h_step)) # upper left quarter
ret.append((3 * w_step, 1 * h_step)) # upper right quarter
ret.append((1 * w_step, 3 * h_step)) # lower left quarter
ret.append((3 * w_step, 3 * h_step)) # lower righ quarter
return ret
class GroupRandomSizedCrop(object):
"""Random crop the given PIL.Image to a random size of (0.08 to 1.0) of the original size
and and a random aspect ratio of 3/4 to 4/3 of the original aspect ratio
This is popularly used to train the Inception networks
size: size of the smaller edge
interpolation: Default: PIL.Image.BILINEAR
"""
def __init__(self, size, interpolation=Image.BILINEAR):
self.size = size
self.interpolation = interpolation
def __call__(self, img_group):
for attempt in range(10):
area = img_group[0].size[0] * img_group[0].size[1]
target_area = random.uniform(0.08, 1.0) * area
aspect_ratio = random.uniform(3. / 4, 4. / 3)
w = int(round(math.sqrt(target_area * aspect_ratio)))
h = int(round(math.sqrt(target_area / aspect_ratio)))
if random.random() < 0.5:
w, h = h, w
if w <= img_group[0].size[0] and h <= img_group[0].size[1]:
x1 = random.randint(0, img_group[0].size[0] - w)
y1 = random.randint(0, img_group[0].size[1] - h)
found = True
break
else:
found = False
x1 = 0
y1 = 0
if found:
out_group = list()
for img in img_group:
img = img.crop((x1, y1, x1 + w, y1 + h))
assert(img.size == (w, h))
out_group.append(img.resize((self.size, self.size), self.interpolation))
return out_group
else:
# Fallback
scale = GroupScale(self.size, interpolation=self.interpolation)
crop = GroupRandomCrop(self.size)
return crop(scale(img_group))
class Stack(object):
def __init__(self, roll=False):
self.roll = roll
def __call__(self, img_group):
if img_group[0].mode == 'L':
return np.concatenate([np.expand_dims(x, 2) for x in img_group], axis=2)
elif img_group[0].mode == 'RGB':
if self.roll:
return np.concatenate([np.array(x)[:, :, ::-1] for x in img_group], axis=2)
else:
return np.concatenate(img_group, axis=2)
class ToTorchFormatTensor(object):
""" Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255]
to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """
def __init__(self, div=True):
self.div = div
def __call__(self, pic):
if isinstance(pic, np.ndarray):
# handle numpy array
img = torch.from_numpy(pic).permute(2, 0, 1).contiguous()
else:
# handle PIL Image
img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
img = img.view(pic.size[1], pic.size[0], len(pic.mode))
# put it from HWC to CHW format
# yikes, this transpose takes 80% of the loading time/CPU
img = img.transpose(0, 1).transpose(0, 2).contiguous()
return img.float().div(255) if self.div else img.float()
class IdentityTransform(object):
def __call__(self, data):
return data
if __name__ == "__main__":
trans = torchvision.transforms.Compose([
GroupScale(256),
GroupRandomCrop(224),
Stack(),
ToTorchFormatTensor(),
GroupNormalize(
mean=[.485, .456, .406],
std=[.229, .224, .225]
)]
)
im = Image.open('../tensorflow-model-zoo.torch/lena_299.png')
color_group = [im] * 3
rst = trans(color_group)
gray_group = [im.convert('L')] * 9
gray_rst = trans(gray_group)
trans2 = torchvision.transforms.Compose([
GroupRandomSizedCrop(256),
Stack(),
ToTorchFormatTensor(),
GroupNormalize(
mean=[.485, .456, .406],
std=[.229, .224, .225])
])
print(trans2(color_group))
\ No newline at end of file
import numpy as np
def softmax(scores):
es = np.exp(scores - scores.max(axis=-1)[..., None])
return es / es.sum(axis=-1)[..., None]
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def accuracy(output, target, topk=(1,)):
"""Computes the precision@k for the specified values of k"""
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0)
res.append(correct_k.mul_(100.0 / batch_size))
return res
\ No newline at end of file
import os
import cv2
import numpy as np
import pickle
def start_filter(config):
cls_class_path = config['MODEL']['CLS_PERSON']
feature_save_dir = config['VIDEO']['FACE_FEATURE_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['PERSON']['RESULT_FILE']
feature_name = config['PERSON']['DATA_NAME']
xgboost_model = pickle.load(open(cls_class_path, "rb"))
result_file_path = os.path.join(frame_list_dir, result_file_name)
result_file = open(result_file_path, 'w')
feature_path = os.path.join(feature_save_dir, feature_name)
val_annotation_pairs = np.load(feature_path, allow_pickle=True, encoding='latin1')
X_val = []
Y_val = []
Y_names = []
for j in range(len(val_annotation_pairs)):
pair = val_annotation_pairs[j]
X_val.append(np.squeeze(pair[0]))
Y_val.append(pair[1])
Y_names.append(pair[2])
X_val = np.array(X_val)
y_pred = xgboost_model.predict_proba(X_val)
for i, Y_name in enumerate(Y_names):
result_file.write(Y_name + ' ')
result_file.write(str(y_pred[i][0]) + ',' + str(y_pred[i][1]) + ',' + str(y_pred[i][2]) + '\n')
result_file.close()
import os
import torch.optim
import numpy as np
import torch.optim
import torch.nn.parallel
from ops.models import TSN
from ops.transforms import *
from ops.dataset import TSNDataSet
from torch.nn import functional as F
def gen_file_list(frame_save_dir, frame_list_dir):
val_path = os.path.join(frame_list_dir, 'val.txt')
video_names = os.listdir(frame_save_dir)
ucf101_rgb_val_file = open(val_path, 'w')
for video_name in video_names:
images_dir = os.path.join(frame_save_dir, video_name)
ucf101_rgb_val_file.write(video_name)
ucf101_rgb_val_file.write(' ')
ucf101_rgb_val_file.write(str(len(os.listdir(images_dir))))
ucf101_rgb_val_file.write('\n')
ucf101_rgb_val_file.close()
return val_path
def start_filter(config):
arch = config['FIGHTING']['ARCH']
prefix = config['VIDEO']['PREFIX']
modality = config['POSE']['MODALITY']
test_crop = config['POSE']['TEST_CROP']
batch_size = config['POSE']['BATCH_SIZE']
weights_path = config['MODEL']['CLS_POSE']
test_segment = config['POSE']['TEST_SEGMENT']
frame_save_dir = config['VIDEO']['POSE_FRAME_SAVE_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['POSE']['RESULT_FILE']
workers = 8
num_class = 3
shift_div = 8
img_feature_dim = 256
softmax = False
is_shift = True
full_res = False
non_local = False
dense_sample = False
twice_sample = False
val_list = gen_file_list(frame_save_dir, frame_list_dir)
result_file_path = os.path.join(frame_list_dir, result_file_name)
pretrain = 'imagenet'
shift_place = 'blockres'
crop_fusion_type = 'avg'
net = TSN(num_class, test_segment if is_shift else 1, modality,
base_model=arch,
consensus_type=crop_fusion_type,
img_feature_dim=img_feature_dim,
pretrain=pretrain,
is_shift=is_shift, shift_div=shift_div, shift_place=shift_place,
non_local=non_local,
)
checkpoint = torch.load(weights_path)
checkpoint = checkpoint['state_dict']
base_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint.items())}
replace_dict = {'base_model.classifier.weight': 'new_fc.weight',
'base_model.classifier.bias': 'new_fc.bias',
}
for k, v in replace_dict.items():
if k in base_dict:
base_dict[v] = base_dict.pop(k)
net.load_state_dict(base_dict)
input_size = net.scale_size if full_res else net.input_size
if test_crop == 1:
cropping = torchvision.transforms.Compose([
GroupScale(net.scale_size),
GroupCenterCrop(input_size),
])
elif test_crop == 3: # do not flip, so only 5 crops
cropping = torchvision.transforms.Compose([
GroupFullResSample(input_size, net.scale_size, flip=False)
])
elif test_crop == 5: # do not flip, so only 5 crops
cropping = torchvision.transforms.Compose([
GroupOverSample(input_size, net.scale_size, flip=False)
])
elif test_crop == 10:
cropping = torchvision.transforms.Compose([
GroupOverSample(input_size, net.scale_size)
])
else:
raise ValueError("Only 1, 5, 10 crops are supported while we got {}".format(test_crop))
data_loader = torch.utils.data.DataLoader(
TSNDataSet(frame_save_dir, val_list, num_segments=test_segment,
new_length=1 if modality == "RGB" else 5,
modality=modality,
image_tmpl=prefix,
test_mode=True,
remove_missing=False,
transform=torchvision.transforms.Compose([
cropping,
Stack(roll=(arch in ['BNInception', 'InceptionV3'])),
ToTorchFormatTensor(div=(arch not in ['BNInception', 'InceptionV3'])),
GroupNormalize(net.input_mean, net.input_std),
]), dense_sample=dense_sample, twice_sample=twice_sample),
batch_size=batch_size, shuffle=False,
num_workers=workers, pin_memory=True,
)
net = torch.nn.DataParallel(net.cuda())
net.eval()
data_gen = enumerate(data_loader)
max_num = len(data_loader.dataset)
result_file = open(result_file_path, 'w')
for i, data_pair in data_gen:
directory, data = data_pair
with torch.no_grad():
if i >= max_num:
break
num_crop = test_crop
if dense_sample:
num_crop *= 10 # 10 clips for testing when using dense sample
if twice_sample:
num_crop *= 2
if modality == 'RGB':
length = 3
elif modality == 'Flow':
length = 10
elif modality == 'RGBDiff':
length = 18
else:
raise ValueError("Unknown modality " + modality)
data_in = data.view(-1, length, data.size(2), data.size(3))
if is_shift:
data_in = data_in.view(batch_size * num_crop, test_segment, length, data_in.size(2), data_in.size(3))
rst, feature = net(data_in)
rst = rst.reshape(batch_size, num_crop, -1).mean(1)
if softmax:
# take the softmax to normalize the output to probability
rst = F.softmax(rst, dim=1)
rst = rst.data.cpu().numpy().copy()
if net.module.is_shift:
rst = rst.reshape(batch_size, num_class)
else:
rst = rst.reshape((batch_size, -1, num_class)).mean(axis=1).reshape((batch_size, num_class))
proba = np.squeeze(rst)
proba = np.exp(proba)/sum(np.exp(proba))
result_file.write(str(directory[0]) + ' ')
result_file.write(str(proba[0]) + ',' + str(proba[1]) + ',' + str(proba[2]) + '\n')
result_file.close()
print('video filter end')
\ No newline at end of file
import os
import cv2
import load_util
import media_util
import numpy as np
from sklearn.metrics import confusion_matrix
import fighting_filter, emotion_filter, argue_filter, audio_filter, class_filter
import video_filter, pose_filter, flow_filter
def accuracy_cal(config):
label_file_path = config['VIDEO']['LABEL_PATH']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
final_file_name = config['AUDIO']['RESULT_FILE']
final_file_path = os.path.join(frame_list_dir, final_file_name)
final_file_lines = open(final_file_path).readlines()
label_file_lines = open(label_file_path).readlines()
final_pairs = {line.strip().split(' ')[0]: line.strip().split(' ')[1] for line in final_file_lines}
lines_num = len(label_file_lines) - 1
hit = 0
for i, label_line in enumerate(label_file_lines):
if i == 0:
continue
file, label = label_line.strip().split(' ')
final_pre = final_pairs[file]
final_pre_class = np.argmax(np.array(final_pre.split(','))) + 1
print(final_pre_class, label)
if final_pre_class == int(label):
hit += 1
return hit/lines_num
def main():
config_path = r'config.yaml'
config = load_util.load_config(config_path)
media_util.extract_wav(config)
media_util.extract_frame(config)
media_util.extract_frame_pose(config)
media_util.extract_is10(config)
media_util.extract_random_face_feature(config)
media_util.extract_mirror(config)
fighting_2_filter.start_filter(config)
emotion_filter.start_filter(config)
audio_filter.start_filter(config)
class_filter.start_filter(config)
video_filter.start_filter(config)
pose_filter.start_filter(config)
flow_filter.start_filter(config)
acc = accuracy_cal(config)
print(acc)
if __name__ == '__main__':
main()
\ No newline at end of file
import os
import torch.optim
import numpy as np
import torch.nn.parallel
from ops.models import TSN
from ops.transforms import *
from ops.dataset import TSNDataSet
from torch.nn import functional as F
def gen_file_list(frame_save_dir, frame_list_dir):
val_path = os.path.join(frame_list_dir, 'val.txt')
video_names = os.listdir(frame_save_dir)
ucf101_rgb_val_file = open(val_path, 'w')
for video_name in video_names:
images_dir = os.path.join(frame_save_dir, video_name)
ucf101_rgb_val_file.write(video_name)
ucf101_rgb_val_file.write(' ')
ucf101_rgb_val_file.write(str(len(os.listdir(images_dir))))
ucf101_rgb_val_file.write('\n')
ucf101_rgb_val_file.close()
return val_path
def start_filter(config):
arch = config['FIGHTING']['ARCH']
prefix = config['VIDEO']['PREFIX']
modality = config['VIDEO_FILTER']['MODALITY']
test_crop = config['VIDEO_FILTER']['TEST_CROP']
batch_size = config['VIDEO_FILTER']['BATCH_SIZE']
weights_path = config['MODEL']['CLS_VIDEO']
test_segment = config['VIDEO_FILTER']['TEST_SEGMENT']
frame_save_dir = config['VIDEO']['FRAME_SAVE_DIR']
frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
result_file_name = config['VIDEO_FILTER']['RESULT_FILE']
workers = 8
num_class = 3
shift_div = 8
img_feature_dim = 256
softmax = False
is_shift = True
full_res = False
non_local = False
dense_sample = False
twice_sample = False
val_list = gen_file_list(frame_save_dir, frame_list_dir)
result_file_path = os.path.join(frame_list_dir, result_file_name)
pretrain = 'imagenet'
shift_place = 'blockres'
crop_fusion_type = 'avg'
net = TSN(num_class, test_segment if is_shift else 1, modality,
base_model=arch,
consensus_type=crop_fusion_type,
img_feature_dim=img_feature_dim,
pretrain=pretrain,
is_shift=is_shift, shift_div=shift_div, shift_place=shift_place,
non_local=non_local,
)
checkpoint = torch.load(weights_path)
checkpoint = checkpoint['state_dict']
base_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint.items())}
replace_dict = {'base_model.classifier.weight': 'new_fc.weight',
'base_model.classifier.bias': 'new_fc.bias',
}
for k, v in replace_dict.items():
if k in base_dict:
base_dict[v] = base_dict.pop(k)
net.load_state_dict(base_dict)
input_size = net.scale_size if full_res else net.input_size
if test_crop == 1:
cropping = torchvision.transforms.Compose([
GroupScale(net.scale_size),
GroupCenterCrop(input_size),
])
elif test_crop == 3: # do not flip, so only 5 crops
cropping = torchvision.transforms.Compose([
GroupFullResSample(input_size, net.scale_size, flip=False)
])
elif test_crop == 5: # do not flip, so only 5 crops
cropping = torchvision.transforms.Compose([
GroupOverSample(input_size, net.scale_size, flip=False)
])
elif test_crop == 10:
cropping = torchvision.transforms.Compose([
GroupOverSample(input_size, net.scale_size)
])
else:
raise ValueError("Only 1, 5, 10 crops are supported while we got {}".format(test_crop))
data_loader = torch.utils.data.DataLoader(
TSNDataSet(frame_save_dir, val_list, num_segments=test_segment,
new_length=1 if modality == "RGB" else 5,
modality=modality,
image_tmpl=prefix,
test_mode=True,
remove_missing=False,
transform=torchvision.transforms.Compose([
cropping,
Stack(roll=(arch in ['BNInception', 'InceptionV3'])),
ToTorchFormatTensor(div=(arch not in ['BNInception', 'InceptionV3'])),
GroupNormalize(net.input_mean, net.input_std),
]), dense_sample=dense_sample, twice_sample=twice_sample),
batch_size=batch_size, shuffle=False,
num_workers=workers, pin_memory=True,
)
net = torch.nn.DataParallel(net.cuda())
net.eval()
data_gen = enumerate(data_loader)
max_num = len(data_loader.dataset)
result_file = open(result_file_path, 'w')
for i, data_pair in data_gen:
directory, data = data_pair
with torch.no_grad():
if i >= max_num:
break
num_crop = test_crop
if dense_sample:
num_crop *= 10 # 10 clips for testing when using dense sample
if twice_sample:
num_crop *= 2
if modality == 'RGB':
length = 3
elif modality == 'Flow':
length = 10
elif modality == 'RGBDiff':
length = 18
else:
raise ValueError("Unknown modality " + modality)
data_in = data.view(-1, length, data.size(2), data.size(3))
if is_shift:
data_in = data_in.view(batch_size * num_crop, test_segment, length, data_in.size(2), data_in.size(3))
rst, feature = net(data_in)
rst = rst.reshape(batch_size, num_crop, -1).mean(1)
if softmax:
# take the softmax to normalize the output to probability
rst = F.softmax(rst, dim=1)
rst = rst.data.cpu().numpy().copy()
if net.module.is_shift:
rst = rst.reshape(batch_size, num_class)
else:
rst = rst.reshape((batch_size, -1, num_class)).mean(axis=1).reshape((batch_size, num_class))
proba = np.squeeze(rst)
proba = np.exp(proba)/sum(np.exp(proba))
result_file.write(str(directory[0]) + ' ')
result_file.write(str(proba[0]) + ',' + str(proba[1]) + ',' + str(proba[2]) + '\n')
result_file.close()
print('video filter end')
\ No newline at end of file
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!