first commit

jiangwenqiang
Showing 46 changed files with 3340 additions and 0 deletions
__init__.py
__pycache__/argue_filter.cpython-36.pyc
__pycache__/audio_filter.cpython-36.pyc
__pycache__/bg_filter.cpython-36.pyc
__pycache__/class_filter.cpython-36.pyc
__pycache__/emotion_1_filter.cpython-36.pyc
__pycache__/emotion_filter.cpython-36.pyc
__pycache__/fighting_2_filter.cpython-36.pyc
__pycache__/fighting_filter.cpython-36.pyc
__pycache__/flow_filter.cpython-36.pyc
__pycache__/load_util.cpython-36.pyc
__pycache__/media_util.cpython-36.pyc
__pycache__/meeting_filter.cpython-36.pyc
__pycache__/person_filter.cpython-36.pyc
__pycache__/pose_filter.cpython-36.pyc
__pycache__/troops_filter.cpython-36.pyc
__pycache__/video_1_filter.cpython-36.pyc
__pycache__/video_filter.cpython-36.pyc
audio_filter.py
bg_filter.py
--- a/__init__.py 0 → 100644
View file @cb29b6d
+++ b/__init__.py 0 → 100644
View file @cb29b6d
--- a/__pycache__/argue_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/__pycache__/argue_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/__pycache__/audio_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/__pycache__/audio_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/__pycache__/bg_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/__pycache__/bg_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/__pycache__/class_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/__pycache__/class_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/__pycache__/emotion_1_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/__pycache__/emotion_1_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/__pycache__/emotion_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/__pycache__/emotion_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/__pycache__/fighting_2_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/__pycache__/fighting_2_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/__pycache__/fighting_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/__pycache__/fighting_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/__pycache__/flow_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/__pycache__/flow_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/__pycache__/load_util.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/__pycache__/load_util.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/__pycache__/media_util.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/__pycache__/media_util.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/__pycache__/meeting_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/__pycache__/meeting_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/__pycache__/person_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/__pycache__/person_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/__pycache__/pose_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/__pycache__/pose_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/__pycache__/troops_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/__pycache__/troops_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/__pycache__/video_1_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/__pycache__/video_1_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/__pycache__/video_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/__pycache__/video_filter.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/audio_filter.py 0 → 100644
View file @cb29b6d
+++ b/audio_filter.py 0 → 100644
View file @cb29b6d
+import os
+import csv
+import pickle
+import numpy as np
+from sklearn.externals import joblib
+def start_filter(config):
+    cls_audio_path = config['MODEL']['CLS_AUDIO']
+    feature_save_dir = config['VIDEO']['IS10_FEATURE_NP_DIR']
+    frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
+    result_file_name = config['AUDIO']['RESULT_FILE']
+    feature_name = config['AUDIO']['DATA_NAME']
+    svm_clf = joblib.load(cls_audio_path)
+    result_file_path = os.path.join(frame_list_dir, result_file_name)
+    result_file = open(result_file_path, 'w')
+    feature_path = os.path.join(feature_save_dir, feature_name)
+    val_annotation_pairs = np.load(feature_path, allow_pickle=True, encoding='latin1')
+    for pair in val_annotation_pairs:
+        v = pair[0]
+        n = pair[2]
+        feature_np = np.reshape(v, (1, -1))
+        res = svm_clf.predict_proba(feature_np)
+        proba = np.squeeze(res)
+        # class_pre = svm_clf.predict(feature_np)
+        result_file.write(str(pair[2])[:-4] + ' ')
+        result_file.write(str(proba[0]) + ',' + str(proba[1]) + ',' + str(proba[2]) + '\n')
+    result_file.close()
+def start_filter_xgboost(config):
+    cls_class_path = config['MODEL']['CLS_AUDIO']
+    feature_save_dir = config['VIDEO']['IS10_FEATURE_NP_DIR']
+    frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
+    result_file_name = config['AUDIO']['RESULT_FILE']
+    feature_name = config['AUDIO']['DATA_NAME']
+    xgboost_model = pickle.load(open(cls_class_path, "rb"))
+    result_file_path = os.path.join(frame_list_dir, result_file_name)
+    result_file = open(result_file_path, 'w')
+    feature_path = os.path.join(feature_save_dir, feature_name)
+    val_annotation_pairs = np.load(feature_path, allow_pickle=True, encoding='latin1')
+    X_val = []
+    Y_names = []
+    for pair in val_annotation_pairs:
+        n, v = pair.items()
+        X_val.append(v)
+        Y_names.append(n)
+    X_val = np.array(X_val)
+    y_pred = xgboost_model.predict_proba(X_val)
+    for i, Y_name in enumerate(Y_names):
+        result_file.write(Y_name + ' ')
+        result_file.write(str(y_pred[i][0]) + ',' + str(y_pred[i][1]) + ',' + str(y_pred[i][2]) + '\n')
+    result_file.close()
--- a/bg_filter.py 0 → 100644
View file @cb29b6d
+++ b/bg_filter.py 0 → 100644
View file @cb29b6d
+import os
+import cv2
+import numpy as np
+import pickle
+def start_filter(config):
+    cls_class_path = config['MODEL']['CLS_BG']
+    feature_save_dir = config['VIDEO']['FACE_FEATURE_DIR']
+    frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
+    result_file_name = config['BG']['RESULT_FILE']
+    feature_name = config['BG']['DATA_NAME']
+    xgboost_model = pickle.load(open(cls_class_path, "rb"))
+    result_file_path = os.path.join(frame_list_dir, result_file_name)
+    result_file = open(result_file_path, 'w')
+    feature_path = os.path.join(feature_save_dir, feature_name)
+    val_annotation_pairs = np.load(feature_path, allow_pickle=True)
+    X_val = []
+    Y_val = []
+    Y_names = []
+    for j in range(len(val_annotation_pairs)):
+        pair = val_annotation_pairs[j]
+        X_val.append(np.squeeze(pair[0]))
+        Y_val.append(pair[1])
+        Y_names.append(pair[2])
+    X_val = np.array(X_val)
+    y_pred = xgboost_model.predict_proba(X_val)
+    for i, Y_name in enumerate(Y_names):
+        result_file.write(Y_name + ' ')
+        result_file.write(str(y_pred[i][0]) + ',' + str(y_pred[i][1]) + ',' + str(y_pred[i][2]) + '\n')
+    result_file.close()
--- a/class_filter.py 0 → 100644
View file @cb29b6d
+++ b/class_filter.py 0 → 100644
View file @cb29b6d
+import os
+import pickle
+import numpy as np
+def start_filter(config):
+    cls_class_path = config['MODEL']['CLS_CLASS']
+    feature_save_dir = config['VIDEO']['CLASS_FEATURE_DIR']
+    frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
+    result_file_name = config['CLASS']['RESULT_FILE']
+    feature_name = config['CLASS']['DATA_NAME']
+    xgboost_model = pickle.load(open(cls_class_path, "rb"))
+    result_file_path = os.path.join(frame_list_dir, result_file_name)
+    result_file = open(result_file_path, 'w')
+    feature_path = os.path.join(feature_save_dir, feature_name)
+    val_annotation_pairs = np.load(feature_path, allow_pickle=True)
+    X_val = []
+    Y_val = []
+    Y_names = []
+    for j in range(len(val_annotation_pairs)):
+        pair = val_annotation_pairs[j]
+        X_val.append(pair[0])
+        Y_val.append(pair[1])
+        Y_names.append(pair[2])
+    X_val = np.array(X_val)
+    y_pred = xgboost_model.predict(X_val)
+    for i, Y_name in enumerate(Y_names):
+        result_file.write(Y_name + ' ')
+        result_file.write(str(y_pred[i]) + '\n')
+    result_file.close()
--- a/config.yaml 0 → 100644
View file @cb29b6d
+++ b/config.yaml 0 → 100644
View file @cb29b6d
+MODEL:
+  CLS_FIGHTING_2: '/home/jwq/models/cls_fighting_2/cls_fighting_2_v0.0.1.pth'
+  CLS_EMOTION: '/home/jwq/models/cls_emotion/v0.1.0.m'
+  FEATURE_EMOTION: '/home/jwq/models/feature_emotion/FerPlus3.h5'
+  CLS_AUDIO: '/home/jwq/models/cls_audio/v0.0.1.m'
+  CLS_CLASS: '/home/jwq/models/cls_class/v_0.0.1_xgb.pkl'
+  CLS_VIDEO: '/home/jwq/models/cls_video/v0.4.1.pth'
+  CLS_POSE: '/home/jwq/models/cls_pose/v0.0.1.pth'
+  CLS_FLOW: '/home/jwq/models/cls_flow/v0.1.1.pth'
+  CLS_BG: '/home/jwq/models/cls_bg/v0.1.1.pkl'
+  CLS_PERSON: '/home/jwq/models/cls_person/v0.1.1.pkl'
+THRESHOLD:
+  FACES_THRESHOLD: 0.6
+FILTER:
+VIDEO:
+  VIDEO_DIR: '/home/jwq/Desktop/VGAF_EmotiW/Val'
+  LABEL_PATH: '/home/jwq/Desktop/VGAF_EmotiW/Val_labels.txt'
+  VIDEO_SAVE_DIR: '/home/jwq/Desktop/tmp/video'
+  AUDIO_SAVE_DIR: '/home/jwq/npys/'
+  FRAME_SAVE_DIR: '/home/jwq/Desktop/tmp/frame'
+  # FRAME_SAVE_DIR: '/home/jwq/Desktop/VGAF_EmotiW_class/train_frame'
+  FLOW_SAVE_DIR: '/home/jwq/Desktop/tmp/flow'
+  POSE_FRAME_SAVE_DIR: '/home/jwq/Desktop/tmp/pose_frame'
+  FRAME_LIST_DIR: '/home/jwq/Desktop/tmp/file_list'
+  IS10_FEATURE_NP_DIR: '/home/jwq/npys'
+  IS10_FEATURE_CSV_DIR: '/home/jwq/Desktop/tmp/is10'
+  # FACE_FEATURE_DIR: '/home/jwq/Desktop/tmp/face_feature_retina'
+  # FACE_FEATURE_DIR: '/data2/retinaface/random_face_frame_features/'
+  FACE_FEATURE_DIR: '/data1/segment/'
+  # FACE_FEATURE_DIR: '/home/jwq/npys/'
+  FACE_IMAGE_DIR: '/data2/retinaface/train/'
+  CLASS_FEATURE_DIR: '/home/jwq/Desktop/tmp/class'
+  PREFIX: 'img_{:05d}.jpg'
+  FLOW_PREFIX: 'flow_{}_{:05d}.jpg'
+  THREAD_NUM: 10
+  FPS: 5
+VIDEO_FILTER:
+  TEST_SEGMENT: 8
+  TEST_CROP: 1
+  BATCH_SIZE: 1
+  INPUT_SIZE: 224
+  MODALITY: 'RGB'
+  ARCH: 'resnet50'
+  RESULT_FILE: 'video_filter.txt'
+VIDEO_1_FILTER:
+  TEST_SEGMENT: 8
+  TEST_CROP: 1
+  BATCH_SIZE: 1
+  INPUT_SIZE: 224
+  MODALITY: 'RGB'
+  ARCH: 'resnet34'
+  RESULT_FILE: 'video_1_filter.txt'
+EMOTION:
+  INTERVAL: 1
+  INPUT_SIZE: 224
+  RESULT_FILE: 'emotion_filter.txt'
+EMOTION_1:
+  RESULT_FILE: 'emotion_1_filter.txt'
+  DATA_NAME: 'val.npy'
+ARGUE:
+  DIMENSION: 1582
+  RESULT_FILE: 'argue_filter.txt'
+FIGHTING:
+  TEST_SEGMENT: 8
+  TEST_CROP: 1
+  BATCH_SIZE: 1
+  INPUT_SIZE: 224
+  MODALITY: 'RGB'
+  ARCH: 'resnet50'
+  RESULT_FILE: 'fighting_filter.txt'
+FIGHTING_2:
+  TEST_SEGMENT: 8
+  TEST_CROP: 1
+  BATCH_SIZE: 1
+  INPUT_SIZE: 224
+  MODALITY: 'RGB'
+  ARCH: 'resnet50'
+  RESULT_FILE: 'fighting_2_filter.txt'
+MEETING:
+  TEST_SEGMENT: 8
+  TEST_CROP: 1
+  BATCH_SIZE: 1
+  INPUT_SIZE: 224
+  MODALITY: 'RGB'
+  ARCH: 'resnet50'
+  RESULT_FILE: 'meeting_filter.txt'
+TROOPS:
+  TEST_SEGMENT: 8
+  TEST_CROP: 1
+  BATCH_SIZE: 1
+  INPUT_SIZE: 224
+  MODALITY: 'RGB'
+  ARCH: 'resnet50'
+  RESULT_FILE: 'troops_filter.txt'
+FLOW:
+  TEST_SEGMENT: 8
+  TEST_CROP: 1
+  BATCH_SIZE: 1
+  INPUT_SIZE: 224
+  MODALITY: 'Flow'
+  ARCH: 'resnet50'
+  RESULT_FILE: 'flow_filter.txt'
+FINAL:
+  RESULT_FILE: 'final.txt'
+  ERROR_FILE: 'error.txt'
+  SIM_FILE: 'image_sim.txt'
+AUDIO:
+  RESULT_FILE: 'audio.txt'
+  OPENSMILE_DIR: '/home/jwq/Downloads/opensmile-2.3.0'
+  DATA_NAME: 'val.npy'
+CLASS:
+  RESULT_FILE: 'class.txt'
+  DATA_NAME: 'val _reannotation.npy'
+POSE:
+  TEST_SEGMENT: 8
+  TEST_CROP: 1
+  BATCH_SIZE: 1
+  INPUT_SIZE: 224
+  MODALITY: 'RGB'
+  ARCH: 'resnet50'
+  RESULT_FILE: 'pose_filter.txt'
+BG:
+  RESULT_FILE: 'bg_filter.txt'
+  DATA_NAME: 'bg_val_feature.npy'
+PERSON:
+  RESULT_FILE: 'person_filter.txt'
+  DATA_NAME: 'person_val_feature.npy'
--- a/emotion_filter.py 0 → 100644
View file @cb29b6d
+++ b/emotion_filter.py 0 → 100644
View file @cb29b6d
+import os
+import cv2
+import numpy as np
+from keras.models import Model
+from keras.models import load_model
+from sklearn.externals import joblib
+from tensorflow.keras.preprocessing.image import img_to_array
+os.environ["CUDA_VISIBLE_DEVICES"] = '0'
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
+class FeatureExtractor(object):
+    def __init__(self, input_size=224, out_put_layer='avg_pool', model_path='FerPlus3.h5'):
+        self.model = load_model(model_path)
+        self.input_size = input_size
+        self.model_inter = Model(inputs=self.model.input, outputs=self.model.get_layer(out_put_layer).output)
+    def inference(self, image):
+        image = cv2.resize(image, (self.input_size, self.input_size))
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        image = image.astype("float") / 255.0
+        image = img_to_array(image)
+        image = np.expand_dims(image, axis=0)
+        feature = self.model_inter.predict(image)[0]
+        return feature
+def features2feature(pics_features):
+        pics_features = np.array(pics_features)
+        fea_mean = pics_features.mean(axis=0)
+        fea_max = np.amax(pics_features, axis=0)
+        fea_min = np.amin(pics_features, axis=0)
+        fea_std = pics_features.std(axis=0)
+        return np.concatenate((fea_mean, fea_max, fea_min, fea_std), axis=1).reshape(1, -1)
+def start_filter(config):
+    cls_emotion_path = config['MODEL']['CLS_EMOTION']
+    face_feature_dir = config['VIDEO']['FACE_FEATURE_DIR']
+    frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
+    result_file_name = config['EMOTION']['RESULT_FILE']
+    svm_clf = joblib.load(cls_emotion_path)
+    result_file_path = os.path.join(frame_list_dir, result_file_name)
+    result_file = open(result_file_path, 'w')
+    face_feature_names = os.listdir(face_feature_dir)
+    for face_feature in face_feature_names:
+        face_feature_path = os.path.join(face_feature_dir, face_feature)
+        features_np = np.load(face_feature_path, allow_pickle=True)
+        feature = features2feature(features_np)
+        res = svm_clf.predict_proba(feature)
+        proba = np.squeeze(res)
+        # class_pre = svm_clf.predict(feature)
+        result_file.write(face_feature[:-4] + ' ')
+        result_file.write(str(proba[0]) + ',' + str(proba[1]) + ',' + str(proba[2]) + '\n')
+    result_file.close()
--- a/fighting_2_filter.py 0 → 100644
View file @cb29b6d
+++ b/fighting_2_filter.py 0 → 100644
View file @cb29b6d
+import os
+import torch.optim
+import numpy as np
+import torch.optim
+import torch.nn.parallel
+from ops.models import TSN
+from ops.transforms import *
+from ops.dataset import TSNDataSet
+from torch.nn import functional as F
+def gen_file_list(frame_save_dir, frame_list_dir):
+    val_path = os.path.join(frame_list_dir, 'val.txt')
+    video_names = os.listdir(frame_save_dir)
+    ucf101_rgb_val_file = open(val_path, 'w')
+    for video_name in video_names:
+        images_dir = os.path.join(frame_save_dir, video_name)
+        ucf101_rgb_val_file.write(video_name)
+        ucf101_rgb_val_file.write(' ')
+        ucf101_rgb_val_file.write(str(len(os.listdir(images_dir))))
+        ucf101_rgb_val_file.write('\n')
+    ucf101_rgb_val_file.close()
+    return val_path
+def start_filter(config):
+    arch = config['FIGHTING_2']['ARCH']
+    prefix = config['VIDEO']['PREFIX']
+    modality = config['FIGHTING_2']['MODALITY']
+    test_crop = config['FIGHTING_2']['TEST_CROP']
+    batch_size = config['FIGHTING_2']['BATCH_SIZE']
+    weights_path = config['MODEL']['CLS_FIGHTING_2']
+    test_segment = config['FIGHTING_2']['TEST_SEGMENT']
+    frame_save_dir = config['VIDEO']['FRAME_SAVE_DIR']
+    frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
+    result_file_name = config['FIGHTING_2']['RESULT_FILE']
+    workers = 8
+    num_class = 2
+    shift_div = 8
+    img_feature_dim = 256
+    softmax = False
+    is_shift = True
+    full_res = False
+    non_local = False
+    dense_sample = False
+    twice_sample = False
+    val_list = gen_file_list(frame_save_dir, frame_list_dir)
+    result_file_path = os.path.join(frame_list_dir, result_file_name)
+    pretrain = 'imagenet'
+    shift_place = 'blockres'
+    crop_fusion_type = 'avg'
+    net = TSN(num_class, test_segment if is_shift else 1, modality,
+              base_model=arch,
+              consensus_type=crop_fusion_type,
+              img_feature_dim=img_feature_dim,
+              pretrain=pretrain,
+              is_shift=is_shift, shift_div=shift_div, shift_place=shift_place,
+              non_local=non_local,
+              )
+    checkpoint = torch.load(weights_path)
+    checkpoint = checkpoint['state_dict']
+    base_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint.items())}
+    replace_dict = {'base_model.classifier.weight': 'new_fc.weight',
+                    'base_model.classifier.bias': 'new_fc.bias',
+                    }
+    for k, v in replace_dict.items():
+        if k in base_dict:
+            base_dict[v] = base_dict.pop(k)
+    net.load_state_dict(base_dict)
+    input_size = net.scale_size if full_res else net.input_size
+    if test_crop == 1:
+        cropping = torchvision.transforms.Compose([
+            GroupScale(net.scale_size),
+            GroupCenterCrop(input_size),
+        ])
+    elif test_crop == 3:  # do not flip, so only 5 crops
+        cropping = torchvision.transforms.Compose([
+            GroupFullResSample(input_size, net.scale_size, flip=False)
+        ])
+    elif test_crop == 5:  # do not flip, so only 5 crops
+        cropping = torchvision.transforms.Compose([
+            GroupOverSample(input_size, net.scale_size, flip=False)
+        ])
+    elif test_crop == 10:
+        cropping = torchvision.transforms.Compose([
+            GroupOverSample(input_size, net.scale_size)
+        ])
+    else:
+        raise ValueError("Only 1, 5, 10 crops are supported while we got {}".format(test_crop))
+    data_loader = torch.utils.data.DataLoader(
+            TSNDataSet(frame_save_dir, val_list, num_segments=test_segment,
+                       new_length=1 if modality == "RGB" else 5,
+                       modality=modality,
+                       image_tmpl=prefix,
+                       test_mode=True, 
+                       remove_missing=False,
+                       transform=torchvision.transforms.Compose([
+                           cropping,
+                           Stack(roll=(arch in ['BNInception', 'InceptionV3'])),
+                           ToTorchFormatTensor(div=(arch not in ['BNInception', 'InceptionV3'])),
+                           GroupNormalize(net.input_mean, net.input_std),
+                       ]), dense_sample=dense_sample, twice_sample=twice_sample),
+            batch_size=batch_size, shuffle=False,
+            num_workers=workers, pin_memory=True,
+    )
+    net = torch.nn.DataParallel(net.cuda())
+    net.eval()
+    data_gen = enumerate(data_loader)
+    max_num = len(data_loader.dataset)
+    result_file = open(result_file_path, 'w')
+    for i, data_pair in data_gen:
+        directory, data = data_pair
+        with torch.no_grad():
+            if i >= max_num:
+                break
+            num_crop = test_crop
+            if dense_sample:
+                num_crop *= 10  # 10 clips for testing when using dense sample
+            if twice_sample:
+                num_crop *= 2
+            if modality == 'RGB':
+                length = 3
+            elif modality == 'Flow':
+                length = 10
+            elif modality == 'RGBDiff':
+                length = 18
+            else:
+                raise ValueError("Unknown modality " + modality)
+            data_in = data.view(-1, length, data.size(2), data.size(3))
+            if is_shift:
+                data_in = data_in.view(batch_size * num_crop, test_segment, length, data_in.size(2), data_in.size(3))
+            rst, feature = net(data_in)
+            rst = rst.reshape(batch_size, num_crop, -1).mean(1)
+            if softmax:
+                # take the softmax to normalize the output to probability
+                rst = F.softmax(rst, dim=1)
+            rst = rst.data.cpu().numpy().copy()
+            if net.module.is_shift:
+                rst = rst.reshape(batch_size, num_class)
+            else:
+                rst = rst.reshape((batch_size, -1, num_class)).mean(axis=1).reshape((batch_size, num_class))
+            proba = np.squeeze(rst)
+            print(proba)
+            proba = np.exp(proba)/sum(np.exp(proba))
+            result_file.write(str(directory[0]) + ' ')
+            result_file.write(str(proba[0]) + ',' + str(proba[1]) + '\n')
+    result_file.close()
+    print('fighting filter end')
\ No newline at end of file
--- a/flow_filter.py 0 → 100644
View file @cb29b6d
+++ b/flow_filter.py 0 → 100644
View file @cb29b6d
+import os
+import torch.optim
+import numpy as np
+import torch.optim
+import torch.nn.parallel
+from ops.models import TSN
+from ops.transforms import *
+from ops.dataset import TSNDataSet
+from torch.nn import functional as F
+def gen_file_list(frame_save_dir, frame_list_dir):
+    val_path = os.path.join(frame_list_dir, 'flow_val.txt')
+    video_names = os.listdir(frame_save_dir)
+    ucf101_rgb_val_file = open(val_path, 'w')
+    for video_name in video_names:
+        images_dir = os.path.join(frame_save_dir, video_name)
+        ucf101_rgb_val_file.write(video_name)
+        ucf101_rgb_val_file.write(' ')
+        ori_list = os.listdir(images_dir)
+        select_list = [element for element in ori_list if 'x' in element]
+        ucf101_rgb_val_file.write(str(len(select_list)))
+        ucf101_rgb_val_file.write('\n')
+    ucf101_rgb_val_file.close()
+    return val_path
+def start_filter(config):
+    arch = config['FLOW']['ARCH']
+    prefix = config['VIDEO']['FLOW_PREFIX']
+    modality = config['FLOW']['MODALITY']
+    test_crop = config['FLOW']['TEST_CROP']
+    batch_size = config['FLOW']['BATCH_SIZE']
+    weights_path = config['MODEL']['CLS_FLOW']
+    test_segment = config['FLOW']['TEST_SEGMENT']
+    frame_save_dir = config['VIDEO']['FLOW_SAVE_DIR']
+    frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
+    result_file_name = config['FLOW']['RESULT_FILE']
+    workers = 8
+    num_class = 3
+    shift_div = 8
+    img_feature_dim = 256
+    softmax = False
+    is_shift = True
+    full_res = False
+    non_local = False
+    dense_sample = False
+    twice_sample = False
+    val_list = gen_file_list(frame_save_dir, frame_list_dir)
+    result_file_path = os.path.join(frame_list_dir, result_file_name)
+    pretrain = 'imagenet'
+    shift_place = 'blockres'
+    crop_fusion_type = 'avg'
+    net = TSN(num_class, test_segment if is_shift else 1, modality,
+              base_model=arch,
+              consensus_type=crop_fusion_type,
+              img_feature_dim=img_feature_dim,
+              pretrain=pretrain,
+              is_shift=is_shift, shift_div=shift_div, shift_place=shift_place,
+              non_local=non_local,
+              )
+    checkpoint = torch.load(weights_path)
+    checkpoint = checkpoint['state_dict']
+    base_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint.items())}
+    replace_dict = {'base_model.classifier.weight': 'new_fc.weight',
+                    'base_model.classifier.bias': 'new_fc.bias',
+                    }
+    for k, v in replace_dict.items():
+        if k in base_dict:
+            base_dict[v] = base_dict.pop(k)
+    net.load_state_dict(base_dict)
+    input_size = net.scale_size if full_res else net.input_size
+    if test_crop == 1:
+        cropping = torchvision.transforms.Compose([
+            GroupScale(net.scale_size),
+            GroupCenterCrop(input_size),
+        ])
+    elif test_crop == 3:  # do not flip, so only 5 crops
+        cropping = torchvision.transforms.Compose([
+            GroupFullResSample(input_size, net.scale_size, flip=False)
+        ])
+    elif test_crop == 5:  # do not flip, so only 5 crops
+        cropping = torchvision.transforms.Compose([
+            GroupOverSample(input_size, net.scale_size, flip=False)
+        ])
+    elif test_crop == 10:
+        cropping = torchvision.transforms.Compose([
+            GroupOverSample(input_size, net.scale_size)
+        ])
+    else:
+        raise ValueError("Only 1, 5, 10 crops are supported while we got {}".format(test_crop))
+    data_loader = torch.utils.data.DataLoader(
+            TSNDataSet(frame_save_dir, val_list, num_segments=test_segment,
+                       new_length=1 if modality == "RGB" else 5,
+                       modality=modality,
+                       image_tmpl=prefix,
+                       test_mode=True,
+                       remove_missing=False,
+                       transform=torchvision.transforms.Compose([
+                           cropping,
+                           Stack(roll=(arch in ['BNInception', 'InceptionV3'])),
+                           ToTorchFormatTensor(div=(arch not in ['BNInception', 'InceptionV3'])),
+                           GroupNormalize(net.input_mean, net.input_std),
+                       ]), dense_sample=dense_sample, twice_sample=twice_sample),
+            batch_size=batch_size, shuffle=False,
+            num_workers=workers, pin_memory=True,
+    )
+    net = torch.nn.DataParallel(net.cuda())
+    net.eval()
+    data_gen = enumerate(data_loader)
+    max_num = len(data_loader.dataset)
+    result_file = open(result_file_path, 'w')
+    for i, data_pair in data_gen:
+        directory, data = data_pair
+        with torch.no_grad():
+            if i >= max_num:
+                break
+            num_crop = test_crop
+            if dense_sample:
+                num_crop *= 10  # 10 clips for testing when using dense sample
+            if twice_sample:
+                num_crop *= 2
+            if modality == 'RGB':
+                length = 3
+            elif modality == 'Flow':
+                length = 10
+            elif modality == 'RGBDiff':
+                length = 18
+            else:
+                raise ValueError("Unknown modality " + modality)
+            data_in = data.view(-1, length, data.size(2), data.size(3))
+            if is_shift:
+                data_in = data_in.view(batch_size * num_crop, test_segment, length, data_in.size(2), data_in.size(3))
+            rst, feature = net(data_in)
+            rst = rst.reshape(batch_size, num_crop, -1).mean(1)
+            if softmax:
+                # take the softmax to normalize the output to probability
+                rst = F.softmax(rst, dim=1)
+            rst = rst.data.cpu().numpy().copy()
+            if net.module.is_shift:
+                rst = rst.reshape(batch_size, num_class)
+            else:
+                rst = rst.reshape((batch_size, -1, num_class)).mean(axis=1).reshape((batch_size, num_class))
+            proba = np.squeeze(rst)
+            proba = np.exp(proba)/sum(np.exp(proba))
+            result_file.write(str(directory[0]) + ' ')
+            result_file.write(str(proba[0]) + ',' + str(proba[1]) + ',' + str(proba[2]) + '\n')
+    result_file.close()
+    print('fighting filter end')
\ No newline at end of file
--- a/load_util.py 0 → 100644
View file @cb29b6d
+++ b/load_util.py 0 → 100644
View file @cb29b6d
+import os
+import cv2
+import yaml
+import tensorflow as tf
+def load_config(config_path):
+    with open(config_path, 'r') as cf:
+        config_obj = yaml.load(cf, Loader=yaml.FullLoader)
+        print(config_obj)
+        return config_obj
+def load_argue_model(config):
+    cls_argue_path = config['MODEL']['CLS_ARGUE']
+    with tf.Graph().as_default():
+        if os.path.isfile(cls_argue_path):
+            print('Model filename: %s' % cls_argue_path)
+            with tf.gfile.GFile(cls_argue_path, 'rb') as f:
+                graph_def = tf.GraphDef()
+                graph_def.ParseFromString(f.read())
+                tf.import_graph_def(graph_def, name='')
+        x = tf.get_default_graph().get_tensor_by_name("x_batch:0")
+        output = tf.get_default_graph().get_tensor_by_name("output/BiasAdd:0")
+        config = tf.ConfigProto()
+        config.gpu_options.allow_growth = False
+        sess = tf.Session(config=config)
+    return x, output, sess
--- a/media_util.py 0 → 100644
View file @cb29b6d
+++ b/media_util.py 0 → 100644
View file @cb29b6d
+import os
+import cv2
+import random
+import shutil
+import subprocess
+import numpy as np
+import torch.optim
+from tqdm import tqdm
+import torch.nn.parallel
+from ops.models import TSN
+from ops.transforms import *
+from functools import partial
+from mtcnn.mtcnn import MTCNN
+from keras.models import Model
+from multiprocessing import Pool
+from keras.models import load_model
+from sklearn.externals import joblib
+from tensorflow.keras.preprocessing.image import img_to_array
+from ops.dataset import TSNDataSet
+from torch.nn import functional as F
+os.environ["CUDA_VISIBLE_DEVICES"] = '1'
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
+class FeatureExtractor(object):
+    def __init__(self, input_size=224, out_put_layer='global_average_pooling2d_1', model_path='nceptionResNetV2-final.h5'):
+        self.model = load_model(model_path)
+        self.input_size = input_size
+        self.model_inter = Model(inputs=self.model.input, outputs=self.model.get_layer(out_put_layer).output)
+    def inference(self, image):
+        image = cv2.resize(image, (self.input_size, self.input_size))
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        image = image.astype("float") / 255.0
+        image = img_to_array(image)
+        image = np.expand_dims(image, axis=0)
+        feature = self.model_inter.predict(image)[0]
+        return feature
+def extract_wav(config):
+    video_dir = config['VIDEO']['VIDEO_DIR']
+    video_save_dir = config['VIDEO']['VIDEO_SAVE_DIR']
+    audio_save_dir = config['VIDEO']['AUDIO_SAVE_DIR']
+    assert os.path.exists(video_dir)
+    video_names = os.listdir(video_dir)
+    for video_index, video_name in enumerate(video_names):
+        file_name = video_name.split('.')[0]
+        video_path = os.path.join(video_dir, video_name)
+        assert os.path.exists(audio_save_dir)
+        assert os.path.exists(video_save_dir)
+        audio_name = file_name + '.wav'
+        audio_save_path = os.path.join(audio_save_dir, audio_name)
+        video_save_path = os.path.join(video_save_dir, video_name)
+        command = 'ffmpeg -i {} -f wav -ar 16000 {}'.format(video_path, audio_save_path)
+        os.popen(command)
+        shutil.copyfile(video_path, video_save_path)
+def video2frame(file_name, class_path, dst_class_path):
+    if '.mp4' not in file_name:
+        return
+    name, ext = os.path.splitext(file_name)
+    dst_directory_path = os.path.join(dst_class_path, name)
+    video_file_path = os.path.join(class_path, file_name)
+    try:
+        if os.path.exists(dst_directory_path):
+            if not os.path.exists(os.path.join(dst_directory_path, 'img_00001.jpg')):
+                subprocess.call('rm -r \"{}\"'.format(dst_directory_path), shell=True)
+                print('remove {}'.format(dst_directory_path))
+                os.mkdir(dst_directory_path)
+            else:
+                print('*** convert has been done: {}'.format(dst_directory_path))
+                return
+        else:
+            os.mkdir(dst_directory_path)
+    except:
+        print(dst_directory_path)
+        return
+    cmd = 'ffmpeg -i \"{}\" -threads 1 -vf scale=-1:331 -q:v 0 \"{}/img_%05d.jpg\"'.format(video_file_path,
+                                                                                           dst_directory_path)
+    # print(cmd)
+    subprocess.call(cmd, shell=True,
+                    stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+def extract_frame(config):
+    video_save_dir = config['VIDEO']['VIDEO_SAVE_DIR']
+    frame_save_dir = config['VIDEO']['FRAME_SAVE_DIR']
+    n_thread = config['VIDEO']['THREAD_NUM']
+    assert os.path.exists(video_save_dir)
+    video_names = os.listdir(video_save_dir)
+    if not os.path.exists(frame_save_dir):
+        os.mkdir(frame_save_dir)
+    p = Pool(n_thread)
+    worker = partial(video2frame, class_path=video_save_dir, dst_class_path=frame_save_dir)
+    for _ in tqdm(p.imap_unordered(worker, video_names), total=len(video_names)):
+        pass
+    p.close()
+    p.join()
+def extract_frame_pose(config):
+    video_save_dir = config['VIDEO']['VIDEO_SAVE_DIR']
+    frame_save_dir = config['VIDEO']['POSE_FRAME_SAVE_DIR']
+    n_thread = config['VIDEO']['THREAD_NUM']
+    assert os.path.exists(video_save_dir)
+    video_names = os.listdir(video_save_dir)
+    if not os.path.exists(frame_save_dir):
+        os.mkdir(frame_save_dir)
+    p = Pool(n_thread)
+    worker = partial(video2frame, class_path=video_save_dir, dst_class_path=frame_save_dir)
+    for _ in tqdm(p.imap_unordered(worker, video_names), total=len(video_names)):
+        pass
+    p.close()
+    p.join()
+def extract_is10(config):
+    open_smile_dir = config['AUDIO']['OPENSMILE_DIR']
+    audio_save_dir = config['VIDEO']['AUDIO_SAVE_DIR']
+    is10_save_dir = config['VIDEO']['IS10_FEATURE_CSV_DIR']
+    assert os.path.exists(audio_save_dir)
+    audio_names = os.listdir(audio_save_dir)
+    if not os.path.exists(is10_save_dir):
+        os.mkdir(is10_save_dir)
+    for audio_name in audio_names:
+        audio_save_path = os.path.join(audio_save_dir, audio_name)
+        csv_name = audio_name[:-4] + '.csv'
+        csv_path = os.path.join(is10_save_dir, csv_name)
+        config = '{}/config/IS10_paraling.conf'.format(open_smile_dir)
+        command = '{}/SMILExtract -C {} -I {}  -O {}'.format(open_smile_dir, config, audio_save_path, csv_path)
+        os.popen(command)
+def extract_face_feature(config):
+    feature_emotion_path = config['MODEL']['FEATURE_EMOTION']
+    frame_save_dir = config['VIDEO']['FRAME_SAVE_DIR']
+    face_feature_dir = config['VIDEO']['FACE_FEATURE_DIR']
+    interval = config['EMOTION']['INTERVAL']
+    input_size = config['EMOTION']['INPUT_SIZE']
+    prefix = config['VIDEO']['PREFIX']
+    feature_extractor = FeatureExtractor(
+        input_size=input_size, out_put_layer='global_average_pooling2d_1', model_path=feature_emotion_path)
+    mtcnn_detector = MTCNN()
+    video_names = os.listdir(frame_save_dir)
+    for video_index, video_name in enumerate(video_names):
+        print('{}/{}'.format(video_index, len(video_names)))
+        video_dir = os.path.join(frame_save_dir, video_name)
+        frame_names = os.listdir(video_dir)
+        end = 0
+        features = []
+        while end < len(frame_names):
+            if end % interval == 0:
+                frame_name = prefix.format(end + 1)
+                frame_path = os.path.join(video_dir, frame_name)
+                frame = cv2.imread(frame_path)
+                img_h, img_w, img_c = frame.shape
+                detect_faces = mtcnn_detector.detect_faces(frame)
+                for i, e in enumerate(detect_faces):
+                    x1, y1, w, h = e['box']
+                    x1 = x1 if x1 > 0 else 0
+                    y1 = y1 if y1 > 0 else 0
+                    x1 = x1 if x1 < img_w else img_w
+                    y1 = y1 if y1 < img_h else img_h
+                    face = frame[y1:y1 + h, x1:x1 + w, :]
+                    if face is []:
+                        continue
+                    features.append(feature_extractor.inference(face)[0])
+                # top_5 = {}
+                # for i, e in enumerate(detect_faces):
+                #     x1, y1, w, h = e['box']
+                #     x1 = x1 if x1 > 0 else 0
+                #     y1 = y1 if y1 > 0 else 0
+                #     x1 = x1 if x1 < img_w else img_w
+                #     y1 = y1 if y1 < img_h else img_h
+                #
+                #     top_5[w*h] = [x1, y1, w, h]
+                #
+                # top_5 = sorted(top_5.items(), key=lambda d:d[0], reverse=True)
+                # j = 0
+                # for v in top_5:
+                #     if j > 5:
+                #         break
+                #     x1, y1, w, h = v[1]
+                #     face = frame[y1:y1+h, x1:x1+w, :]
+                #     if face is []:
+                #         continue
+                #     features.append(feature_extractor.inference(face)[0])
+            end += 1
+        if len(features) is 0:
+            continue
+        features_np = np.array(features)
+        face_feature_path = os.path.join(face_feature_dir, video_name + '.npy')
+        np.save(face_feature_path, features_np)
+def extract_random_face_feature(config):
+    feature_emotion_path = config['MODEL']['FEATURE_EMOTION']
+    face_save_dir = config['VIDEO']['FACE_IMAGE_DIR']
+    face_feature_dir = config['VIDEO']['FACE_FEATURE_DIR']
+    input_size = config['EMOTION']['INPUT_SIZE']
+    feature_extractor = FeatureExtractor(
+        input_size=input_size, out_put_layer='avg_pool', model_path=feature_emotion_path)
+    video_dirs = []
+    class_names = os.listdir(face_save_dir)
+    for class_name in class_names:
+        class_dir = os.path.join(face_save_dir, class_name)
+        video_names = os.listdir(class_dir)
+        for video_name in video_names:
+            video_dir = os.path.join(class_dir, video_name)
+            video_dirs.append(video_dir)
+    for video_dir_index, video_dir in enumerate(video_dirs):
+        print('{}/{}'.format(video_dir_index, len(video_dirs)))
+        class_name, video_name = video_dir.split('/')[-2], video_dir.split('/')[-1]
+        video_file_name = video_name.split('.')[0]
+        save_class_dir = os.path.join(face_feature_dir, class_name)
+        face_feature_path = os.path.join(save_class_dir, video_file_name + '.npy')
+        if os.path.exists(face_feature_path):
+            print('file is exists')
+            continue
+        image_names = os.listdir(video_dir)
+        image_dirs = []
+        for image_name in image_names:
+            image_dir = os.path.join(video_dir, image_name)
+            image_dirs.append(image_dir)
+        features = []
+        for image_dir_index, image_dir in enumerate(image_dirs):
+            sub_face_names = os.listdir(image_dir)
+            sub_face_num = len(sub_face_names)
+            for face_index in range(sub_face_num):
+                face_path = os.path.join(image_dir, sub_face_names[face_index])
+                face_image = cv2.imread(face_path)
+                features.append(feature_extractor.inference(face_image)[0])
+        face_num = len(features)
+        random_1 = random.sample(range(face_num), int(0.8 * face_num))
+        features_random_1 = [features[c] for c in random_1]
+        random_2 = random.sample(range(face_num), int(0.6 * face_num))
+        features_random_2 = [features[d] for d in random_2]
+        random_3 = random.sample(range(face_num), int(0.4 * face_num))
+        features_random_3 = [features[e] for e in random_3]
+        if len(features) is 0:
+            continue
+        if os.path.exists(save_class_dir) is False:
+            os.mkdir(save_class_dir)
+        features_np = np.array(features)
+        face_feature_path = os.path.join(save_class_dir, video_file_name + '.npy')
+        np.save(face_feature_path, features_np)
+        features_np_random_1 = np.array(features_random_1)
+        face_feature_1_path = os.path.join(save_class_dir, video_file_name + '_1.npy')
+        np.save(face_feature_1_path, features_np_random_1)
+        features_np_random_2 = np.array(features_random_2)
+        face_feature_2_path = os.path.join(save_class_dir, video_file_name + '_2.npy')
+        np.save(face_feature_2_path, features_np_random_2)
+        features_np_random_3 = np.array(features_random_3)
+        face_feature_3_path = os.path.join(save_class_dir, video_file_name + '_3.npy')
+        np.save(face_feature_3_path, features_np_random_3)
+def get_vid_fea(pics_features):
+    pics_features = np.array(pics_features)
+    fea_mean = pics_features.mean(axis=0)
+    fea_max = np.amax(pics_features, axis=0)
+    fea_min = np.amin(pics_features, axis=0)
+    fea_std = pics_features.std(axis=0)
+    feature_concate = np.concatenate((fea_mean, fea_max, fea_min, fea_std), axis=1)
+    return np.squeeze(feature_concate)
+def extract_random_face_and_frame_feature_():
+    face_feature_dir = r'/data2/3_log-ResNet50/train_mirror/'
+    new_face_feature_dir = r'/data2/retinaface/random_face_frame_features_train_mirror/'
+    video_dirs = []
+    class_names = os.listdir(face_feature_dir)
+    for class_name in class_names:
+        class_dir = os.path.join(face_feature_dir, class_name)
+        video_names = os.listdir(class_dir)
+        for video_name in video_names:
+            video_dir = os.path.join(class_dir, video_name)
+            video_dirs.append(video_dir)
+    for video_dir in video_dirs:
+        video_name = video_dir.split('/')[-1]
+        frame_names = os.listdir(video_dir)
+        feature = []
+        for frame_name in frame_names:
+            feature_dir = os.path.join(video_dir, frame_name)
+            face_features_names = os.listdir(feature_dir)
+            for face_features_name in face_features_names:
+                face_features_path = os.path.join(feature_dir, face_features_name)
+                feature_np = np.load(face_features_path, allow_pickle=True)
+                feature.append(feature_np)
+        feature_num = len(feature)
+        if feature_num < 4:
+            continue
+        random_1 = random.sample(range(feature_num), int(0.9 * feature_num))
+        features_random_1 = [feature[c] for c in random_1]
+        random_2 = random.sample(range(feature_num), int(0.7 * feature_num))
+        features_random_2 = [feature[d] for d in random_2]
+        random_3 = random.sample(range(feature_num), int(0.5 * feature_num))
+        features_random_3 = [feature[e] for e in random_3]
+        video_file_name = video_name.split('.')[0]
+        features_np = get_vid_fea(feature)
+        face_feature_path = os.path.join(new_face_feature_dir, video_file_name + '.npy')
+        np.save(face_feature_path, features_np)
+        features_np_random_1 = get_vid_fea(features_random_1)
+        face_feature_1_path = os.path.join(new_face_feature_dir, video_file_name + '_1.npy')
+        np.save(face_feature_1_path, features_np_random_1)
+        features_np_random_2 = get_vid_fea(features_random_2)
+        face_feature_2_path = os.path.join(new_face_feature_dir, video_file_name + '_2.npy')
+        np.save(face_feature_2_path, features_np_random_2)
+        features_np_random_3 = get_vid_fea(features_random_3)
+        face_feature_3_path = os.path.join(new_face_feature_dir, video_file_name + '_3.npy')
+        np.save(face_feature_3_path, features_np_random_3)
+def extract_random_face_and_frame_feature(config):
+    feature_emotion_path = config['MODEL']['FEATURE_EMOTION']
+    input_size = config['EMOTION']['INPUT_SIZE']
+    face_dir = r'/data2/retinaface/train/'
+    new_face_feature_dir = r'/data2/3_log-ResNet50/train_mirror/'
+    feature_extractor = FeatureExtractor(
+        input_size=input_size, out_put_layer='avg_pool', model_path=feature_emotion_path)
+    sub_face_paths = []
+    class_names = os.listdir(face_dir)
+    for class_name in class_names:
+        class_dir = os.path.join(face_dir, class_name)
+        video_names = os.listdir(class_dir)
+        for video_name in video_names:
+            video_dir = os.path.join(class_dir, video_name)
+            frame_names = os.listdir(video_dir)
+            for frame_name in frame_names:
+                frame_dir = os.path.join(video_dir, frame_name)
+                sub_face_names = os.listdir(frame_dir)
+                for sub_face_name in sub_face_names:
+                    sub_face_path = os.path.join(frame_dir, sub_face_name)
+                    sub_face_paths.append(sub_face_path)
+    for face_index, sub_face_path in enumerate(sub_face_paths):
+        print('{}/{}'.format(face_index+1, len(sub_face_paths)))
+        class_name, video_name, frame_name, sub_face_name = sub_face_path.split('/')[-4]\
+            , sub_face_path.split('/')[-3], sub_face_path.split('/')[-2], sub_face_path.split('/')[-1]
+        class_dir = os.path.join(new_face_feature_dir, class_name)
+        video_dir = os.path.join(class_dir, video_name)
+        frame_dir = os.path.join(video_dir, frame_name)
+        sub_face_name = sub_face_name.split('.')[0] + '.npy'
+        face_feature_save_path = os.path.join(frame_dir, sub_face_name)
+        if os.path.exists(face_feature_save_path):
+            print('file exists')
+            continue 
+        face_image = cv2.imread(sub_face_path)
+        mirror_face_image = cv2.flip(face_image, 0)
+        feature = feature_extractor.inference(mirror_face_image)[0]
+        if os.path.exists(class_dir) is False:
+            os.mkdir(class_dir)
+        if os.path.exists(video_dir) is False:
+            os.mkdir(video_dir)
+        if os.path.exists(frame_dir) is False:
+            os.mkdir(frame_dir)
+        np.save(face_feature_save_path, feature)
+def gen_file_list(frame_save_dir, frame_list_dir):
+    val_path = os.path.join(frame_list_dir, 'train.txt')
+    video_names = os.listdir(frame_save_dir)
+    ucf101_rgb_val_file = open(val_path, 'w')
+    for video_name in video_names:
+        images_dir = os.path.join(frame_save_dir, video_name)
+        ucf101_rgb_val_file.write(video_name)
+        ucf101_rgb_val_file.write(' ')
+        ucf101_rgb_val_file.write(str(len(os.listdir(images_dir))))
+        ucf101_rgb_val_file.write('\n')
+    ucf101_rgb_val_file.close()
+    return val_path
+def extract_video_features(config):
+    arch = config['FIGHTING']['ARCH']
+    prefix = config['VIDEO']['PREFIX']
+    modality = config['VIDEO_FILTER']['MODALITY']
+    test_crop = config['VIDEO_FILTER']['TEST_CROP']
+    batch_size = config['VIDEO_FILTER']['BATCH_SIZE']
+    weights_path = config['MODEL']['CLS_VIDEO']
+    test_segment = config['VIDEO_FILTER']['TEST_SEGMENT']
+    frame_save_dir = config['VIDEO']['FRAME_SAVE_DIR']
+    frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
+    feature_save_dir = r'/home/jwq/Desktop/tmp/video2np/train/'
+    workers = 8
+    num_class = 3
+    shift_div = 8
+    img_feature_dim = 256
+    softmax = False
+    is_shift = True
+    full_res = False
+    non_local = False
+    dense_sample = False
+    twice_sample = False
+    val_list = gen_file_list(frame_save_dir, frame_list_dir)
+    pretrain = 'imagenet'
+    shift_place = 'blockres'
+    crop_fusion_type = 'avg'
+    net = TSN(num_class, test_segment if is_shift else 1, modality,
+              base_model=arch,
+              consensus_type=crop_fusion_type,
+              img_feature_dim=img_feature_dim,
+              pretrain=pretrain,
+              is_shift=is_shift, shift_div=shift_div, shift_place=shift_place,
+              non_local=non_local,
+              )
+    checkpoint = torch.load(weights_path)
+    checkpoint = checkpoint['state_dict']
+    base_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint.items())}
+    replace_dict = {'base_model.classifier.weight': 'new_fc.weight',
+                    'base_model.classifier.bias': 'new_fc.bias',
+                    }
+    for k, v in replace_dict.items():
+        if k in base_dict:
+            base_dict[v] = base_dict.pop(k)
+    net.load_state_dict(base_dict)
+    input_size = net.scale_size if full_res else net.input_size
+    if test_crop == 1:
+        cropping = torchvision.transforms.Compose([
+            GroupScale(net.scale_size),
+            GroupCenterCrop(input_size),
+        ])
+    elif test_crop == 3:  # do not flip, so only 5 crops
+        cropping = torchvision.transforms.Compose([
+            GroupFullResSample(input_size, net.scale_size, flip=False)
+        ])
+    elif test_crop == 5:  # do not flip, so only 5 crops
+        cropping = torchvision.transforms.Compose([
+            GroupOverSample(input_size, net.scale_size, flip=False)
+        ])
+    elif test_crop == 10:
+        cropping = torchvision.transforms.Compose([
+            GroupOverSample(input_size, net.scale_size)
+        ])
+    else:
+        raise ValueError("Only 1, 5, 10 crops are supported while we got {}".format(test_crop))
+    data_loader = torch.utils.data.DataLoader(
+            TSNDataSet(frame_save_dir, val_list, num_segments=test_segment,
+                       new_length=1 if modality == "RGB" else 5,
+                       modality=modality,
+                       image_tmpl=prefix,
+                       test_mode=True,
+                       remove_missing=False,
+                       transform=torchvision.transforms.Compose([
+                           cropping,
+                           Stack(roll=(arch in ['BNInception', 'InceptionV3'])),
+                           ToTorchFormatTensor(div=(arch not in ['BNInception', 'InceptionV3'])),
+                           GroupNormalize(net.input_mean, net.input_std),
+                       ]), dense_sample=dense_sample, twice_sample=twice_sample),
+            batch_size=batch_size, shuffle=False,
+            num_workers=workers, pin_memory=True,
+    )
+    net = torch.nn.DataParallel(net.cuda())
+    net.eval()
+    data_gen = enumerate(data_loader)
+    max_num = len(data_loader.dataset)
+    for i, data_pair in data_gen:
+        directory, data = data_pair
+        with torch.no_grad():
+            if i >= max_num:
+                break
+            num_crop = test_crop
+            if dense_sample:
+                num_crop *= 10  # 10 clips for testing when using dense sample
+            if twice_sample:
+                num_crop *= 2
+            if modality == 'RGB':
+                length = 3
+            elif modality == 'Flow':
+                length = 10
+            elif modality == 'RGBDiff':
+                length = 18
+            else:
+                raise ValueError("Unknown modality " + modality)
+            data_in = data.view(-1, length, data.size(2), data.size(3))
+            if is_shift:
+                data_in = data_in.view(batch_size * num_crop, test_segment, length, data_in.size(2), data_in.size(3))
+            rst, feature = net(data_in)
+            feature = np.squeeze(feature.cpu())
+            print(feature.shape)
+            feature_name = str(directory[0]) + '.npy'
+            feature_save_path = os.path.join(feature_save_dir, feature_name)
+            np.save(feature_save_path, feature)
+if __name__ == '__main__':
+    extract_random_face_and_frame_feature_()
--- a/ops/__init__.py 0 → 100755
View file @cb29b6d
+++ b/ops/__init__.py 0 → 100755
View file @cb29b6d
+from ops.basic_ops import *
\ No newline at end of file
--- a/ops/__pycache__/__init__.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/ops/__pycache__/__init__.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/ops/__pycache__/basic_ops.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/ops/__pycache__/basic_ops.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/ops/__pycache__/dataset.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/ops/__pycache__/dataset.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/ops/__pycache__/models.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/ops/__pycache__/models.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/ops/__pycache__/temporal_shift.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/ops/__pycache__/temporal_shift.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/ops/__pycache__/transforms.cpython-36.pyc 0 → 100644
View file @cb29b6d
+++ b/ops/__pycache__/transforms.cpython-36.pyc 0 → 100644
View file @cb29b6d
--- a/ops/basic_ops.py 0 → 100755
View file @cb29b6d
+++ b/ops/basic_ops.py 0 → 100755
View file @cb29b6d
+import torch
+class Identity(torch.nn.Module):
+    def forward(self, input):
+        return input
+class SegmentConsensus(torch.nn.Module):
+    def __init__(self, consensus_type, dim=1):
+        super(SegmentConsensus, self).__init__()
+        self.consensus_type = consensus_type
+        self.dim = dim
+        self.shape = None
+    def forward(self, input_tensor):
+        self.shape = input_tensor.size()
+        if self.consensus_type == 'avg':
+            output = input_tensor.mean(dim=self.dim, keepdim=True)
+        elif self.consensus_type == 'identity':
+            output = input_tensor
+        else:
+            output = None
+        return output
+class ConsensusModule(torch.nn.Module):
+    def __init__(self, consensus_type, dim=1):
+        super(ConsensusModule, self).__init__()
+        self.consensus_type = consensus_type if consensus_type != 'rnn' else 'identity'
+        self.dim = dim
+    def forward(self, input):
+        return SegmentConsensus(self.consensus_type, self.dim)(input)
--- a/ops/dataset.py 0 → 100755
View file @cb29b6d
+++ b/ops/dataset.py 0 → 100755
View file @cb29b6d
+# Code for "TSM: Temporal Shift Module for Efficient Video Understanding"
+# arXiv:1811.08383
+# Ji Lin*, Chuang Gan, Song Han
+# {jilin, songhan}@mit.edu, ganchuang@csail.mit.edu
+import torch.utils.data as data
+from PIL import Image
+import os
+import numpy as np
+from numpy.random import randint
+class VideoRecord(object):
+    def __init__(self, row):
+        self._data = row
+    @property
+    def path(self):
+        return self._data[0]
+    @property
+    def num_frames(self):
+        return int(self._data[1])
+class TSNDataSet(data.Dataset):
+    def __init__(self, root_path, list_file,
+                 num_segments=3, new_length=1, modality='RGB',
+                 image_tmpl='img_{:05d}.jpg', transform=None,
+                 random_shift=True, test_mode=False,
+                 remove_missing=False, dense_sample=False, twice_sample=False):
+        self.root_path = root_path
+        self.list_file = list_file
+        self.num_segments = num_segments
+        self.new_length = new_length
+        self.modality = modality
+        self.image_tmpl = image_tmpl
+        self.transform = transform
+        self.random_shift = random_shift
+        self.test_mode = test_mode
+        self.remove_missing = remove_missing
+        self.dense_sample = dense_sample  # using dense sample as I3D
+        self.twice_sample = twice_sample  # twice sample for more validation
+        if self.dense_sample:
+            print('=> Using dense sample for the dataset...')
+        if self.twice_sample:
+            print('=> Using twice sample for the dataset...')
+        if self.modality == 'RGBDiff':
+            self.new_length += 1  # Diff needs one more image to calculate diff
+        self._parse_list()
+    def _load_image(self, directory, idx):
+        if self.modality == 'RGB' or self.modality == 'RGBDiff':
+            try:
+                return [Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(idx))).convert('RGB')]
+            except Exception:
+                print('error loading image:', os.path.join(self.root_path, directory, self.image_tmpl.format(idx)))
+                return [Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(1))).convert('RGB')]
+        elif self.modality == 'Flow':
+            if self.image_tmpl == 'flow_{}_{:05d}.jpg':  # ucf
+                x_img = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format('x', idx))).convert(
+                    'L')
+                y_img = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format('y', idx))).convert(
+                    'L')
+            elif self.image_tmpl == '{:06d}-{}_{:05d}.jpg':  # something v1 flow
+                x_img = Image.open(os.path.join(self.root_path, '{:06d}'.format(int(directory)), self.image_tmpl.
+                                                format(int(directory), 'x', idx))).convert('L')
+                y_img = Image.open(os.path.join(self.root_path, '{:06d}'.format(int(directory)), self.image_tmpl.
+                                                format(int(directory), 'y', idx))).convert('L')
+            else:
+                try:
+                    # idx_skip = 1 + (idx-1)*5
+                    flow = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(idx))).convert(
+                        'RGB')
+                except Exception:
+                    print('error loading flow file:',
+                          os.path.join(self.root_path, directory, self.image_tmpl.format(idx)))
+                    flow = Image.open(os.path.join(self.root_path, directory, self.image_tmpl.format(1))).convert('RGB')
+                # the input flow file is RGB image with (flow_x, flow_y, blank) for each channel
+                flow_x, flow_y, _ = flow.split()
+                x_img = flow_x.convert('L')
+                y_img = flow_y.convert('L')
+            return [x_img, y_img]
+    def _parse_list(self):
+        # check the frame number is large >3:
+        tmp = [x.strip().split(' ') for x in open(self.list_file)]
+        if not self.test_mode or self.remove_missing:
+            tmp = [item for item in tmp if int(item[1]) >= 3]
+        self.video_list = [VideoRecord(item) for item in tmp]
+        if self.image_tmpl == '{:06d}-{}_{:05d}.jpg':
+            for v in self.video_list:
+                v._data[1] = int(v._data[1]) / 2
+        print('video number:%d' % (len(self.video_list)))
+    def _sample_indices(self, record):
+        """
+        :param record: VideoRecord
+        :return: list
+        """
+        if self.dense_sample:  # i3d dense sample
+            sample_pos = max(1, 1 + record.num_frames - 64)
+            t_stride = 64 // self.num_segments
+            start_idx = 0 if sample_pos == 1 else np.random.randint(0, sample_pos - 1)
+            offsets = [(idx * t_stride + start_idx) % record.num_frames for idx in range(self.num_segments)]
+            return np.array(offsets) + 1
+        else:  # normal sample
+            average_duration = (record.num_frames - self.new_length + 1) // self.num_segments
+            if average_duration > 0:
+                offsets = np.multiply(list(range(self.num_segments)), average_duration) + randint(average_duration,
+                                                                                                  size=self.num_segments)
+            elif record.num_frames > self.num_segments:
+                offsets = np.sort(randint(record.num_frames - self.new_length + 1, size=self.num_segments))
+            else:
+                offsets = np.zeros((self.num_segments,))
+            return offsets + 1
+    def _get_val_indices(self, record):
+        if self.dense_sample:  # i3d dense sample
+            sample_pos = max(1, 1 + record.num_frames - 64)
+            t_stride = 64 // self.num_segments
+            start_idx = 0 if sample_pos == 1 else np.random.randint(0, sample_pos - 1)
+            offsets = [(idx * t_stride + start_idx) % record.num_frames for idx in range(self.num_segments)]
+            return np.array(offsets) + 1
+        else:
+            if record.num_frames > self.num_segments + self.new_length - 1:
+                tick = (record.num_frames - self.new_length + 1) / float(self.num_segments)
+                offsets = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segments)])
+            else:
+                offsets = np.zeros((self.num_segments,))
+            return offsets + 1
+    def _get_test_indices(self, record):
+        if self.dense_sample:
+            sample_pos = max(1, 1 + record.num_frames - 64)
+            t_stride = 64 // self.num_segments
+            start_list = np.linspace(0, sample_pos - 1, num=10, dtype=int)
+            offsets = []
+            for start_idx in start_list.tolist():
+                offsets += [(idx * t_stride + start_idx) % record.num_frames for idx in range(self.num_segments)]
+            return np.array(offsets) + 1
+        elif self.twice_sample:
+            tick = (record.num_frames - self.new_length + 1) / float(self.num_segments)
+            offsets = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segments)] +
+                               [int(tick * x) for x in range(self.num_segments)])
+            return offsets + 1
+        else:
+            tick = (record.num_frames - self.new_length + 1) / float(self.num_segments)
+            offsets = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segments)])
+            return offsets + 1
+    def __getitem__(self, index):
+        record = self.video_list[index]
+        # check this is a legit video folder
+        if self.image_tmpl == 'flow_{}_{:05d}.jpg':
+            file_name = self.image_tmpl.format('x', 1)
+            full_path = os.path.join(self.root_path, record.path, file_name)
+        elif self.image_tmpl == '{:06d}-{}_{:05d}.jpg':
+            file_name = self.image_tmpl.format(int(record.path), 'x', 1)
+            full_path = os.path.join(self.root_path, '{:06d}'.format(int(record.path)), file_name)
+        else:
+            file_name = self.image_tmpl.format(1)
+            full_path = os.path.join(self.root_path, record.path, file_name)
+        while not os.path.exists(full_path):
+            print('################## Not Found:', os.path.join(self.root_path, record.path, file_name))
+            index = np.random.randint(len(self.video_list))
+            record = self.video_list[index]
+            if self.image_tmpl == 'flow_{}_{:05d}.jpg':
+                file_name = self.image_tmpl.format('x', 1)
+                full_path = os.path.join(self.root_path, record.path, file_name)
+            elif self.image_tmpl == '{:06d}-{}_{:05d}.jpg':
+                file_name = self.image_tmpl.format(int(record.path), 'x', 1)
+                full_path = os.path.join(self.root_path, '{:06d}'.format(int(record.path)), file_name)
+            else:
+                file_name = self.image_tmpl.format(1)
+                full_path = os.path.join(self.root_path, record.path, file_name)
+        if not self.test_mode:
+            segment_indices = self._sample_indices(record) if self.random_shift else self._get_val_indices(record)
+        else:
+            segment_indices = self._get_test_indices(record)
+        return self.get(record, segment_indices)
+    def get(self, record, indices):
+        images = list()
+        for seg_ind in indices:
+            p = int(seg_ind)
+            for i in range(self.new_length):
+                seg_imgs = self._load_image(record.path, p)
+                images.extend(seg_imgs)
+                if p < record.num_frames:
+                    p += 1
+        process_data = self.transform(images)
+        return record.path, process_data
+    def __len__(self):
+        return len(self.video_list)
--- a/ops/dataset_config.py 0 → 100755
View file @cb29b6d
+++ b/ops/dataset_config.py 0 → 100755
View file @cb29b6d
+# Code for "TSM: Temporal Shift Module for Efficient Video Understanding"
+# arXiv:1811.08383
+# Ji Lin*, Chuang Gan, Song Han
+# {jilin, songhan}@mit.edu, ganchuang@csail.mit.edu
+import os
+ROOT_DATASET = '/data1/action_1_images/'  # '/data/jilin/'
+def return_ucf101(modality):
+    filename_categories = 'labels/classInd.txt'
+    if modality == 'RGB':
+        root_data = ROOT_DATASET + 'images'
+        filename_imglist_train = 'file_list/ucf101_rgb_train_split_1.txt'
+        filename_imglist_val = 'file_list/ucf101_rgb_val_split_1.txt'
+        prefix = 'img_{:05d}.jpg'
+    elif modality == 'Flow':
+        root_data = ROOT_DATASET + 'UCF101/jpg'
+        filename_imglist_train = 'UCF101/file_list/ucf101_flow_train_split_1.txt'
+        filename_imglist_val = 'UCF101/file_list/ucf101_flow_val_split_1.txt'
+        prefix = 'flow_{}_{:05d}.jpg'
+    else:
+        raise NotImplementedError('no such modality:' + modality)
+    return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
+def return_hmdb51(modality):
+    filename_categories = 51
+    if modality == 'RGB':
+        root_data = ROOT_DATASET + 'HMDB51/images'
+        filename_imglist_train = 'HMDB51/splits/hmdb51_rgb_train_split_1.txt'
+        filename_imglist_val = 'HMDB51/splits/hmdb51_rgb_val_split_1.txt'
+        prefix = 'img_{:05d}.jpg'
+    elif modality == 'Flow':
+        root_data = ROOT_DATASET + 'HMDB51/images'
+        filename_imglist_train = 'HMDB51/splits/hmdb51_flow_train_split_1.txt'
+        filename_imglist_val = 'HMDB51/splits/hmdb51_flow_val_split_1.txt'
+        prefix = 'flow_{}_{:05d}.jpg'
+    else:
+        raise NotImplementedError('no such modality:' + modality)
+    return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
+def return_something(modality):
+    filename_categories = 'something/v1/category.txt'
+    if modality == 'RGB':
+        root_data = ROOT_DATASET + 'something/v1/20bn-something-something-v1'
+        filename_imglist_train = 'something/v1/train_videofolder.txt'
+        filename_imglist_val = 'something/v1/val_videofolder.txt'
+        prefix = '{:05d}.jpg'
+    elif modality == 'Flow':
+        root_data = ROOT_DATASET + 'something/v1/20bn-something-something-v1-flow'
+        filename_imglist_train = 'something/v1/train_videofolder_flow.txt'
+        filename_imglist_val = 'something/v1/val_videofolder_flow.txt'
+        prefix = '{:06d}-{}_{:05d}.jpg'
+    else:
+        print('no such modality:'+modality)
+        raise NotImplementedError
+    return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
+def return_somethingv2(modality):
+    filename_categories = 'something/v2/category.txt'
+    if modality == 'RGB':
+        root_data = ROOT_DATASET + 'something/v2/20bn-something-something-v2-frames'
+        filename_imglist_train = 'something/v2/train_videofolder.txt'
+        filename_imglist_val = 'something/v2/val_videofolder.txt'
+        prefix = '{:06d}.jpg'
+    elif modality == 'Flow':
+        root_data = ROOT_DATASET + 'something/v2/20bn-something-something-v2-flow'
+        filename_imglist_train = 'something/v2/train_videofolder_flow.txt'
+        filename_imglist_val = 'something/v2/val_videofolder_flow.txt'
+        prefix = '{:06d}.jpg'
+    else:
+        raise NotImplementedError('no such modality:'+modality)
+    return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
+def return_jester(modality):
+    filename_categories = 'jester/category.txt'
+    if modality == 'RGB':
+        prefix = '{:05d}.jpg'
+        root_data = ROOT_DATASET + 'jester/20bn-jester-v1'
+        filename_imglist_train = 'jester/train_videofolder.txt'
+        filename_imglist_val = 'jester/val_videofolder.txt'
+    else:
+        raise NotImplementedError('no such modality:'+modality)
+    return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
+def return_kinetics(modality):
+    filename_categories = 400
+    if modality == 'RGB':
+        root_data = ROOT_DATASET + 'kinetics/images'
+        filename_imglist_train = 'kinetics/labels/train_videofolder.txt'
+        filename_imglist_val = 'kinetics/labels/val_videofolder.txt'
+        prefix = 'img_{:05d}.jpg'
+    else:
+        raise NotImplementedError('no such modality:' + modality)
+    return filename_categories, filename_imglist_train, filename_imglist_val, root_data, prefix
+def return_dataset(dataset, modality):
+    dict_single = {'jester': return_jester, 'something': return_something, 'somethingv2': return_somethingv2,
+                   'ucf101': return_ucf101, 'hmdb51': return_hmdb51,
+                   'kinetics': return_kinetics}
+    if dataset in dict_single:
+        file_categories, file_imglist_train, file_imglist_val, root_data, prefix = dict_single[dataset](modality)
+    else:
+        raise ValueError('Unknown dataset '+dataset)
+    file_imglist_train = os.path.join(ROOT_DATASET, file_imglist_train)
+    file_imglist_val = os.path.join(ROOT_DATASET, file_imglist_val)
+    if isinstance(file_categories, str):
+        file_categories = os.path.join(ROOT_DATASET, file_categories)
+        with open(file_categories) as f:
+            lines = f.readlines()
+        categories = [item.rstrip() for item in lines]
+    else:  # number of categories
+        categories = [None] * file_categories
+    n_class = len(categories)
+    print('{}: {} classes'.format(dataset, n_class))
+    return n_class, file_imglist_train, file_imglist_val, root_data, prefix
--- a/ops/models.py 0 → 100755
View file @cb29b6d
+++ b/ops/models.py 0 → 100755
View file @cb29b6d
+# Code for "TSM: Temporal Shift Module for Efficient Video Understanding"
+# arXiv:1811.08383
+# Ji Lin*, Chuang Gan, Song Han
+# {jilin, songhan}@mit.edu, ganchuang@csail.mit.edu
+from torch import nn
+from ops.basic_ops import ConsensusModule
+from ops.transforms import *
+from torch.nn.init import normal_, constant_
+class TSN(nn.Module):
+    def __init__(self, num_class, num_segments, modality,
+                 base_model='resnet101', new_length=None,
+                 consensus_type='avg', before_softmax=True,
+                 dropout=0.8, img_feature_dim=256,
+                 crop_num=1, partial_bn=True, print_spec=True, pretrain='imagenet',
+                 is_shift=True, shift_div=8, shift_place='blockres', fc_lr5=False,
+                 temporal_pool=False, non_local=False):
+        super(TSN, self).__init__()
+        self.modality = modality
+        self.num_segments = num_segments
+        self.reshape = True
+        self.before_softmax = before_softmax
+        self.dropout = dropout
+        self.crop_num = crop_num
+        self.consensus_type = consensus_type
+        self.img_feature_dim = img_feature_dim  # the dimension of the CNN feature to represent each frame
+        self.pretrain = pretrain
+        self.is_shift = is_shift
+        self.shift_div = shift_div
+        self.shift_place = shift_place
+        self.base_model_name = base_model
+        self.fc_lr5 = fc_lr5
+        self.temporal_pool = temporal_pool
+        self.non_local = non_local
+        if not before_softmax and consensus_type != 'avg':
+            raise ValueError("Only avg consensus can be used after Softmax")
+        if new_length is None:
+            self.new_length = 1 if modality == "RGB" else 5
+        else:
+            self.new_length = new_length
+        if print_spec:
+            print(("""
+    Initializing TSN with base model: {}.
+    TSN Configurations:
+        input_modality:     {}
+        num_segments:       {}
+        new_length:         {}
+        consensus_module:   {}
+        dropout_ratio:      {}
+        img_feature_dim:    {}
+            """.format(base_model, self.modality, self.num_segments, self.new_length, consensus_type, self.dropout, self.img_feature_dim)))
+        self._prepare_base_model(base_model)
+        feature_dim = self._prepare_tsn(num_class)
+        if self.modality == 'Flow':
+            print("Converting the ImageNet model to a flow init model")
+            self.base_model = self._construct_flow_model(self.base_model)
+            print("Done. Flow model ready...")
+        elif self.modality == 'RGBDiff':
+            print("Converting the ImageNet model to RGB+Diff init model")
+            self.base_model = self._construct_diff_model(self.base_model)
+            print("Done. RGBDiff model ready.")
+        self.consensus = ConsensusModule(consensus_type)
+        if not self.before_softmax:
+            self.softmax = nn.Softmax()
+        self._enable_pbn = partial_bn
+        if partial_bn:
+            self.partialBN(True)
+    def _prepare_tsn(self, num_class):
+        feature_dim = getattr(self.base_model, self.base_model.last_layer_name).in_features
+        if self.dropout == 0:
+            setattr(self.base_model, self.base_model.last_layer_name, nn.Linear(feature_dim, num_class))
+            self.new_fc = None
+        else:
+            setattr(self.base_model, self.base_model.last_layer_name, nn.Dropout(p=self.dropout))
+            self.new_fc = nn.Linear(feature_dim, num_class)
+        std = 0.001
+        if self.new_fc is None:
+            normal_(getattr(self.base_model, self.base_model.last_layer_name).weight, 0, std)
+            constant_(getattr(self.base_model, self.base_model.last_layer_name).bias, 0)
+        else:
+            if hasattr(self.new_fc, 'weight'):
+                normal_(self.new_fc.weight, 0, std)
+                constant_(self.new_fc.bias, 0)
+        return feature_dim
+    def _prepare_base_model(self, base_model):
+        print('=> base model: {}'.format(base_model))
+        if 'resnet' in base_model:
+            self.base_model = getattr(torchvision.models, base_model)(True if self.pretrain == 'imagenet' else False)
+            if self.is_shift:
+                print('Adding temporal shift...')
+                from ops.temporal_shift import make_temporal_shift
+                make_temporal_shift(self.base_model, self.num_segments,
+                                    n_div=self.shift_div, place=self.shift_place, temporal_pool=self.temporal_pool)
+            if self.non_local:
+                print('Adding non-local module...')
+                from ops.non_local import make_non_local
+                make_non_local(self.base_model, self.num_segments)
+            self.base_model.last_layer_name = 'fc'
+            self.input_size = 224
+            self.input_mean = [0.485, 0.456, 0.406]
+            self.input_std = [0.229, 0.224, 0.225]
+            self.base_model.avgpool = nn.AdaptiveAvgPool2d(1)
+            if self.modality == 'Flow':
+                self.input_mean = [0.5]
+                self.input_std = [np.mean(self.input_std)]
+            elif self.modality == 'RGBDiff':
+                self.input_mean = [0.485, 0.456, 0.406] + [0] * 3 * self.new_length
+                self.input_std = self.input_std + [np.mean(self.input_std) * 2] * 3 * self.new_length
+        elif base_model == 'mobilenetv2':
+            from archs.mobilenet_v2 import mobilenet_v2, InvertedResidual
+            self.base_model = mobilenet_v2(True if self.pretrain == 'imagenet' else False)
+            self.base_model.last_layer_name = 'classifier'
+            self.input_size = 224
+            self.input_mean = [0.485, 0.456, 0.406]
+            self.input_std = [0.229, 0.224, 0.225]
+            self.base_model.avgpool = nn.AdaptiveAvgPool2d(1)
+            if self.is_shift:
+                from ops.temporal_shift import TemporalShift
+                for m in self.base_model.modules():
+                    if isinstance(m, InvertedResidual) and len(m.conv) == 8 and m.use_res_connect:
+                        if self.print_spec:
+                            print('Adding temporal shift... {}'.format(m.use_res_connect))
+                        m.conv[0] = TemporalShift(m.conv[0], n_segment=self.num_segments, n_div=self.shift_div)
+            if self.modality == 'Flow':
+                self.input_mean = [0.5]
+                self.input_std = [np.mean(self.input_std)]
+            elif self.modality == 'RGBDiff':
+                self.input_mean = [0.485, 0.456, 0.406] + [0] * 3 * self.new_length
+                self.input_std = self.input_std + [np.mean(self.input_std) * 2] * 3 * self.new_length
+        elif base_model == 'BNInception':
+            from archs.bn_inception import bninception
+            self.base_model = bninception(pretrained=self.pretrain)
+            self.input_size = self.base_model.input_size
+            self.input_mean = self.base_model.mean
+            self.input_std = self.base_model.std
+            self.base_model.last_layer_name = 'fc'
+            if self.modality == 'Flow':
+                self.input_mean = [128]
+            elif self.modality == 'RGBDiff':
+                self.input_mean = self.input_mean * (1 + self.new_length)
+            if self.is_shift:
+                print('Adding temporal shift...')
+                self.base_model.build_temporal_ops(
+                    self.num_segments, is_temporal_shift=self.shift_place, shift_div=self.shift_div)
+        else:
+            raise ValueError('Unknown base model: {}'.format(base_model))
+    def train(self, mode=True):
+        """
+        Override the default train() to freeze the BN parameters
+        :return:
+        """
+        super(TSN, self).train(mode)
+        count = 0
+        if self._enable_pbn and mode:
+            print("Freezing BatchNorm2D except the first one.")
+            for m in self.base_model.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    count += 1
+                    if count >= (2 if self._enable_pbn else 1):
+                        m.eval()
+                        # shutdown update in frozen mode
+                        m.weight.requires_grad = False
+                        m.bias.requires_grad = False
+    def partialBN(self, enable):
+        self._enable_pbn = enable
+    def get_optim_policies(self):
+        first_conv_weight = []
+        first_conv_bias = []
+        normal_weight = []
+        normal_bias = []
+        lr5_weight = []
+        lr10_bias = []
+        bn = []
+        custom_ops = []
+        conv_cnt = 0
+        bn_cnt = 0
+        for m in self.modules():
+            if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv3d):
+                ps = list(m.parameters())
+                conv_cnt += 1
+                if conv_cnt == 1:
+                    first_conv_weight.append(ps[0])
+                    if len(ps) == 2:
+                        first_conv_bias.append(ps[1])
+                else:
+                    normal_weight.append(ps[0])
+                    if len(ps) == 2:
+                        normal_bias.append(ps[1])
+            elif isinstance(m, torch.nn.Linear):
+                ps = list(m.parameters())
+                if self.fc_lr5:
+                    lr5_weight.append(ps[0])
+                else:
+                    normal_weight.append(ps[0])
+                if len(ps) == 2:
+                    if self.fc_lr5:
+                        lr10_bias.append(ps[1])
+                    else:
+                        normal_bias.append(ps[1])
+            elif isinstance(m, torch.nn.BatchNorm2d):
+                bn_cnt += 1
+                # later BN's are frozen
+                if not self._enable_pbn or bn_cnt == 1:
+                    bn.extend(list(m.parameters()))
+            elif isinstance(m, torch.nn.BatchNorm3d):
+                bn_cnt += 1
+                # later BN's are frozen
+                if not self._enable_pbn or bn_cnt == 1:
+                    bn.extend(list(m.parameters()))
+            elif len(m._modules) == 0:
+                if len(list(m.parameters())) > 0:
+                    raise ValueError("New atomic module type: {}. Need to give it a learning policy".format(type(m)))
+        return [
+            {'params': first_conv_weight, 'lr_mult': 5 if self.modality == 'Flow' else 1, 'decay_mult': 1,
+             'name': "first_conv_weight"},
+            {'params': first_conv_bias, 'lr_mult': 10 if self.modality == 'Flow' else 2, 'decay_mult': 0,
+             'name': "first_conv_bias"},
+            {'params': normal_weight, 'lr_mult': 1, 'decay_mult': 1,
+             'name': "normal_weight"},
+            {'params': normal_bias, 'lr_mult': 2, 'decay_mult': 0,
+             'name': "normal_bias"},
+            {'params': bn, 'lr_mult': 1, 'decay_mult': 0,
+             'name': "BN scale/shift"},
+            {'params': custom_ops, 'lr_mult': 1, 'decay_mult': 1,
+             'name': "custom_ops"},
+            # for fc
+            {'params': lr5_weight, 'lr_mult': 5, 'decay_mult': 1,
+             'name': "lr5_weight"},
+            {'params': lr10_bias, 'lr_mult': 10, 'decay_mult': 0,
+             'name': "lr10_bias"},
+        ]
+    def forward(self, input, no_reshape=False):
+        if not no_reshape:
+            sample_len = (3 if self.modality == "RGB" else 2) * self.new_length
+            if self.modality == 'RGBDiff':
+                sample_len = 3 * self.new_length
+                input = self._get_diff(input)
+            base_out = self.base_model(input.view((-1, sample_len) + input.size()[-2:]))
+        else:
+            base_out = self.base_model(input)
+        if self.dropout > 0:
+            feature = base_out.view(base_out.size(0), -1)
+            base_out = self.new_fc(base_out)
+        if not self.before_softmax:
+            base_out = self.softmax(base_out)
+        if self.reshape:
+            if self.is_shift and self.temporal_pool:
+                base_out = base_out.view((-1, self.num_segments // 2) + base_out.size()[1:])
+            else:
+                base_out = base_out.view((-1, self.num_segments) + base_out.size()[1:])
+            output = self.consensus(base_out)
+            return output.squeeze(1), feature
+    def _get_diff(self, input, keep_rgb=False):
+        input_c = 3 if self.modality in ["RGB", "RGBDiff"] else 2
+        input_view = input.view((-1, self.num_segments, self.new_length + 1, input_c,) + input.size()[2:])
+        if keep_rgb:
+            new_data = input_view.clone()
+        else:
+            new_data = input_view[:, :, 1:, :, :, :].clone()
+        for x in reversed(list(range(1, self.new_length + 1))):
+            if keep_rgb:
+                new_data[:, :, x, :, :, :] = input_view[:, :, x, :, :, :] - input_view[:, :, x - 1, :, :, :]
+            else:
+                new_data[:, :, x - 1, :, :, :] = input_view[:, :, x, :, :, :] - input_view[:, :, x - 1, :, :, :]
+        return new_data
+    def _construct_flow_model(self, base_model):
+        # modify the convolution layers
+        # Torch models are usually defined in a hierarchical way.
+        # nn.modules.children() return all sub modules in a DFS manner
+        modules = list(self.base_model.modules())
+        first_conv_idx = list(filter(lambda x: isinstance(modules[x], nn.Conv2d), list(range(len(modules)))))[0]
+        conv_layer = modules[first_conv_idx]
+        container = modules[first_conv_idx - 1]
+        # modify parameters, assume the first blob contains the convolution kernels
+        params = [x.clone() for x in conv_layer.parameters()]
+        kernel_size = params[0].size()
+        new_kernel_size = kernel_size[:1] + (2 * self.new_length, ) + kernel_size[2:]
+        new_kernels = params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()
+        new_conv = nn.Conv2d(2 * self.new_length, conv_layer.out_channels,
+                             conv_layer.kernel_size, conv_layer.stride, conv_layer.padding,
+                             bias=True if len(params) == 2 else False)
+        new_conv.weight.data = new_kernels
+        if len(params) == 2:
+            new_conv.bias.data = params[1].data # add bias if neccessary
+        layer_name = list(container.state_dict().keys())[0][:-7] # remove .weight suffix to get the layer name
+        # replace the first convlution layer
+        setattr(container, layer_name, new_conv)
+        if self.base_model_name == 'BNInception':
+            import torch.utils.model_zoo as model_zoo
+            sd = model_zoo.load_url('https://www.dropbox.com/s/35ftw2t4mxxgjae/BNInceptionFlow-ef652051.pth.tar?dl=1')
+            base_model.load_state_dict(sd)
+            print('=> Loading pretrained Flow weight done...')
+        else:
+            print('#' * 30, 'Warning! No Flow pretrained model is found')
+        return base_model
+    def _construct_diff_model(self, base_model, keep_rgb=False):
+        # modify the convolution layers
+        # Torch models are usually defined in a hierarchical way.
+        # nn.modules.children() return all sub modules in a DFS manner
+        modules = list(self.base_model.modules())
+        first_conv_idx = filter(lambda x: isinstance(modules[x], nn.Conv2d), list(range(len(modules))))[0]
+        conv_layer = modules[first_conv_idx]
+        container = modules[first_conv_idx - 1]
+        # modify parameters, assume the first blob contains the convolution kernels
+        params = [x.clone() for x in conv_layer.parameters()]
+        kernel_size = params[0].size()
+        if not keep_rgb:
+            new_kernel_size = kernel_size[:1] + (3 * self.new_length,) + kernel_size[2:]
+            new_kernels = params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()
+        else:
+            new_kernel_size = kernel_size[:1] + (3 * self.new_length,) + kernel_size[2:]
+            new_kernels = torch.cat((params[0].data, params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()),
+                                    1)
+            new_kernel_size = kernel_size[:1] + (3 + 3 * self.new_length,) + kernel_size[2:]
+        new_conv = nn.Conv2d(new_kernel_size[1], conv_layer.out_channels,
+                             conv_layer.kernel_size, conv_layer.stride, conv_layer.padding,
+                             bias=True if len(params) == 2 else False)
+        new_conv.weight.data = new_kernels
+        if len(params) == 2:
+            new_conv.bias.data = params[1].data  # add bias if neccessary
+        layer_name = list(container.state_dict().keys())[0][:-7]  # remove .weight suffix to get the layer name
+        # replace the first convolution layer
+        setattr(container, layer_name, new_conv)
+        return base_model
+    @property
+    def crop_size(self):
+        return self.input_size
+    @property
+    def scale_size(self):
+        return self.input_size * 256 // 224
+    def get_augmentation(self, flip=True):
+        if self.modality == 'RGB':
+            if flip:
+                return torchvision.transforms.Compose([GroupMultiScaleCrop(self.input_size, [1, .875, .75, .66]),
+                                                       GroupRandomHorizontalFlip(is_flow=False)])
+            else:
+                print('#' * 20, 'NO FLIP!!!')
+                return torchvision.transforms.Compose([GroupMultiScaleCrop(self.input_size, [1, .875, .75, .66])])
+        elif self.modality == 'Flow':
+            return torchvision.transforms.Compose([GroupMultiScaleCrop(self.input_size, [1, .875, .75]),
+                                                   GroupRandomHorizontalFlip(is_flow=True)])
+        elif self.modality == 'RGBDiff':
+            return torchvision.transforms.Compose([GroupMultiScaleCrop(self.input_size, [1, .875, .75]),
+                                                   GroupRandomHorizontalFlip(is_flow=False)])
--- a/ops/non_local.py 0 → 100644
View file @cb29b6d
+++ b/ops/non_local.py 0 → 100644
View file @cb29b6d
+# Non-local block using embedded gaussian
+# Code from
+# https://github.com/AlexHex7/Non-local_pytorch/blob/master/Non-Local_pytorch_0.3.1/lib/non_local_embedded_gaussian.py
+import torch
+from torch import nn
+from torch.nn import functional as F
+class _NonLocalBlockND(nn.Module):
+    def __init__(self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True):
+        super(_NonLocalBlockND, self).__init__()
+        assert dimension in [1, 2, 3]
+        self.dimension = dimension
+        self.sub_sample = sub_sample
+        self.in_channels = in_channels
+        self.inter_channels = inter_channels
+        if self.inter_channels is None:
+            self.inter_channels = in_channels // 2
+            if self.inter_channels == 0:
+                self.inter_channels = 1
+        if dimension == 3:
+            conv_nd = nn.Conv3d
+            max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
+            bn = nn.BatchNorm3d
+        elif dimension == 2:
+            conv_nd = nn.Conv2d
+            max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
+            bn = nn.BatchNorm2d
+        else:
+            conv_nd = nn.Conv1d
+            max_pool_layer = nn.MaxPool1d(kernel_size=(2))
+            bn = nn.BatchNorm1d
+        self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
+                         kernel_size=1, stride=1, padding=0)
+        if bn_layer:
+            self.W = nn.Sequential(
+                conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
+                        kernel_size=1, stride=1, padding=0),
+                bn(self.in_channels)
+            )
+            nn.init.constant_(self.W[1].weight, 0)
+            nn.init.constant_(self.W[1].bias, 0)
+        else:
+            self.W = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels,
+                             kernel_size=1, stride=1, padding=0)
+            nn.init.constant_(self.W.weight, 0)
+            nn.init.constant_(self.W.bias, 0)
+        self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
+                             kernel_size=1, stride=1, padding=0)
+        self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels,
+                           kernel_size=1, stride=1, padding=0)
+        if sub_sample:
+            self.g = nn.Sequential(self.g, max_pool_layer)
+            self.phi = nn.Sequential(self.phi, max_pool_layer)
+    def forward(self, x):
+        '''
+        :param x: (b, c, t, h, w)
+        :return:
+        '''
+        batch_size = x.size(0)
+        g_x = self.g(x).view(batch_size, self.inter_channels, -1)
+        g_x = g_x.permute(0, 2, 1)
+        theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
+        theta_x = theta_x.permute(0, 2, 1)
+        phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)
+        f = torch.matmul(theta_x, phi_x)
+        f_div_C = F.softmax(f, dim=-1)
+        y = torch.matmul(f_div_C, g_x)
+        y = y.permute(0, 2, 1).contiguous()
+        y = y.view(batch_size, self.inter_channels, *x.size()[2:])
+        W_y = self.W(y)
+        z = W_y + x
+        return z
+class NONLocalBlock1D(_NonLocalBlockND):
+    def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
+        super(NONLocalBlock1D, self).__init__(in_channels,
+                                              inter_channels=inter_channels,
+                                              dimension=1, sub_sample=sub_sample,
+                                              bn_layer=bn_layer)
+class NONLocalBlock2D(_NonLocalBlockND):
+    def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
+        super(NONLocalBlock2D, self).__init__(in_channels,
+                                              inter_channels=inter_channels,
+                                              dimension=2, sub_sample=sub_sample,
+                                              bn_layer=bn_layer)
+class NONLocalBlock3D(_NonLocalBlockND):
+    def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
+        super(NONLocalBlock3D, self).__init__(in_channels,
+                                              inter_channels=inter_channels,
+                                              dimension=3, sub_sample=sub_sample,
+                                              bn_layer=bn_layer)
+class NL3DWrapper(nn.Module):
+    def __init__(self, block, n_segment):
+        super(NL3DWrapper, self).__init__()
+        self.block = block
+        self.nl = NONLocalBlock3D(block.bn3.num_features)
+        self.n_segment = n_segment
+    def forward(self, x):
+        x = self.block(x)
+        nt, c, h, w = x.size()
+        x = x.view(nt // self.n_segment, self.n_segment, c, h, w).transpose(1, 2)  # n, c, t, h, w
+        x = self.nl(x)
+        x = x.transpose(1, 2).contiguous().view(nt, c, h, w)
+        return x
+def make_non_local(net, n_segment):
+    import torchvision
+    import archs
+    if isinstance(net, torchvision.models.ResNet):
+        net.layer2 = nn.Sequential(
+            NL3DWrapper(net.layer2[0], n_segment),
+            net.layer2[1],
+            NL3DWrapper(net.layer2[2], n_segment),
+            net.layer2[3],
+        )
+        net.layer3 = nn.Sequential(
+            NL3DWrapper(net.layer3[0], n_segment),
+            net.layer3[1],
+            NL3DWrapper(net.layer3[2], n_segment),
+            net.layer3[3],
+            NL3DWrapper(net.layer3[4], n_segment),
+            net.layer3[5],
+        )
+    else:
+        raise NotImplementedError
+if __name__ == '__main__':
+    from torch.autograd import Variable
+    import torch
+    sub_sample = True
+    bn_layer = True
+    img = Variable(torch.zeros(2, 3, 20))
+    net = NONLocalBlock1D(3, sub_sample=sub_sample, bn_layer=bn_layer)
+    out = net(img)
+    print(out.size())
+    img = Variable(torch.zeros(2, 3, 20, 20))
+    net = NONLocalBlock2D(3, sub_sample=sub_sample, bn_layer=bn_layer)
+    out = net(img)
+    print(out.size())
+    img = Variable(torch.randn(2, 3, 10, 20, 20))
+    net = NONLocalBlock3D(3, sub_sample=sub_sample, bn_layer=bn_layer)
+    out = net(img)
+    print(out.size())
\ No newline at end of file
--- a/ops/temporal_shift.py 0 → 100755
View file @cb29b6d
+++ b/ops/temporal_shift.py 0 → 100755
View file @cb29b6d
+# Code for "TSM: Temporal Shift Module for Efficient Video Understanding"
+# arXiv:1811.08383
+# Ji Lin*, Chuang Gan, Song Han
+# {jilin, songhan}@mit.edu, ganchuang@csail.mit.edu
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class TemporalShift(nn.Module):
+    def __init__(self, net, n_segment=3, n_div=8, inplace=False):
+        super(TemporalShift, self).__init__()
+        self.net = net
+        self.n_segment = n_segment
+        self.fold_div = n_div
+        self.inplace = inplace
+        if inplace:
+            print('=> Using in-place shift...')
+        print('=> Using fold div: {}'.format(self.fold_div))
+    def forward(self, x):
+        x = self.shift(x, self.n_segment, fold_div=self.fold_div, inplace=self.inplace)
+        return self.net(x)
+    @staticmethod
+    def shift(x, n_segment, fold_div=3, inplace=False):
+        nt, c, h, w = x.size()
+        n_batch = nt // n_segment
+        x = x.view(n_batch, n_segment, c, h, w)
+        fold = c // fold_div
+        if inplace:
+            # Due to some out of order error when performing parallel computing. 
+            # May need to write a CUDA kernel.
+            raise NotImplementedError  
+            # out = InplaceShift.apply(x, fold)
+        else:
+            out = torch.zeros_like(x)
+            out[:, :-1, :fold] = x[:, 1:, :fold]  # shift left
+            out[:, 1:, fold: 2 * fold] = x[:, :-1, fold: 2 * fold]  # shift right
+            out[:, :, 2 * fold:] = x[:, :, 2 * fold:]  # not shift
+        return out.view(nt, c, h, w)
+class InplaceShift(torch.autograd.Function):
+    # Special thanks to @raoyongming for the help to this function
+    @staticmethod
+    def forward(ctx, input, fold):
+        # not support higher order gradient
+        # input = input.detach_()
+        ctx.fold_ = fold
+        n, t, c, h, w = input.size()
+        buffer = input.data.new(n, t, fold, h, w).zero_()
+        buffer[:, :-1] = input.data[:, 1:, :fold]
+        input.data[:, :, :fold] = buffer
+        buffer.zero_()
+        buffer[:, 1:] = input.data[:, :-1, fold: 2 * fold]
+        input.data[:, :, fold: 2 * fold] = buffer
+        return input
+    @staticmethod
+    def backward(ctx, grad_output):
+        # grad_output = grad_output.detach_()
+        fold = ctx.fold_
+        n, t, c, h, w = grad_output.size()
+        buffer = grad_output.data.new(n, t, fold, h, w).zero_()
+        buffer[:, 1:] = grad_output.data[:, :-1, :fold]
+        grad_output.data[:, :, :fold] = buffer
+        buffer.zero_()
+        buffer[:, :-1] = grad_output.data[:, 1:, fold: 2 * fold]
+        grad_output.data[:, :, fold: 2 * fold] = buffer
+        return grad_output, None
+class TemporalPool(nn.Module):
+    def __init__(self, net, n_segment):
+        super(TemporalPool, self).__init__()
+        self.net = net
+        self.n_segment = n_segment
+    def forward(self, x):
+        x = self.temporal_pool(x, n_segment=self.n_segment)
+        return self.net(x)
+    @staticmethod
+    def temporal_pool(x, n_segment):
+        nt, c, h, w = x.size()
+        n_batch = nt // n_segment
+        x = x.view(n_batch, n_segment, c, h, w).transpose(1, 2)  # n, c, t, h, w
+        x = F.max_pool3d(x, kernel_size=(3, 1, 1), stride=(2, 1, 1), padding=(1, 0, 0))
+        x = x.transpose(1, 2).contiguous().view(nt // 2, c, h, w)
+        return x
+def make_temporal_shift(net, n_segment, n_div=8, place='blockres', temporal_pool=False):
+    if temporal_pool:
+        n_segment_list = [n_segment, n_segment // 2, n_segment // 2, n_segment // 2]
+    else:
+        n_segment_list = [n_segment] * 4
+    assert n_segment_list[-1] > 0
+    print('=> n_segment per stage: {}'.format(n_segment_list))
+    import torchvision
+    if isinstance(net, torchvision.models.ResNet):
+        if place == 'block':
+            def make_block_temporal(stage, this_segment):
+                blocks = list(stage.children())
+                print('=> Processing stage with {} blocks'.format(len(blocks)))
+                for i, b in enumerate(blocks):
+                    blocks[i] = TemporalShift(b, n_segment=this_segment, n_div=n_div)
+                return nn.Sequential(*(blocks))
+            net.layer1 = make_block_temporal(net.layer1, n_segment_list[0])
+            net.layer2 = make_block_temporal(net.layer2, n_segment_list[1])
+            net.layer3 = make_block_temporal(net.layer3, n_segment_list[2])
+            net.layer4 = make_block_temporal(net.layer4, n_segment_list[3])
+        elif 'blockres' in place:
+            n_round = 1
+            if len(list(net.layer3.children())) >= 23:
+                n_round = 2
+                print('=> Using n_round {} to insert temporal shift'.format(n_round))
+            def make_block_temporal(stage, this_segment):
+                blocks = list(stage.children())
+                print('=> Processing stage with {} blocks residual'.format(len(blocks)))
+                for i, b in enumerate(blocks):
+                    if i % n_round == 0:
+                        blocks[i].conv1 = TemporalShift(b.conv1, n_segment=this_segment, n_div=n_div)
+                return nn.Sequential(*blocks)
+            net.layer1 = make_block_temporal(net.layer1, n_segment_list[0])
+            net.layer2 = make_block_temporal(net.layer2, n_segment_list[1])
+            net.layer3 = make_block_temporal(net.layer3, n_segment_list[2])
+            net.layer4 = make_block_temporal(net.layer4, n_segment_list[3])
+    else:
+        raise NotImplementedError(place)
+def make_temporal_pool(net, n_segment):
+    import torchvision
+    if isinstance(net, torchvision.models.ResNet):
+        print('=> Injecting nonlocal pooling')
+        net.layer2 = TemporalPool(net.layer2, n_segment)
+    else:
+        raise NotImplementedError
+if __name__ == '__main__':
+    # test inplace shift v.s. vanilla shift
+    tsm1 = TemporalShift(nn.Sequential(), n_segment=8, n_div=8, inplace=False)
+    tsm2 = TemporalShift(nn.Sequential(), n_segment=8, n_div=8, inplace=True)
+    print('=> Testing CPU...')
+    # test forward
+    with torch.no_grad():
+        for i in range(10):
+            x = torch.rand(2 * 8, 3, 224, 224)
+            y1 = tsm1(x)
+            y2 = tsm2(x)
+            assert torch.norm(y1 - y2).item() < 1e-5
+    # test backward
+    with torch.enable_grad():
+        for i in range(10):
+            x1 = torch.rand(2 * 8, 3, 224, 224)
+            x1.requires_grad_()
+            x2 = x1.clone()
+            y1 = tsm1(x1)
+            y2 = tsm2(x2)
+            grad1 = torch.autograd.grad((y1 ** 2).mean(), [x1])[0]
+            grad2 = torch.autograd.grad((y2 ** 2).mean(), [x2])[0]
+            assert torch.norm(grad1 - grad2).item() < 1e-5
+    print('=> Testing GPU...')
+    tsm1.cuda()
+    tsm2.cuda()
+    # test forward
+    with torch.no_grad():
+        for i in range(10):
+            x = torch.rand(2 * 8, 3, 224, 224).cuda()
+            y1 = tsm1(x)
+            y2 = tsm2(x)
+            assert torch.norm(y1 - y2).item() < 1e-5
+    # test backward
+    with torch.enable_grad():
+        for i in range(10):
+            x1 = torch.rand(2 * 8, 3, 224, 224).cuda()
+            x1.requires_grad_()
+            x2 = x1.clone()
+            y1 = tsm1(x1)
+            y2 = tsm2(x2)
+            grad1 = torch.autograd.grad((y1 ** 2).mean(), [x1])[0]
+            grad2 = torch.autograd.grad((y2 ** 2).mean(), [x2])[0]
+            assert torch.norm(grad1 - grad2).item() < 1e-5
+    print('Test passed.')
--- a/ops/transforms.py 0 → 100755
View file @cb29b6d
+++ b/ops/transforms.py 0 → 100755
View file @cb29b6d
+import torchvision
+import random
+from PIL import Image, ImageOps
+import numpy as np
+import numbers
+import math
+import torch
+class GroupRandomCrop(object):
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+    def __call__(self, img_group):
+        w, h = img_group[0].size
+        th, tw = self.size
+        out_images = list()
+        x1 = random.randint(0, w - tw)
+        y1 = random.randint(0, h - th)
+        for img in img_group:
+            assert(img.size[0] == w and img.size[1] == h)
+            if w == tw and h == th:
+                out_images.append(img)
+            else:
+                out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
+        return out_images
+class GroupCenterCrop(object):
+    def __init__(self, size):
+        self.worker = torchvision.transforms.CenterCrop(size)
+    def __call__(self, img_group):
+        return [self.worker(img) for img in img_group]
+class GroupRandomHorizontalFlip(object):
+    """Randomly horizontally flips the given PIL.Image with a probability of 0.5
+    """
+    def __init__(self, is_flow=False):
+        self.is_flow = is_flow
+    def __call__(self, img_group, is_flow=False):
+        v = random.random()
+        if v < 0.5:
+            ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
+            if self.is_flow:
+                for i in range(0, len(ret), 2):
+                    ret[i] = ImageOps.invert(ret[i])  # invert flow pixel values when flipping
+            return ret
+        else:
+            return img_group
+class GroupNormalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+    def __call__(self, tensor):
+        rep_mean = self.mean * (tensor.size()[0]//len(self.mean))
+        rep_std = self.std * (tensor.size()[0]//len(self.std))
+        # TODO: make efficient
+        for t, m, s in zip(tensor, rep_mean, rep_std):
+            t.sub_(m).div_(s)
+        return tensor
+class GroupScale(object):
+    """ Rescales the input PIL.Image to the given 'size'.
+    'size' will be the size of the smaller edge.
+    For example, if height > width, then image will be
+    rescaled to (size * height / width, size)
+    size: size of the smaller edge
+    interpolation: Default: PIL.Image.BILINEAR
+    """
+    def __init__(self, size, interpolation=Image.BILINEAR):
+        self.worker = torchvision.transforms.Resize(size, interpolation)
+    def __call__(self, img_group):
+        return [self.worker(img) for img in img_group]
+class GroupOverSample(object):
+    def __init__(self, crop_size, scale_size=None, flip=True):
+        self.crop_size = crop_size if not isinstance(crop_size, int) else (crop_size, crop_size)
+        if scale_size is not None:
+            self.scale_worker = GroupScale(scale_size)
+        else:
+            self.scale_worker = None
+        self.flip = flip
+    def __call__(self, img_group):
+        if self.scale_worker is not None:
+            img_group = self.scale_worker(img_group)
+        image_w, image_h = img_group[0].size
+        crop_w, crop_h = self.crop_size
+        offsets = GroupMultiScaleCrop.fill_fix_offset(False, image_w, image_h, crop_w, crop_h)
+        oversample_group = list()
+        for o_w, o_h in offsets:
+            normal_group = list()
+            flip_group = list()
+            for i, img in enumerate(img_group):
+                crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
+                normal_group.append(crop)
+                flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
+                if img.mode == 'L' and i % 2 == 0:
+                    flip_group.append(ImageOps.invert(flip_crop))
+                else:
+                    flip_group.append(flip_crop)
+            oversample_group.extend(normal_group)
+            if self.flip:
+                oversample_group.extend(flip_group)
+        return oversample_group
+class GroupFullResSample(object):
+    def __init__(self, crop_size, scale_size=None, flip=True):
+        self.crop_size = crop_size if not isinstance(crop_size, int) else (crop_size, crop_size)
+        if scale_size is not None:
+            self.scale_worker = GroupScale(scale_size)
+        else:
+            self.scale_worker = None
+        self.flip = flip
+    def __call__(self, img_group):
+        if self.scale_worker is not None:
+            img_group = self.scale_worker(img_group)
+        image_w, image_h = img_group[0].size
+        crop_w, crop_h = self.crop_size
+        w_step = (image_w - crop_w) // 4
+        h_step = (image_h - crop_h) // 4
+        offsets = list()
+        offsets.append((0 * w_step, 2 * h_step))  # left
+        offsets.append((4 * w_step, 2 * h_step))  # right
+        offsets.append((2 * w_step, 2 * h_step))  # center
+        oversample_group = list()
+        for o_w, o_h in offsets:
+            normal_group = list()
+            flip_group = list()
+            for i, img in enumerate(img_group):
+                crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
+                normal_group.append(crop)
+                if self.flip:
+                    flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
+                    if img.mode == 'L' and i % 2 == 0:
+                        flip_group.append(ImageOps.invert(flip_crop))
+                    else:
+                        flip_group.append(flip_crop)
+            oversample_group.extend(normal_group)
+            oversample_group.extend(flip_group)
+        return oversample_group
+class GroupMultiScaleCrop(object):
+    def __init__(self, input_size, scales=None, max_distort=1, fix_crop=True, more_fix_crop=True):
+        self.scales = scales if scales is not None else [1, .875, .75, .66]
+        self.max_distort = max_distort
+        self.fix_crop = fix_crop
+        self.more_fix_crop = more_fix_crop
+        self.input_size = input_size if not isinstance(input_size, int) else [input_size, input_size]
+        self.interpolation = Image.BILINEAR
+    def __call__(self, img_group):
+        im_size = img_group[0].size
+        crop_w, crop_h, offset_w, offset_h = self._sample_crop_size(im_size)
+        crop_img_group = [img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h)) for img in img_group]
+        ret_img_group = [img.resize((self.input_size[0], self.input_size[1]), self.interpolation)
+                         for img in crop_img_group]
+        return ret_img_group
+    def _sample_crop_size(self, im_size):
+        image_w, image_h = im_size[0], im_size[1]
+        # find a crop size
+        base_size = min(image_w, image_h)
+        crop_sizes = [int(base_size * x) for x in self.scales]
+        crop_h = [self.input_size[1] if abs(x - self.input_size[1]) < 3 else x for x in crop_sizes]
+        crop_w = [self.input_size[0] if abs(x - self.input_size[0]) < 3 else x for x in crop_sizes]
+        pairs = []
+        for i, h in enumerate(crop_h):
+            for j, w in enumerate(crop_w):
+                if abs(i - j) <= self.max_distort:
+                    pairs.append((w, h))
+        crop_pair = random.choice(pairs)
+        if not self.fix_crop:
+            w_offset = random.randint(0, image_w - crop_pair[0])
+            h_offset = random.randint(0, image_h - crop_pair[1])
+        else:
+            w_offset, h_offset = self._sample_fix_offset(image_w, image_h, crop_pair[0], crop_pair[1])
+        return crop_pair[0], crop_pair[1], w_offset, h_offset
+    def _sample_fix_offset(self, image_w, image_h, crop_w, crop_h):
+        offsets = self.fill_fix_offset(self.more_fix_crop, image_w, image_h, crop_w, crop_h)
+        return random.choice(offsets)
+    @staticmethod
+    def fill_fix_offset(more_fix_crop, image_w, image_h, crop_w, crop_h):
+        w_step = (image_w - crop_w) // 4
+        h_step = (image_h - crop_h) // 4
+        ret = list()
+        ret.append((0, 0))  # upper left
+        ret.append((4 * w_step, 0))  # upper right
+        ret.append((0, 4 * h_step))  # lower left
+        ret.append((4 * w_step, 4 * h_step))  # lower right
+        ret.append((2 * w_step, 2 * h_step))  # center
+        if more_fix_crop:
+            ret.append((0, 2 * h_step))  # center left
+            ret.append((4 * w_step, 2 * h_step))  # center right
+            ret.append((2 * w_step, 4 * h_step))  # lower center
+            ret.append((2 * w_step, 0 * h_step))  # upper center
+            ret.append((1 * w_step, 1 * h_step))  # upper left quarter
+            ret.append((3 * w_step, 1 * h_step))  # upper right quarter
+            ret.append((1 * w_step, 3 * h_step))  # lower left quarter
+            ret.append((3 * w_step, 3 * h_step))  # lower righ quarter
+        return ret
+class GroupRandomSizedCrop(object):
+    """Random crop the given PIL.Image to a random size of (0.08 to 1.0) of the original size
+    and and a random aspect ratio of 3/4 to 4/3 of the original aspect ratio
+    This is popularly used to train the Inception networks
+    size: size of the smaller edge
+    interpolation: Default: PIL.Image.BILINEAR
+    """
+    def __init__(self, size, interpolation=Image.BILINEAR):
+        self.size = size
+        self.interpolation = interpolation
+    def __call__(self, img_group):
+        for attempt in range(10):
+            area = img_group[0].size[0] * img_group[0].size[1]
+            target_area = random.uniform(0.08, 1.0) * area
+            aspect_ratio = random.uniform(3. / 4, 4. / 3)
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+            if random.random() < 0.5:
+                w, h = h, w
+            if w <= img_group[0].size[0] and h <= img_group[0].size[1]:
+                x1 = random.randint(0, img_group[0].size[0] - w)
+                y1 = random.randint(0, img_group[0].size[1] - h)
+                found = True
+                break
+        else:
+            found = False
+            x1 = 0
+            y1 = 0
+        if found:
+            out_group = list()
+            for img in img_group:
+                img = img.crop((x1, y1, x1 + w, y1 + h))
+                assert(img.size == (w, h))
+                out_group.append(img.resize((self.size, self.size), self.interpolation))
+            return out_group
+        else:
+            # Fallback
+            scale = GroupScale(self.size, interpolation=self.interpolation)
+            crop = GroupRandomCrop(self.size)
+            return crop(scale(img_group))
+class Stack(object):
+    def __init__(self, roll=False):
+        self.roll = roll
+    def __call__(self, img_group):
+        if img_group[0].mode == 'L':
+            return np.concatenate([np.expand_dims(x, 2) for x in img_group], axis=2)
+        elif img_group[0].mode == 'RGB':
+            if self.roll:
+                return np.concatenate([np.array(x)[:, :, ::-1] for x in img_group], axis=2)
+            else:
+                return np.concatenate(img_group, axis=2)
+class ToTorchFormatTensor(object):
+    """ Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255]
+    to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """
+    def __init__(self, div=True):
+        self.div = div
+    def __call__(self, pic):
+        if isinstance(pic, np.ndarray):
+            # handle numpy array
+            img = torch.from_numpy(pic).permute(2, 0, 1).contiguous()
+        else:
+            # handle PIL Image
+            img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
+            img = img.view(pic.size[1], pic.size[0], len(pic.mode))
+            # put it from HWC to CHW format
+            # yikes, this transpose takes 80% of the loading time/CPU
+            img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        return img.float().div(255) if self.div else img.float()
+class IdentityTransform(object):
+    def __call__(self, data):
+        return data
+if __name__ == "__main__":
+    trans = torchvision.transforms.Compose([
+        GroupScale(256),
+        GroupRandomCrop(224),
+        Stack(),
+        ToTorchFormatTensor(),
+        GroupNormalize(
+            mean=[.485, .456, .406],
+            std=[.229, .224, .225]
+        )]
+    )
+    im = Image.open('../tensorflow-model-zoo.torch/lena_299.png')
+    color_group = [im] * 3
+    rst = trans(color_group)
+    gray_group = [im.convert('L')] * 9
+    gray_rst = trans(gray_group)
+    trans2 = torchvision.transforms.Compose([
+        GroupRandomSizedCrop(256),
+        Stack(),
+        ToTorchFormatTensor(),
+        GroupNormalize(
+            mean=[.485, .456, .406],
+            std=[.229, .224, .225])
+    ])
+    print(trans2(color_group))
\ No newline at end of file
--- a/ops/utils.py 0 → 100755
View file @cb29b6d
+++ b/ops/utils.py 0 → 100755
View file @cb29b6d
+import numpy as np
+def softmax(scores):
+    es = np.exp(scores - scores.max(axis=-1)[..., None])
+    return es / es.sum(axis=-1)[..., None]
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
\ No newline at end of file
--- a/person_filter.py 0 → 100644
View file @cb29b6d
+++ b/person_filter.py 0 → 100644
View file @cb29b6d
+import os
+import cv2
+import numpy as np
+import pickle
+def start_filter(config):
+    cls_class_path = config['MODEL']['CLS_PERSON']
+    feature_save_dir = config['VIDEO']['FACE_FEATURE_DIR']
+    frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
+    result_file_name = config['PERSON']['RESULT_FILE']
+    feature_name = config['PERSON']['DATA_NAME']
+    xgboost_model = pickle.load(open(cls_class_path, "rb"))
+    result_file_path = os.path.join(frame_list_dir, result_file_name)
+    result_file = open(result_file_path, 'w')
+    feature_path = os.path.join(feature_save_dir, feature_name)
+    val_annotation_pairs = np.load(feature_path, allow_pickle=True, encoding='latin1')
+    X_val = []
+    Y_val = []
+    Y_names = []
+    for j in range(len(val_annotation_pairs)):
+        pair = val_annotation_pairs[j]
+        X_val.append(np.squeeze(pair[0]))
+        Y_val.append(pair[1])
+        Y_names.append(pair[2])
+    X_val = np.array(X_val)
+    y_pred = xgboost_model.predict_proba(X_val)
+    for i, Y_name in enumerate(Y_names):
+        result_file.write(Y_name + ' ')
+        result_file.write(str(y_pred[i][0]) + ',' + str(y_pred[i][1]) + ',' + str(y_pred[i][2]) + '\n')
+    result_file.close()
--- a/pose_filter.py 0 → 100644
View file @cb29b6d
+++ b/pose_filter.py 0 → 100644
View file @cb29b6d
+import os
+import torch.optim
+import numpy as np
+import torch.optim
+import torch.nn.parallel
+from ops.models import TSN
+from ops.transforms import *
+from ops.dataset import TSNDataSet
+from torch.nn import functional as F
+def gen_file_list(frame_save_dir, frame_list_dir):
+    val_path = os.path.join(frame_list_dir, 'val.txt')
+    video_names = os.listdir(frame_save_dir)
+    ucf101_rgb_val_file = open(val_path, 'w')
+    for video_name in video_names:
+        images_dir = os.path.join(frame_save_dir, video_name)
+        ucf101_rgb_val_file.write(video_name)
+        ucf101_rgb_val_file.write(' ')
+        ucf101_rgb_val_file.write(str(len(os.listdir(images_dir))))
+        ucf101_rgb_val_file.write('\n')
+    ucf101_rgb_val_file.close()
+    return val_path
+def start_filter(config):
+    arch = config['FIGHTING']['ARCH']
+    prefix = config['VIDEO']['PREFIX']
+    modality = config['POSE']['MODALITY']
+    test_crop = config['POSE']['TEST_CROP']
+    batch_size = config['POSE']['BATCH_SIZE']
+    weights_path = config['MODEL']['CLS_POSE']
+    test_segment = config['POSE']['TEST_SEGMENT']
+    frame_save_dir = config['VIDEO']['POSE_FRAME_SAVE_DIR']
+    frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
+    result_file_name = config['POSE']['RESULT_FILE']
+    workers = 8
+    num_class = 3
+    shift_div = 8
+    img_feature_dim = 256
+    softmax = False
+    is_shift = True
+    full_res = False
+    non_local = False
+    dense_sample = False
+    twice_sample = False
+    val_list = gen_file_list(frame_save_dir, frame_list_dir)
+    result_file_path = os.path.join(frame_list_dir, result_file_name)
+    pretrain = 'imagenet'
+    shift_place = 'blockres'
+    crop_fusion_type = 'avg'
+    net = TSN(num_class, test_segment if is_shift else 1, modality,
+              base_model=arch,
+              consensus_type=crop_fusion_type,
+              img_feature_dim=img_feature_dim,
+              pretrain=pretrain,
+              is_shift=is_shift, shift_div=shift_div, shift_place=shift_place,
+              non_local=non_local,
+              )
+    checkpoint = torch.load(weights_path)
+    checkpoint = checkpoint['state_dict']
+    base_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint.items())}
+    replace_dict = {'base_model.classifier.weight': 'new_fc.weight',
+                    'base_model.classifier.bias': 'new_fc.bias',
+                    }
+    for k, v in replace_dict.items():
+        if k in base_dict:
+            base_dict[v] = base_dict.pop(k)
+    net.load_state_dict(base_dict)
+    input_size = net.scale_size if full_res else net.input_size
+    if test_crop == 1:
+        cropping = torchvision.transforms.Compose([
+            GroupScale(net.scale_size),
+            GroupCenterCrop(input_size),
+        ])
+    elif test_crop == 3:  # do not flip, so only 5 crops
+        cropping = torchvision.transforms.Compose([
+            GroupFullResSample(input_size, net.scale_size, flip=False)
+        ])
+    elif test_crop == 5:  # do not flip, so only 5 crops
+        cropping = torchvision.transforms.Compose([
+            GroupOverSample(input_size, net.scale_size, flip=False)
+        ])
+    elif test_crop == 10:
+        cropping = torchvision.transforms.Compose([
+            GroupOverSample(input_size, net.scale_size)
+        ])
+    else:
+        raise ValueError("Only 1, 5, 10 crops are supported while we got {}".format(test_crop))
+    data_loader = torch.utils.data.DataLoader(
+            TSNDataSet(frame_save_dir, val_list, num_segments=test_segment,
+                       new_length=1 if modality == "RGB" else 5,
+                       modality=modality,
+                       image_tmpl=prefix,
+                       test_mode=True,
+                       remove_missing=False,
+                       transform=torchvision.transforms.Compose([
+                           cropping,
+                           Stack(roll=(arch in ['BNInception', 'InceptionV3'])),
+                           ToTorchFormatTensor(div=(arch not in ['BNInception', 'InceptionV3'])),
+                           GroupNormalize(net.input_mean, net.input_std),
+                       ]), dense_sample=dense_sample, twice_sample=twice_sample),
+            batch_size=batch_size, shuffle=False,
+            num_workers=workers, pin_memory=True,
+    )
+    net = torch.nn.DataParallel(net.cuda())
+    net.eval()
+    data_gen = enumerate(data_loader)
+    max_num = len(data_loader.dataset)
+    result_file = open(result_file_path, 'w')
+    for i, data_pair in data_gen:
+        directory, data = data_pair
+        with torch.no_grad():
+            if i >= max_num:
+                break
+            num_crop = test_crop
+            if dense_sample:
+                num_crop *= 10  # 10 clips for testing when using dense sample
+            if twice_sample:
+                num_crop *= 2
+            if modality == 'RGB':
+                length = 3
+            elif modality == 'Flow':
+                length = 10
+            elif modality == 'RGBDiff':
+                length = 18
+            else:
+                raise ValueError("Unknown modality " + modality)
+            data_in = data.view(-1, length, data.size(2), data.size(3))
+            if is_shift:
+                data_in = data_in.view(batch_size * num_crop, test_segment, length, data_in.size(2), data_in.size(3))
+            rst, feature = net(data_in)
+            rst = rst.reshape(batch_size, num_crop, -1).mean(1)
+            if softmax:
+                # take the softmax to normalize the output to probability
+                rst = F.softmax(rst, dim=1)
+            rst = rst.data.cpu().numpy().copy()
+            if net.module.is_shift:
+                rst = rst.reshape(batch_size, num_class)
+            else:
+                rst = rst.reshape((batch_size, -1, num_class)).mean(axis=1).reshape((batch_size, num_class))
+            proba = np.squeeze(rst)
+            proba = np.exp(proba)/sum(np.exp(proba))
+            result_file.write(str(directory[0]) + ' ')
+            result_file.write(str(proba[0]) + ',' + str(proba[1]) + ',' + str(proba[2]) + '\n')
+    result_file.close()
+    print('video filter end')
\ No newline at end of file
--- a/test.py 0 → 100644
View file @cb29b6d
+++ b/test.py 0 → 100644
View file @cb29b6d
+import os
+import cv2
+import load_util
+import media_util
+import numpy as np
+from sklearn.metrics import confusion_matrix
+import fighting_filter, emotion_filter, argue_filter, audio_filter, class_filter
+import video_filter, pose_filter, flow_filter
+def accuracy_cal(config):
+    label_file_path = config['VIDEO']['LABEL_PATH']
+    frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
+    final_file_name = config['AUDIO']['RESULT_FILE']
+    final_file_path = os.path.join(frame_list_dir, final_file_name)
+    final_file_lines = open(final_file_path).readlines()
+    label_file_lines = open(label_file_path).readlines()
+    final_pairs = {line.strip().split(' ')[0]: line.strip().split(' ')[1] for line in final_file_lines}
+    lines_num = len(label_file_lines) - 1
+    hit = 0
+    for i, label_line in enumerate(label_file_lines):
+        if i == 0:
+            continue
+        file, label = label_line.strip().split(' ')
+        final_pre = final_pairs[file]
+        final_pre_class = np.argmax(np.array(final_pre.split(','))) + 1
+        print(final_pre_class, label)
+        if final_pre_class == int(label):
+            hit += 1
+    return hit/lines_num
+def main():
+    config_path = r'config.yaml'
+    config = load_util.load_config(config_path)
+    media_util.extract_wav(config)
+    media_util.extract_frame(config)
+    media_util.extract_frame_pose(config)
+    media_util.extract_is10(config)
+    media_util.extract_random_face_feature(config)
+    media_util.extract_mirror(config)
+    fighting_2_filter.start_filter(config)
+    emotion_filter.start_filter(config)
+    audio_filter.start_filter(config)
+    class_filter.start_filter(config)
+    video_filter.start_filter(config)
+    pose_filter.start_filter(config)
+    flow_filter.start_filter(config)
+    acc = accuracy_cal(config)
+    print(acc)
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/video_filter.py 0 → 100644
View file @cb29b6d
+++ b/video_filter.py 0 → 100644
View file @cb29b6d
+import os
+import torch.optim
+import numpy as np
+import torch.nn.parallel
+from ops.models import TSN
+from ops.transforms import *
+from ops.dataset import TSNDataSet
+from torch.nn import functional as F
+def gen_file_list(frame_save_dir, frame_list_dir):
+    val_path = os.path.join(frame_list_dir, 'val.txt')
+    video_names = os.listdir(frame_save_dir)
+    ucf101_rgb_val_file = open(val_path, 'w')
+    for video_name in video_names:
+        images_dir = os.path.join(frame_save_dir, video_name)
+        ucf101_rgb_val_file.write(video_name)
+        ucf101_rgb_val_file.write(' ')
+        ucf101_rgb_val_file.write(str(len(os.listdir(images_dir))))
+        ucf101_rgb_val_file.write('\n')
+    ucf101_rgb_val_file.close()
+    return val_path
+def start_filter(config):
+    arch = config['FIGHTING']['ARCH']
+    prefix = config['VIDEO']['PREFIX']
+    modality = config['VIDEO_FILTER']['MODALITY']
+    test_crop = config['VIDEO_FILTER']['TEST_CROP']
+    batch_size = config['VIDEO_FILTER']['BATCH_SIZE']
+    weights_path = config['MODEL']['CLS_VIDEO']
+    test_segment = config['VIDEO_FILTER']['TEST_SEGMENT']
+    frame_save_dir = config['VIDEO']['FRAME_SAVE_DIR']
+    frame_list_dir = config['VIDEO']['FRAME_LIST_DIR']
+    result_file_name = config['VIDEO_FILTER']['RESULT_FILE']
+    workers = 8
+    num_class = 3
+    shift_div = 8
+    img_feature_dim = 256
+    softmax = False
+    is_shift = True
+    full_res = False
+    non_local = False
+    dense_sample = False
+    twice_sample = False
+    val_list = gen_file_list(frame_save_dir, frame_list_dir)
+    result_file_path = os.path.join(frame_list_dir, result_file_name)
+    pretrain = 'imagenet'
+    shift_place = 'blockres'
+    crop_fusion_type = 'avg'
+    net = TSN(num_class, test_segment if is_shift else 1, modality,
+              base_model=arch,
+              consensus_type=crop_fusion_type,
+              img_feature_dim=img_feature_dim,
+              pretrain=pretrain,
+              is_shift=is_shift, shift_div=shift_div, shift_place=shift_place,
+              non_local=non_local,
+              )
+    checkpoint = torch.load(weights_path)
+    checkpoint = checkpoint['state_dict']
+    base_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint.items())}
+    replace_dict = {'base_model.classifier.weight': 'new_fc.weight',
+                    'base_model.classifier.bias': 'new_fc.bias',
+                    }
+    for k, v in replace_dict.items():
+        if k in base_dict:
+            base_dict[v] = base_dict.pop(k)
+    net.load_state_dict(base_dict)
+    input_size = net.scale_size if full_res else net.input_size
+    if test_crop == 1:
+        cropping = torchvision.transforms.Compose([
+            GroupScale(net.scale_size),
+            GroupCenterCrop(input_size),
+        ])
+    elif test_crop == 3:  # do not flip, so only 5 crops
+        cropping = torchvision.transforms.Compose([
+            GroupFullResSample(input_size, net.scale_size, flip=False)
+        ])
+    elif test_crop == 5:  # do not flip, so only 5 crops
+        cropping = torchvision.transforms.Compose([
+            GroupOverSample(input_size, net.scale_size, flip=False)
+        ])
+    elif test_crop == 10:
+        cropping = torchvision.transforms.Compose([
+            GroupOverSample(input_size, net.scale_size)
+        ])
+    else:
+        raise ValueError("Only 1, 5, 10 crops are supported while we got {}".format(test_crop))
+    data_loader = torch.utils.data.DataLoader(
+            TSNDataSet(frame_save_dir, val_list, num_segments=test_segment,
+                       new_length=1 if modality == "RGB" else 5,
+                       modality=modality,
+                       image_tmpl=prefix,
+                       test_mode=True,
+                       remove_missing=False,
+                       transform=torchvision.transforms.Compose([
+                           cropping,
+                           Stack(roll=(arch in ['BNInception', 'InceptionV3'])),
+                           ToTorchFormatTensor(div=(arch not in ['BNInception', 'InceptionV3'])),
+                           GroupNormalize(net.input_mean, net.input_std),
+                       ]), dense_sample=dense_sample, twice_sample=twice_sample),
+            batch_size=batch_size, shuffle=False,
+            num_workers=workers, pin_memory=True,
+    )
+    net = torch.nn.DataParallel(net.cuda())
+    net.eval()
+    data_gen = enumerate(data_loader)
+    max_num = len(data_loader.dataset)
+    result_file = open(result_file_path, 'w')
+    for i, data_pair in data_gen:
+        directory, data = data_pair
+        with torch.no_grad():
+            if i >= max_num:
+                break
+            num_crop = test_crop
+            if dense_sample:
+                num_crop *= 10  # 10 clips for testing when using dense sample
+            if twice_sample:
+                num_crop *= 2
+            if modality == 'RGB':
+                length = 3
+            elif modality == 'Flow':
+                length = 10
+            elif modality == 'RGBDiff':
+                length = 18
+            else:
+                raise ValueError("Unknown modality " + modality)
+            data_in = data.view(-1, length, data.size(2), data.size(3))
+            if is_shift:
+                data_in = data_in.view(batch_size * num_crop, test_segment, length, data_in.size(2), data_in.size(3))
+            rst, feature = net(data_in)
+            rst = rst.reshape(batch_size, num_crop, -1).mean(1)
+            if softmax:
+                # take the softmax to normalize the output to probability
+                rst = F.softmax(rst, dim=1)
+            rst = rst.data.cpu().numpy().copy()
+            if net.module.is_shift:
+                rst = rst.reshape(batch_size, num_class)
+            else:
+                rst = rst.reshape((batch_size, -1, num_class)).mean(axis=1).reshape((batch_size, num_class))
+            proba = np.squeeze(rst)
+            proba = np.exp(proba)/sum(np.exp(proba))
+            result_file.write(str(directory[0]) + ' ')
+            result_file.write(str(proba[0]) + ',' + str(proba[1]) + ',' + str(proba[2]) + '\n')
+    result_file.close()
+    print('video filter end')
\ No newline at end of file