clear_repeat_id.py 3.44 KB
import os
import numpy as np
import cv2
from tqdm import tqdm

from face_id import Face_Recognizer


def generate_face_embedding():
    face_recognizer = Face_Recognizer(reg_face_id_model_path)
    
    id_names = os.listdir(image_dir)
    for id_name in tqdm(id_names):
        id_dir = os.path.join(image_dir, id_name)
        image_names = os.listdir(id_dir)
        norm_images = []
        for image_name in image_names:
            image_path = os.path.join(id_dir, image_name)
            if '.npy' in image_name:
                print(image_name)
                os.remove(image_path)
                continue     
    
            image = cv2.imread(image_path)
            norm_images.append(image)
        
        embeddings = face_recognizer.recognize(norm_images)
    
        save_id_dir = os.path.join(embeddings_dir, id_name)
        if os.path.exists(save_id_dir) is False:
            os.mkdir(save_id_dir)
    
        for image_idx, embedding in enumerate(embeddings):
            save_id_path = os.path.join(save_id_dir, image_names[image_idx][:-4]+'.npy')
            np.save(save_id_path, embedding)


def get_embeddings(id_dir):
    embedding_names = os.listdir(id_dir)
    embeddings = []
    for embedding_name in embedding_names:
        embedding_path = os.path.join(id_dir, embedding_name)
        embedding = np.load(embedding_path)
        embeddings.append(embedding)

    return embeddings


def get_high_similarity():
    result_file_path = 'clear_result1.txt'
    # result_done_path = 'clear_result.txt'

    result_file = open(result_file_path, 'w')
    # result_done_file = open(result_done_path, 'r')

    # all_lines = result_done_file.readlines()
    # done_names = []
    # for line in all_lines:
    #     done_name = line.split(',')[0]
    #     if done_name not in done_names:
    #         done_names.append(done_name)

    id_names = os.listdir(embeddings_dir)
    id_names_set = set(id_names)
 
    assert len(id_names) == len(id_names_set)

    done_set = set()

    for id_name in tqdm(id_names_set):
        done_set.add(id_name)
        # if id_name in done_names:
        #     continue

        id_dir = os.path.join(embeddings_dir, id_name)
        embeddings = get_embeddings(id_dir)
 
        diff_id_names_set = id_names_set - done_set
        for new_id_name in diff_id_names_set:
            new_id_dir = os.path.join(embeddings_dir, new_id_name)
            new_embeddings = get_embeddings(new_id_dir)
            
            for embedding in embeddings:
                for new_embedding in new_embeddings:
                    embedding = np.mat(embedding)
                    new_embedding = np.mat(new_embedding)
                    dot = np.sum(np.multiply(embedding, new_embedding), axis=1)
                    norm = np.linalg.norm(embedding, axis=1) * np.linalg.norm(new_embedding, axis=1)
                    dist_1 = dot / norm

                    sim = dist_1.tolist()
                    sim = sim[0][0]
                    if sim > 0.6:
                        print('same file')
                        result_file.write(id_name + ',' + new_id_name + '\n')
            
    result_file.close()
      
        
           

reg_face_id_model_path = r'/home/jwq/PycharmProjects/situ/src/face_id/insightface/recognition/arcface_torch/work_dirs/ms1mv3_r18/ms1mv3_r18_0.96200/ms1mv3_r18.mnn'
image_dir = r'/data2/face_id/situ_other/train_norm_112_mix_add'
embeddings_dir = r'/data2/face_id/situ_other/train_norm_embeddings'

get_high_similarity()