speak_detector.cpp 8.9 KB
#include "speak_detector.h"

void SpeakDetector::init_model(string face_det_model,string landm_det_model,string speak_cls_model){
    face_det = RetinaFace();
    face_det.init_model(face_det_model);
    landm_det = FaceLandmarks();
    landm_det.init_model(landm_det_model);
    speak_cls = SpeakCls();
    speak_cls.init_model(speak_cls_model);
}

float SpeakDetector::iou_compute(Bbox b1, Bbox b2)
{   
    float tmp_w=min(b1.xmax,b2.xmax) - max(b1.xmin, b2.xmin);
    float tmp_h=min(b1.ymax, b2.ymax) - max(b1.ymin, b2.ymin);
    float w = max(tmp_w, float(0));
    float h = max(tmp_h, float(0));
    return w*h / ((b1.xmax-b1.xmin)*(b1.ymax-b1.ymin) + (b2.xmax-b2.xmin)*(b2.ymax-b2.ymin) - w*h);
}
vector<vector<cv::Mat>> SpeakDetector::mouth_process(vector<vector<vector<vector<float>>>> batch_landmarks, vector<cv::Mat> batch_images){
    int input_size=112;
    vector<vector<cv::Mat>> align_mouths;
    for(int i=0;i<batch_images.size();++i){
        cv::Mat image = batch_images[i];
        vector<cv::Mat> tmp_mouths;
        for(int j=0;j<batch_landmarks[i].size();++j){
            vector<float> mouth_xs; 
            vector<float> mouth_ys;
            for(int k=84;k<int(104);++k){
                float x_q = round(batch_landmarks[i][j][k][0]);
                float y_q = round(batch_landmarks[i][j][k][1]);
                mouth_xs.push_back(x_q);
                mouth_ys.push_back(y_q);
            }
            float mouth_width=*max_element(mouth_xs.begin(),mouth_xs.end())-*min_element(mouth_xs.begin(),mouth_xs.end());
            float mouth_height=*max_element(mouth_ys.begin(),mouth_ys.end())-*min_element(mouth_ys.begin(),mouth_ys.end());
            int mouth_min_x=ceil(*min_element(mouth_xs.begin(),mouth_xs.end())-mouth_width*0.2);
            int mouth_min_y=ceil(*min_element(mouth_ys.begin(),mouth_ys.end())-mouth_height*0.1);
            int mouth_max_x=ceil(*max_element(mouth_xs.begin(),mouth_xs.end())+mouth_width*0.2);
            int mouth_max_y=ceil(*max_element(mouth_ys.begin(),mouth_ys.end())+mouth_height*0.1);
            
            mouth_min_x=mouth_min_x>0?mouth_min_x:0;
            mouth_min_y=mouth_min_y>0?mouth_min_y:0;
            cv::Rect mouth_rect = Rect(mouth_min_x,mouth_min_y,mouth_max_x-mouth_min_x,mouth_max_y-mouth_min_y);
            cv::Mat mouth_crop = image(mouth_rect);
            cv::Mat resize_mouth_crop;
            cv::resize(mouth_crop,resize_mouth_crop,Size(input_size,input_size));
            Point center=Point(input_size/2,input_size/2);
            float dx = batch_landmarks[i][j][90][0]-batch_landmarks[i][j][84][0];
            float dy = batch_landmarks[i][j][90][1]-batch_landmarks[i][j][84][1];
            double angle = atan2(dy,dx)*180/float(M_PI);
            cv::Mat rotate_matrix = cv::getRotationMatrix2D(center,double(angle),1);
            cv::Mat rot_img;
            cv::warpAffine(resize_mouth_crop,rot_img,rotate_matrix,Size(input_size,input_size));
            tmp_mouths.push_back(rot_img);   
        }
        align_mouths.push_back(tmp_mouths);
    }
    return align_mouths;
}
//视频/图像数据切片
//图像
void SpeakDetector::image_reader(string file_path,int segment_num,vector<Mat> &bgr_frames,vector<vector<int>> &indices){
    int new_length = 1;
    vector<String> image_files;
    glob(file_path, image_files, false);
    int total_frames_num = (int)image_files.size();
    float tick = float(total_frames_num - new_length + 1) / float(segment_num);
    vector<int> indice;
    for(int x=0;x<segment_num;++x){
        indice.push_back(int(tick / 2.0 + tick * x));
    }
    indices.push_back(indice);
    
    for(auto im_file:image_files){
        Mat bgr_img=cv::imread(im_file);
        bgr_frames.push_back(bgr_img);
    }
}
void SpeakDetector::speak_recognize(string image_path){
    vector<Mat> all_bgr_images;
    vector<vector<int>> total_split_indices;
    image_reader(image_path,10,all_bgr_images,total_split_indices);
    // vector<json> all_results;

    bool is_talk=false;

    for(int im_i=0;im_i<total_split_indices.size();++im_i){
        vector<vector<cv::Mat>> face_list;
        vector<vector<Bbox>> bbox_list;
        vector<cv::Mat> rgb_frames;
        vector<cv::Mat> bgr_frames;
        int tmp_rows,tmp_cols;
        for(int im_j=0;im_j<total_split_indices[im_i].size();++im_j){
            Mat tmp_img=all_bgr_images[total_split_indices[im_i][im_j]];
            if(im_j !=0){
                if(tmp_img.rows!=tmp_rows&&tmp_img.cols!=tmp_cols){
                    cv::resize(tmp_img,tmp_img,Size(int(tmp_img.cols),int(tmp_img.rows)));
                }
            }
            tmp_rows=tmp_img.rows;
            tmp_cols=tmp_img.cols;
            Mat rgb_tmp_img;
            cv::cvtColor(tmp_img,rgb_tmp_img,cv::COLOR_BGR2RGB);
            bgr_frames.push_back(tmp_img);
            rgb_frames.push_back(rgb_tmp_img);

        }    
        for(auto bgr_frame:bgr_frames){
            vector<Bbox> boxes=face_det.inference(bgr_frame);
            vector<cv::Mat> tmp_face_areas;
            vector<Bbox> tmp_bbox_list;
            for(auto box:boxes){
                tmp_bbox_list.push_back(box);

                // cout<<box.xmin<<" "<<box.ymin<<" "<<box.xmax-box.xmin<<" "<<box.ymax-box.ymin<<endl;
                
                Rect m_select = Rect(box.xmin,box.ymin,box.xmax-box.xmin,box.ymax-box.ymin);
                
                cv::Mat face_area=bgr_frame(m_select);
                tmp_face_areas.push_back(face_area);
                // cv::waitKey(0);
            }
            face_list.push_back(tmp_face_areas);
            bbox_list.push_back(tmp_bbox_list);
        }
        // cout<<123<<endl;
        vector<vector<vector<vector<float>>>> landms_list;
        for(int i=0;i<face_list.size();++i){
            vector<vector<vector<float>>> tmp_landm_list;
            for(int j=0;j<face_list[i].size();++j){
                vector<vector<float>> tmp_landms=landm_det.inference(face_list[i][j]);
                for(int k=0;k<tmp_landms.size();++k){
                    tmp_landms[k][0]=tmp_landms[k][0]+bbox_list[i][j].xmin;
                    tmp_landms[k][1]=tmp_landms[k][1]+bbox_list[i][j].ymin;
                }
                tmp_landm_list.push_back(tmp_landms);
            }
            landms_list.push_back(tmp_landm_list);
        }
        vector<vector<cv::Mat>> mouth_list=mouth_process(landms_list,rgb_frames);

        vector<vector<Bbox>> last_bboxes=bbox_list;
        vector<Bbox> first_bboxes = bbox_list[0];
        vector<vector<Bbox>>::iterator k = last_bboxes.begin();
        last_bboxes.erase(k);
        
        vector<vector<Bbox>> all_track_bbox_list;
        vector<vector<cv::Mat>> all_face_list,all_mouth_list;

        for(int i=0;i<first_bboxes.size();++i){
            Bbox first_bbox=first_bboxes[i];
            vector<Bbox> track_bbox_list;
            vector<cv::Mat> trace_face_list,trace_mouth_list;
            track_bbox_list.push_back(first_bbox);
            trace_face_list.push_back(face_list[0][i]);
            trace_mouth_list.push_back(mouth_list[0][i]);
            for(int j=0;j<last_bboxes.size();++j){
                vector<Bbox> next_bboxes=last_bboxes[j];
                for(int k=0;k<next_bboxes.size();++k){

                    Bbox next_bbox = next_bboxes[k];
                    float iou=iou_compute(first_bbox,next_bbox);
                    if(iou>=0.4){
                        track_bbox_list.push_back(next_bbox);
                        trace_face_list.push_back(face_list[j+1][k]);
                        trace_mouth_list.push_back(mouth_list[j+1][k]);
                        break;
                    }
                }
            }
            all_track_bbox_list.push_back(track_bbox_list);
            all_face_list.push_back(trace_face_list);
            all_mouth_list.push_back(trace_mouth_list);
        }
        for(int j=0;j<all_mouth_list.size();j++){
            vector<cv::Mat> select_mouth_list=all_mouth_list[j];

            /**
             * @brief 模型推理部分代码,返回result  0/1 ,其中1为说话,0为未说话
             * 
             */
            bool result=speak_cls.inference(select_mouth_list);
            
            // bool result=true;
            if(result){
                is_talk=true;
                // speak_duration = (split_indices[0], split_indices[-1])
                // Mat speaker = all_face_list[j][0];
                // speaker_str = cv::imencode('.jpg', speaker)[1].tostring()
                // speaker_str = base64.b64encode(speaker_str).decode()
                // int position = j
                // json cur_output={
                    // "is_talk":true,
                    // "speak_duration":[str(speak_duration[0]), str(speak_duration[1])],
                    // "speaker":speaker_str,
                    // "position":position
                // }
                // all_results.push_back(cur_output);
                cout<<is_talk<<endl;
            }else{
                cout<<is_talk<<endl;
            }

        }
        
        // return 0;
        }
}