the first submit

乔峰昇
Showing 14 changed files with 582 additions and 0 deletions
CMakeLists.txt
include/facelandmarks.h
include/retinaface.h
include/speak_detector.h
include/speakcls.h
lib/libspeakrecognize.so
main.cpp
model/cls_speak_v0.2.2.mnn
model/det_face_retina_mnn_1.0.0_v0.1.1.mnn
model/det_landmarks_106_v0.0.1.mnn
speak_detector.cpp
tools/facelandmarks.cpp
tools/retinaface.cpp
tools/speakcls.cpp
--- a/CMakeLists.txt 0 → 100644
View file @974e676
+++ b/CMakeLists.txt 0 → 100644
View file @974e676
+cmake_minimum_required(VERSION 3.10)
+project(main)
+set(CMAKE_CXX_STANDARD 11)
+find_package(OpenCV REQUIRED)
+set(MNN_DIR /home/situ/MNN/MNN1.0/MNN)     
+include_directories(${MNN_DIR}/include)
+LINK_DIRECTORIES(${MNN_DIR}/build)
+include_directories(/home/situ/qfs/sdk_project/mnn_projects/speak_recognize_mnn/include)
+aux_source_directory(/home/situ/qfs/sdk_project/mnn_projects/speak_recognize_mnn/tools SOURCE_CPP)
+link_directories(/home/situ/qfs/sdk_project/mnn_projects/speak_recognize_mnn/lib)
+# add_library(speakrecognize SHARED speak_detector.cpp ${SOURCE_CPP})
+# add_executable(speakrecognize main.cpp retinaface.cpp facelandmarks.cpp speakcls.cpp speak_detector.cpp)
+add_executable(speakrecognize main.cpp)
+# target_link_libraries(speakrecognize -lMNN ${OpenCV_LIBS})
+target_link_libraries(speakrecognize -lspeakrecognize -lMNN ${OpenCV_LIBS})
--- a/include/facelandmarks.h 0 → 100644
View file @974e676
+++ b/include/facelandmarks.h 0 → 100644
View file @974e676
+#ifndef FACELANDMARKS_H
+#define FACELANDMARKS_H
+#include <opencv2/opencv.hpp>
+#include<MNN/Interpreter.hpp>
+#include<MNN/ImageProcess.hpp>
+#include<iostream>
+#include<memory>
+using namespace std;
+using namespace cv;
+using namespace MNN;
+class FaceLandmarks{
+    public:
+        int num_thread = 2;
+        MNNForwardType forward_type = MNN_FORWARD_CPU;
+    public:
+        FaceLandmarks(){};
+        // ~FaceLandmarks();
+        bool init_model(string model_path);
+        vector<vector<float>> inference(string image_path);
+        vector<vector<float>> inference(Mat image);
+    private:
+        bool model_init;
+        float normal[3]={1.0f/256.f,1.0f/256.f,1.0f/256.f};
+        std::shared_ptr<MNN::Interpreter> pfld_interpreter = nullptr;
+        MNN::Session* session = nullptr;
+        MNN::Tensor* input_tensor = nullptr;
+        shared_ptr<MNN::CV::ImageProcess> pretreat;
+};
+#endif
--- a/include/retinaface.h 0 → 100644
View file @974e676
+++ b/include/retinaface.h 0 → 100644
View file @974e676
+#ifndef RETINAFACE_H
+#define RETINAFACE_H
+#include<opencv2/opencv.hpp>
+#include<MNN/Interpreter.hpp>
+#include<MNN/ImageProcess.hpp>
+#include<iostream>
+#include<memory>
+using namespace MNN;
+using namespace std;
+using namespace cv;
+struct Bbox{
+        float xmin;
+        float ymin;
+        float xmax;
+        float ymax;
+        float score;
+        float x1;
+        float y1;
+        float x2;
+        float y2;
+        float x3;
+        float y3;
+        float x4;
+        float y4;
+        float x5;
+        float y5;
+        };
+class RetinaFace{
+    public:
+        float confidence_threshold = 0.5;
+        bool is_bbox_process=true;
+        int num_thread = 2;
+        MNNForwardType forward_type = MNN_FORWARD_CPU;
+    private:
+        bool model_init=false;
+        vector<int> input_size={640,640};
+        vector<float> variances={0.1,0.2};
+        float mean[3] = {104.0f, 117.0f, 123.0f};
+        float keep_top_k = 100;
+        float nms_threshold = 0.4;
+        float resize_scale = 1.0;
+        std::shared_ptr<MNN::Interpreter> net;
+        Session *session = nullptr;
+        MNN::Tensor* input_tensor=nullptr;
+        shared_ptr<MNN::CV::ImageProcess> pretreat;
+        vector<vector<float>> anchors;
+    private:
+        // 生成anchors
+        vector<vector<float>> priorBox(vector<int> image_size);
+        // 解析bounding box  landmarks 包含置信度
+        vector<Bbox> decode(float *loc,float *score,float *pre,vector<vector<float>> priors,vector<float> variances);
+        // 解析landmarks
+        // vector<vector<float>> decode_landm(vector<vector<float>> pre,vector<vector<float>> priors,vector<float> variances);
+        //NMS
+        void nms_cpu(std::vector<Bbox> &bboxes, float threshold);
+        // 根据阈值筛选
+        vector<Bbox> select_score(vector<Bbox> bboxes,float threshold,float w_r,float h_r);
+        // 数据后处理
+        vector<Bbox> bbox_process(vector<Bbox> bboxes,float frame_w,float frame_h);
+    public:
+        RetinaFace(){};
+        // ~RetinaFace();
+        bool init_model(string model_path);
+        // 推理
+        vector<Bbox> inference(string image_path);
+        vector<Bbox> inference(Mat image);
+};
+#endif
\ No newline at end of file
--- a/include/speak_detector.h 0 → 100644
View file @974e676
+++ b/include/speak_detector.h 0 → 100644
View file @974e676
+#ifndef SPEAKCLS_DETECTOR
+#define SPEALCLS_DETECTOR
+#include "speakcls.h"
+#include "retinaface.h"
+#include "facelandmarks.h"
+class SpeakDetector{
+    private:
+        RetinaFace face_det;
+        FaceLandmarks landm_det;
+        SpeakCls speak_cls;
+    public:
+        SpeakDetector(){};
+        void init_model(string face_det_model,string landm_det_model,string speak_cls_model);
+        float iou_compute(Bbox b1, Bbox b2);
+        vector<vector<cv::Mat>> mouth_process(vector<vector<vector<vector<float>>>> batch_landmarks, vector<cv::Mat> batch_images);
+        void image_reader(string file_path,int segment_num,vector<Mat> &bgr_frames,vector<vector<int>> &indices);
+        void speak_recognize(string image_path);
+};
+#endif
\ No newline at end of file
--- a/include/speakcls.h 0 → 100644
View file @974e676
+++ b/include/speakcls.h 0 → 100644
View file @974e676
+#ifndef SPEAKCLS_H
+#define SPEALCLS_H
+#include<opencv2/opencv.hpp>
+#include<MNN/Interpreter.hpp>
+#include<MNN/ImageProcess.hpp>
+#include<memory>
+using namespace std;
+using namespace cv;
+using namespace MNN;
+class SpeakCls{
+    private:
+        std::shared_ptr<MNN::Interpreter> net;
+        MNN::Session* session;
+        MNN::Tensor* input_tensor;
+        ScheduleConfig config;
+        int split_nums = 10;
+    public:
+        SpeakCls(){};
+        bool init_model(string model_path);
+        bool inference(vector<Mat> images);
+    private:
+        cv::Mat standardize(cv::Mat image);
+        cv::Mat data_process(vector<Mat> images);
+        vector<double> softmax(vector<double> input);
+};
+#endif
\ No newline at end of file
--- a/lib/libspeakrecognize.so 0 → 100755
View file @974e676
+++ b/lib/libspeakrecognize.so 0 → 100755
View file @974e676
--- a/main.cpp 0 → 100644
View file @974e676
+++ b/main.cpp 0 → 100644
View file @974e676
+#include "speak_detector.h"
+int main(){
+    SpeakDetector speak = SpeakDetector();
+    string face_det_model = "/home/situ/qfs/sdk_project/mnn_projects/speak_recognize_mnn/model/det_face_retina_mnn_1.0.0_v0.1.1.mnn";
+    string face_landm_model = "/home/situ/qfs/sdk_project/mnn_projects/speak_recognize_mnn/model/det_landmarks_106_v0.0.1.mnn";
+    string speakcls_model = "/home/situ/qfs/sdk_project/mnn_projects/speak_recognize_mnn/model/cls_speak_v0.2.2.mnn";
+    speak.init_model(face_det_model,face_landm_model,speakcls_model);
+    speak.speak_recognize("/data/speak/bank_test/no_speak/2395QUESTION_ANSWER");
+    return 0;
+}
--- a/model/cls_speak_v0.2.2.mnn 0 → 100644
View file @974e676
+++ b/model/cls_speak_v0.2.2.mnn 0 → 100644
View file @974e676
--- a/model/det_face_retina_mnn_1.0.0_v0.1.1.mnn 0 → 100644
View file @974e676
+++ b/model/det_face_retina_mnn_1.0.0_v0.1.1.mnn 0 → 100644
View file @974e676
--- a/model/det_landmarks_106_v0.0.1.mnn 0 → 100644
View file @974e676
+++ b/model/det_landmarks_106_v0.0.1.mnn 0 → 100644
View file @974e676
--- a/speak_detector.cpp 0 → 100644
View file @974e676
+++ b/speak_detector.cpp 0 → 100644
View file @974e676
+#include "speak_detector.h"
+void SpeakDetector::init_model(string face_det_model,string landm_det_model,string speak_cls_model){
+    face_det = RetinaFace();
+    face_det.init_model(face_det_model);
+    landm_det = FaceLandmarks();
+    landm_det.init_model(landm_det_model);
+    speak_cls = SpeakCls();
+    speak_cls.init_model(speak_cls_model);
+}
+float SpeakDetector::iou_compute(Bbox b1, Bbox b2)
+{   
+    float tmp_w=min(b1.xmax,b2.xmax) - max(b1.xmin, b2.xmin);
+    float tmp_h=min(b1.ymax, b2.ymax) - max(b1.ymin, b2.ymin);
+    float w = max(tmp_w, float(0));
+    float h = max(tmp_h, float(0));
+    return w*h / ((b1.xmax-b1.xmin)*(b1.ymax-b1.ymin) + (b2.xmax-b2.xmin)*(b2.ymax-b2.ymin) - w*h);
+}
+vector<vector<cv::Mat>> SpeakDetector::mouth_process(vector<vector<vector<vector<float>>>> batch_landmarks, vector<cv::Mat> batch_images){
+    int input_size=112;
+    vector<vector<cv::Mat>> align_mouths;
+    for(int i=0;i<batch_images.size();++i){
+        cv::Mat image = batch_images[i];
+        vector<cv::Mat> tmp_mouths;
+        for(int j=0;j<batch_landmarks[i].size();++j){
+            vector<float> mouth_xs; 
+            vector<float> mouth_ys;
+            for(int k=84;k<int(104);++k){
+                float x_q = round(batch_landmarks[i][j][k][0]);
+                float y_q = round(batch_landmarks[i][j][k][1]);
+                mouth_xs.push_back(x_q);
+                mouth_ys.push_back(y_q);
+            }
+            float mouth_width=*max_element(mouth_xs.begin(),mouth_xs.end())-*min_element(mouth_xs.begin(),mouth_xs.end());
+            float mouth_height=*max_element(mouth_ys.begin(),mouth_ys.end())-*min_element(mouth_ys.begin(),mouth_ys.end());
+            int mouth_min_x=ceil(*min_element(mouth_xs.begin(),mouth_xs.end())-mouth_width*0.2);
+            int mouth_min_y=ceil(*min_element(mouth_ys.begin(),mouth_ys.end())-mouth_height*0.1);
+            int mouth_max_x=ceil(*max_element(mouth_xs.begin(),mouth_xs.end())+mouth_width*0.2);
+            int mouth_max_y=ceil(*max_element(mouth_ys.begin(),mouth_ys.end())+mouth_height*0.1);
+            mouth_min_x=mouth_min_x>0?mouth_min_x:0;
+            mouth_min_y=mouth_min_y>0?mouth_min_y:0;
+            cv::Rect mouth_rect = Rect(mouth_min_x,mouth_min_y,mouth_max_x-mouth_min_x,mouth_max_y-mouth_min_y);
+            cv::Mat mouth_crop = image(mouth_rect);
+            cv::Mat resize_mouth_crop;
+            cv::resize(mouth_crop,resize_mouth_crop,Size(input_size,input_size));
+            Point center=Point(input_size/2,input_size/2);
+            float dx = batch_landmarks[i][j][90][0]-batch_landmarks[i][j][84][0];
+            float dy = batch_landmarks[i][j][90][1]-batch_landmarks[i][j][84][1];
+            double angle = atan2(dy,dx)*180/float(M_PI);
+            cv::Mat rotate_matrix = cv::getRotationMatrix2D(center,double(angle),1);
+            cv::Mat rot_img;
+            cv::warpAffine(resize_mouth_crop,rot_img,rotate_matrix,Size(input_size,input_size));
+            tmp_mouths.push_back(rot_img);   
+        }
+        align_mouths.push_back(tmp_mouths);
+    }
+    return align_mouths;
+}
+//视频/图像数据切片
+//图像
+void SpeakDetector::image_reader(string file_path,int segment_num,vector<Mat> &bgr_frames,vector<vector<int>> &indices){
+    int new_length = 1;
+    vector<String> image_files;
+    glob(file_path, image_files, false);
+    int total_frames_num = (int)image_files.size();
+    float tick = float(total_frames_num - new_length + 1) / float(segment_num);
+    vector<int> indice;
+    for(int x=0;x<segment_num;++x){
+        indice.push_back(int(tick / 2.0 + tick * x));
+    }
+    indices.push_back(indice);
+    for(auto im_file:image_files){
+        Mat bgr_img=cv::imread(im_file);
+        bgr_frames.push_back(bgr_img);
+    }
+}
+void SpeakDetector::speak_recognize(string image_path){
+    vector<Mat> all_bgr_images;
+    vector<vector<int>> total_split_indices;
+    image_reader(image_path,10,all_bgr_images,total_split_indices);
+    // vector<json> all_results;
+    bool is_talk=false;
+    for(int im_i=0;im_i<total_split_indices.size();++im_i){
+        vector<vector<cv::Mat>> face_list;
+        vector<vector<Bbox>> bbox_list;
+        vector<cv::Mat> rgb_frames;
+        vector<cv::Mat> bgr_frames;
+        int tmp_rows,tmp_cols;
+        for(int im_j=0;im_j<total_split_indices[im_i].size();++im_j){
+            Mat tmp_img=all_bgr_images[total_split_indices[im_i][im_j]];
+            if(im_j !=0){
+                if(tmp_img.rows!=tmp_rows&&tmp_img.cols!=tmp_cols){
+                    cv::resize(tmp_img,tmp_img,Size(int(tmp_img.cols),int(tmp_img.rows)));
+                }
+            }
+            tmp_rows=tmp_img.rows;
+            tmp_cols=tmp_img.cols;
+            Mat rgb_tmp_img;
+            cv::cvtColor(tmp_img,rgb_tmp_img,cv::COLOR_BGR2RGB);
+            bgr_frames.push_back(tmp_img);
+            rgb_frames.push_back(rgb_tmp_img);
+        }    
+        for(auto bgr_frame:bgr_frames){
+            vector<Bbox> boxes=face_det.inference(bgr_frame);
+            vector<cv::Mat> tmp_face_areas;
+            vector<Bbox> tmp_bbox_list;
+            for(auto box:boxes){
+                tmp_bbox_list.push_back(box);
+                // cout<<box.xmin<<" "<<box.ymin<<" "<<box.xmax-box.xmin<<" "<<box.ymax-box.ymin<<endl;
+                Rect m_select = Rect(box.xmin,box.ymin,box.xmax-box.xmin,box.ymax-box.ymin);
+                cv::Mat face_area=bgr_frame(m_select);
+                tmp_face_areas.push_back(face_area);
+                // cv::waitKey(0);
+            }
+            face_list.push_back(tmp_face_areas);
+            bbox_list.push_back(tmp_bbox_list);
+        }
+        // cout<<123<<endl;
+        vector<vector<vector<vector<float>>>> landms_list;
+        for(int i=0;i<face_list.size();++i){
+            vector<vector<vector<float>>> tmp_landm_list;
+            for(int j=0;j<face_list[i].size();++j){
+                vector<vector<float>> tmp_landms=landm_det.inference(face_list[i][j]);
+                for(int k=0;k<tmp_landms.size();++k){
+                    tmp_landms[k][0]=tmp_landms[k][0]+bbox_list[i][j].xmin;
+                    tmp_landms[k][1]=tmp_landms[k][1]+bbox_list[i][j].ymin;
+                }
+                tmp_landm_list.push_back(tmp_landms);
+            }
+            landms_list.push_back(tmp_landm_list);
+        }
+        vector<vector<cv::Mat>> mouth_list=mouth_process(landms_list,rgb_frames);
+        vector<vector<Bbox>> last_bboxes=bbox_list;
+        vector<Bbox> first_bboxes = bbox_list[0];
+        vector<vector<Bbox>>::iterator k = last_bboxes.begin();
+        last_bboxes.erase(k);
+        vector<vector<Bbox>> all_track_bbox_list;
+        vector<vector<cv::Mat>> all_face_list,all_mouth_list;
+        for(int i=0;i<first_bboxes.size();++i){
+            Bbox first_bbox=first_bboxes[i];
+            vector<Bbox> track_bbox_list;
+            vector<cv::Mat> trace_face_list,trace_mouth_list;
+            track_bbox_list.push_back(first_bbox);
+            trace_face_list.push_back(face_list[0][i]);
+            trace_mouth_list.push_back(mouth_list[0][i]);
+            for(int j=0;j<last_bboxes.size();++j){
+                vector<Bbox> next_bboxes=last_bboxes[j];
+                for(int k=0;k<next_bboxes.size();++k){
+                    Bbox next_bbox = next_bboxes[k];
+                    float iou=iou_compute(first_bbox,next_bbox);
+                    if(iou>=0.4){
+                        track_bbox_list.push_back(next_bbox);
+                        trace_face_list.push_back(face_list[j+1][k]);
+                        trace_mouth_list.push_back(mouth_list[j+1][k]);
+                        break;
+                    }
+                }
+            }
+            all_track_bbox_list.push_back(track_bbox_list);
+            all_face_list.push_back(trace_face_list);
+            all_mouth_list.push_back(trace_mouth_list);
+        }
+        for(int j=0;j<all_mouth_list.size();j++){
+            vector<cv::Mat> select_mouth_list=all_mouth_list[j];
+            /**
+             * @brief 模型推理部分代码,返回result  0/1 ,其中1为说话,0为未说话
+             * 
+             */
+            bool result=speak_cls.inference(select_mouth_list);
+            // bool result=true;
+            if(result){
+                is_talk=true;
+                // speak_duration = (split_indices[0], split_indices[-1])
+                // Mat speaker = all_face_list[j][0];
+                // speaker_str = cv::imencode('.jpg', speaker)[1].tostring()
+                // speaker_str = base64.b64encode(speaker_str).decode()
+                // int position = j
+                // json cur_output={
+                    // "is_talk":true,
+                    // "speak_duration":[str(speak_duration[0]), str(speak_duration[1])],
+                    // "speaker":speaker_str,
+                    // "position":position
+                // }
+                // all_results.push_back(cur_output);
+                cout<<is_talk<<endl;
+            }else{
+                cout<<is_talk<<endl;
+            }
+        }
+        // return 0;
+        }
+}
\ No newline at end of file
--- a/tools/facelandmarks.cpp 0 → 100644
View file @974e676
+++ b/tools/facelandmarks.cpp 0 → 100644
View file @974e676
+#include "facelandmarks.h"
+// FaceLandmarks::~FaceLandmarks(){
+//     pfld_interpreter->releaseModel();
+//     pfld_interpreter->releaseSession(session);
+// }
+bool FaceLandmarks::init_model(string model_path){
+    pfld_interpreter = unique_ptr<MNN::Interpreter>(MNN::Interpreter::createFromFile(model_path.c_str()));
+    if(nullptr==pfld_interpreter){
+        return false;
+    }
+    //创建session
+    MNN::ScheduleConfig schedule_config;
+    schedule_config.type = forward_type;
+    schedule_config.numThread = num_thread;
+    MNN::BackendConfig backend_config;
+    backend_config.memory    = MNN::BackendConfig::Memory_Normal;
+    backend_config.power     = MNN::BackendConfig::Power_Normal;
+    backend_config.precision = MNN::BackendConfig::Precision_Normal;
+    schedule_config.backendConfig = &backend_config;
+    session = pfld_interpreter->createSession(schedule_config);
+    input_tensor = pfld_interpreter->getSessionInput(session,NULL);
+    pfld_interpreter->resizeTensor(input_tensor,{1,3,112,112});
+    pfld_interpreter->resizeSession(session);
+    //数据预处理
+    MNN::CV::ImageProcess::Config image_config;
+    ::memcpy(image_config.normal,normal,sizeof(normal));
+    image_config.sourceFormat = MNN::CV::BGR;
+    image_config.destFormat = MNN::CV::BGR;
+    pretreat = shared_ptr<MNN::CV::ImageProcess>(MNN::CV::ImageProcess::create(image_config));
+    // pretreat->setMatrix(transforms);
+    return true;
+}
+vector<vector<float>> FaceLandmarks::inference(string image_path){
+    Mat image = cv::imread(image_path);
+    vector<vector<float>> landmarks;
+    int width = image.cols;
+    int height = image.rows;
+    Mat resize_image;
+    cv::resize(image,resize_image,Size(112,112));
+    float ws = float(width)/float(112.0);
+    float hs = float(height)/float(112.0);
+    pretreat->convert(resize_image.data,112,112,0,input_tensor);
+    pfld_interpreter->runSession(session);
+    auto output_landmark = pfld_interpreter->getSessionOutput(session, NULL);
+    MNN::Tensor landmark_tensor(output_landmark, output_landmark->getDimensionType());
+    output_landmark->copyToHostTensor(&landmark_tensor);
+    float* result = landmark_tensor.host<float>();
+    for (int i = 0; i < 106; ++i) {
+        vector<float> curr_pt={result[2 * i + 0] * ws,result[2 * i + 1] * hs};
+        landmarks.push_back(curr_pt);
+    }
+    return landmarks;
+}
+vector<vector<float>> FaceLandmarks::inference(Mat image){
+    vector<vector<float>> landmarks;
+    int width = image.cols;
+    int height = image.rows;
+    Mat resize_image;
+    cv::resize(image,resize_image,Size(112,112));
+    float ws = float(width)/float(112.0);
+    float hs = float(height)/float(112.0);
+    pretreat->convert(resize_image.data,112,112,0,input_tensor);
+    pfld_interpreter->runSession(session);
+    auto output_landmark = pfld_interpreter->getSessionOutput(session, NULL);
+    MNN::Tensor landmark_tensor(output_landmark, output_landmark->getDimensionType());
+    output_landmark->copyToHostTensor(&landmark_tensor);
+    float* result = landmark_tensor.host<float>();
+    for (int i = 0; i < 106; ++i) {
+        vector<float> curr_pt={result[2 * i + 0] * ws,result[2 * i + 1] * hs};
+        landmarks.push_back(curr_pt);
+    }
+    return landmarks;
+}
\ No newline at end of file
--- a/tools/retinaface.cpp 0 → 100644
View file @974e676
+++ b/tools/retinaface.cpp 0 → 100644
View file @974e676
--- a/tools/speakcls.cpp 0 → 100644
View file @974e676
+++ b/tools/speakcls.cpp 0 → 100644
View file @974e676
+#include "speakcls.h"
+bool SpeakCls::init_model(string model_path){
+    net= std::shared_ptr<MNN::Interpreter>(MNN::Interpreter::createFromFile(model_path.c_str()));//创建解释器
+    config.numThread = 2;
+    config.type = MNN_FORWARD_CPU;
+    session = net->createSession(config);
+    input_tensor = net->getSessionInput(session,NULL);
+    net->resizeTensor(input_tensor,{1,3*split_nums,112,112});
+    net->resizeSession(session);
+}
+cv::Mat SpeakCls::standardize(cv::Mat image){
+    cv::Mat image_f,dst;
+    image.convertTo(image_f, CV_32F);
+    Scalar max_pix = Scalar(255.0f,255.0f,255.0f);
+    Scalar mean = Scalar(0.485f, 0.456f, 0.406f);
+    Scalar std = Scalar(0.229f, 0.224f, 0.225f);
+    dst=image_f/max_pix;
+    dst = (dst-mean)/std;
+    return dst;
+}
+cv::Mat SpeakCls::data_process(vector<Mat> images){
+    std::vector<cv::Mat> all_image_channels;
+    for(auto f:images){
+        Mat tmp_image = standardize(f);
+        std::vector<cv::Mat> tmp_channels;
+        cv::split(tmp_image,tmp_channels);
+        all_image_channels.push_back(tmp_channels[0]);
+        all_image_channels.push_back(tmp_channels[1]);
+        all_image_channels.push_back(tmp_channels[2]);
+    }
+    Mat input_data;
+    cv::merge(all_image_channels,input_data);
+    return input_data;
+}
+vector<double> SpeakCls::softmax(vector<double> input){
+    double total=0;
+	for(auto x:input)
+	{
+		total+=exp(x);
+	}
+	vector<double> result;
+	for(auto x:input)
+	{
+		result.push_back(exp(x)/total);
+	}
+	return result;
+}
+bool SpeakCls::inference(vector<Mat> images){
+    Mat input_data=data_process(images);
+    // cout << _Tensor->elementSize() << endl;
+    std::vector<std::vector<cv::Mat>> nChannels;
+    std::vector<cv::Mat> rgbChannels(3*split_nums);
+    cv::split(input_data, rgbChannels);
+    nChannels.push_back(rgbChannels); //  NHWC  转NCHW
+    auto *pvData = malloc(1 * 3*split_nums * 112 * 112 *sizeof(float));
+    int nPlaneSize = 112 * 112;
+    for (int c = 0; c < 3*split_nums; ++c)
+    {
+    cv::Mat matPlane = nChannels[0][c];
+    memcpy((float *)(pvData) + c * nPlaneSize,\
+            matPlane.data, nPlaneSize * sizeof(float));
+    }
+    auto nchwTensor = new Tensor(input_tensor, Tensor::CAFFE);
+    ::memcpy(nchwTensor->host<float>(), pvData, nPlaneSize * 3*split_nums * sizeof(float));
+    input_tensor->copyFromHostTensor(nchwTensor);
+    //推理
+    net->runSession(session);
+    auto output= net->getSessionOutput(session, NULL);
+    MNN::Tensor feat_tensor(output, output->getDimensionType());
+    output->copyToHostTensor(&feat_tensor);
+    auto scores_dataPtr  = feat_tensor.host<float>();
+    cout<<scores_dataPtr[0]<<" "<<scores_dataPtr[1]<<endl;
+    vector<double> outputs={scores_dataPtr[0],scores_dataPtr[1]};
+    // softmax
+    vector<double> result=softmax(outputs);
+    printf("output belong to class: %f  %f\n", result[0],result[1]);
+    if(result[0]>result[1]){
+        return false;
+    }else{
+        return true;
+    }
+}
\ No newline at end of file