974e6763 by 乔峰昇

the first submit

1 parent 32d03d1b
cmake_minimum_required(VERSION 3.10)
project(main)
set(CMAKE_CXX_STANDARD 11)
find_package(OpenCV REQUIRED)
set(MNN_DIR /home/situ/MNN/MNN1.0/MNN)
include_directories(${MNN_DIR}/include)
LINK_DIRECTORIES(${MNN_DIR}/build)
include_directories(/home/situ/qfs/sdk_project/mnn_projects/speak_recognize_mnn/include)
aux_source_directory(/home/situ/qfs/sdk_project/mnn_projects/speak_recognize_mnn/tools SOURCE_CPP)
link_directories(/home/situ/qfs/sdk_project/mnn_projects/speak_recognize_mnn/lib)
# add_library(speakrecognize SHARED speak_detector.cpp ${SOURCE_CPP})
# add_executable(speakrecognize main.cpp retinaface.cpp facelandmarks.cpp speakcls.cpp speak_detector.cpp)
add_executable(speakrecognize main.cpp)
# target_link_libraries(speakrecognize -lMNN ${OpenCV_LIBS})
target_link_libraries(speakrecognize -lspeakrecognize -lMNN ${OpenCV_LIBS})
#ifndef FACELANDMARKS_H
#define FACELANDMARKS_H
#include <opencv2/opencv.hpp>
#include<MNN/Interpreter.hpp>
#include<MNN/ImageProcess.hpp>
#include<iostream>
#include<memory>
using namespace std;
using namespace cv;
using namespace MNN;
class FaceLandmarks{
public:
int num_thread = 2;
MNNForwardType forward_type = MNN_FORWARD_CPU;
public:
FaceLandmarks(){};
// ~FaceLandmarks();
bool init_model(string model_path);
vector<vector<float>> inference(string image_path);
vector<vector<float>> inference(Mat image);
private:
bool model_init;
float normal[3]={1.0f/256.f,1.0f/256.f,1.0f/256.f};
std::shared_ptr<MNN::Interpreter> pfld_interpreter = nullptr;
MNN::Session* session = nullptr;
MNN::Tensor* input_tensor = nullptr;
shared_ptr<MNN::CV::ImageProcess> pretreat;
};
#endif
#ifndef RETINAFACE_H
#define RETINAFACE_H
#include<opencv2/opencv.hpp>
#include<MNN/Interpreter.hpp>
#include<MNN/ImageProcess.hpp>
#include<iostream>
#include<memory>
using namespace MNN;
using namespace std;
using namespace cv;
struct Bbox{
float xmin;
float ymin;
float xmax;
float ymax;
float score;
float x1;
float y1;
float x2;
float y2;
float x3;
float y3;
float x4;
float y4;
float x5;
float y5;
};
class RetinaFace{
public:
float confidence_threshold = 0.5;
bool is_bbox_process=true;
int num_thread = 2;
MNNForwardType forward_type = MNN_FORWARD_CPU;
private:
bool model_init=false;
vector<int> input_size={640,640};
vector<float> variances={0.1,0.2};
float mean[3] = {104.0f, 117.0f, 123.0f};
float keep_top_k = 100;
float nms_threshold = 0.4;
float resize_scale = 1.0;
std::shared_ptr<MNN::Interpreter> net;
Session *session = nullptr;
MNN::Tensor* input_tensor=nullptr;
shared_ptr<MNN::CV::ImageProcess> pretreat;
vector<vector<float>> anchors;
private:
// 生成anchors
vector<vector<float>> priorBox(vector<int> image_size);
// 解析bounding box landmarks 包含置信度
vector<Bbox> decode(float *loc,float *score,float *pre,vector<vector<float>> priors,vector<float> variances);
// 解析landmarks
// vector<vector<float>> decode_landm(vector<vector<float>> pre,vector<vector<float>> priors,vector<float> variances);
//NMS
void nms_cpu(std::vector<Bbox> &bboxes, float threshold);
// 根据阈值筛选
vector<Bbox> select_score(vector<Bbox> bboxes,float threshold,float w_r,float h_r);
// 数据后处理
vector<Bbox> bbox_process(vector<Bbox> bboxes,float frame_w,float frame_h);
public:
RetinaFace(){};
// ~RetinaFace();
bool init_model(string model_path);
// 推理
vector<Bbox> inference(string image_path);
vector<Bbox> inference(Mat image);
};
#endif
\ No newline at end of file
#ifndef SPEAKCLS_DETECTOR
#define SPEALCLS_DETECTOR
#include "speakcls.h"
#include "retinaface.h"
#include "facelandmarks.h"
class SpeakDetector{
private:
RetinaFace face_det;
FaceLandmarks landm_det;
SpeakCls speak_cls;
public:
SpeakDetector(){};
void init_model(string face_det_model,string landm_det_model,string speak_cls_model);
float iou_compute(Bbox b1, Bbox b2);
vector<vector<cv::Mat>> mouth_process(vector<vector<vector<vector<float>>>> batch_landmarks, vector<cv::Mat> batch_images);
void image_reader(string file_path,int segment_num,vector<Mat> &bgr_frames,vector<vector<int>> &indices);
void speak_recognize(string image_path);
};
#endif
\ No newline at end of file
#ifndef SPEAKCLS_H
#define SPEALCLS_H
#include<opencv2/opencv.hpp>
#include<MNN/Interpreter.hpp>
#include<MNN/ImageProcess.hpp>
#include<memory>
using namespace std;
using namespace cv;
using namespace MNN;
class SpeakCls{
private:
std::shared_ptr<MNN::Interpreter> net;
MNN::Session* session;
MNN::Tensor* input_tensor;
ScheduleConfig config;
int split_nums = 10;
public:
SpeakCls(){};
bool init_model(string model_path);
bool inference(vector<Mat> images);
private:
cv::Mat standardize(cv::Mat image);
cv::Mat data_process(vector<Mat> images);
vector<double> softmax(vector<double> input);
};
#endif
\ No newline at end of file
No preview for this file type
#include "speak_detector.h"
int main(){
SpeakDetector speak = SpeakDetector();
string face_det_model = "/home/situ/qfs/sdk_project/mnn_projects/speak_recognize_mnn/model/det_face_retina_mnn_1.0.0_v0.1.1.mnn";
string face_landm_model = "/home/situ/qfs/sdk_project/mnn_projects/speak_recognize_mnn/model/det_landmarks_106_v0.0.1.mnn";
string speakcls_model = "/home/situ/qfs/sdk_project/mnn_projects/speak_recognize_mnn/model/cls_speak_v0.2.2.mnn";
speak.init_model(face_det_model,face_landm_model,speakcls_model);
speak.speak_recognize("/data/speak/bank_test/no_speak/2395QUESTION_ANSWER");
return 0;
}
No preview for this file type
No preview for this file type
No preview for this file type
#include "speak_detector.h"
void SpeakDetector::init_model(string face_det_model,string landm_det_model,string speak_cls_model){
face_det = RetinaFace();
face_det.init_model(face_det_model);
landm_det = FaceLandmarks();
landm_det.init_model(landm_det_model);
speak_cls = SpeakCls();
speak_cls.init_model(speak_cls_model);
}
float SpeakDetector::iou_compute(Bbox b1, Bbox b2)
{
float tmp_w=min(b1.xmax,b2.xmax) - max(b1.xmin, b2.xmin);
float tmp_h=min(b1.ymax, b2.ymax) - max(b1.ymin, b2.ymin);
float w = max(tmp_w, float(0));
float h = max(tmp_h, float(0));
return w*h / ((b1.xmax-b1.xmin)*(b1.ymax-b1.ymin) + (b2.xmax-b2.xmin)*(b2.ymax-b2.ymin) - w*h);
}
vector<vector<cv::Mat>> SpeakDetector::mouth_process(vector<vector<vector<vector<float>>>> batch_landmarks, vector<cv::Mat> batch_images){
int input_size=112;
vector<vector<cv::Mat>> align_mouths;
for(int i=0;i<batch_images.size();++i){
cv::Mat image = batch_images[i];
vector<cv::Mat> tmp_mouths;
for(int j=0;j<batch_landmarks[i].size();++j){
vector<float> mouth_xs;
vector<float> mouth_ys;
for(int k=84;k<int(104);++k){
float x_q = round(batch_landmarks[i][j][k][0]);
float y_q = round(batch_landmarks[i][j][k][1]);
mouth_xs.push_back(x_q);
mouth_ys.push_back(y_q);
}
float mouth_width=*max_element(mouth_xs.begin(),mouth_xs.end())-*min_element(mouth_xs.begin(),mouth_xs.end());
float mouth_height=*max_element(mouth_ys.begin(),mouth_ys.end())-*min_element(mouth_ys.begin(),mouth_ys.end());
int mouth_min_x=ceil(*min_element(mouth_xs.begin(),mouth_xs.end())-mouth_width*0.2);
int mouth_min_y=ceil(*min_element(mouth_ys.begin(),mouth_ys.end())-mouth_height*0.1);
int mouth_max_x=ceil(*max_element(mouth_xs.begin(),mouth_xs.end())+mouth_width*0.2);
int mouth_max_y=ceil(*max_element(mouth_ys.begin(),mouth_ys.end())+mouth_height*0.1);
mouth_min_x=mouth_min_x>0?mouth_min_x:0;
mouth_min_y=mouth_min_y>0?mouth_min_y:0;
cv::Rect mouth_rect = Rect(mouth_min_x,mouth_min_y,mouth_max_x-mouth_min_x,mouth_max_y-mouth_min_y);
cv::Mat mouth_crop = image(mouth_rect);
cv::Mat resize_mouth_crop;
cv::resize(mouth_crop,resize_mouth_crop,Size(input_size,input_size));
Point center=Point(input_size/2,input_size/2);
float dx = batch_landmarks[i][j][90][0]-batch_landmarks[i][j][84][0];
float dy = batch_landmarks[i][j][90][1]-batch_landmarks[i][j][84][1];
double angle = atan2(dy,dx)*180/float(M_PI);
cv::Mat rotate_matrix = cv::getRotationMatrix2D(center,double(angle),1);
cv::Mat rot_img;
cv::warpAffine(resize_mouth_crop,rot_img,rotate_matrix,Size(input_size,input_size));
tmp_mouths.push_back(rot_img);
}
align_mouths.push_back(tmp_mouths);
}
return align_mouths;
}
//视频/图像数据切片
//图像
void SpeakDetector::image_reader(string file_path,int segment_num,vector<Mat> &bgr_frames,vector<vector<int>> &indices){
int new_length = 1;
vector<String> image_files;
glob(file_path, image_files, false);
int total_frames_num = (int)image_files.size();
float tick = float(total_frames_num - new_length + 1) / float(segment_num);
vector<int> indice;
for(int x=0;x<segment_num;++x){
indice.push_back(int(tick / 2.0 + tick * x));
}
indices.push_back(indice);
for(auto im_file:image_files){
Mat bgr_img=cv::imread(im_file);
bgr_frames.push_back(bgr_img);
}
}
void SpeakDetector::speak_recognize(string image_path){
vector<Mat> all_bgr_images;
vector<vector<int>> total_split_indices;
image_reader(image_path,10,all_bgr_images,total_split_indices);
// vector<json> all_results;
bool is_talk=false;
for(int im_i=0;im_i<total_split_indices.size();++im_i){
vector<vector<cv::Mat>> face_list;
vector<vector<Bbox>> bbox_list;
vector<cv::Mat> rgb_frames;
vector<cv::Mat> bgr_frames;
int tmp_rows,tmp_cols;
for(int im_j=0;im_j<total_split_indices[im_i].size();++im_j){
Mat tmp_img=all_bgr_images[total_split_indices[im_i][im_j]];
if(im_j !=0){
if(tmp_img.rows!=tmp_rows&&tmp_img.cols!=tmp_cols){
cv::resize(tmp_img,tmp_img,Size(int(tmp_img.cols),int(tmp_img.rows)));
}
}
tmp_rows=tmp_img.rows;
tmp_cols=tmp_img.cols;
Mat rgb_tmp_img;
cv::cvtColor(tmp_img,rgb_tmp_img,cv::COLOR_BGR2RGB);
bgr_frames.push_back(tmp_img);
rgb_frames.push_back(rgb_tmp_img);
}
for(auto bgr_frame:bgr_frames){
vector<Bbox> boxes=face_det.inference(bgr_frame);
vector<cv::Mat> tmp_face_areas;
vector<Bbox> tmp_bbox_list;
for(auto box:boxes){
tmp_bbox_list.push_back(box);
// cout<<box.xmin<<" "<<box.ymin<<" "<<box.xmax-box.xmin<<" "<<box.ymax-box.ymin<<endl;
Rect m_select = Rect(box.xmin,box.ymin,box.xmax-box.xmin,box.ymax-box.ymin);
cv::Mat face_area=bgr_frame(m_select);
tmp_face_areas.push_back(face_area);
// cv::waitKey(0);
}
face_list.push_back(tmp_face_areas);
bbox_list.push_back(tmp_bbox_list);
}
// cout<<123<<endl;
vector<vector<vector<vector<float>>>> landms_list;
for(int i=0;i<face_list.size();++i){
vector<vector<vector<float>>> tmp_landm_list;
for(int j=0;j<face_list[i].size();++j){
vector<vector<float>> tmp_landms=landm_det.inference(face_list[i][j]);
for(int k=0;k<tmp_landms.size();++k){
tmp_landms[k][0]=tmp_landms[k][0]+bbox_list[i][j].xmin;
tmp_landms[k][1]=tmp_landms[k][1]+bbox_list[i][j].ymin;
}
tmp_landm_list.push_back(tmp_landms);
}
landms_list.push_back(tmp_landm_list);
}
vector<vector<cv::Mat>> mouth_list=mouth_process(landms_list,rgb_frames);
vector<vector<Bbox>> last_bboxes=bbox_list;
vector<Bbox> first_bboxes = bbox_list[0];
vector<vector<Bbox>>::iterator k = last_bboxes.begin();
last_bboxes.erase(k);
vector<vector<Bbox>> all_track_bbox_list;
vector<vector<cv::Mat>> all_face_list,all_mouth_list;
for(int i=0;i<first_bboxes.size();++i){
Bbox first_bbox=first_bboxes[i];
vector<Bbox> track_bbox_list;
vector<cv::Mat> trace_face_list,trace_mouth_list;
track_bbox_list.push_back(first_bbox);
trace_face_list.push_back(face_list[0][i]);
trace_mouth_list.push_back(mouth_list[0][i]);
for(int j=0;j<last_bboxes.size();++j){
vector<Bbox> next_bboxes=last_bboxes[j];
for(int k=0;k<next_bboxes.size();++k){
Bbox next_bbox = next_bboxes[k];
float iou=iou_compute(first_bbox,next_bbox);
if(iou>=0.4){
track_bbox_list.push_back(next_bbox);
trace_face_list.push_back(face_list[j+1][k]);
trace_mouth_list.push_back(mouth_list[j+1][k]);
break;
}
}
}
all_track_bbox_list.push_back(track_bbox_list);
all_face_list.push_back(trace_face_list);
all_mouth_list.push_back(trace_mouth_list);
}
for(int j=0;j<all_mouth_list.size();j++){
vector<cv::Mat> select_mouth_list=all_mouth_list[j];
/**
* @brief 模型推理部分代码,返回result 0/1 ,其中1为说话,0为未说话
*
*/
bool result=speak_cls.inference(select_mouth_list);
// bool result=true;
if(result){
is_talk=true;
// speak_duration = (split_indices[0], split_indices[-1])
// Mat speaker = all_face_list[j][0];
// speaker_str = cv::imencode('.jpg', speaker)[1].tostring()
// speaker_str = base64.b64encode(speaker_str).decode()
// int position = j
// json cur_output={
// "is_talk":true,
// "speak_duration":[str(speak_duration[0]), str(speak_duration[1])],
// "speaker":speaker_str,
// "position":position
// }
// all_results.push_back(cur_output);
cout<<is_talk<<endl;
}else{
cout<<is_talk<<endl;
}
}
// return 0;
}
}
\ No newline at end of file
#include "facelandmarks.h"
// FaceLandmarks::~FaceLandmarks(){
// pfld_interpreter->releaseModel();
// pfld_interpreter->releaseSession(session);
// }
bool FaceLandmarks::init_model(string model_path){
pfld_interpreter = unique_ptr<MNN::Interpreter>(MNN::Interpreter::createFromFile(model_path.c_str()));
if(nullptr==pfld_interpreter){
return false;
}
//创建session
MNN::ScheduleConfig schedule_config;
schedule_config.type = forward_type;
schedule_config.numThread = num_thread;
MNN::BackendConfig backend_config;
backend_config.memory = MNN::BackendConfig::Memory_Normal;
backend_config.power = MNN::BackendConfig::Power_Normal;
backend_config.precision = MNN::BackendConfig::Precision_Normal;
schedule_config.backendConfig = &backend_config;
session = pfld_interpreter->createSession(schedule_config);
input_tensor = pfld_interpreter->getSessionInput(session,NULL);
pfld_interpreter->resizeTensor(input_tensor,{1,3,112,112});
pfld_interpreter->resizeSession(session);
//数据预处理
MNN::CV::ImageProcess::Config image_config;
::memcpy(image_config.normal,normal,sizeof(normal));
image_config.sourceFormat = MNN::CV::BGR;
image_config.destFormat = MNN::CV::BGR;
pretreat = shared_ptr<MNN::CV::ImageProcess>(MNN::CV::ImageProcess::create(image_config));
// pretreat->setMatrix(transforms);
return true;
}
vector<vector<float>> FaceLandmarks::inference(string image_path){
Mat image = cv::imread(image_path);
vector<vector<float>> landmarks;
int width = image.cols;
int height = image.rows;
Mat resize_image;
cv::resize(image,resize_image,Size(112,112));
float ws = float(width)/float(112.0);
float hs = float(height)/float(112.0);
pretreat->convert(resize_image.data,112,112,0,input_tensor);
pfld_interpreter->runSession(session);
auto output_landmark = pfld_interpreter->getSessionOutput(session, NULL);
MNN::Tensor landmark_tensor(output_landmark, output_landmark->getDimensionType());
output_landmark->copyToHostTensor(&landmark_tensor);
float* result = landmark_tensor.host<float>();
for (int i = 0; i < 106; ++i) {
vector<float> curr_pt={result[2 * i + 0] * ws,result[2 * i + 1] * hs};
landmarks.push_back(curr_pt);
}
return landmarks;
}
vector<vector<float>> FaceLandmarks::inference(Mat image){
vector<vector<float>> landmarks;
int width = image.cols;
int height = image.rows;
Mat resize_image;
cv::resize(image,resize_image,Size(112,112));
float ws = float(width)/float(112.0);
float hs = float(height)/float(112.0);
pretreat->convert(resize_image.data,112,112,0,input_tensor);
pfld_interpreter->runSession(session);
auto output_landmark = pfld_interpreter->getSessionOutput(session, NULL);
MNN::Tensor landmark_tensor(output_landmark, output_landmark->getDimensionType());
output_landmark->copyToHostTensor(&landmark_tensor);
float* result = landmark_tensor.host<float>();
for (int i = 0; i < 106; ++i) {
vector<float> curr_pt={result[2 * i + 0] * ws,result[2 * i + 1] * hs};
landmarks.push_back(curr_pt);
}
return landmarks;
}
\ No newline at end of file
#include "speakcls.h"
bool SpeakCls::init_model(string model_path){
net= std::shared_ptr<MNN::Interpreter>(MNN::Interpreter::createFromFile(model_path.c_str()));//创建解释器
config.numThread = 2;
config.type = MNN_FORWARD_CPU;
session = net->createSession(config);
input_tensor = net->getSessionInput(session,NULL);
net->resizeTensor(input_tensor,{1,3*split_nums,112,112});
net->resizeSession(session);
}
cv::Mat SpeakCls::standardize(cv::Mat image){
cv::Mat image_f,dst;
image.convertTo(image_f, CV_32F);
Scalar max_pix = Scalar(255.0f,255.0f,255.0f);
Scalar mean = Scalar(0.485f, 0.456f, 0.406f);
Scalar std = Scalar(0.229f, 0.224f, 0.225f);
dst=image_f/max_pix;
dst = (dst-mean)/std;
return dst;
}
cv::Mat SpeakCls::data_process(vector<Mat> images){
std::vector<cv::Mat> all_image_channels;
for(auto f:images){
Mat tmp_image = standardize(f);
std::vector<cv::Mat> tmp_channels;
cv::split(tmp_image,tmp_channels);
all_image_channels.push_back(tmp_channels[0]);
all_image_channels.push_back(tmp_channels[1]);
all_image_channels.push_back(tmp_channels[2]);
}
Mat input_data;
cv::merge(all_image_channels,input_data);
return input_data;
}
vector<double> SpeakCls::softmax(vector<double> input){
double total=0;
for(auto x:input)
{
total+=exp(x);
}
vector<double> result;
for(auto x:input)
{
result.push_back(exp(x)/total);
}
return result;
}
bool SpeakCls::inference(vector<Mat> images){
Mat input_data=data_process(images);
// cout << _Tensor->elementSize() << endl;
std::vector<std::vector<cv::Mat>> nChannels;
std::vector<cv::Mat> rgbChannels(3*split_nums);
cv::split(input_data, rgbChannels);
nChannels.push_back(rgbChannels); // NHWC 转NCHW
auto *pvData = malloc(1 * 3*split_nums * 112 * 112 *sizeof(float));
int nPlaneSize = 112 * 112;
for (int c = 0; c < 3*split_nums; ++c)
{
cv::Mat matPlane = nChannels[0][c];
memcpy((float *)(pvData) + c * nPlaneSize,\
matPlane.data, nPlaneSize * sizeof(float));
}
auto nchwTensor = new Tensor(input_tensor, Tensor::CAFFE);
::memcpy(nchwTensor->host<float>(), pvData, nPlaneSize * 3*split_nums * sizeof(float));
input_tensor->copyFromHostTensor(nchwTensor);
//推理
net->runSession(session);
auto output= net->getSessionOutput(session, NULL);
MNN::Tensor feat_tensor(output, output->getDimensionType());
output->copyToHostTensor(&feat_tensor);
auto scores_dataPtr = feat_tensor.host<float>();
cout<<scores_dataPtr[0]<<" "<<scores_dataPtr[1]<<endl;
vector<double> outputs={scores_dataPtr[0],scores_dataPtr[1]};
// softmax
vector<double> result=softmax(outputs);
printf("output belong to class: %f %f\n", result[0],result[1]);
if(result[0]>result[1]){
return false;
}else{
return true;
}
}
\ No newline at end of file
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!