license part 1
Showing
2 changed files
with
98 additions
and
13 deletions
| ... | @@ -478,3 +478,14 @@ BC_FIELD = (('CardNum', '银行卡号'), | ... | @@ -478,3 +478,14 @@ BC_FIELD = (('CardNum', '银行卡号'), |
| 478 | 478 | ||
| 479 | SUCCESS_CODE_SET = {'0', 0} | 479 | SUCCESS_CODE_SET = {'0', 0} |
| 480 | 480 | ||
| 481 | BC_PID = 4 | ||
| 482 | |||
| 483 | OTHER_SET = {0, 1, 2} | ||
| 484 | BS_SET = {10, 11, 12} | ||
| 485 | LICENSE_SET_1 = {110, 111, 112} | ||
| 486 | LICENSE_SET_2 = {1110, 1111, 1112} | ||
| 487 | |||
| 488 | CLASSIFY_PID_DICT = { | ||
| 489 | 0: (4, BC_KEY) # 银行卡 | ||
| 490 | } | ||
| 491 | ... | ... |
| ... | @@ -4,6 +4,7 @@ import signal | ... | @@ -4,6 +4,7 @@ import signal |
| 4 | import asyncio | 4 | import asyncio |
| 5 | import aiohttp | 5 | import aiohttp |
| 6 | import difflib | 6 | import difflib |
| 7 | import base64 | ||
| 7 | import requests | 8 | import requests |
| 8 | from datetime import datetime, date | 9 | from datetime import datetime, date |
| 9 | from collections import Counter | 10 | from collections import Counter |
| ... | @@ -30,7 +31,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -30,7 +31,8 @@ class Command(BaseCommand, LoggerMixin): |
| 30 | # 数据目录 | 31 | # 数据目录 |
| 31 | self.data_dir = conf.DATA_DIR | 32 | self.data_dir = conf.DATA_DIR |
| 32 | # ocr相关 | 33 | # ocr相关 |
| 33 | self.ocr_url = conf.OCR_URL | 34 | self.ocr_url_1 = conf.OCR_URL_1 |
| 35 | self.ocr_url_2 = conf.OCR_URL_2 | ||
| 34 | # EDMS web_service_api | 36 | # EDMS web_service_api |
| 35 | self.edms = EDMS(conf.EDMS_USER, conf.EDMS_PWD) | 37 | self.edms = EDMS(conf.EDMS_USER, conf.EDMS_PWD) |
| 36 | # 优雅退出信号:15 | 38 | # 优雅退出信号:15 |
| ... | @@ -79,7 +81,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -79,7 +81,7 @@ class Command(BaseCommand, LoggerMixin): |
| 79 | return doc_data_path, excel_path, src_excel_path, pdf_path | 81 | return doc_data_path, excel_path, src_excel_path, pdf_path |
| 80 | 82 | ||
| 81 | @staticmethod | 83 | @staticmethod |
| 82 | def append_bs_sheet(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence): | 84 | def bs_process(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence): |
| 83 | for i, sheet in enumerate(sheets): | 85 | for i, sheet in enumerate(sheets): |
| 84 | sheet_name = 'page_{0}_img_{1}_{2}'.format(pno, img_idx, i) | 86 | sheet_name = 'page_{0}_img_{1}_{2}'.format(pno, img_idx, i) |
| 85 | # ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'] | 87 | # ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'] |
| ... | @@ -137,7 +139,39 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -137,7 +139,39 @@ class Command(BaseCommand, LoggerMixin): |
| 137 | words = cell.get('words') | 139 | words = cell.get('words') |
| 138 | ws.cell(row=r1+1, column=c1+1, value=words) | 140 | ws.cell(row=r1+1, column=c1+1, value=words) |
| 139 | 141 | ||
| 142 | def license2_process(self, img_path, license_summary, pid, license_key): | ||
| 143 | with open(img_path, 'rb') as f: | ||
| 144 | base64_data = base64.b64encode(f.read()) | ||
| 145 | # 获取解码后的base64值 | ||
| 146 | filedata = base64_data.decode() | ||
| 147 | # pid 产品的pid, key, secret 登录之后能够查看到 | ||
| 148 | datas = { | ||
| 149 | "pid": str(pid), | ||
| 150 | "key": conf.OCR_KEY, | ||
| 151 | "secret": conf.OCR_SECRET, | ||
| 152 | "file": filedata | ||
| 153 | } | ||
| 154 | r = requests.post(self.ocr_url_2, data=datas) | ||
| 155 | if r.status_code == 200: | ||
| 156 | # 识别结果 | ||
| 157 | response = r.json() | ||
| 158 | if response.get('ErrorCode') in consts.SUCCESS_CODE_SET: | ||
| 159 | if pid == consts.BC_PID: | ||
| 160 | # 银行卡 | ||
| 161 | res_list = [] | ||
| 162 | for en_key, chn_key in consts.BC_FIELD: | ||
| 163 | res_list.append((chn_key, response.get(en_key, ''))) | ||
| 164 | license_summary.setdefault(license_key, []).append(res_list) | ||
| 165 | else: | ||
| 166 | # 营业执照、行驶证等 | ||
| 167 | for result_dict in response.get('ResultList', []): | ||
| 168 | res_list = [] | ||
| 169 | for field_dict in result_dict.get('FieldList', []): | ||
| 170 | res_list.append((field_dict.get('chn_key', ''), field_dict.get('value', ''))) | ||
| 171 | license_summary.setdefault(license_key, []).append(res_list) | ||
| 172 | |||
| 140 | def ocr_2_wb(self, res, wb, pno, img_idx, bs_summary, unknown_summary, license_summary): | 173 | def ocr_2_wb(self, res, wb, pno, img_idx, bs_summary, unknown_summary, license_summary): |
| 174 | # # 流水 | ||
| 141 | # res = { | 175 | # res = { |
| 142 | # 'code': 1, | 176 | # 'code': 1, |
| 143 | # 'msg': 'success', | 177 | # 'msg': 'success', |
| ... | @@ -156,18 +190,55 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -156,18 +190,55 @@ class Command(BaseCommand, LoggerMixin): |
| 156 | # ] | 190 | # ] |
| 157 | # } | 191 | # } |
| 158 | # } | 192 | # } |
| 193 | # | ||
| 194 | # # 证件-1 | ||
| 195 | # res = { | ||
| 196 | # 'code': 1, | ||
| 197 | # 'msg': 'success', | ||
| 198 | # 'data': { | ||
| 199 | # 'classify': 0, | ||
| 200 | # 'confidence': 0.999, | ||
| 201 | # 'data': [ | ||
| 202 | # { | ||
| 203 | # 'cn_key': 'value', | ||
| 204 | # 'cn_key': 'value', | ||
| 205 | # }, | ||
| 206 | # { | ||
| 207 | # 'cn_key': 'value', | ||
| 208 | # 'cn_key': 'value', | ||
| 209 | # }, | ||
| 210 | # ] | ||
| 211 | # } | ||
| 212 | # } | ||
| 213 | # | ||
| 214 | # # 证件-2 or 其他类 | ||
| 215 | # res = { | ||
| 216 | # 'code': 1, | ||
| 217 | # 'msg': 'success', | ||
| 218 | # 'data': { | ||
| 219 | # 'classify': 0, | ||
| 220 | # 'confidence': 0.999, | ||
| 221 | # } | ||
| 222 | # } | ||
| 223 | |||
| 159 | data = res.get('data', {}) | 224 | data = res.get('data', {}) |
| 160 | classify = data.get('classify') | 225 | classify = data.get('classify') |
| 161 | if classify is None: | 226 | if classify is None: |
| 162 | return | 227 | return |
| 163 | # if classify in | 228 | elif classify in consts.OTHER_SET: # 其他类 |
| 164 | sheets = data.get('sheets', []) | ||
| 165 | if not sheets: | ||
| 166 | return | 229 | return |
| 167 | confidence = data.get('confidence', 1) | 230 | elif classify in consts.BS_SET: # 流水处理 |
| 168 | self.append_bs_sheet(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence) | 231 | sheets = data.get('sheets', []) |
| 169 | # else: | 232 | if not sheets: |
| 170 | # pass | 233 | return |
| 234 | confidence = data.get('confidence', 1) | ||
| 235 | self.bs_process(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence) | ||
| 236 | elif classify in consts.LICENSE_SET_1: # 证件1 | ||
| 237 | # self.license1_process() # TODO license1 | ||
| 238 | pass | ||
| 239 | elif classify in consts.LICENSE_SET_2: # 证件2 | ||
| 240 | pid, license_key = consts.CLASSIFY_PID_DICT.get(classify) | ||
| 241 | self.license2_process(license_summary, pid, license_key) # TODO reuse img data? | ||
| 171 | 242 | ||
| 172 | # async def fetch_ocr_result(self, img_path): | 243 | # async def fetch_ocr_result(self, img_path): |
| 173 | # async with aiohttp.ClientSession( | 244 | # async with aiohttp.ClientSession( |
| ... | @@ -188,8 +259,9 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -188,8 +259,9 @@ class Command(BaseCommand, LoggerMixin): |
| 188 | files = [ | 259 | files = [ |
| 189 | ('img', open(img_path, 'rb')) | 260 | ('img', open(img_path, 'rb')) |
| 190 | ] | 261 | ] |
| 191 | response = requests.request("POST", self.ocr_url, files=files) | 262 | response = requests.request("POST", self.ocr_url_1, files=files) |
| 192 | return response.json() | 263 | if response.status_code == 200: |
| 264 | return response.json() | ||
| 193 | 265 | ||
| 194 | def img_2_ocr_2_wb(self, wb, img_info, bs_summary, unknown_summary, license_summary): | 266 | def img_2_ocr_2_wb(self, wb, img_info, bs_summary, unknown_summary, license_summary): |
| 195 | res = self.fetch_ocr_result(img_info[0]) | 267 | res = self.fetch_ocr_result(img_info[0]) |
| ... | @@ -255,7 +327,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -255,7 +327,6 @@ class Command(BaseCommand, LoggerMixin): |
| 255 | summary['role'] = self.get_most(summary['role']) | 327 | summary['role'] = self.get_most(summary['role']) |
| 256 | return bs_summary | 328 | return bs_summary |
| 257 | 329 | ||
| 258 | |||
| 259 | def rebuild_bs_summary(self, bs_summary, unknown_summary): | 330 | def rebuild_bs_summary(self, bs_summary, unknown_summary): |
| 260 | # bs_summary = { | 331 | # bs_summary = { |
| 261 | # '卡号': { | 332 | # '卡号': { |
| ... | @@ -336,9 +407,12 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -336,9 +407,12 @@ class Command(BaseCommand, LoggerMixin): |
| 336 | return merged_bs_summary | 407 | return merged_bs_summary |
| 337 | 408 | ||
| 338 | # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理 | 409 | # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理 |
| 339 | # TODO 调用接口重试 | ||
| 340 | # TODO 协程异步发送OCR请求 | 410 | # TODO 协程异步发送OCR请求 |
| 411 | # TODO 调用接口重试 | ||
| 341 | # TODO 异常邮件通知 | 412 | # TODO 异常邮件通知 |
| 413 | # 识别失败:普通异常,如PDF异常、构建过程异常 | ||
| 414 | # EDMS异常:下载异常-->回队列-->邮件;上传异常-->重新上传队列-->邮件 | ||
| 415 | # 算法异常:第一道异常-->识别失败-->邮件;第二道异常-->识别失败-->邮件 | ||
| 342 | # TODO 数据库断联问题 | 416 | # TODO 数据库断联问题 |
| 343 | # TODO 非流水证件处理 | 417 | # TODO 非流水证件处理 |
| 344 | # TODO EDMS API GATEWAY | 418 | # TODO EDMS API GATEWAY | ... | ... |
-
Please register or sign in to post a comment