fix bug
Showing
2 changed files
with
66 additions
and
36 deletions
... | @@ -19,7 +19,7 @@ from common.mixins import LoggerMixin | ... | @@ -19,7 +19,7 @@ from common.mixins import LoggerMixin |
19 | from common.tools.pdf_to_img import PDFHandler | 19 | from common.tools.pdf_to_img import PDFHandler |
20 | from apps.doc import consts | 20 | from apps.doc import consts |
21 | from apps.doc.exceptions import OCR1Exception, OCR4Exception | 21 | from apps.doc.exceptions import OCR1Exception, OCR4Exception |
22 | from apps.doc.ocr.wb import BSWorkbook | 22 | from apps.doc.ocr.wb import BSWorkbook, PatternFill |
23 | 23 | ||
24 | 24 | ||
25 | class Finder: | 25 | class Finder: |
... | @@ -85,7 +85,7 @@ class Finder: | ... | @@ -85,7 +85,7 @@ class Finder: |
85 | pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 | 85 | pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 |
86 | ocr_texts = pattern.sub('', ocr_texts) | 86 | ocr_texts = pattern.sub('', ocr_texts) |
87 | 87 | ||
88 | score = fuzz.ratio(page_template, ocr_texts) / 100. | 88 | score = fuzz.ratio(page_template, ocr_texts)/100. |
89 | classes.append([pno, score]) | 89 | classes.append([pno, score]) |
90 | pred = sorted(classes, key=lambda x: x[1], reverse=True)[0] | 90 | pred = sorted(classes, key=lambda x: x[1], reverse=True)[0] |
91 | return pred | 91 | return pred |
... | @@ -111,7 +111,7 @@ class Finder: | ... | @@ -111,7 +111,7 @@ class Finder: |
111 | continue | 111 | continue |
112 | inter = Polygon(g).intersection(Polygon(p)).area | 112 | inter = Polygon(g).intersection(Polygon(p)).area |
113 | union = g.area + p.area - inter | 113 | union = g.area + p.area - inter |
114 | iou = inter / union | 114 | iou = inter/union |
115 | iou_list.append([iou, key]) | 115 | iou_list.append([iou, key]) |
116 | if len(iou_list) == 0: | 116 | if len(iou_list) == 0: |
117 | return -1, -1 | 117 | return -1, -1 |
... | @@ -141,8 +141,8 @@ class Finder: | ... | @@ -141,8 +141,8 @@ class Finder: |
141 | if len(words) == 0: | 141 | if len(words) == 0: |
142 | # 将 bbox 右移一个单位 | 142 | # 将 bbox 右移一个单位 |
143 | x0, y0, x1, y1, x2, y2, x3, y3 = bbox | 143 | x0, y0, x1, y1, x2, y2, x3, y3 = bbox |
144 | rw = abs(x0 - x1) | 144 | rw = abs(x0-x1) |
145 | anchor = [x0 + rw, y0, x1 + rw, y1, x2 + rw, y2, x3 + rw, y3] | 145 | anchor = [x0+rw, y0, x1+rw, y1, x2+rw, y2, x3+rw, y3] |
146 | iou, key = self.get_top_iou(ocr_results, anchor) | 146 | iou, key = self.get_top_iou(ocr_results, anchor) |
147 | if ratio > 0.3: | 147 | if ratio > 0.3: |
148 | bbox, text = ocr_results[key] | 148 | bbox, text = ocr_results[key] |
... | @@ -223,7 +223,7 @@ class Finder: | ... | @@ -223,7 +223,7 @@ class Finder: |
223 | bbox, text = self.ocr_results[pno][key] | 223 | bbox, text = self.ocr_results[pno][key] |
224 | all_texts += text | 224 | all_texts += text |
225 | 225 | ||
226 | searchObj = re.search(r'保证人\[(.*?)\]与甲方', all_texts) | 226 | searchObj = re.search( r'保证人\[(.*?)\]与甲方', all_texts) |
227 | if searchObj: | 227 | if searchObj: |
228 | words = f'[{searchObj.group(1)}]' | 228 | words = f'[{searchObj.group(1)}]' |
229 | words = words.replace('【', '[').replace('】', ']').replace(',', ',').replace('(', '(').replace(')', ')') | 229 | words = words.replace('【', '[').replace('】', ']').replace(',', ',').replace('(', '(').replace(')', ')') |
... | @@ -256,7 +256,9 @@ class Finder: | ... | @@ -256,7 +256,9 @@ class Finder: |
256 | if score > 0.5: | 256 | if score > 0.5: |
257 | if len(self.ocr_results[pno]) > 0: | 257 | if len(self.ocr_results[pno]) > 0: |
258 | # 根据关键词,找这一行字符 | 258 | # 根据关键词,找这一行字符 |
259 | lines = self.get_line(self.ocr_results[pno], 'RMB') | 259 | lines = '' |
260 | for i in ['RMB', 'CNY']: | ||
261 | lines += self.get_line(self.ocr_results[pno], i) | ||
260 | # searchObj = re.search( r'RMB(.*?)in', lines) | 262 | # searchObj = re.search( r'RMB(.*?)in', lines) |
261 | searchObj = re.search(r'[0-9,.]+', lines) | 263 | searchObj = re.search(r'[0-9,.]+', lines) |
262 | if searchObj: | 264 | if searchObj: |
... | @@ -264,7 +266,7 @@ class Finder: | ... | @@ -264,7 +266,7 @@ class Finder: |
264 | amount_eng = words | 266 | amount_eng = words |
265 | 267 | ||
266 | lines = self.get_line(self.ocr_results[pno], '人民币') | 268 | lines = self.get_line(self.ocr_results[pno], '人民币') |
267 | searchObj = re.search(r'大写(.*?)综合', lines) | 269 | searchObj = re.search( r'大写(.*?)综合', lines) |
268 | if searchObj: | 270 | if searchObj: |
269 | words = searchObj.group(1) | 271 | words = searchObj.group(1) |
270 | pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 | 272 | pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 |
... | @@ -280,11 +282,11 @@ class Finder: | ... | @@ -280,11 +282,11 @@ class Finder: |
280 | lines = self.get_line(self.ocr_results[pno], 'ending') | 282 | lines = self.get_line(self.ocr_results[pno], 'ending') |
281 | if len(lines) > 0: | 283 | if len(lines) > 0: |
282 | start, end = lines.split('ending') | 284 | start, end = lines.split('ending') |
283 | searchStart = re.search(r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', start) | 285 | searchStart = re.search( r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', start) |
284 | if searchStart: | 286 | if searchStart: |
285 | words = searchStart.group() | 287 | words = searchStart.group() |
286 | term_start_eng = words | 288 | term_start_eng = words |
287 | searchEnd = re.search(r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', end) | 289 | searchEnd = re.search( r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', end) |
288 | if searchEnd: | 290 | if searchEnd: |
289 | words = searchEnd.group() | 291 | words = searchEnd.group() |
290 | term_end_eng = words | 292 | term_end_eng = words |
... | @@ -292,23 +294,23 @@ class Finder: | ... | @@ -292,23 +294,23 @@ class Finder: |
292 | lines = self.get_line(self.ocr_results[pno], '至') | 294 | lines = self.get_line(self.ocr_results[pno], '至') |
293 | if len(lines) > 0: | 295 | if len(lines) > 0: |
294 | start, end = lines.split('至') | 296 | start, end = lines.split('至') |
295 | searchStart = re.search(r'[0-9]{4}-[0-9]+-[0-9]+', start) | 297 | searchStart = re.search( r'[0-9]{4}-[0-9]+-[0-9]+', start) |
296 | if searchStart: | 298 | if searchStart: |
297 | words = searchStart.group() | 299 | words = searchStart.group() |
298 | term_start_chn = words | 300 | term_start_chn = words |
299 | searchEnd = re.search(r'[0-9]{4}-[0-9]+-[0-9]+', end) | 301 | searchEnd = re.search( r'[0-9]{4}-[0-9]+-[0-9]+', end) |
300 | if searchEnd: | 302 | if searchEnd: |
301 | words = searchEnd.group() | 303 | words = searchEnd.group() |
302 | term_end_chn = words | 304 | term_end_chn = words |
303 | 305 | ||
304 | lines = self.get_line(self.ocr_results[pno], 'above') | 306 | lines = self.get_line(self.ocr_results[pno], 'above') |
305 | searchObj = re.search(r'aboveto([0-9]+)', lines.replace('O', '0')) | 307 | searchObj = re.search( r'aboveto([0-9]+)', lines.replace('O', '0').replace('too', 'to0')) |
306 | if searchObj: | 308 | if searchObj: |
307 | words = searchObj.group(1) | 309 | words = searchObj.group(1) |
308 | deposit_eng = f'{words}%' | 310 | deposit_eng = f'{words}%' |
309 | 311 | ||
310 | lines = self.get_line(self.ocr_results[pno], '授信额度的') | 312 | lines = self.get_line(self.ocr_results[pno], '授信额度的') |
311 | searchObj = re.search(r'授信额度的([0-9]+)', lines.replace('O', '0')) | 313 | searchObj = re.search( r'授信额度的([0-9]+)', lines.replace('O', '0')) |
312 | if searchObj: | 314 | if searchObj: |
313 | words = searchObj.group(1) | 315 | words = searchObj.group(1) |
314 | deposit_chn = f'{words}%' | 316 | deposit_chn = f'{words}%' |
... | @@ -371,7 +373,6 @@ class Finder: | ... | @@ -371,7 +373,6 @@ class Finder: |
371 | self.init_result["其他约定与条件中文"] = words_chn | 373 | self.init_result["其他约定与条件中文"] = words_chn |
372 | return self.init_result | 374 | return self.init_result |
373 | 375 | ||
374 | |||
375 | class TIFFHandler: | 376 | class TIFFHandler: |
376 | 377 | ||
377 | def __init__(self, path, img_save_path): | 378 | def __init__(self, path, img_save_path): |
... | @@ -409,6 +410,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -409,6 +410,7 @@ class Command(BaseCommand, LoggerMixin): |
409 | self.input_dir = conf.WSC_DIR | 410 | self.input_dir = conf.WSC_DIR |
410 | # ocr相关 | 411 | # ocr相关 |
411 | self.go_ocr_url = conf.WSC_GO_URL | 412 | self.go_ocr_url = conf.WSC_GO_URL |
413 | self.amount_fill = PatternFill("solid", fgColor="00FFFF00") | ||
412 | # 优雅退出信号:15 | 414 | # 优雅退出信号:15 |
413 | signal.signal(signal.SIGTERM, self.signal_handler) | 415 | signal.signal(signal.SIGTERM, self.signal_handler) |
414 | 416 | ||
... | @@ -435,6 +437,19 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -435,6 +437,19 @@ class Command(BaseCommand, LoggerMixin): |
435 | excel_path = os.path.join(wb_output_dir, excel_name) | 437 | excel_path = os.path.join(wb_output_dir, excel_name) |
436 | return img_save_path, excel_path, pdf_save_path | 438 | return img_save_path, excel_path, pdf_save_path |
437 | 439 | ||
440 | @staticmethod | ||
441 | def get_mode_code(code_list): | ||
442 | result_dict = {} | ||
443 | for code in code_list: | ||
444 | if code in result_dict: | ||
445 | result_dict[code] += 1 | ||
446 | else: | ||
447 | result_dict[code] = 1 | ||
448 | if len(result_dict) == 1: | ||
449 | return None | ||
450 | else: | ||
451 | return sorted(result_dict.items(), key=lambda x:x[1], reverse=True)[0][0] | ||
452 | |||
438 | def res_process(self, all_res, excel_path): | 453 | def res_process(self, all_res, excel_path): |
439 | try: | 454 | try: |
440 | self.finder.ocr_results = all_res | 455 | self.finder.ocr_results = all_res |
... | @@ -442,11 +457,26 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -442,11 +457,26 @@ class Command(BaseCommand, LoggerMixin): |
442 | 457 | ||
443 | wb = BSWorkbook(set(), set(), set(), set(), set()) | 458 | wb = BSWorkbook(set(), set(), set(), set(), set()) |
444 | ws = wb.create_sheet(self.sheet_name) | 459 | ws = wb.create_sheet(self.sheet_name) |
460 | row_idx = 1 | ||
461 | code_idx = 1 | ||
462 | mode_code = None | ||
445 | for write_field, field_value in results.items(): | 463 | for write_field, field_value in results.items(): |
464 | row_idx += 1 | ||
446 | if isinstance(field_value, list): | 465 | if isinstance(field_value, list): |
466 | if write_field == '合同编号列表': | ||
467 | code_idx = row_idx | ||
468 | mode_code = self.get_mode_code(field_value) | ||
447 | ws.append((write_field, *field_value)) | 469 | ws.append((write_field, *field_value)) |
448 | else: | 470 | else: |
449 | ws.append((write_field, field_value)) | 471 | ws.append((write_field, field_value)) |
472 | |||
473 | if isinstance(mode_code, str): | ||
474 | for cell in ws[code_idx]: | ||
475 | if cell.value == '合同编号列表': | ||
476 | continue | ||
477 | if cell.value != mode_code: | ||
478 | cell.fill = self.amount_fill | ||
479 | |||
450 | wb.remove_base_sheet() | 480 | wb.remove_base_sheet() |
451 | wb.save(excel_path) | 481 | wb.save(excel_path) |
452 | except Exception as e: | 482 | except Exception as e: | ... | ... |
... | @@ -257,19 +257,19 @@ class PDFHandler: | ... | @@ -257,19 +257,19 @@ class PDFHandler: |
257 | self.page_to_png(page) | 257 | self.page_to_png(page) |
258 | 258 | ||
259 | def check_ebank(self, pdf): | 259 | def check_ebank(self, pdf): |
260 | page_text_list = [] | 260 | # page_text_list = [] |
261 | text_item_sum = 0 | 261 | text_item_sum = 0 |
262 | for pno in range(pdf.pageCount): | 262 | for pno in range(pdf.pageCount): |
263 | page = pdf.loadPage(pno) | 263 | page = pdf.loadPage(pno) |
264 | if page.rotation is None: | 264 | # if page.rotation is None: |
265 | rotation = 0 | 265 | # rotation = 0 |
266 | elif isinstance(page.rotation, int): | 266 | # elif isinstance(page.rotation, int): |
267 | divisor, remainder = divmod(page.rotation, 90) | 267 | # divisor, remainder = divmod(page.rotation, 90) |
268 | if remainder != 0: | 268 | # if remainder != 0: |
269 | return | 269 | # return |
270 | rotation = divmod(divisor, 4)[1] | 270 | # rotation = divmod(divisor, 4)[1] |
271 | else: | 271 | # else: |
272 | return | 272 | # return |
273 | textpage = page.getTextPage() | 273 | textpage = page.getTextPage() |
274 | text = textpage.extractDICT() | 274 | text = textpage.extractDICT() |
275 | text_list = [] | 275 | text_list = [] |
... | @@ -284,17 +284,17 @@ class PDFHandler: | ... | @@ -284,17 +284,17 @@ class PDFHandler: |
284 | text_item_sum += len(text_list) | 284 | text_item_sum += len(text_list) |
285 | if text_item_sum < (pno + 1) * 5: | 285 | if text_item_sum < (pno + 1) * 5: |
286 | return | 286 | return |
287 | else: | 287 | # else: |
288 | page_text_list.append( | 288 | # page_text_list.append( |
289 | { | 289 | # { |
290 | 'width': text.get('width'), | 290 | # 'width': text.get('width'), |
291 | 'height': text.get('height'), | 291 | # 'height': text.get('height'), |
292 | 'rotation': rotation, | 292 | # 'rotation': rotation, |
293 | 'text': text_list | 293 | # 'text': text_list |
294 | } | 294 | # } |
295 | ) | 295 | # ) |
296 | self.is_ebank = True | 296 | self.is_ebank = True |
297 | self.page_text_list = page_text_list | 297 | # self.page_text_list = page_text_list |
298 | 298 | ||
299 | def extract_image(self, max_img_count=None): | 299 | def extract_image(self, max_img_count=None): |
300 | self.img_path_list = [] | 300 | self.img_path_list = [] |
... | @@ -310,7 +310,7 @@ class PDFHandler: | ... | @@ -310,7 +310,7 @@ class PDFHandler: |
310 | if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count: | 310 | if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count: |
311 | self.img_count = pdf.pageCount | 311 | self.img_count = pdf.pageCount |
312 | return | 312 | return |
313 | # self.check_ebank(pdf) | 313 | self.check_ebank(pdf) |
314 | for pno in range(pdf.pageCount): | 314 | for pno in range(pdf.pageCount): |
315 | il = pdf.getPageImageList(pno) # 获取页面图片对象 | 315 | il = pdf.getPageImageList(pno) # 获取页面图片对象 |
316 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) | 316 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) | ... | ... |
-
Please register or sign in to post a comment