fix bug
Showing
2 changed files
with
66 additions
and
36 deletions
| ... | @@ -19,7 +19,7 @@ from common.mixins import LoggerMixin | ... | @@ -19,7 +19,7 @@ from common.mixins import LoggerMixin |
| 19 | from common.tools.pdf_to_img import PDFHandler | 19 | from common.tools.pdf_to_img import PDFHandler |
| 20 | from apps.doc import consts | 20 | from apps.doc import consts |
| 21 | from apps.doc.exceptions import OCR1Exception, OCR4Exception | 21 | from apps.doc.exceptions import OCR1Exception, OCR4Exception |
| 22 | from apps.doc.ocr.wb import BSWorkbook | 22 | from apps.doc.ocr.wb import BSWorkbook, PatternFill |
| 23 | 23 | ||
| 24 | 24 | ||
| 25 | class Finder: | 25 | class Finder: |
| ... | @@ -85,7 +85,7 @@ class Finder: | ... | @@ -85,7 +85,7 @@ class Finder: |
| 85 | pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 | 85 | pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 |
| 86 | ocr_texts = pattern.sub('', ocr_texts) | 86 | ocr_texts = pattern.sub('', ocr_texts) |
| 87 | 87 | ||
| 88 | score = fuzz.ratio(page_template, ocr_texts) / 100. | 88 | score = fuzz.ratio(page_template, ocr_texts)/100. |
| 89 | classes.append([pno, score]) | 89 | classes.append([pno, score]) |
| 90 | pred = sorted(classes, key=lambda x: x[1], reverse=True)[0] | 90 | pred = sorted(classes, key=lambda x: x[1], reverse=True)[0] |
| 91 | return pred | 91 | return pred |
| ... | @@ -111,7 +111,7 @@ class Finder: | ... | @@ -111,7 +111,7 @@ class Finder: |
| 111 | continue | 111 | continue |
| 112 | inter = Polygon(g).intersection(Polygon(p)).area | 112 | inter = Polygon(g).intersection(Polygon(p)).area |
| 113 | union = g.area + p.area - inter | 113 | union = g.area + p.area - inter |
| 114 | iou = inter / union | 114 | iou = inter/union |
| 115 | iou_list.append([iou, key]) | 115 | iou_list.append([iou, key]) |
| 116 | if len(iou_list) == 0: | 116 | if len(iou_list) == 0: |
| 117 | return -1, -1 | 117 | return -1, -1 |
| ... | @@ -141,8 +141,8 @@ class Finder: | ... | @@ -141,8 +141,8 @@ class Finder: |
| 141 | if len(words) == 0: | 141 | if len(words) == 0: |
| 142 | # 将 bbox 右移一个单位 | 142 | # 将 bbox 右移一个单位 |
| 143 | x0, y0, x1, y1, x2, y2, x3, y3 = bbox | 143 | x0, y0, x1, y1, x2, y2, x3, y3 = bbox |
| 144 | rw = abs(x0 - x1) | 144 | rw = abs(x0-x1) |
| 145 | anchor = [x0 + rw, y0, x1 + rw, y1, x2 + rw, y2, x3 + rw, y3] | 145 | anchor = [x0+rw, y0, x1+rw, y1, x2+rw, y2, x3+rw, y3] |
| 146 | iou, key = self.get_top_iou(ocr_results, anchor) | 146 | iou, key = self.get_top_iou(ocr_results, anchor) |
| 147 | if ratio > 0.3: | 147 | if ratio > 0.3: |
| 148 | bbox, text = ocr_results[key] | 148 | bbox, text = ocr_results[key] |
| ... | @@ -223,7 +223,7 @@ class Finder: | ... | @@ -223,7 +223,7 @@ class Finder: |
| 223 | bbox, text = self.ocr_results[pno][key] | 223 | bbox, text = self.ocr_results[pno][key] |
| 224 | all_texts += text | 224 | all_texts += text |
| 225 | 225 | ||
| 226 | searchObj = re.search(r'保证人\[(.*?)\]与甲方', all_texts) | 226 | searchObj = re.search( r'保证人\[(.*?)\]与甲方', all_texts) |
| 227 | if searchObj: | 227 | if searchObj: |
| 228 | words = f'[{searchObj.group(1)}]' | 228 | words = f'[{searchObj.group(1)}]' |
| 229 | words = words.replace('【', '[').replace('】', ']').replace(',', ',').replace('(', '(').replace(')', ')') | 229 | words = words.replace('【', '[').replace('】', ']').replace(',', ',').replace('(', '(').replace(')', ')') |
| ... | @@ -256,7 +256,9 @@ class Finder: | ... | @@ -256,7 +256,9 @@ class Finder: |
| 256 | if score > 0.5: | 256 | if score > 0.5: |
| 257 | if len(self.ocr_results[pno]) > 0: | 257 | if len(self.ocr_results[pno]) > 0: |
| 258 | # 根据关键词,找这一行字符 | 258 | # 根据关键词,找这一行字符 |
| 259 | lines = self.get_line(self.ocr_results[pno], 'RMB') | 259 | lines = '' |
| 260 | for i in ['RMB', 'CNY']: | ||
| 261 | lines += self.get_line(self.ocr_results[pno], i) | ||
| 260 | # searchObj = re.search( r'RMB(.*?)in', lines) | 262 | # searchObj = re.search( r'RMB(.*?)in', lines) |
| 261 | searchObj = re.search(r'[0-9,.]+', lines) | 263 | searchObj = re.search(r'[0-9,.]+', lines) |
| 262 | if searchObj: | 264 | if searchObj: |
| ... | @@ -264,7 +266,7 @@ class Finder: | ... | @@ -264,7 +266,7 @@ class Finder: |
| 264 | amount_eng = words | 266 | amount_eng = words |
| 265 | 267 | ||
| 266 | lines = self.get_line(self.ocr_results[pno], '人民币') | 268 | lines = self.get_line(self.ocr_results[pno], '人民币') |
| 267 | searchObj = re.search(r'大写(.*?)综合', lines) | 269 | searchObj = re.search( r'大写(.*?)综合', lines) |
| 268 | if searchObj: | 270 | if searchObj: |
| 269 | words = searchObj.group(1) | 271 | words = searchObj.group(1) |
| 270 | pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 | 272 | pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 |
| ... | @@ -280,11 +282,11 @@ class Finder: | ... | @@ -280,11 +282,11 @@ class Finder: |
| 280 | lines = self.get_line(self.ocr_results[pno], 'ending') | 282 | lines = self.get_line(self.ocr_results[pno], 'ending') |
| 281 | if len(lines) > 0: | 283 | if len(lines) > 0: |
| 282 | start, end = lines.split('ending') | 284 | start, end = lines.split('ending') |
| 283 | searchStart = re.search(r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', start) | 285 | searchStart = re.search( r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', start) |
| 284 | if searchStart: | 286 | if searchStart: |
| 285 | words = searchStart.group() | 287 | words = searchStart.group() |
| 286 | term_start_eng = words | 288 | term_start_eng = words |
| 287 | searchEnd = re.search(r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', end) | 289 | searchEnd = re.search( r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', end) |
| 288 | if searchEnd: | 290 | if searchEnd: |
| 289 | words = searchEnd.group() | 291 | words = searchEnd.group() |
| 290 | term_end_eng = words | 292 | term_end_eng = words |
| ... | @@ -292,23 +294,23 @@ class Finder: | ... | @@ -292,23 +294,23 @@ class Finder: |
| 292 | lines = self.get_line(self.ocr_results[pno], '至') | 294 | lines = self.get_line(self.ocr_results[pno], '至') |
| 293 | if len(lines) > 0: | 295 | if len(lines) > 0: |
| 294 | start, end = lines.split('至') | 296 | start, end = lines.split('至') |
| 295 | searchStart = re.search(r'[0-9]{4}-[0-9]+-[0-9]+', start) | 297 | searchStart = re.search( r'[0-9]{4}-[0-9]+-[0-9]+', start) |
| 296 | if searchStart: | 298 | if searchStart: |
| 297 | words = searchStart.group() | 299 | words = searchStart.group() |
| 298 | term_start_chn = words | 300 | term_start_chn = words |
| 299 | searchEnd = re.search(r'[0-9]{4}-[0-9]+-[0-9]+', end) | 301 | searchEnd = re.search( r'[0-9]{4}-[0-9]+-[0-9]+', end) |
| 300 | if searchEnd: | 302 | if searchEnd: |
| 301 | words = searchEnd.group() | 303 | words = searchEnd.group() |
| 302 | term_end_chn = words | 304 | term_end_chn = words |
| 303 | 305 | ||
| 304 | lines = self.get_line(self.ocr_results[pno], 'above') | 306 | lines = self.get_line(self.ocr_results[pno], 'above') |
| 305 | searchObj = re.search(r'aboveto([0-9]+)', lines.replace('O', '0')) | 307 | searchObj = re.search( r'aboveto([0-9]+)', lines.replace('O', '0').replace('too', 'to0')) |
| 306 | if searchObj: | 308 | if searchObj: |
| 307 | words = searchObj.group(1) | 309 | words = searchObj.group(1) |
| 308 | deposit_eng = f'{words}%' | 310 | deposit_eng = f'{words}%' |
| 309 | 311 | ||
| 310 | lines = self.get_line(self.ocr_results[pno], '授信额度的') | 312 | lines = self.get_line(self.ocr_results[pno], '授信额度的') |
| 311 | searchObj = re.search(r'授信额度的([0-9]+)', lines.replace('O', '0')) | 313 | searchObj = re.search( r'授信额度的([0-9]+)', lines.replace('O', '0')) |
| 312 | if searchObj: | 314 | if searchObj: |
| 313 | words = searchObj.group(1) | 315 | words = searchObj.group(1) |
| 314 | deposit_chn = f'{words}%' | 316 | deposit_chn = f'{words}%' |
| ... | @@ -371,7 +373,6 @@ class Finder: | ... | @@ -371,7 +373,6 @@ class Finder: |
| 371 | self.init_result["其他约定与条件中文"] = words_chn | 373 | self.init_result["其他约定与条件中文"] = words_chn |
| 372 | return self.init_result | 374 | return self.init_result |
| 373 | 375 | ||
| 374 | |||
| 375 | class TIFFHandler: | 376 | class TIFFHandler: |
| 376 | 377 | ||
| 377 | def __init__(self, path, img_save_path): | 378 | def __init__(self, path, img_save_path): |
| ... | @@ -409,6 +410,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -409,6 +410,7 @@ class Command(BaseCommand, LoggerMixin): |
| 409 | self.input_dir = conf.WSC_DIR | 410 | self.input_dir = conf.WSC_DIR |
| 410 | # ocr相关 | 411 | # ocr相关 |
| 411 | self.go_ocr_url = conf.WSC_GO_URL | 412 | self.go_ocr_url = conf.WSC_GO_URL |
| 413 | self.amount_fill = PatternFill("solid", fgColor="00FFFF00") | ||
| 412 | # 优雅退出信号:15 | 414 | # 优雅退出信号:15 |
| 413 | signal.signal(signal.SIGTERM, self.signal_handler) | 415 | signal.signal(signal.SIGTERM, self.signal_handler) |
| 414 | 416 | ||
| ... | @@ -435,6 +437,19 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -435,6 +437,19 @@ class Command(BaseCommand, LoggerMixin): |
| 435 | excel_path = os.path.join(wb_output_dir, excel_name) | 437 | excel_path = os.path.join(wb_output_dir, excel_name) |
| 436 | return img_save_path, excel_path, pdf_save_path | 438 | return img_save_path, excel_path, pdf_save_path |
| 437 | 439 | ||
| 440 | @staticmethod | ||
| 441 | def get_mode_code(code_list): | ||
| 442 | result_dict = {} | ||
| 443 | for code in code_list: | ||
| 444 | if code in result_dict: | ||
| 445 | result_dict[code] += 1 | ||
| 446 | else: | ||
| 447 | result_dict[code] = 1 | ||
| 448 | if len(result_dict) == 1: | ||
| 449 | return None | ||
| 450 | else: | ||
| 451 | return sorted(result_dict.items(), key=lambda x:x[1], reverse=True)[0][0] | ||
| 452 | |||
| 438 | def res_process(self, all_res, excel_path): | 453 | def res_process(self, all_res, excel_path): |
| 439 | try: | 454 | try: |
| 440 | self.finder.ocr_results = all_res | 455 | self.finder.ocr_results = all_res |
| ... | @@ -442,11 +457,26 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -442,11 +457,26 @@ class Command(BaseCommand, LoggerMixin): |
| 442 | 457 | ||
| 443 | wb = BSWorkbook(set(), set(), set(), set(), set()) | 458 | wb = BSWorkbook(set(), set(), set(), set(), set()) |
| 444 | ws = wb.create_sheet(self.sheet_name) | 459 | ws = wb.create_sheet(self.sheet_name) |
| 460 | row_idx = 1 | ||
| 461 | code_idx = 1 | ||
| 462 | mode_code = None | ||
| 445 | for write_field, field_value in results.items(): | 463 | for write_field, field_value in results.items(): |
| 464 | row_idx += 1 | ||
| 446 | if isinstance(field_value, list): | 465 | if isinstance(field_value, list): |
| 466 | if write_field == '合同编号列表': | ||
| 467 | code_idx = row_idx | ||
| 468 | mode_code = self.get_mode_code(field_value) | ||
| 447 | ws.append((write_field, *field_value)) | 469 | ws.append((write_field, *field_value)) |
| 448 | else: | 470 | else: |
| 449 | ws.append((write_field, field_value)) | 471 | ws.append((write_field, field_value)) |
| 472 | |||
| 473 | if isinstance(mode_code, str): | ||
| 474 | for cell in ws[code_idx]: | ||
| 475 | if cell.value == '合同编号列表': | ||
| 476 | continue | ||
| 477 | if cell.value != mode_code: | ||
| 478 | cell.fill = self.amount_fill | ||
| 479 | |||
| 450 | wb.remove_base_sheet() | 480 | wb.remove_base_sheet() |
| 451 | wb.save(excel_path) | 481 | wb.save(excel_path) |
| 452 | except Exception as e: | 482 | except Exception as e: | ... | ... |
| ... | @@ -257,19 +257,19 @@ class PDFHandler: | ... | @@ -257,19 +257,19 @@ class PDFHandler: |
| 257 | self.page_to_png(page) | 257 | self.page_to_png(page) |
| 258 | 258 | ||
| 259 | def check_ebank(self, pdf): | 259 | def check_ebank(self, pdf): |
| 260 | page_text_list = [] | 260 | # page_text_list = [] |
| 261 | text_item_sum = 0 | 261 | text_item_sum = 0 |
| 262 | for pno in range(pdf.pageCount): | 262 | for pno in range(pdf.pageCount): |
| 263 | page = pdf.loadPage(pno) | 263 | page = pdf.loadPage(pno) |
| 264 | if page.rotation is None: | 264 | # if page.rotation is None: |
| 265 | rotation = 0 | 265 | # rotation = 0 |
| 266 | elif isinstance(page.rotation, int): | 266 | # elif isinstance(page.rotation, int): |
| 267 | divisor, remainder = divmod(page.rotation, 90) | 267 | # divisor, remainder = divmod(page.rotation, 90) |
| 268 | if remainder != 0: | 268 | # if remainder != 0: |
| 269 | return | 269 | # return |
| 270 | rotation = divmod(divisor, 4)[1] | 270 | # rotation = divmod(divisor, 4)[1] |
| 271 | else: | 271 | # else: |
| 272 | return | 272 | # return |
| 273 | textpage = page.getTextPage() | 273 | textpage = page.getTextPage() |
| 274 | text = textpage.extractDICT() | 274 | text = textpage.extractDICT() |
| 275 | text_list = [] | 275 | text_list = [] |
| ... | @@ -284,17 +284,17 @@ class PDFHandler: | ... | @@ -284,17 +284,17 @@ class PDFHandler: |
| 284 | text_item_sum += len(text_list) | 284 | text_item_sum += len(text_list) |
| 285 | if text_item_sum < (pno + 1) * 5: | 285 | if text_item_sum < (pno + 1) * 5: |
| 286 | return | 286 | return |
| 287 | else: | 287 | # else: |
| 288 | page_text_list.append( | 288 | # page_text_list.append( |
| 289 | { | 289 | # { |
| 290 | 'width': text.get('width'), | 290 | # 'width': text.get('width'), |
| 291 | 'height': text.get('height'), | 291 | # 'height': text.get('height'), |
| 292 | 'rotation': rotation, | 292 | # 'rotation': rotation, |
| 293 | 'text': text_list | 293 | # 'text': text_list |
| 294 | } | 294 | # } |
| 295 | ) | 295 | # ) |
| 296 | self.is_ebank = True | 296 | self.is_ebank = True |
| 297 | self.page_text_list = page_text_list | 297 | # self.page_text_list = page_text_list |
| 298 | 298 | ||
| 299 | def extract_image(self, max_img_count=None): | 299 | def extract_image(self, max_img_count=None): |
| 300 | self.img_path_list = [] | 300 | self.img_path_list = [] |
| ... | @@ -310,7 +310,7 @@ class PDFHandler: | ... | @@ -310,7 +310,7 @@ class PDFHandler: |
| 310 | if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count: | 310 | if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count: |
| 311 | self.img_count = pdf.pageCount | 311 | self.img_count = pdf.pageCount |
| 312 | return | 312 | return |
| 313 | # self.check_ebank(pdf) | 313 | self.check_ebank(pdf) |
| 314 | for pno in range(pdf.pageCount): | 314 | for pno in range(pdf.pageCount): |
| 315 | il = pdf.getPageImageList(pno) # 获取页面图片对象 | 315 | il = pdf.getPageImageList(pno) # 获取页面图片对象 |
| 316 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) | 316 | # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) | ... | ... |
-
Please register or sign in to post a comment