92b21d6a by 周伟奇

fix bug

1 parent 6ba8d65d
...@@ -19,16 +19,16 @@ from common.mixins import LoggerMixin ...@@ -19,16 +19,16 @@ from common.mixins import LoggerMixin
19 from common.tools.pdf_to_img import PDFHandler 19 from common.tools.pdf_to_img import PDFHandler
20 from apps.doc import consts 20 from apps.doc import consts
21 from apps.doc.exceptions import OCR1Exception, OCR4Exception 21 from apps.doc.exceptions import OCR1Exception, OCR4Exception
22 from apps.doc.ocr.wb import BSWorkbook 22 from apps.doc.ocr.wb import BSWorkbook, PatternFill
23 23
24 24
25 class Finder: 25 class Finder:
26 """Summary 26 """Summary
27 27
28 Attributes: 28 Attributes:
29 ocr_results (TYPE): Description 29 ocr_results (TYPE): Description
30 """ 30 """
31 31
32 def __init__(self, ocr_results=None): 32 def __init__(self, ocr_results=None):
33 self.ocr_results = ocr_results 33 self.ocr_results = ocr_results
34 34
...@@ -82,15 +82,15 @@ class Finder: ...@@ -82,15 +82,15 @@ class Finder:
82 for key in ocr_results[pno]: 82 for key in ocr_results[pno]:
83 bbox, text = ocr_results[pno][key] 83 bbox, text = ocr_results[pno][key]
84 ocr_texts += text 84 ocr_texts += text
85 pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 85 pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符
86 ocr_texts = pattern.sub('', ocr_texts) 86 ocr_texts = pattern.sub('', ocr_texts)
87 87
88 score = fuzz.ratio(page_template, ocr_texts) / 100. 88 score = fuzz.ratio(page_template, ocr_texts)/100.
89 classes.append([pno, score]) 89 classes.append([pno, score])
90 pred = sorted(classes, key=lambda x: x[1], reverse=True)[0] 90 pred = sorted(classes, key=lambda x: x[1], reverse=True)[0]
91 return pred 91 return pred
92 92
93 def get_top_key(self, ocr_results, key_string): # 加入过滤词典 93 def get_top_key(self, ocr_results, key_string): # 加入过滤词典
94 """找到与 key_string 最匹配的字段的 key 94 """找到与 key_string 最匹配的字段的 key
95 """ 95 """
96 if len(ocr_results) == 0: 96 if len(ocr_results) == 0:
...@@ -111,7 +111,7 @@ class Finder: ...@@ -111,7 +111,7 @@ class Finder:
111 continue 111 continue
112 inter = Polygon(g).intersection(Polygon(p)).area 112 inter = Polygon(g).intersection(Polygon(p)).area
113 union = g.area + p.area - inter 113 union = g.area + p.area - inter
114 iou = inter / union 114 iou = inter/union
115 iou_list.append([iou, key]) 115 iou_list.append([iou, key])
116 if len(iou_list) == 0: 116 if len(iou_list) == 0:
117 return -1, -1 117 return -1, -1
...@@ -128,8 +128,8 @@ class Finder: ...@@ -128,8 +128,8 @@ class Finder:
128 bbox, text = ocr_results[key] 128 bbox, text = ocr_results[key]
129 # 定制化规则, 比如过滤一些词呀什么的 129 # 定制化规则, 比如过滤一些词呀什么的
130 # 该例中, 我们要去掉非中文字符 130 # 该例中, 我们要去掉非中文字符
131 pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 131 pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符
132 text = pattern.sub('', text) 132 text = pattern.sub('', text)
133 tmp_ocr_results[key] = [bbox, text] 133 tmp_ocr_results[key] = [bbox, text]
134 134
135 # 先根据 key_string 找到 key 的位置所在, 再判断该位置是否包含 value 135 # 先根据 key_string 找到 key 的位置所在, 再判断该位置是否包含 value
...@@ -141,8 +141,8 @@ class Finder: ...@@ -141,8 +141,8 @@ class Finder:
141 if len(words) == 0: 141 if len(words) == 0:
142 # 将 bbox 右移一个单位 142 # 将 bbox 右移一个单位
143 x0, y0, x1, y1, x2, y2, x3, y3 = bbox 143 x0, y0, x1, y1, x2, y2, x3, y3 = bbox
144 rw = abs(x0 - x1) 144 rw = abs(x0-x1)
145 anchor = [x0 + rw, y0, x1 + rw, y1, x2 + rw, y2, x3 + rw, y3] 145 anchor = [x0+rw, y0, x1+rw, y1, x2+rw, y2, x3+rw, y3]
146 iou, key = self.get_top_iou(ocr_results, anchor) 146 iou, key = self.get_top_iou(ocr_results, anchor)
147 if ratio > 0.3: 147 if ratio > 0.3:
148 bbox, text = ocr_results[key] 148 bbox, text = ocr_results[key]
...@@ -223,7 +223,7 @@ class Finder: ...@@ -223,7 +223,7 @@ class Finder:
223 bbox, text = self.ocr_results[pno][key] 223 bbox, text = self.ocr_results[pno][key]
224 all_texts += text 224 all_texts += text
225 225
226 searchObj = re.search(r'保证人\[(.*?)\]与甲方', all_texts) 226 searchObj = re.search( r'保证人\[(.*?)\]与甲方', all_texts)
227 if searchObj: 227 if searchObj:
228 words = f'[{searchObj.group(1)}]' 228 words = f'[{searchObj.group(1)}]'
229 words = words.replace('【', '[').replace('】', ']').replace(',', ',').replace('(', '(').replace(')', ')') 229 words = words.replace('【', '[').replace('】', ']').replace(',', ',').replace('(', '(').replace(')', ')')
...@@ -256,7 +256,9 @@ class Finder: ...@@ -256,7 +256,9 @@ class Finder:
256 if score > 0.5: 256 if score > 0.5:
257 if len(self.ocr_results[pno]) > 0: 257 if len(self.ocr_results[pno]) > 0:
258 # 根据关键词,找这一行字符 258 # 根据关键词,找这一行字符
259 lines = self.get_line(self.ocr_results[pno], 'RMB') 259 lines = ''
260 for i in ['RMB', 'CNY']:
261 lines += self.get_line(self.ocr_results[pno], i)
260 # searchObj = re.search( r'RMB(.*?)in', lines) 262 # searchObj = re.search( r'RMB(.*?)in', lines)
261 searchObj = re.search(r'[0-9,.]+', lines) 263 searchObj = re.search(r'[0-9,.]+', lines)
262 if searchObj: 264 if searchObj:
...@@ -264,10 +266,10 @@ class Finder: ...@@ -264,10 +266,10 @@ class Finder:
264 amount_eng = words 266 amount_eng = words
265 267
266 lines = self.get_line(self.ocr_results[pno], '人民币') 268 lines = self.get_line(self.ocr_results[pno], '人民币')
267 searchObj = re.search(r'大写(.*?)综合', lines) 269 searchObj = re.search( r'大写(.*?)综合', lines)
268 if searchObj: 270 if searchObj:
269 words = searchObj.group(1) 271 words = searchObj.group(1)
270 pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符 272 pattern = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文的其他字符
271 words = pattern.sub('', words) 273 words = pattern.sub('', words)
272 words = words.replace("仔", "仟").replace("任", "仟") 274 words = words.replace("仔", "仟").replace("任", "仟")
273 words = words.replace("值", "佰") 275 words = words.replace("值", "佰")
...@@ -276,15 +278,15 @@ class Finder: ...@@ -276,15 +278,15 @@ class Finder:
276 words = words.replace("政", "玖") 278 words = words.replace("政", "玖")
277 words = words.replace("垒", "叁") 279 words = words.replace("垒", "叁")
278 amount_chn = words 280 amount_chn = words
279 281
280 lines = self.get_line(self.ocr_results[pno], 'ending') 282 lines = self.get_line(self.ocr_results[pno], 'ending')
281 if len(lines) > 0: 283 if len(lines) > 0:
282 start, end = lines.split('ending') 284 start, end = lines.split('ending')
283 searchStart = re.search(r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', start) 285 searchStart = re.search( r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', start)
284 if searchStart: 286 if searchStart:
285 words = searchStart.group() 287 words = searchStart.group()
286 term_start_eng = words 288 term_start_eng = words
287 searchEnd = re.search(r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', end) 289 searchEnd = re.search( r'[0-9]+-[0-9a-zA-Z]+-[0-9]{4}', end)
288 if searchEnd: 290 if searchEnd:
289 words = searchEnd.group() 291 words = searchEnd.group()
290 term_end_eng = words 292 term_end_eng = words
...@@ -292,29 +294,29 @@ class Finder: ...@@ -292,29 +294,29 @@ class Finder:
292 lines = self.get_line(self.ocr_results[pno], '至') 294 lines = self.get_line(self.ocr_results[pno], '至')
293 if len(lines) > 0: 295 if len(lines) > 0:
294 start, end = lines.split('至') 296 start, end = lines.split('至')
295 searchStart = re.search(r'[0-9]{4}-[0-9]+-[0-9]+', start) 297 searchStart = re.search( r'[0-9]{4}-[0-9]+-[0-9]+', start)
296 if searchStart: 298 if searchStart:
297 words = searchStart.group() 299 words = searchStart.group()
298 term_start_chn = words 300 term_start_chn = words
299 searchEnd = re.search(r'[0-9]{4}-[0-9]+-[0-9]+', end) 301 searchEnd = re.search( r'[0-9]{4}-[0-9]+-[0-9]+', end)
300 if searchEnd: 302 if searchEnd:
301 words = searchEnd.group() 303 words = searchEnd.group()
302 term_end_chn = words 304 term_end_chn = words
303 305
304 lines = self.get_line(self.ocr_results[pno], 'above') 306 lines = self.get_line(self.ocr_results[pno], 'above')
305 searchObj = re.search(r'aboveto([0-9]+)', lines.replace('O', '0')) 307 searchObj = re.search( r'aboveto([0-9]+)', lines.replace('O', '0').replace('too', 'to0'))
306 if searchObj: 308 if searchObj:
307 words = searchObj.group(1) 309 words = searchObj.group(1)
308 deposit_eng = f'{words}%' 310 deposit_eng = f'{words}%'
309 311
310 lines = self.get_line(self.ocr_results[pno], '授信额度的') 312 lines = self.get_line(self.ocr_results[pno], '授信额度的')
311 searchObj = re.search(r'授信额度的([0-9]+)', lines.replace('O', '0')) 313 searchObj = re.search( r'授信额度的([0-9]+)', lines.replace('O', '0'))
312 if searchObj: 314 if searchObj:
313 words = searchObj.group(1) 315 words = searchObj.group(1)
314 deposit_chn = f'{words}%' 316 deposit_chn = f'{words}%'
315 317
316 return amount_eng, amount_chn, term_start_eng, term_end_eng, \ 318 return amount_eng, amount_chn, term_start_eng, term_end_eng, \
317 term_start_chn, term_end_chn, deposit_eng, deposit_chn 319 term_start_chn, term_end_chn, deposit_eng, deposit_chn
318 320
319 def get_other_arrangements_and_conditions(self): 321 def get_other_arrangements_and_conditions(self):
320 """获取其它约定与条件文本段落 322 """获取其它约定与条件文本段落
...@@ -330,7 +332,7 @@ class Finder: ...@@ -330,7 +332,7 @@ class Finder:
330 searchObj = re.search(r'Conditions:(.*?)其他约定与条件', all_texts, re.I) 332 searchObj = re.search(r'Conditions:(.*?)其他约定与条件', all_texts, re.I)
331 if searchObj: 333 if searchObj:
332 words = searchObj.group(1) 334 words = searchObj.group(1)
333 pattern = re.compile("[\u4e00-\u9fa5]") # 去除中文字符 335 pattern = re.compile("[\u4e00-\u9fa5]") # 去除中文字符
334 words = pattern.sub('', words) 336 words = pattern.sub('', words)
335 other_arrangements_and_conditions_eng = words 337 other_arrangements_and_conditions_eng = words
336 338
...@@ -356,7 +358,7 @@ class Finder: ...@@ -356,7 +358,7 @@ class Finder:
356 self.init_result["保证人"] = guarantor 358 self.init_result["保证人"] = guarantor
357 359
358 amount_eng, amount_chn, term_start_eng, term_end_eng, \ 360 amount_eng, amount_chn, term_start_eng, term_end_eng, \
359 term_start_chn, term_end_chn, deposit_eng, deposit_chn = self.get_info_in_page_39() 361 term_start_chn, term_end_chn, deposit_eng, deposit_chn = self.get_info_in_page_39()
360 self.init_result["综合授信额度金额英文"] = amount_eng 362 self.init_result["综合授信额度金额英文"] = amount_eng
361 self.init_result["综合授信额度金额中文"] = amount_chn 363 self.init_result["综合授信额度金额中文"] = amount_chn
362 self.init_result["综合授信额度期限开始日期英文"] = term_start_eng 364 self.init_result["综合授信额度期限开始日期英文"] = term_start_eng
...@@ -371,7 +373,6 @@ class Finder: ...@@ -371,7 +373,6 @@ class Finder:
371 self.init_result["其他约定与条件中文"] = words_chn 373 self.init_result["其他约定与条件中文"] = words_chn
372 return self.init_result 374 return self.init_result
373 375
374
375 class TIFFHandler: 376 class TIFFHandler:
376 377
377 def __init__(self, path, img_save_path): 378 def __init__(self, path, img_save_path):
...@@ -409,6 +410,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -409,6 +410,7 @@ class Command(BaseCommand, LoggerMixin):
409 self.input_dir = conf.WSC_DIR 410 self.input_dir = conf.WSC_DIR
410 # ocr相关 411 # ocr相关
411 self.go_ocr_url = conf.WSC_GO_URL 412 self.go_ocr_url = conf.WSC_GO_URL
413 self.amount_fill = PatternFill("solid", fgColor="00FFFF00")
412 # 优雅退出信号:15 414 # 优雅退出信号:15
413 signal.signal(signal.SIGTERM, self.signal_handler) 415 signal.signal(signal.SIGTERM, self.signal_handler)
414 416
...@@ -435,6 +437,19 @@ class Command(BaseCommand, LoggerMixin): ...@@ -435,6 +437,19 @@ class Command(BaseCommand, LoggerMixin):
435 excel_path = os.path.join(wb_output_dir, excel_name) 437 excel_path = os.path.join(wb_output_dir, excel_name)
436 return img_save_path, excel_path, pdf_save_path 438 return img_save_path, excel_path, pdf_save_path
437 439
440 @staticmethod
441 def get_mode_code(code_list):
442 result_dict = {}
443 for code in code_list:
444 if code in result_dict:
445 result_dict[code] += 1
446 else:
447 result_dict[code] = 1
448 if len(result_dict) == 1:
449 return None
450 else:
451 return sorted(result_dict.items(), key=lambda x:x[1], reverse=True)[0][0]
452
438 def res_process(self, all_res, excel_path): 453 def res_process(self, all_res, excel_path):
439 try: 454 try:
440 self.finder.ocr_results = all_res 455 self.finder.ocr_results = all_res
...@@ -442,11 +457,26 @@ class Command(BaseCommand, LoggerMixin): ...@@ -442,11 +457,26 @@ class Command(BaseCommand, LoggerMixin):
442 457
443 wb = BSWorkbook(set(), set(), set(), set(), set()) 458 wb = BSWorkbook(set(), set(), set(), set(), set())
444 ws = wb.create_sheet(self.sheet_name) 459 ws = wb.create_sheet(self.sheet_name)
460 row_idx = 1
461 code_idx = 1
462 mode_code = None
445 for write_field, field_value in results.items(): 463 for write_field, field_value in results.items():
464 row_idx += 1
446 if isinstance(field_value, list): 465 if isinstance(field_value, list):
466 if write_field == '合同编号列表':
467 code_idx = row_idx
468 mode_code = self.get_mode_code(field_value)
447 ws.append((write_field, *field_value)) 469 ws.append((write_field, *field_value))
448 else: 470 else:
449 ws.append((write_field, field_value)) 471 ws.append((write_field, field_value))
472
473 if isinstance(mode_code, str):
474 for cell in ws[code_idx]:
475 if cell.value == '合同编号列表':
476 continue
477 if cell.value != mode_code:
478 cell.fill = self.amount_fill
479
450 wb.remove_base_sheet() 480 wb.remove_base_sheet()
451 wb.save(excel_path) 481 wb.save(excel_path)
452 except Exception as e: 482 except Exception as e:
......
...@@ -257,19 +257,19 @@ class PDFHandler: ...@@ -257,19 +257,19 @@ class PDFHandler:
257 self.page_to_png(page) 257 self.page_to_png(page)
258 258
259 def check_ebank(self, pdf): 259 def check_ebank(self, pdf):
260 page_text_list = [] 260 # page_text_list = []
261 text_item_sum = 0 261 text_item_sum = 0
262 for pno in range(pdf.pageCount): 262 for pno in range(pdf.pageCount):
263 page = pdf.loadPage(pno) 263 page = pdf.loadPage(pno)
264 if page.rotation is None: 264 # if page.rotation is None:
265 rotation = 0 265 # rotation = 0
266 elif isinstance(page.rotation, int): 266 # elif isinstance(page.rotation, int):
267 divisor, remainder = divmod(page.rotation, 90) 267 # divisor, remainder = divmod(page.rotation, 90)
268 if remainder != 0: 268 # if remainder != 0:
269 return 269 # return
270 rotation = divmod(divisor, 4)[1] 270 # rotation = divmod(divisor, 4)[1]
271 else: 271 # else:
272 return 272 # return
273 textpage = page.getTextPage() 273 textpage = page.getTextPage()
274 text = textpage.extractDICT() 274 text = textpage.extractDICT()
275 text_list = [] 275 text_list = []
...@@ -284,17 +284,17 @@ class PDFHandler: ...@@ -284,17 +284,17 @@ class PDFHandler:
284 text_item_sum += len(text_list) 284 text_item_sum += len(text_list)
285 if text_item_sum < (pno + 1) * 5: 285 if text_item_sum < (pno + 1) * 5:
286 return 286 return
287 else: 287 # else:
288 page_text_list.append( 288 # page_text_list.append(
289 { 289 # {
290 'width': text.get('width'), 290 # 'width': text.get('width'),
291 'height': text.get('height'), 291 # 'height': text.get('height'),
292 'rotation': rotation, 292 # 'rotation': rotation,
293 'text': text_list 293 # 'text': text_list
294 } 294 # }
295 ) 295 # )
296 self.is_ebank = True 296 self.is_ebank = True
297 self.page_text_list = page_text_list 297 # self.page_text_list = page_text_list
298 298
299 def extract_image(self, max_img_count=None): 299 def extract_image(self, max_img_count=None):
300 self.img_path_list = [] 300 self.img_path_list = []
...@@ -310,7 +310,7 @@ class PDFHandler: ...@@ -310,7 +310,7 @@ class PDFHandler:
310 if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count: 310 if isinstance(max_img_count, int) and pdf.pageCount >= max_img_count:
311 self.img_count = pdf.pageCount 311 self.img_count = pdf.pageCount
312 return 312 return
313 # self.check_ebank(pdf) 313 self.check_ebank(pdf)
314 for pno in range(pdf.pageCount): 314 for pno in range(pdf.pageCount):
315 il = pdf.getPageImageList(pno) # 获取页面图片对象 315 il = pdf.getPageImageList(pno) # 获取页面图片对象
316 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker) 316 # (xref, smask, width, height, bpc, colorspace, alt.colorspace, name, filter, invoker)
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!