ba0dc000 by 冯轩

Merge branch 'feature/uat-tmp' of gitlab.situdata.com:zhouweiqi/bmw-ocr into feature/uat-tmp

2 parents f3671aab c92067d8
...@@ -32,3 +32,4 @@ data/* ...@@ -32,3 +32,4 @@ data/*
32 32
33 test* 33 test*
34 flow_test.py 34 flow_test.py
35 pdf_test.py
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -2434,14 +2434,14 @@ ECONTRACT_KEYWORDS_MAP = { ...@@ -2434,14 +2434,14 @@ ECONTRACT_KEYWORDS_MAP = {
2434 2434
2435 FSM_ECONTRACT_KEYWORDS_MAP = { 2435 FSM_ECONTRACT_KEYWORDS_MAP = {
2436 AFC_PREFIX: [ 2436 AFC_PREFIX: [
2437 ('延长保修条款与条件', FSM_CONTRACT_WEP_CLASSIFY), 2437 ('延长保修服务合约', FSM_CONTRACT_WEP_CLASSIFY),
2438 ('长悦保养套餐服务合约', FSM_CONTRACT_MSI_CLASSIFY), 2438 ('长悦保养套餐服务合约', FSM_CONTRACT_MSI_CLASSIFY),
2439 ('汽车销售合同补充合同', FSM_CONTRACT_SC2_CLASSIFY), 2439 ('汽车销售合同补充合同', FSM_CONTRACT_SC2_CLASSIFY),
2440 ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY), 2440 ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY),
2441 ], 2441 ],
2442 HIL_PREFIX: [ 2442 HIL_PREFIX: [
2443 ('延长保修条款与条件', FSM_CONTRACT_WEP_CLASSIFY), 2443 ('延长保修服务合约', FSM_CONTRACT_WEP_CLASSIFY),
2444 ('长悦保养套餐服务合', FSM_CONTRACT_MSI_CLASSIFY), 2444 ('长悦保养套餐服务合', FSM_CONTRACT_MSI_CLASSIFY),
2445 ('汽车销售合同补充合同', FSM_CONTRACT_SC2_CLASSIFY), 2445 ('汽车销售合同补充合同', FSM_CONTRACT_SC2_CLASSIFY),
2446 ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY), 2446 ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY),
2447 ] 2447 ]
......
...@@ -1339,8 +1339,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -1339,8 +1339,8 @@ class Command(BaseCommand, LoggerMixin):
1339 pdf_handler.extract_image(max_img_count) 1339 pdf_handler.extract_image(max_img_count)
1340 end_time = time.time() 1340 end_time = time.time()
1341 speed_time = int(end_time - start_time) 1341 speed_time = int(end_time - start_time)
1342 self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format( 1342 self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format(
1343 self.log_base, task_str, times, speed_time)) 1343 self.log_base, task_str, times, speed_time, pdf_handler.is_new_modify))
1344 except Exception as e: 1344 except Exception as e:
1345 self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] ' 1345 self.online_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] '
1346 '[error={3}]'.format(self.log_base, task_str, times, 1346 '[error={3}]'.format(self.log_base, task_str, times,
......
...@@ -6,7 +6,7 @@ retriever_list = [Retriever(WEP_FIELD), Retriever(MSI_FIELD), Retriever(SC_FIELD ...@@ -6,7 +6,7 @@ retriever_list = [Retriever(WEP_FIELD), Retriever(MSI_FIELD), Retriever(SC_FIELD
6 6
7 def predict(pdf_info, file_type=0): 7 def predict(pdf_info, file_type=0):
8 retriever = retriever_list[file_type] 8 retriever = retriever_list[file_type]
9 pdf_text_list, pdf_img_list = pdf_info_rebuild(pdf_info) 9 pdf_text_list, pdf_img_list = pdf_info_rebuild(pdf_info, file_type=file_type)
10 return retriever.get_target_fields(pdf_text_list, pdf_img_list) 10 return retriever.get_target_fields(pdf_text_list, pdf_img_list)
11 11
12 12
......
1 def pdf_info_rebuild(pdf_info, fix_bbox=True): 1 def pdf_info_rebuild(pdf_info, fix_bbox=True, file_type=0):
2 pdf_text_info = dict() 2 pdf_text_info = dict()
3 pdf_img_info = dict() 3 pdf_img_info = dict()
4 for pno_str, page_info in pdf_info.items(): 4 for pno_str, page_info in pdf_info.items():
...@@ -11,6 +11,7 @@ def pdf_info_rebuild(pdf_info, fix_bbox=True): ...@@ -11,6 +11,7 @@ def pdf_info_rebuild(pdf_info, fix_bbox=True):
11 for span in line['spans']: 11 for span in line['spans']:
12 bbox, text = span['bbox'], span['text'].strip() 12 bbox, text = span['bbox'], span['text'].strip()
13 if len(text) != 0 and text not in text_set: 13 if len(text) != 0 and text not in text_set:
14 if file_type != 3: # 汽车销售合同补充协议,相同的总价会被过滤,所以取消
14 text_set.add(text) 15 text_set.add(text)
15 # bbox的高,不准 16 # bbox的高,不准
16 if fix_bbox and bbox[-1] - bbox[1] < span['size']: 17 if fix_bbox and bbox[-1] - bbox[1] < span['size']:
......
...@@ -12,8 +12,10 @@ from unicodedata import normalize ...@@ -12,8 +12,10 @@ from unicodedata import normalize
12 # 页面保存为png图片参数 12 # 页面保存为png图片参数
13 ZOOM_X_1 = ZOOM_Y_1 = 1.0 13 ZOOM_X_1 = ZOOM_Y_1 = 1.0
14 ZOOM_X_2 = ZOOM_Y_2 = 2.0 14 ZOOM_X_2 = ZOOM_Y_2 = 2.0
15 ZOOM_X_3 = ZOOM_Y_3 = 3.0
15 trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension 16 trans_1 = fitz.Matrix(ZOOM_X_1, ZOOM_X_1).preRotate(0) # zoom factor 1 in each dimension
16 trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension 17 trans_2 = fitz.Matrix(ZOOM_X_2, ZOOM_X_2).preRotate(0) # zoom factor 2 in each dimension
18 trans_3 = fitz.Matrix(ZOOM_X_3, ZOOM_X_3).preRotate(0) # zoom factor 3 in each dimension
17 19
18 # 特殊filter处理 20 # 特殊filter处理
19 ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'} 21 ADOBE_FILTER_SET = {'FlateDecode', 'JPXDecode', 'JBIG2Decode'}
...@@ -25,6 +27,12 @@ WH_COUPLE_3 = (100, 100) ...@@ -25,6 +27,12 @@ WH_COUPLE_3 = (100, 100)
25 WH_COUPLE_4 = (100, 300) 27 WH_COUPLE_4 = (100, 300)
26 WH_COUPLE_5 = (100, 200) 28 WH_COUPLE_5 = (100, 200)
27 29
30 # 碎图宽度阈值
31 TINY_IMG_MAX_WIDTH = 1400
32
33 # 大图宽高阈值
34 WH_COUPLE_6 = (1800, 1400)
35 WH_COUPLE_7 = (2500, 3000)
28 36
29 class PDFBuild: 37 class PDFBuild:
30 38
...@@ -55,6 +63,7 @@ class PDFHandler: ...@@ -55,6 +63,7 @@ class PDFHandler:
55 self.img_dir_path = img_dir_path 63 self.img_dir_path = img_dir_path
56 self.img_path_list = [] 64 self.img_path_list = []
57 self.img_count = 0 65 self.img_count = 0
66 self.is_new_modify = 0 # 用于记录受新改动影响的PDF
58 self.xref_set = set() 67 self.xref_set = set()
59 self.img_suffixs = {'.jpeg', '.jpg', '.png', '.webp', '.bmp'} 68 self.img_suffixs = {'.jpeg', '.jpg', '.png', '.webp', '.bmp'}
60 self.suffix = self.get_suffix(document_name) 69 self.suffix = self.get_suffix(document_name)
...@@ -165,8 +174,10 @@ class PDFHandler: ...@@ -165,8 +174,10 @@ class PDFHandler:
165 except Exception as e: 174 except Exception as e:
166 pass 175 pass
167 176
168 def page_to_png(self, page): 177 def page_to_png(self, page, is_big_img=False):
169 if page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500: 178 if is_big_img:
179 pm = page.getPixmap(matrix=trans_3, alpha=False)
180 elif page.MediaBoxSize.x > 1500 or page.MediaBoxSize.y > 1500:
170 pm = page.getPixmap(matrix=trans_1, alpha=False) 181 pm = page.getPixmap(matrix=trans_1, alpha=False)
171 else: 182 else:
172 pm = page.getPixmap(matrix=trans_2, alpha=False) 183 pm = page.getPixmap(matrix=trans_2, alpha=False)
...@@ -236,8 +247,8 @@ class PDFHandler: ...@@ -236,8 +247,8 @@ class PDFHandler:
236 self.xref_set.add(xref) 247 self.xref_set.add(xref)
237 self.img_path_list.append(img_save_path) 248 self.img_path_list.append(img_save_path)
238 249
239 @staticmethod 250 # @staticmethod
240 def split_il(il): 251 def split_il(self, il):
241 broken_il = [] 252 broken_il = []
242 start = 0 253 start = 0
243 length = len(il) 254 length = len(il)
...@@ -247,6 +258,10 @@ class PDFHandler: ...@@ -247,6 +258,10 @@ class PDFHandler:
247 if il[i][-1] in ADOBE_FILTER_SET: 258 if il[i][-1] in ADOBE_FILTER_SET:
248 page_to_png = True 259 page_to_png = True
249 break 260 break
261 if il[i][2] >= TINY_IMG_MAX_WIDTH:
262 self.is_new_modify = 1
263 page_to_png = True
264 break
250 else: 265 else:
251 for i in range(length): 266 for i in range(length):
252 # 当图片对象够大时,不作碎图合并处理,而是单纯提取 267 # 当图片对象够大时,不作碎图合并处理,而是单纯提取
...@@ -446,6 +461,11 @@ class PDFHandler: ...@@ -446,6 +461,11 @@ class PDFHandler:
446 page = pdf.loadPage(pno) 461 page = pdf.loadPage(pno)
447 self.page_to_png(page) 462 self.page_to_png(page)
448 # 大图 463 # 大图
464 elif width >= WH_COUPLE_6[0] or height >= WH_COUPLE_6[1]:
465 self.is_new_modify = 1
466 is_big_img = (width < WH_COUPLE_7[0] and height < WH_COUPLE_7[1]) # 防止图片过大
467 page = pdf.loadPage(pno)
468 self.page_to_png(page, is_big_img=is_big_img)
449 elif xref not in self.xref_set: 469 elif xref not in self.xref_set:
450 self.extract_single_image(pdf, xref, smask, colorspace, pno) 470 self.extract_single_image(pdf, xref, smask, colorspace, pno)
451 # 3.页面图片对象数目大于1时,特殊处理 471 # 3.页面图片对象数目大于1时,特殊处理
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!