Merge branch 'feature/4058' into 'master'
Feature/4058 See merge request !24
Showing
3 changed files
with
7 additions
and
6 deletions
... | @@ -2434,14 +2434,14 @@ ECONTRACT_KEYWORDS_MAP = { | ... | @@ -2434,14 +2434,14 @@ ECONTRACT_KEYWORDS_MAP = { |
2434 | 2434 | ||
2435 | FSM_ECONTRACT_KEYWORDS_MAP = { | 2435 | FSM_ECONTRACT_KEYWORDS_MAP = { |
2436 | AFC_PREFIX: [ | 2436 | AFC_PREFIX: [ |
2437 | ('延长保修条款与条件', FSM_CONTRACT_WEP_CLASSIFY), | 2437 | ('延长保修服务合约', FSM_CONTRACT_WEP_CLASSIFY), |
2438 | ('长悦保养套餐服务合约', FSM_CONTRACT_MSI_CLASSIFY), | 2438 | ('长悦保养套餐服务合约', FSM_CONTRACT_MSI_CLASSIFY), |
2439 | ('汽车销售合同补充合同', FSM_CONTRACT_SC2_CLASSIFY), | 2439 | ('汽车销售合同补充合同', FSM_CONTRACT_SC2_CLASSIFY), |
2440 | ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY), | 2440 | ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY), |
2441 | ], | 2441 | ], |
2442 | HIL_PREFIX: [ | 2442 | HIL_PREFIX: [ |
2443 | ('延长保修条款与条件', FSM_CONTRACT_WEP_CLASSIFY), | 2443 | ('延长保修服务合约', FSM_CONTRACT_WEP_CLASSIFY), |
2444 | ('长悦保养套餐服务合同', FSM_CONTRACT_MSI_CLASSIFY), | 2444 | ('长悦保养套餐服务合约', FSM_CONTRACT_MSI_CLASSIFY), |
2445 | ('汽车销售合同补充合同', FSM_CONTRACT_SC2_CLASSIFY), | 2445 | ('汽车销售合同补充合同', FSM_CONTRACT_SC2_CLASSIFY), |
2446 | ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY), | 2446 | ('汽车销售合同', FSM_CONTRACT_SC_CLASSIFY), |
2447 | ] | 2447 | ] | ... | ... |
... | @@ -6,7 +6,7 @@ retriever_list = [Retriever(WEP_FIELD), Retriever(MSI_FIELD), Retriever(SC_FIELD | ... | @@ -6,7 +6,7 @@ retriever_list = [Retriever(WEP_FIELD), Retriever(MSI_FIELD), Retriever(SC_FIELD |
6 | 6 | ||
7 | def predict(pdf_info, file_type=0): | 7 | def predict(pdf_info, file_type=0): |
8 | retriever = retriever_list[file_type] | 8 | retriever = retriever_list[file_type] |
9 | pdf_text_list, pdf_img_list = pdf_info_rebuild(pdf_info) | 9 | pdf_text_list, pdf_img_list = pdf_info_rebuild(pdf_info, file_type=file_type) |
10 | return retriever.get_target_fields(pdf_text_list, pdf_img_list) | 10 | return retriever.get_target_fields(pdf_text_list, pdf_img_list) |
11 | 11 | ||
12 | 12 | ... | ... |
1 | def pdf_info_rebuild(pdf_info, fix_bbox=True): | 1 | def pdf_info_rebuild(pdf_info, fix_bbox=True, file_type=0): |
2 | pdf_text_info = dict() | 2 | pdf_text_info = dict() |
3 | pdf_img_info = dict() | 3 | pdf_img_info = dict() |
4 | for pno_str, page_info in pdf_info.items(): | 4 | for pno_str, page_info in pdf_info.items(): |
... | @@ -11,7 +11,8 @@ def pdf_info_rebuild(pdf_info, fix_bbox=True): | ... | @@ -11,7 +11,8 @@ def pdf_info_rebuild(pdf_info, fix_bbox=True): |
11 | for span in line['spans']: | 11 | for span in line['spans']: |
12 | bbox, text = span['bbox'], span['text'].strip() | 12 | bbox, text = span['bbox'], span['text'].strip() |
13 | if len(text) != 0 and text not in text_set: | 13 | if len(text) != 0 and text not in text_set: |
14 | text_set.add(text) | 14 | if file_type != 3: # 汽车销售合同补充协议,相同的总价会被过滤,所以取消 |
15 | text_set.add(text) | ||
15 | # bbox的高,不准 | 16 | # bbox的高,不准 |
16 | if fix_bbox and bbox[-1] - bbox[1] < span['size']: | 17 | if fix_bbox and bbox[-1] - bbox[1] < span['size']: |
17 | bbox[-1] = bbox[-1] + span['size'] | 18 | bbox[-1] = bbox[-1] + span['size'] | ... | ... |
-
Please register or sign in to post a comment