65b19211 by 周伟奇

fix sc2

1 parent d656bacc
......@@ -6,7 +6,7 @@ retriever_list = [Retriever(WEP_FIELD), Retriever(MSI_FIELD), Retriever(SC_FIELD
def predict(pdf_info, file_type=0):
retriever = retriever_list[file_type]
pdf_text_list, pdf_img_list = pdf_info_rebuild(pdf_info)
pdf_text_list, pdf_img_list = pdf_info_rebuild(pdf_info, file_type=file_type)
return retriever.get_target_fields(pdf_text_list, pdf_img_list)
......
def pdf_info_rebuild(pdf_info, fix_bbox=True):
def pdf_info_rebuild(pdf_info, fix_bbox=True, file_type=0):
pdf_text_info = dict()
pdf_img_info = dict()
for pno_str, page_info in pdf_info.items():
......@@ -11,7 +11,8 @@ def pdf_info_rebuild(pdf_info, fix_bbox=True):
for span in line['spans']:
bbox, text = span['bbox'], span['text'].strip()
if len(text) != 0 and text not in text_set:
text_set.add(text)
if file_type != 3: # 汽车销售合同补充协议,相同的总价会被过滤,所以取消
text_set.add(text)
# bbox的高,不准
if fix_bbox and bbox[-1] - bbox[1] < span['size']:
bbox[-1] = bbox[-1] + span['size']
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!