7a3d093e by 周伟奇

issue list 1117

1 parent ec638e4f
......@@ -29,9 +29,6 @@ sftp-config.json
*.sqlite3
conf/*
data/*
ocr/*
# 脚本
src/*.sh
test*
\ No newline at end of file
test*
flow_test.py
\ No newline at end of file
......
......@@ -140,9 +140,9 @@ RESULT_IDX = FIXED_HEADERS.index('核对结果')
# '借贷': ('贷', '借'), # 竖版-无表格-广发银行
# '借贷状态': ('贷', '借'), # 竖版-特殊-交通银行
# '收/支': ('收入', '支出'), # 横版-表格-北京银行
BORROW_HEADERS_SET = {'借贷', '借贷状态', '收/支'}
BORROW_INCOME_SET = {'贷', '收入'}
BORROW_OUTLAY_SET = {'借', '支出'}
BORROW_HEADERS_SET = {'借贷', '借贷状态', '收/支', '收支标志'}
BORROW_INCOME_SET = {'贷', '收入', '收'}
BORROW_OUTLAY_SET = {'借', '支出', '支'}
INCOME_HEADERS_SET = {'收入金额', '收入', '存入', '存入金额(贷)', '存入金额(贷)'}
OUTLAY_HEADERS_SET = {'支出金额', '支出', '支取金额(借)', '支取金额(借)'}
......@@ -154,6 +154,7 @@ HEADERS_MAPPING.update(
{
'借贷': BORROW_KEY,
'借贷状态': BORROW_KEY,
'收支标志': BORROW_KEY,
'收/支': BORROW_KEY,
}
)
......@@ -911,11 +912,11 @@ WECHART_HEADERS_MAPPING.update(
}
)
PATTERN_LIST = ['收入/支出金额', '收入', '存入', '支出', '支取', '金额', '余额', '发生额', '借贷', '借贷状态', '收/支', '收入金额',
'存入金额(贷)', '存入金额(贷)', '支出金额', '支取金额(借)', '支取金额(借)', '记账日期', '附言', '交易日期', '摘要',
'业务摘要', '工作日期', '交易金额', '账户余额', '交易类型', '金额(元)', '金额(元)', '时间', '名称/备注',
'摘要/附言', '交易发生额', '交易摘要', '借贷发生额(借:-贷:+)', '借贷发生额(借:-贷:+)', '联机余额', '交易金额(元)',
'交易金额(元)', '账户余额(元)', '账户余额(元)', '会计日期', '摘要代码', '摘要信息', '日期', '短摘要', '本次余额',
'交易后余额', '交易说明', '帐户余额', '交易日期 记账日期']
PATTERN_LIST = ['收入/支出金额', '收入', '存入', '支出', '支取', '金额', '余额', '发生额', '借贷', '借贷状态', '收支标志', '收/支',
'收入金额', '存入金额(贷)', '存入金额(贷)', '支出金额', '支取金额(借)', '支取金额(借)', '记账日期', '附言',
'交易日期', '摘要', '业务摘要', '工作日期', '交易金额', '账户余额', '交易类型', '金额(元)', '金额(元)', '时间',
'名称/备注', '摘要/附言', '交易发生额', '交易摘要', '借贷发生额(借:-贷:+)', '借贷发生额(借:-贷:+)', '联机余额',
'交易金额(元)', '交易金额(元)', '账户余额(元)', '账户余额(元)', '会计日期', '摘要代码', '摘要信息', '日期',
'短摘要', '本次余额', '交易后余额', '交易说明', '帐户余额', '交易日期 记账日期']
CN_RE = re.compile(u'[\u4e00-\u9fa5]')
......
......@@ -163,14 +163,19 @@ class Command(BaseCommand, LoggerMixin):
shutil.move(path, img_save_path)
def folder_process(self, input_dir, classify):
while not os.path.isdir(input_dir):
self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir))
time.sleep(self.sleep_time)
output_dir = os.path.join(os.path.dirname(input_dir), 'Output')
img_output_dir = os.path.join(output_dir, 'image')
wb_output_dir = os.path.join(output_dir, 'excel')
pdf_output_dir = os.path.join(output_dir, 'pdf')
failed_output_dir = os.path.join(output_dir, 'failed')
os.makedirs(output_dir, exist_ok=True)
os.makedirs(img_output_dir, exist_ok=True)
os.makedirs(wb_output_dir, exist_ok=True)
os.makedirs(pdf_output_dir, exist_ok=True)
os.makedirs(failed_output_dir, exist_ok=True)
while self.switch:
# 1. 从input dir获取pdf or image
list_dir = os.listdir(input_dir)
......@@ -178,14 +183,26 @@ class Command(BaseCommand, LoggerMixin):
self.folder_log.info('{0} [input dir empty] [input_dir={1}]'.format(self.log_base, input_dir))
time.sleep(self.sleep_time)
for name in list_dir:
path = os.path.join(input_dir, name)
if os.path.isfile(path):
self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
if name.endswith('.pdf'):
self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir)
else:
self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir)
self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path))
try:
path = os.path.join(input_dir, name)
if os.path.isfile(path):
self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
if name.endswith('.pdf'):
self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir)
else:
self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir)
self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path))
except Exception as e:
try:
path = os.path.join(input_dir, name)
self.folder_log.error('{0} [file error] [path={1}] [error={2}]'.format(self.log_base, path,
traceback.format_exc()))
shutil.move(path, failed_output_dir)
continue
except Exception as e:
self.folder_log.error('{0} [file error] [error={1}]'.format(
self.log_base, traceback.format_exc()))
continue
def handle(self, *args, **kwargs):
process_list = []
......
......@@ -20,6 +20,7 @@ class EDMS:
self.user_name = conf.EDMS_USER
self.pwd = conf.EDMS_PWD
self.session_id = None
self.prefix = 'OCR'
def set_session_id(self):
self.session_id = self.sm_client.service.StartSession(login=self.user_name,
......@@ -83,12 +84,15 @@ class EDMS:
else:
raise Exception
@staticmethod
def get_doc_file_name(doc_name):
if doc_name.endswith('pdf'):
def get_doc_file_name(self, doc_name):
if not isinstance(doc_name, str):
return self.prefix
if doc_name.endswith('.pdf') or doc_name.endswith('.PDF') or \
doc_name.endswith('.pdF') or doc_name.endswith('.pDF') or doc_name.endswith('.pDf') or \
doc_name.endswith('.Pdf') or doc_name.endswith('.PdF') or doc_name.endswith('.PDf'):
name, _ = os.path.splitext(doc_name)
return name
return doc_name
return '{0}{1}'.format(self.prefix, name)
return '{0}{1}'.format(self.prefix, doc_name)
def get_doc_info(self, token, doc, business_type, file_path):
business_type = consts.BUSINESS_TYPE_DICT.get(business_type)
......@@ -140,5 +144,3 @@ class EDMS:
headers.pop('Content-Type')
metadata_version_id = self.add_doc_info(headers, token, doc, business_type, file_path)
return metadata_version_id
......
......@@ -574,12 +574,25 @@ class BSWorkbook(Workbook):
license_list = license_summary.get(classify)
if not license_list:
continue
if classify == consts.IC_CLASSIFY: # 身份证、居住证先正面,后反面
key, _, _ = consts.FIELD_ORDER_MAP.get(classify)
side1_list = []
side2_list = []
for license_dict in license_list:
if key in license_dict:
side2_list.append(license_dict)
else:
side1_list.append(license_dict)
side1_list.extend(side2_list)
license_list = side1_list
side2_list = None
side1_list = None
count = 0
ws = self.create_sheet(name)
if scheme_diff and document_scheme == consts.DOC_SCHEME_LIST[1]:
classify = consts.MVC_CLASSIFY_SE
for license_dict in license_list:
if classify == consts.IC_CLASSIFY and license_dict.get('类别') == '1':
if classify == consts.IC_CLASSIFY and license_dict.get('类别') == '1': # 居住证处理
license_summary.setdefault(consts.RP_CLASSIFY, []).append(license_dict)
continue
if side_diff:
......@@ -632,8 +645,12 @@ class BSWorkbook(Workbook):
def rebuild(self, bs_summary, license_summary, res_list, document_scheme):
count_list = [(consts.MODEL_FIELD_BS, len(self.sheetnames) - 1)]
self.bs_rebuild(bs_summary)
self.license_rebuild(license_summary, document_scheme, count_list)
if document_scheme == consts.DOC_SCHEME_LIST[1]:
self.license_rebuild(license_summary, document_scheme, count_list)
self.bs_rebuild(bs_summary)
else:
self.bs_rebuild(bs_summary)
self.license_rebuild(license_summary, document_scheme, count_list)
self.res_sheet(res_list)
self.remove_base_sheet()
return count_list
......
......@@ -293,8 +293,9 @@ class DocView(GenericView, DocHandler):
metadata_version_id = str(int(time.time()) - random_int)
pdf_file = args.get('pdf_file')
if not pdf_file.name.endswith('pdf'):
self.invalid_params(msg='invalid params: not a PDF file')
if isinstance(pdf_file.name, str):
if not pdf_file.name.endswith('pdf') or not pdf_file.name.endswith('PDF'):
self.invalid_params(msg='invalid params: not a PDF file')
business_type = random.choice(consts.BUSINESS_TYPE_LIST)
tmp_save_path = os.path.join(conf.DATA_DIR, business_type, '{0}.pdf'.format(metadata_version_id))
......
......@@ -8,7 +8,7 @@ SLEEP_SECOND_FOLDER = 2
IMG_QUEUE_SIZE = 500
EDMS_DOWNLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/DownloadHandler.ashx
EDMS_UPLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/UploadHandler.ashx
DEALER_CODE = ocr_situ_group
EDMS_DOWNLOAD_URL = http://sccn0639.bmwgroup.net/FH/FileHold/DocumentRepository/DownloadHandler.ashx
EDMS_UPLOAD_URL = http://sccn0639.bmwgroup.net/FH/FileHold/DocumentRepository/UploadHandler.ashx
DEALER_CODE = ocr_group
......
......@@ -8,6 +8,6 @@ SLEEP_SECOND_FOLDER = 2
IMG_QUEUE_SIZE = 500
EDMS_DOWNLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/DownloadHandler.ashx
EDMS_UPLOAD_URL = https://edms-test.bmw.com/FH/FileHold/DocumentRepository/UploadHandler.ashx
EDMS_DOWNLOAD_URL = http://sccn0637.bmwgroup.net/FH/FileHold/DocumentRepository/DownloadHandler.ashx
EDMS_UPLOAD_URL = http://sccn0637.bmwgroup.net/FH/FileHold/DocumentRepository/UploadHandler.ashx
DEALER_CODE = ocr_situ_group
\ No newline at end of file
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!