1a51bc03 by 周伟奇

Merge branch 'feature/main' into feature/mssql

2 parents 05a27cd8 fe25f273
......@@ -642,7 +642,7 @@ RP_FIELD_ORDER_0 = (('姓名', '姓名'),
('住址', '住址'),
('性别', '性别'),)
RP_FIELD_ORDER_1 = IC_FIELD_ORDER_1
# 增值税
# 增值税
VAT_CN_NAME = 'VAT普票'
VAT_CLASSIFY = 0
VAT_FIELD_ORDER = (('发票代码', '发票代码'),
......@@ -667,6 +667,32 @@ VAT_FIELD_ORDER = (('发票代码', '发票代码'),
('销方开户行及账号', '销售方开户行及账号'),
('下盖章', '销售方:(章)'),
('备注', '备注'),)
# 增值税专票
VATS_CN_NAME = 'VAT专票'
VATS_CLASSIFY = 10088
VATS_FIELD_ORDER = (('发票代码', '发票代码'),
('发票代码_开具', '发票代码(开具)'),
('发票号码', '发票号码'),
('发票号码_开具', '发票号码(开具)'),
('开票日期', '开票日期'),
('校验码', '校验码'),
('货物或应税劳务、服务名称', '货物或应税劳务、服务名称'),
('金额合计', '开具金额合计(不含税)'),
('税率', '税率'),
('税额合计', '税额合计'),
('价税合计小写', '价税合计(小写)'),
('价税合计大写', '价税合计(大写)'),
('购方名称', '购买方名称'),
('购方纳税人识别号', '购买方纳税人识别号'),
('购方地址、电话', '购买方地址、电话'),
('购方开户行及账号', '购买方开户行及账号'),
('销方名称', '销售方名称'),
('销方纳税人识别号', '销售方纳税人识别号'),
('销方地址、电话', '销售方地址、电话'),
('销方开户行及账号', '销售方开户行及账号'),
('下盖章', '销售方:(章)'),
('车船税', '车船税'),
('备注', '备注'),)
# 机动车登记证书
MVC_CN_NAME = '机动车登记证书'
MVC_CLASSIFY = 28
......@@ -770,7 +796,7 @@ MVI_FIELD_ORDER = (('发票代码', '发票代码'),
('主管税务机关及代码', '主管税务机关及代码'),
('吨位', '吨位'),
('限乘人数', '限乘人数'),)
IC_PID = VAT_PID = MVC_PID = MVI_PID = None
IC_PID = VAT_PID = VATS_PID = MVC_PID = MVI_PID = None
# 营业执照
BL_CN_NAME = '营业执照'
......@@ -909,6 +935,11 @@ LICENSE_ORDER = ((MVI_CLASSIFY, (MVI_PID, MVI_CN_NAME, MVI_FIELD_ORDER, False, F
(MVC_CLASSIFY, (MVC_PID, MVC_CN_NAME, None, True, True, MODEL_FIELD_MVC)),
(VAT_CLASSIFY, (VAT_PID, VAT_CN_NAME, VAT_FIELD_ORDER, False, False, MODEL_FIELD_VAT)))
FOLDER_LICENSE_ORDER = ((MVI_CLASSIFY, (MVI_PID, MVI_CN_NAME, MVI_FIELD_ORDER, False, False, MODEL_FIELD_MVI)),
(IC_CLASSIFY, (IC_PID, IC_CN_NAME, None, True, False, MODEL_FIELD_IC)),
(VAT_CLASSIFY, (VAT_PID, VAT_CN_NAME, VAT_FIELD_ORDER, False, False, MODEL_FIELD_VAT)),
(VATS_CLASSIFY, (VATS_PID, VATS_CN_NAME, VATS_FIELD_ORDER, False, False, MODEL_FIELD_VAT)))
LICENSE_CLASSIFY_MAPPING = dict(LICENSE_ORDER)
OTHER_CLASSIFY_SET = {OTHER_CLASSIFY}
......
......@@ -165,7 +165,6 @@ class Command(BaseCommand, LoggerMixin):
def folder_process(self, input_dir, classify):
while not os.path.isdir(input_dir):
self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir))
print(self.switch)
if self.switch:
time.sleep(self.sleep_time)
continue
......@@ -202,6 +201,9 @@ class Command(BaseCommand, LoggerMixin):
else:
self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir)
self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path))
else:
self.folder_log.info('{0} [path is dir] [path={1}]'.format(self.log_base, input_dir))
shutil.move(path, failed_output_dir)
except Exception as e:
try:
path = os.path.join(input_dir, name)
......
......@@ -102,20 +102,20 @@ class Command(BaseCommand, LoggerMixin):
self.log_base, task_str, is_priority))
return doc, business_type, task_str
def pdf_download(self, doc, pdf_path):
if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
for times in range(consts.RETRY_TIMES):
try:
self.edms.download(pdf_path, doc.metadata_version_id)
except Exception as e:
self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] '
'[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc()))
edms_exc = str(e)
else:
break
else:
raise EDMSException(edms_exc)
self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path))
# def pdf_download(self, doc, pdf_path):
# if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
# for times in range(consts.RETRY_TIMES):
# try:
# self.edms.download(pdf_path, doc.metadata_version_id)
# except Exception as e:
# self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] '
# '[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc()))
# edms_exc = str(e)
# else:
# break
# else:
# raise EDMSException(edms_exc)
# self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path))
def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx):
sheets = ocr_data.get('data', [])
......@@ -439,19 +439,35 @@ class Command(BaseCommand, LoggerMixin):
# 2. 从EDMS获取PDF文件
doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id))
os.makedirs(doc_data_path, exist_ok=True)
img_save_path = os.path.join(doc_data_path, 'img')
pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
self.pdf_download(doc, pdf_path)
pdf_handler = PDFHandler(pdf_path, img_save_path)
for times in range(consts.RETRY_TIMES):
try:
if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
self.edms.download(pdf_path, doc.metadata_version_id)
self.cronjob_log.info('{0} [edms download success] [task={1}] [times={2}] '
'[pdf_path={3}]'.format(self.log_base, task_str, times, pdf_path))
# 3.PDF文件提取图片
self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str))
self.cronjob_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
self.log_base, task_str, times))
start_time = time.time()
img_save_path = os.path.join(doc_data_path, 'img')
pdf_handler = PDFHandler(pdf_path, img_save_path)
pdf_handler.extract_image()
end_time = time.time()
speed_time = int(end_time - start_time)
self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format(
self.log_base, task_str, speed_time))
self.cronjob_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format(
self.log_base, task_str, times, speed_time))
except Exception as e:
self.cronjob_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] '
'[error={3}]'.format(self.log_base, task_str, times,
traceback.format_exc()))
else:
break
else:
raise Exception('download or pdf to img failed')
img_count = len(pdf_handler.img_path_list)
if img_count == 0:
......@@ -466,25 +482,25 @@ class Command(BaseCommand, LoggerMixin):
self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
time.sleep(self.sleep_time_img_put)
img_queue.put(img_path)
except EDMSException as e:
try:
doc.status = DocStatus.PROCESS_FAILED.value
doc.save()
self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
except Exception as e:
self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
error_list.append(1)
return
# except EDMSException as e:
# try:
# doc.status = DocStatus.PROCESS_FAILED.value
# doc.save()
# self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
# self.log_base, task_str, traceback.format_exc()))
# except Exception as e:
# self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format(
# self.log_base, traceback.format_exc()))
# error_list.append(1)
# return
except Exception as e:
try:
doc.status = DocStatus.PROCESS_FAILED.value
doc.save()
self.cronjob_log.warn('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
self.cronjob_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
'[error={2}]'.format(self.log_base, task_str, traceback.format_exc()))
except Exception as e:
self.cronjob_log.error('{0} [process error (db save 2)] [error={1}]'.format(
self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
error_list.append(1)
return
......@@ -523,8 +539,8 @@ class Command(BaseCommand, LoggerMixin):
ocr_1_res = ocr_1_response.json()
end_time = time.time()
speed_time = int(end_time - start_time)
self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [res={2}] [speed_time={3}]'.format(
self.log_base, img_path, ocr_1_res, speed_time))
self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [url={2}] [speed_time={3}]'.format(
self.log_base, img_path, url, speed_time))
break
else:
ocr_1_res = {}
......@@ -636,8 +652,8 @@ class Command(BaseCommand, LoggerMixin):
end_time = time.time()
speed_time = int(end_time - start_time)
self.cronjob_log.info(
'{0} [ocr_2 success] [img={1}] [res={2}] [speed_time={3}]'.format(
self.log_base, img_path, ocr_2_res, speed_time))
'{0} [ocr_2 success] [img={1}] [speed_time={2}]'.format(
self.log_base, img_path, speed_time))
if classify == consts.BC_CLASSIFY:
name = '有'
......
......@@ -520,7 +520,7 @@ class BSWorkbook(Workbook):
for row in loan_fill_row:
for cell in new_ws[row]:
cell.fill = self.loan_fill
cell.fill = self.amount_fill
# 3.6.同一天相同进出账高亮
del amount_mapping
......@@ -656,17 +656,24 @@ class BSWorkbook(Workbook):
count_list.append((field_str, count))
def simple_license_rebuild(self, license_summary, document_scheme):
for classify, (_, name, field_order, side_diff, scheme_diff, _) in consts.LICENSE_ORDER:
for ic_license_dict in license_summary.get(consts.IC_CLASSIFY, []):
if ic_license_dict.get('类别') == '1':
license_summary.setdefault(consts.RP_CLASSIFY, []).append(ic_license_dict)
continue
for vat_license_dict in license_summary.get(consts.VAT_CLASSIFY, []):
if vat_license_dict.get('发票类型') == 'special':
license_summary.setdefault(consts.VATS_CLASSIFY, []).append(vat_license_dict)
continue
for classify, (_, name, field_order, side_diff, scheme_diff, _) in consts.FOLDER_LICENSE_ORDER:
license_list = license_summary.get(classify)
if not license_list:
continue
ws = self.create_sheet(name)
if scheme_diff and document_scheme == consts.DOC_SCHEME_LIST[1]:
classify = consts.MVC_CLASSIFY_SE
# if scheme_diff and document_scheme == consts.DOC_SCHEME_LIST[1]:
# classify = consts.MVC_CLASSIFY_SE
for license_dict in license_list:
if classify == consts.IC_CLASSIFY and license_dict.get('类别') == '1':
license_summary.setdefault(consts.RP_CLASSIFY, []).append(license_dict)
continue
if side_diff:
key, field_order_yes, field_order_no = consts.FIELD_ORDER_MAP.get(classify)
field_order = field_order_yes if key in license_dict else field_order_no
......
......@@ -187,6 +187,8 @@ class PDFHandler:
self.page_to_png(page)
def extract_image(self):
self.img_path_list = []
self.xref_set = set()
os.makedirs(self.img_dir_path, exist_ok=True)
with fitz.Document(self.path) as pdf:
for pno in range(pdf.pageCount):
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!