Skip to content
Toggle navigation
Toggle navigation
This project
Loading...
Sign in
周伟奇
/
bmw-ocr
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Files
Commits
Network
Compare
Branches
Tags
cd509dca
authored
2025-04-27 15:14:11 +0800
by
冯轩
Browse Files
Options
Browse Files
Tag
Download
Email Patches
Plain Diff
add:log
1 parent
0cb79d87
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
26 additions
and
4 deletions
src-0424.zip
src/apps/doc/management/commands/ocr_process.py
src/apps/doc/ocr/wb.py
src-0424.zip
0 → 100644
View file @
cd509dc
No preview for this file type
src/apps/doc/management/commands/ocr_process.py
View file @
cd509dc
...
...
@@ -178,6 +178,8 @@ class Command(BaseCommand, LoggerMixin):
# self.online_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path))
def
bs_process
(
self
,
wb
,
ocr_data
,
bs_summary
,
unknown_summary
,
classify
,
res_list
,
pno
,
ino
,
part_idx
,
income_keywords_dictionary
):
self
.
online_log
.
warn
(
'{0} [bs_process] [ocr_data={1}] [bs_summary={2}] [unknown_summary={3}] [classify={4}] [res_list={5}] [pno={6}] [ino={7}] [part_idx={8}] [income_keywords_dictionary={9}]'
.
format
(
self
.
log_base
,
ocr_data
,
bs_summary
,
unknown_summary
,
classify
,
res_list
,
pno
,
ino
,
part_idx
,
income_keywords_dictionary
))
sheets
=
ocr_data
.
get
(
'data'
,
[])
if
not
sheets
:
res_list
.
append
((
pno
,
ino
,
part_idx
,
consts
.
RES_SUCCESS_EMPTY
))
...
...
@@ -2053,8 +2055,8 @@ class Command(BaseCommand, LoggerMixin):
try
:
# 重构Excel文件
#
src_excel_path = os.path.join(doc_data_path, 'src.xlsx')
#
wb.save(src_excel_path)
src_excel_path
=
os
.
path
.
join
(
doc_data_path
,
'src.xlsx'
)
wb
.
save
(
src_excel_path
)
#need_follow表示在上传edms时文件名是否要添加"关注"两字
count_list
,
need_follow
=
wb
.
rebuild
(
merged_bs_summary
,
license_summary
,
res_list
,
doc
.
document_scheme
,
contract_result
,
doc
.
metadata
,
financial_statement_dict
,
financial_explanation_dict
,
down_payment_dict
)
wb
.
save
(
excel_path
)
...
...
src/apps/doc/ocr/wb.py
View file @
cd509dc
...
...
@@ -122,13 +122,19 @@ class BSWorkbook(Workbook, LoggerMixin):
header_col_list
=
[]
for
first_row
in
ws
.
iter_rows
(
max_row
=
1
,
min_row
=
1
,
values_only
=
True
):
sheet_header_info
.
setdefault
(
ws
.
title
,
{})
.
setdefault
(
consts
.
HEADER_KEY
,
first_row
)
self
.
online_log
.
warn
(
'{0} [header_collect_1] [first_row={1}] [sheet_header_info={2}]'
.
format
(
self
.
log_base
,
first_row
,
sheet_header_info
))
for
idx
,
header_value
in
enumerate
(
first_row
):
header_col
=
self
.
get_header_col
(
header_value
,
classify
)
self
.
online_log
.
warn
(
'{0} [header_collect_2] [idx={1}] [header_value={2}] [header_col={3}]'
.
format
(
self
.
log_base
,
idx
,
header_value
,
header_col
))
if
classify
==
consts
.
MS_CLASSIFY
and
header_col
==
consts
.
OVER_KEY
and
\
header_value
==
'账户余额现转标志'
and
not
first_row
[
idx
-
1
]:
idx
-=
1
if
header_col
is
not
None
:
header_col_list
.
append
((
idx
,
header_col
))
self
.
online_log
.
warn
(
'{0} [header_collect_3] [header_col_list={1}]'
.
format
(
self
.
log_base
,
header_col_list
))
find_count
=
len
(
header_col_list
)
if
find_count
<
2
:
...
...
@@ -136,15 +142,25 @@ class BSWorkbook(Workbook, LoggerMixin):
else
:
for
idx
,
header_col
in
header_col_list
:
sheet_header_info
.
setdefault
(
ws
.
title
,
{})
.
setdefault
(
header_col
,
idx
)
self
.
online_log
.
warn
(
'{0} [header_collect_4] [sheet_header_info={1}]'
.
format
(
self
.
log_base
,
sheet_header_info
))
find_col_set
=
sheet_header_info
.
setdefault
(
ws
.
title
,
{})
.
setdefault
(
consts
.
FIND_COL_KEY
,
set
())
find_col_set
.
add
(
idx
)
self
.
online_log
.
warn
(
'{0} [header_collect_5] [sheet_header_info={1}]'
.
format
(
self
.
log_base
,
sheet_header_info
))
col_count
=
header_info
.
setdefault
(
header_col
,
{})
.
get
(
idx
)
header_info
.
setdefault
(
header_col
,
{})[
idx
]
=
1
if
col_count
is
None
else
col_count
+
1
self
.
online_log
.
warn
(
'{0} [header_collect_6] [header_info={1}]'
.
format
(
self
.
log_base
,
header_info
))
sheet_header_info
.
setdefault
(
ws
.
title
,
{})
.
setdefault
(
consts
.
FIND_COUNT_KEY
,
find_count
)
self
.
online_log
.
warn
(
'{0} [header_collect_7] [sheet_header_info={1}]'
.
format
(
self
.
log_base
,
sheet_header_info
))
min_row
=
1
if
find_count
==
0
else
2
sheet_header_info
.
setdefault
(
ws
.
title
,
{})
.
setdefault
(
consts
.
MIN_ROW_KEY
,
min_row
)
max_column_list
.
append
(
ws
.
max_column
)
self
.
online_log
.
warn
(
'{0} [header_collect_8] [sheet_header_info={1}] [header_info={2}] [max_column_list={3}]'
.
format
(
self
.
log_base
,
sheet_header_info
,
header_info
,
max_column_list
))
@staticmethod
def
header_statistics
(
sheet_header_info
,
header_info
,
classify
,
special_nhzs
):
...
...
@@ -194,7 +210,7 @@ class BSWorkbook(Workbook, LoggerMixin):
return
statistics_header_info
,
max_find_count
@staticmethod
def
get_data_col_min_row
(
sheet
,
sheet_header_info
,
header_info
,
classify
):
def
get_data_col_min_row
(
s
elf
,
s
heet
,
sheet_header_info
,
header_info
,
classify
):
date_col
=
sheet_header_info
.
get
(
sheet
,
{})
.
get
(
consts
.
DATE_KEY
)
if
date_col
is
None
:
date_col_dict
=
header_info
.
get
(
consts
.
DATE_KEY
,
{})
...
...
@@ -273,6 +289,8 @@ class BSWorkbook(Workbook, LoggerMixin):
return
reverse_trend
def
sheet_split
(
self
,
ws
,
date_col
,
min_row
,
month_mapping
,
reverse_trend_list
,
date_list
,
date_statistics
):
self
.
online_log
.
warn
(
'{0} [sheet_split] [date_col={1}] [min_row={2}] [month_mapping={3}] [reverse_trend_list={4}] [date_list={5}] [date_statistics={6}]'
.
format
(
self
.
log_base
,
date_col
,
min_row
,
month_mapping
,
reverse_trend_list
,
date_list
,
date_statistics
))
if
date_col
is
None
:
# month_info process
month_info
=
month_mapping
.
setdefault
(
'xxxx-xx'
,
[])
...
...
@@ -749,7 +767,9 @@ class BSWorkbook(Workbook, LoggerMixin):
reverse_trend_list
=
[]
# 用于判断倒序与正序
for
sheet
in
sheets_list
:
ws
=
self
.
get_sheet_by_name
(
sheet
)
date_col
,
min_row
=
self
.
get_data_col_min_row
(
sheet
,
sheet_header_info
,
header_info
,
classify
)
date_col
,
min_row
=
self
.
get_data_col_min_row
(
self
,
sheet
,
sheet_header_info
,
header_info
,
classify
)
self
.
online_log
.
warn
(
'{0} [bs_rebuild] [date_col={1}] [min_row={2}]'
.
format
(
self
.
log_base
,
date_col
,
min_row
))
self
.
sheet_split
(
ws
,
date_col
,
min_row
,
month_mapping
,
reverse_trend_list
,
date_list
,
date_statistics
)
if
date_statistics
is
True
and
len
(
date_list
)
>
1
:
...
...
Write
Preview
Styling with
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment