update wb build
Showing
6 changed files
with
761 additions
and
151 deletions
| 1 | PAGE_DEFAULT = 1 | 1 | PAGE_DEFAULT = 1 |
| 2 | PAGE_SIZE_DEFAULT = 10 | 2 | PAGE_SIZE_DEFAULT = 10 |
| 3 | 3 | ||
| 4 | TRANS = str.maketrans('Cc((oODlLmAsSbg', '000000011345569') | ||
| 5 | |||
| 6 | CARD_RATIO = 0.9 | ||
| 7 | UNKNOWN_CARD = '未知卡号' | ||
| 8 | UNKNOWN_ROLE = '未知户名' | ||
| 9 | DATE_FORMAT = ['%Y年%m月%d日', '%Y/%m/%d', '%Y-%m-%d', '%Y%m%d'] | ||
| 10 | |||
| 4 | FIXED_APPLICATION_ID_PREFIX = 'CH-S' | 11 | FIXED_APPLICATION_ID_PREFIX = 'CH-S' |
| 5 | 12 | ||
| 6 | DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACT MANAGEMENT'] | 13 | DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACT MANAGEMENT'] |
| 7 | DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT'] | 14 | DATA_SOURCE_LIST = ['POS', 'E-APP', 'ECONTRACT'] |
| 8 | 15 | ||
| 9 | HIL_PREFIX = 'HIL' | 16 | HIL_PREFIX = 'HIL' |
| 10 | AFC_PREFIX = 'AFC' | 17 | AFC_PREFIX = 'AFC' |
| ... | @@ -39,11 +46,33 @@ PROOF_COL_TITLE = '核对结果' | ... | @@ -39,11 +46,33 @@ PROOF_COL_TITLE = '核对结果' |
| 39 | PROOF_RES = ('对', '错') | 46 | PROOF_RES = ('对', '错') |
| 40 | META_SHEET_TITLE = '关键信息提取和展示' | 47 | META_SHEET_TITLE = '关键信息提取和展示' |
| 41 | 48 | ||
| 42 | FIXED_HEADERS = ('记账日期', '记账时间', '金额', '余额', '交易名称', '附言', '对方账户名', '对方卡号/账号', '对方开户行', '核对结果') | 49 | FIXED_HEADERS = ('记账日期', '记账时间', '金额', '余额', '交易名称', '附言', '对方账户名', '对方卡号/账号', '对方开户行', '核对结果', '借贷', '收入', '支出') |
| 43 | FIXED_COL_AMOUNT = len(FIXED_HEADERS) | 50 | FIXED_COL_AMOUNT = len(FIXED_HEADERS) |
| 44 | BASE_HEADERS_MAPPING = {label: idx+1 for idx, label in enumerate(FIXED_HEADERS)} | 51 | BASE_HEADERS_MAPPING = {label: idx+1 for idx, label in enumerate(FIXED_HEADERS)} |
| 52 | BORROW_HEADER_COL = BASE_HEADERS_MAPPING['借贷'] | ||
| 53 | INCOME_HEADER_COL = BASE_HEADERS_MAPPING['收入'] | ||
| 54 | OUTLAY_HEADER_COL = BASE_HEADERS_MAPPING['支出'] | ||
| 55 | RESULT_HEADER_COL = BASE_HEADERS_MAPPING['核对结果'] | ||
| 56 | BORROW_IDX = BORROW_HEADER_COL - 1 | ||
| 57 | INCOME_IDX = INCOME_HEADER_COL - 1 | ||
| 58 | OUTLAY_IDX = OUTLAY_HEADER_COL - 1 | ||
| 59 | SUMMARY_IDX = FIXED_HEADERS.index('附言') | ||
| 60 | DATE_IDX = FIXED_HEADERS.index('记账日期') | ||
| 61 | AMOUNT_IDX = FIXED_HEADERS.index('金额') | ||
| 62 | OVER_IDX = FIXED_HEADERS.index('余额') | ||
| 63 | RESULT_IDX = FIXED_HEADERS.index('核对结果') | ||
| 64 | # '借贷': ('贷', '借'), # 竖版-无表格-广发银行 | ||
| 65 | # '借贷状态': ('贷', '借'), # 竖版-特殊-交通银行 | ||
| 66 | # '收/支': ('收入', '支出'), # 横版-表格-北京银行 | ||
| 67 | BORROW_HEADERS_SET = {'借贷', '借贷状态', '收/支'} | ||
| 68 | BORROW_INCOME_SET = {'贷', '收入'} | ||
| 69 | BORROW_OUTLAY_SET = {'借', '支出'} | ||
| 70 | INCOME_HEADERS_SET = {'收入金额', '收入', '存入', '存入金额(贷)', '存入金额(贷)'} | ||
| 71 | OUTLAY_HEADERS_SET = {'支出金额', '支出', '支取金额(借)', '支取金额(借)'} | ||
| 72 | |||
| 73 | # ------------------普通打印-全格线-------------------------------------------------------------------------------------- | ||
| 45 | HEADERS_MAPPING = {} | 74 | HEADERS_MAPPING = {} |
| 46 | # 中国银行 | 75 | # 横版-表格-中国银行(不规则) |
| 47 | HEADERS_MAPPING.update( | 76 | HEADERS_MAPPING.update( |
| 48 | { | 77 | { |
| 49 | '记账日期': BASE_HEADERS_MAPPING['记账日期'], | 78 | '记账日期': BASE_HEADERS_MAPPING['记账日期'], |
| ... | @@ -57,37 +86,294 @@ HEADERS_MAPPING.update( | ... | @@ -57,37 +86,294 @@ HEADERS_MAPPING.update( |
| 57 | '对方开户行': BASE_HEADERS_MAPPING['对方开户行'], | 86 | '对方开户行': BASE_HEADERS_MAPPING['对方开户行'], |
| 58 | } | 87 | } |
| 59 | ) | 88 | ) |
| 60 | # 竖版-表格-建设银行 | 89 | # 横版-表格-农业银行-中国农业银行个人账户明细 |
| 61 | HEADERS_MAPPING.update( | 90 | HEADERS_MAPPING.update( |
| 62 | { | 91 | { |
| 63 | '交易日期': BASE_HEADERS_MAPPING['记账日期'], | 92 | '交易日期': BASE_HEADERS_MAPPING['记账日期'], |
| 64 | '交易金额': BASE_HEADERS_MAPPING['金额'], | 93 | '存入': BASE_HEADERS_MAPPING['金额'], |
| 65 | '账户余额': BASE_HEADERS_MAPPING['余额'], | 94 | '对方账号': BASE_HEADERS_MAPPING['对方卡号/账号'], |
| 95 | '对方名称': BASE_HEADERS_MAPPING['对方账户名'], | ||
| 66 | '摘要': BASE_HEADERS_MAPPING['附言'], | 96 | '摘要': BASE_HEADERS_MAPPING['附言'], |
| 67 | '对方账号与户名': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
| 68 | } | 97 | } |
| 69 | ) | 98 | ) |
| 70 | # 横版-表格-农业银行 | 99 | # 横版-表格-北京银行 |
| 71 | HEADERS_MAPPING.update( | 100 | HEADERS_MAPPING.update( |
| 72 | { | 101 | { |
| 73 | '存入': BASE_HEADERS_MAPPING['金额'], | 102 | '业务摘要': BASE_HEADERS_MAPPING['附言'], |
| 74 | '对方账号': BASE_HEADERS_MAPPING['对方卡号/账号'], | 103 | '发生额': BASE_HEADERS_MAPPING['金额'], |
| 75 | '对方名称': BASE_HEADERS_MAPPING['对方账户名'], | 104 | '对方户名': BASE_HEADERS_MAPPING['对方账户名'], |
| 76 | } | 105 | } |
| 77 | ) | 106 | ) |
| 78 | # 横版-表格-工商银行 | 107 | # 横版-表格-工商银行 借记卡账户历史明细清单 |
| 108 | # 横版-表格-工商银行-机打验证码 借记卡账户历史明细清单 | ||
| 109 | # 横版-表格-工商银行CH-B008802400 | ||
| 110 | # 横版-表格-工商银行 工资明细清单 | ||
| 111 | # 工商银行历史明细(申请单号:20042501303039397888) | ||
| 79 | HEADERS_MAPPING.update( | 112 | HEADERS_MAPPING.update( |
| 80 | { | 113 | { |
| 81 | '对方户名': BASE_HEADERS_MAPPING['对方账户名'], | ||
| 82 | '收入/支出金额': BASE_HEADERS_MAPPING['金额'], | 114 | '收入/支出金额': BASE_HEADERS_MAPPING['金额'], |
| 83 | '工作日期': BASE_HEADERS_MAPPING['记账日期'], | 115 | '工作日期': BASE_HEADERS_MAPPING['记账日期'], |
| 84 | } | 116 | } |
| 85 | ) | 117 | ) |
| 86 | # 横版-表格-北京银行 | 118 | |
| 119 | # 横版-表格-建设银行-个人活期账户交易明细 | ||
| 120 | # 竖版-表格-建设银行-个人活期账户交易明细 CH-B005832604 | ||
| 121 | # 竖版-表格-建设银行-工资账单CH-B008786812 | ||
| 122 | # 竖版-表格-建设银行-个人活期账户交易明细 CH-B005832604 (2) | ||
| 87 | HEADERS_MAPPING.update( | 123 | HEADERS_MAPPING.update( |
| 88 | { | 124 | { |
| 89 | '业务摘要': BASE_HEADERS_MAPPING['附言'], | 125 | '交易金额': BASE_HEADERS_MAPPING['金额'], |
| 90 | '发生额': BASE_HEADERS_MAPPING['金额'], | 126 | '账户余额': BASE_HEADERS_MAPPING['余额'], |
| 127 | '对方账号与户名': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
| 128 | } | ||
| 129 | ) | ||
| 130 | # 微信 | ||
| 131 | HEADERS_MAPPING.update( | ||
| 132 | { | ||
| 133 | '交易时间': BASE_HEADERS_MAPPING['记账时间'], | ||
| 134 | '交易类型': BASE_HEADERS_MAPPING['附言'], | ||
| 135 | '金额(元)': BASE_HEADERS_MAPPING['金额'], | ||
| 136 | '金额(元)': BASE_HEADERS_MAPPING['金额'], | ||
| 137 | '交易对方': BASE_HEADERS_MAPPING['对方账户名'], | ||
| 91 | } | 138 | } |
| 92 | ) | 139 | ) |
| 140 | # 支付宝 | ||
| 141 | HEADERS_MAPPING.update( | ||
| 142 | { | ||
| 143 | '时间': BASE_HEADERS_MAPPING['记账日期'], | ||
| 144 | '名称/备注': BASE_HEADERS_MAPPING['附言'], | ||
| 145 | } | ||
| 146 | ) | ||
| 147 | |||
| 148 | # ------------普通打印-部分格线------------------------------------------------------------------------------------------- | ||
| 149 | |||
| 150 | # 竖版-无表格-农业银行 | ||
| 151 | # 竖版-无表格-农业银行CH-B008805428 | ||
| 152 | HEADERS_MAPPING.update( | ||
| 153 | { | ||
| 154 | '摘要/附言': BASE_HEADERS_MAPPING['附言'], | ||
| 155 | '交易地点/对方账号和户名': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
| 156 | } | ||
| 157 | ) | ||
| 158 | # 竖版-特殊-农商行 | ||
| 159 | HEADERS_MAPPING.update( | ||
| 160 | { | ||
| 161 | '交易发生额': BASE_HEADERS_MAPPING['金额'], | ||
| 162 | } | ||
| 163 | ) | ||
| 164 | # 横版-特殊-中信银行-账户交易明细 | ||
| 165 | HEADERS_MAPPING.update( | ||
| 166 | { | ||
| 167 | '对方银行': BASE_HEADERS_MAPPING['对方开户行'], | ||
| 168 | '交易摘要': BASE_HEADERS_MAPPING['附言'], | ||
| 169 | } | ||
| 170 | ) | ||
| 171 | # 平安电子账单 | ||
| 172 | HEADERS_MAPPING.update( | ||
| 173 | { | ||
| 174 | '借贷发生额(借:-贷:+)': BASE_HEADERS_MAPPING['金额'], | ||
| 175 | } | ||
| 176 | ) | ||
| 177 | |||
| 178 | # ------------普通打印-无格线-------------------------------------------------------------------------------------------- | ||
| 179 | |||
| 180 | # 竖版-无表格-招商银行(略歪) | ||
| 181 | # 竖版-无表格-招商银行账户历史交易明细表 | ||
| 182 | HEADERS_MAPPING.update( | ||
| 183 | { | ||
| 184 | '联机余额': BASE_HEADERS_MAPPING['余额'], | ||
| 185 | } | ||
| 186 | ) | ||
| 187 | # 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户 | ||
| 188 | # 竖版-无表格-邮储银行 账户对账单 | ||
| 189 | # 竖版-无表格-邮储银行-电子章 邮储银行 账户对账单 | ||
| 190 | HEADERS_MAPPING.update( | ||
| 191 | { | ||
| 192 | '交易金额(元)': BASE_HEADERS_MAPPING['金额'], | ||
| 193 | '交易金额(元)': BASE_HEADERS_MAPPING['金额'], | ||
| 194 | '账户余额(元)': BASE_HEADERS_MAPPING['余额'], | ||
| 195 | '账户余额(元)': BASE_HEADERS_MAPPING['余额'], | ||
| 196 | '对手方户名': BASE_HEADERS_MAPPING['对方账户名'], | ||
| 197 | '对手方账户': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
| 198 | } | ||
| 199 | ) | ||
| 200 | # 横版-无表格-广发银行-账户交易历史 --> 已废弃 | ||
| 201 | # 竖版-无表格-广发银行-账户交易历史 --> 已废弃 | ||
| 202 | HEADERS_MAPPING.update( | ||
| 203 | { | ||
| 204 | '会计日期': BASE_HEADERS_MAPPING['记账日期'], | ||
| 205 | '对手户名': BASE_HEADERS_MAPPING['对方账户名'], | ||
| 206 | '对手账号': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
| 207 | } | ||
| 208 | ) | ||
| 209 | # 招行电子账单 TODO 有英文,需测试 | ||
| 210 | HEADERS_MAPPING.update( | ||
| 211 | { | ||
| 212 | '对手信息': BASE_HEADERS_MAPPING['对方账户名'], | ||
| 213 | '摘要代码': BASE_HEADERS_MAPPING['附言'], | ||
| 214 | } | ||
| 215 | ) | ||
| 216 | # 横版-无表格-民生银行-中国民生银行个人账户对账单(客户卡号) | ||
| 217 | # 横版-无表格-民生银行-无标题(客户账户) | ||
| 218 | # 横版-无表格-民生银行 | ||
| 219 | HEADERS_MAPPING.update( | ||
| 220 | { | ||
| 221 | '摘要信息': BASE_HEADERS_MAPPING['附言'], | ||
| 222 | '对方行名': BASE_HEADERS_MAPPING['对方开户行'], | ||
| 223 | } | ||
| 224 | ) | ||
| 225 | # 竖版-无表格-农业银行整数 | ||
| 226 | # 竖版-无表格-农业银行-中国农业银行银行卡交易明细清单 | ||
| 227 | HEADERS_MAPPING.update( | ||
| 228 | { | ||
| 229 | '对方账号和户名': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
| 230 | } | ||
| 231 | ) | ||
| 232 | # 竖版-无表格-农业银行-中国农业银行银行卡活期存折交易明细清单.pdf | ||
| 233 | # 竖版-无表格-农业银行-扩张.pdf | ||
| 234 | # 竖版-无表格-农业银行-缩进.pdf | ||
| 235 | HEADERS_MAPPING.update( | ||
| 236 | { | ||
| 237 | '日期': BASE_HEADERS_MAPPING['记账日期'], | ||
| 238 | '短摘要': BASE_HEADERS_MAPPING['附言'], | ||
| 239 | '本次余额': BASE_HEADERS_MAPPING['余额'], | ||
| 240 | } | ||
| 241 | ) | ||
| 242 | # 竖版-无表格-农业银行-无标题(对手帐号) | ||
| 243 | HEADERS_MAPPING.update( | ||
| 244 | { | ||
| 245 | '交易后余额': BASE_HEADERS_MAPPING['余额'], | ||
| 246 | '对手帐号': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
| 247 | } | ||
| 248 | ) | ||
| 249 | # 竖版-无表格-农商行(非常规) | ||
| 250 | HEADERS_MAPPING.update( | ||
| 251 | { | ||
| 252 | '交易说明': BASE_HEADERS_MAPPING['附言'], | ||
| 253 | } | ||
| 254 | ) | ||
| 255 | # 竖版-无表格-工商银行 抬头三行 活期历史明细清单 | ||
| 256 | HEADERS_MAPPING.update( | ||
| 257 | { | ||
| 258 | '对方账户': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
| 259 | } | ||
| 260 | ) | ||
| 261 | |||
| 262 | # -----------针式打印-全格线-------------------------------------------------------------------------------------------- | ||
| 263 | # 竖版-表格-建设银行-中国建设银行活期账户交易明细 | ||
| 264 | # 竖版-表格-建设银行-中国建设银行活期账户明细清单 | ||
| 265 | # 竖版-表格-建设银行-对私活期账户明细- (1).pdf | ||
| 266 | HEADERS_MAPPING.update( | ||
| 267 | { | ||
| 268 | '帐户余额': BASE_HEADERS_MAPPING['余额'], | ||
| 269 | '对方帐户名称': BASE_HEADERS_MAPPING['对方账户名'], | ||
| 270 | } | ||
| 271 | ) | ||
| 272 | # 竖版-特殊-交通银行 零售客户交易清单 5000以上交易记录 | ||
| 273 | HEADERS_MAPPING.update( | ||
| 274 | { | ||
| 275 | '交易日期 记账日期': BASE_HEADERS_MAPPING['记账日期'], | ||
| 276 | } | ||
| 277 | ) | ||
| 278 | |||
| 279 | # ----------针式打印-部分格线------------------------------------------------------------------------------------------ | ||
| 280 | # 竖版-特殊-邮储银行-一本通绿卡通交易明细(客户) | ||
| 281 | # 竖版-特殊-邮储银行-账户交易明细(客户) | ||
| 282 | HEADERS_MAPPING.update( | ||
| 283 | { | ||
| 284 | '对方账号/卡号/汇票号': BASE_HEADERS_MAPPING['对方卡号/账号'], | ||
| 285 | } | ||
| 286 | ) | ||
| 287 | |||
| 288 | # -------------------------------------------------------------------------------------------------------------------- | ||
| 289 | |||
| 290 | # ('记账日期', '记账时间', '金额', '余额', '交易名称', '附言', '对方账户名', '对方卡号/账号', '对方开户行', '核对结果', '借贷', '收入', '支出') | ||
| 291 | # CLASSIFY_LIST = [ | ||
| 292 | # ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), # 横版-表格-中国银行(不规则) | ||
| 293 | # ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)), # 横版-表格-农业银行-中国农业银行个人账户明细 | ||
| 294 | # ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), # 横版-表格-北京银行 | ||
| 295 | # | ||
| 296 | # # 工商银行:交易日期 账号 储种 序号 币种 钞汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道 | ||
| 297 | # ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), | ||
| 298 | # | ||
| 299 | # # 建设银行:空 摘要 交易日期 交易金额 账户余额 商户/网点号及其名称 对方账号与户名 --> 竖版-表格-建设银行 | ||
| 300 | # # 序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 --> 横版-表格-建设银行 | ||
| 301 | # ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)), | ||
| 302 | # | ||
| 303 | # # 微信:交易单号 交易时间 交易类型 收/支/其他 交易方式 金额(元) 交易对方 商户单号 | ||
| 304 | # | ||
| 305 | # # 支付宝:流水号 时间 名称/备注 收入 支出 账户余额 资金渠道 | ||
| 306 | # | ||
| 307 | # # ----------------- | ||
| 308 | # | ||
| 309 | # # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名 | ||
| 310 | # ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | ||
| 311 | # | ||
| 312 | # # 农商行:交易日期 交易发生额 账户余额 对方账号 对方户名 摘要 备注 | ||
| 313 | # | ||
| 314 | # # 中信银行:交易日期 交易摘要 收入金额 支出金额 账户余额 对方户名 对方账号 对方银行 交易流水号 | ||
| 315 | # ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)), | ||
| 316 | # | ||
| 317 | # # 平安电子账单:序号 交易日期 交易网点 摘要 借贷发生额(借:-贷:+) 账户余额 | ||
| 318 | # ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)), | ||
| 319 | # | ||
| 320 | # # ------------------------- | ||
| 321 | # | ||
| 322 | # # 招商银行:记账日期 货币 交易金额 联机余额 冲补账 交易摘要 | ||
| 323 | # | ||
| 324 | # # 邮储银行:交易日期、交易类型 交易币种 交易金额(元) 账户余额(元) [对手方户名 对手方账户 收支类型] --> 竖版-无表格-邮储银行-账户对账单 含有对手方户名 对手方账户 | ||
| 325 | # # 交易日期、交易类型 交易金额(元) 账户余额(元) 操作柜员 --> 竖版-无表格-邮储银行 账户对账单 | ||
| 326 | # | ||
| 327 | # # 工商银行电子版:交易日期 账号 储种 序号 币种 妙汇 摘要 地区 收入/支出金额 余额 [对方户名 对方账号] 渠道 | ||
| 328 | # | ||
| 329 | # # 招商银行电子版:记账日期 货币 交易金额 联机余额 交易摘要 对手信息 | ||
| 330 | # | ||
| 331 | # # 民生银行:凭证类型 凭证号码 摘要信息 交易时间 交易金额 账户余额 现转标志 交易渠道 交易机构 对方户名 对方行名 | ||
| 332 | # # 凭证类型 凭证号码 交易时间 摘要 交易金额 账户余额 现转标志 交易渠道 交易机构 对方户名 对方行名 | ||
| 333 | # | ||
| 334 | # # 农业银行:交易日期 摘要/附言 交易金额 对方账号和户名 | ||
| 335 | # | ||
| 336 | # # 农业银行:交易日期 摘要/附言 交易金额 余额 交易地点/对方账号和户名 | ||
| 337 | # | ||
| 338 | # # 农业银行:日期、时间、[日志号]、短摘要、交易金额、本次余额、交易网点、渠道、附言 | ||
| 339 | # | ||
| 340 | # # 交通银行:交易日期 记账日期、交易地点、交易类型、借贷状态、交易金额、余额 | ||
| 341 | # | ||
| 342 | # # =================================== | ||
| 343 | # | ||
| 344 | # # 建设银行:摘要、交易日期、交易金额、账户余额、商户/网点号及其名称、对方账号、对方户名 | ||
| 345 | # # 交易日期、摘要、币种、钞汇、交易金额、帐户余额、对方账号、对方帐户名称 | ||
| 346 | # | ||
| 347 | # | ||
| 348 | # # =================================== | ||
| 349 | # | ||
| 350 | # # 邮储银行:序号、交易日期、交易渠道、摘要、交易金额、账户余额、对方账号/卡号/汇票号、原子账号、交易机构名称 | ||
| 351 | # | ||
| 352 | # # 建设银行:序号 摘要 币别 钞汇 交易日期 交易金额 账户余额 交易地点附言 对方账号与户名 | ||
| 353 | # ] | ||
| 354 | |||
| 355 | # { | ||
| 356 | # "0": "全表格-中国农业银行个人账户明细", | ||
| 357 | # "1": "全表格-中国银行", | ||
| 358 | # "2": "全表格-北京银行", | ||
| 359 | # "3": "全表格-工商银行", | ||
| 360 | # "4": "全表格-建设银行", | ||
| 361 | # "5": "部分格线-横版-中信银行账户交易明细", | ||
| 362 | # "6": "部分格线-横版-中信银行账户交易明细特殊", | ||
| 363 | # "7": "部分格线-竖版-中国农业银行", | ||
| 364 | # "8": "部分格线-竖版-中国农业银行分账户(窄页)", | ||
| 365 | # "9": "部分格线-竖版-平安电子账单" | ||
| 366 | # } | ||
| 367 | CLASSIFY_LIST = [ | ||
| 368 | ('农业银行', (1, None, 3, 5, None, 8, 7, 6, None, None, None, None, None)), | ||
| 369 | ('中国银行', (1, 2, 4, 5, 6, 9, 10, 11, 12, None, None, None, None)), | ||
| 370 | ('北京银行', (1, None, 4, 5, None, 2, 6, 7, None, None, 3, None, None)), | ||
| 371 | ('工商银行', (1, None, 9, 10, None, 7, None, None, None, None, None, None, None)), | ||
| 372 | ('建设银行', (None, None, None, None, None, 2, None, None, None, None, None, None, None)), | ||
| 373 | ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)), | ||
| 374 | ('中信银行', (1, None, None, 5, None, 2, 6, 7, 8, None, None, 3, 4)), | ||
| 375 | ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | ||
| 376 | ('农业银行', (1, None, 3, 4, None, 2, None, 5, None, None, None, None, None)), | ||
| 377 | ('平安电子账单', (2, None, 5, 6, None, 4, None, None, None, None, None, None, None)), | ||
| 378 | ] | ||
| 93 | 379 | ... | ... |
| 1 | import os | 1 | import os |
| 2 | import time | 2 | import time |
| 3 | import fitz | ||
| 4 | import signal | 3 | import signal |
| 5 | import base64 | ||
| 6 | import asyncio | 4 | import asyncio |
| 7 | import aiohttp | 5 | import aiohttp |
| 6 | import difflib | ||
| 8 | import requests | 7 | import requests |
| 8 | from datetime import datetime | ||
| 9 | from collections import Counter | ||
| 9 | from apps.doc.ocr.wb import BSWorkbook, Workbook | 10 | from apps.doc.ocr.wb import BSWorkbook, Workbook |
| 10 | from django.core.management import BaseCommand | 11 | from django.core.management import BaseCommand |
| 11 | 12 | ||
| ... | @@ -65,8 +66,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -65,8 +66,6 @@ class Command(BaseCommand, LoggerMixin): |
| 65 | return doc, business_type | 66 | return doc, business_type |
| 66 | 67 | ||
| 67 | def pdf_download(self, doc, business_type): | 68 | def pdf_download(self, doc, business_type): |
| 68 | if doc is None: | ||
| 69 | return None, None, None, None | ||
| 70 | doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id)) | 69 | doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id)) |
| 71 | os.makedirs(doc_data_path, exist_ok=True) | 70 | os.makedirs(doc_data_path, exist_ok=True) |
| 72 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | 71 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) |
| ... | @@ -80,20 +79,96 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -80,20 +79,96 @@ class Command(BaseCommand, LoggerMixin): |
| 80 | return doc_data_path, excel_path, src_excel_path, pdf_path | 79 | return doc_data_path, excel_path, src_excel_path, pdf_path |
| 81 | 80 | ||
| 82 | @staticmethod | 81 | @staticmethod |
| 83 | def append_sheet(wb, sheets_list, img_name, role_summary): | 82 | def append_bs_sheet(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence): |
| 84 | for i, sheet in enumerate(sheets_list): | 83 | for i, sheet in enumerate(sheets): |
| 85 | sheet_name = '{0}_{1}'.format(img_name, i) | 84 | sheet_name = 'page_{0}_img_{1}_{2}'.format(pno, img_idx, i) |
| 86 | role_summary['银行-户名'].append((sheet_name, 1, None, None, None, None, None)) | 85 | # ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'] |
| 86 | summary = sheet.get('summary') | ||
| 87 | card = summary[1] | ||
| 88 | if card is None: | ||
| 89 | classify_dict = unknown_summary.setdefault(classify, {}) | ||
| 90 | role = consts.UNKNOWN_ROLE if summary[0] is None else summary[0] | ||
| 91 | role_dict = classify_dict.setdefault(role, {}) | ||
| 92 | role_dict['classify'] = classify | ||
| 93 | role_dict['role'] = role | ||
| 94 | role_dict.setdefault('sheet', []).append(sheet_name) | ||
| 95 | role_dict.setdefault('confidence', []).append(confidence) | ||
| 96 | code_list = role_dict.setdefault('code', []) | ||
| 97 | pt_list = role_dict.setdefault('print_time', []) | ||
| 98 | sd_list = role_dict.setdefault('start_date', []) | ||
| 99 | ed_list = role_dict.setdefault('end_date', []) | ||
| 100 | if summary[3] is not None: | ||
| 101 | code_list.append((summary[2], summary[3])) | ||
| 102 | if summary[4] is not None: | ||
| 103 | pt_list.append(summary[4]) | ||
| 104 | if summary[5] is not None: | ||
| 105 | sd_list.append(summary[5]) | ||
| 106 | if summary[6] is not None: | ||
| 107 | ed_list.append(summary[6]) | ||
| 108 | else: | ||
| 109 | card_dict = bs_summary.setdefault(card, {}) | ||
| 110 | card_dict['count'] = card_dict.get('count', 0) + 1 | ||
| 111 | card_dict.setdefault('classify', []).append(classify) | ||
| 112 | card_dict.setdefault('confidence', []).append(confidence) | ||
| 113 | card_dict.setdefault('sheet', []).append(sheet_name) | ||
| 114 | role_list = card_dict.setdefault('role', []) | ||
| 115 | role_set = card_dict.setdefault('role_set', set()) | ||
| 116 | code_list = card_dict.setdefault('code', []) | ||
| 117 | pt_list = card_dict.setdefault('print_time', []) | ||
| 118 | sd_list = card_dict.setdefault('start_date', []) | ||
| 119 | ed_list = card_dict.setdefault('end_date', []) | ||
| 120 | if summary[0] is not None: | ||
| 121 | role_list.append(summary[0]) | ||
| 122 | role_set.add(summary[0]) | ||
| 123 | if summary[3] is not None: | ||
| 124 | code_list.append((summary[2], summary[3])) | ||
| 125 | if summary[4] is not None: | ||
| 126 | pt_list.append(summary[4]) | ||
| 127 | if summary[5] is not None: | ||
| 128 | sd_list.append(summary[5]) | ||
| 129 | if summary[6] is not None: | ||
| 130 | ed_list.append(summary[6]) | ||
| 131 | |||
| 87 | ws = wb.create_sheet(sheet_name) | 132 | ws = wb.create_sheet(sheet_name) |
| 88 | cells = sheet.get('cells') | 133 | cells = sheet.get('cells') |
| 89 | for cell in cells: | 134 | for cell in cells: |
| 90 | c1 = cell.get('start_column') | 135 | c1 = cell.get('start_column') |
| 91 | # c2 = cell.get('end_column') | ||
| 92 | r1 = cell.get('start_row') | 136 | r1 = cell.get('start_row') |
| 93 | # r2 = cell.get('end_row') | ||
| 94 | words = cell.get('words') | 137 | words = cell.get('words') |
| 95 | ws.cell(row=r1+1, column=c1+1, value=words) | 138 | ws.cell(row=r1+1, column=c1+1, value=words) |
| 96 | 139 | ||
| 140 | def ocr_2_wb(self, res, wb, pno, img_idx, bs_summary, unknown_summary, license_summary): | ||
| 141 | # res = { | ||
| 142 | # 'code': 1, | ||
| 143 | # 'msg': 'success', | ||
| 144 | # 'data': { | ||
| 145 | # 'classify': 0, | ||
| 146 | # 'confidence': 0.999, | ||
| 147 | # 'sheets': [ | ||
| 148 | # { | ||
| 149 | # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
| 150 | # 'cells': [] | ||
| 151 | # }, | ||
| 152 | # { | ||
| 153 | # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
| 154 | # 'cells': [] | ||
| 155 | # } | ||
| 156 | # ] | ||
| 157 | # } | ||
| 158 | # } | ||
| 159 | data = res.get('data', {}) | ||
| 160 | classify = data.get('classify') | ||
| 161 | if classify is None: | ||
| 162 | return | ||
| 163 | # if classify in | ||
| 164 | sheets = data.get('sheets', []) | ||
| 165 | if not sheets: | ||
| 166 | return | ||
| 167 | confidence = data.get('confidence', 1) | ||
| 168 | self.append_bs_sheet(wb, sheets, bs_summary, unknown_summary, pno, img_idx, classify, confidence) | ||
| 169 | # else: | ||
| 170 | # pass | ||
| 171 | |||
| 97 | # async def fetch_ocr_result(self, img_path): | 172 | # async def fetch_ocr_result(self, img_path): |
| 98 | # async with aiohttp.ClientSession( | 173 | # async with aiohttp.ClientSession( |
| 99 | # headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False) | 174 | # headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False) |
| ... | @@ -102,35 +177,170 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -102,35 +177,170 @@ class Command(BaseCommand, LoggerMixin): |
| 102 | # async with session.post(self.ocr_url, json=json_data) as response: | 177 | # async with session.post(self.ocr_url, json=json_data) as response: |
| 103 | # return await response.json() | 178 | # return await response.json() |
| 104 | # | 179 | # |
| 105 | # async def img_ocr_excel(self, wb, img_path, role_summary): | 180 | # async def img_2_ocr_2_wb(self, wb, img_path, summary): |
| 106 | # res = await self.fetch_ocr_result(img_path) | 181 | # res = await self.fetch_ocr_result(img_path) |
| 107 | # self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res)) | 182 | # self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res)) |
| 108 | # sheets_list = res.get('result').get('res') | 183 | # sheets_list = res.get('result').get('res') |
| 109 | # img_name = os.path.basename(img_path) | 184 | # img_name = os.path.basename(img_path) |
| 110 | # self.append_sheet(wb, sheets_list, img_name, role_summary) | 185 | # self.append_sheet(wb, sheets_list, img_name, summary) |
| 111 | 186 | ||
| 112 | def fetch_ocr_result(self, img_path): | 187 | def fetch_ocr_result(self, img_path): |
| 113 | # payload = {'name': 'page_0_img_0_0'} | ||
| 114 | files = [ | 188 | files = [ |
| 115 | ('img', open(img_path, 'rb')) | 189 | ('img', open(img_path, 'rb')) |
| 116 | ] | 190 | ] |
| 117 | response = requests.request("POST", self.ocr_url, files=files) | 191 | response = requests.request("POST", self.ocr_url, files=files) |
| 118 | return response.json() | 192 | return response.json() |
| 119 | 193 | ||
| 120 | def img_ocr_excel(self, wb, img_path, role_summary): | 194 | def img_2_ocr_2_wb(self, wb, img_info, bs_summary, unknown_summary, license_summary): |
| 121 | res = self.fetch_ocr_result(img_path) | 195 | res = self.fetch_ocr_result(img_info[0]) |
| 122 | self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res)) | 196 | self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format( |
| 197 | self.log_base, img_info[0], res)) | ||
| 123 | if res.get('code') == 1: | 198 | if res.get('code') == 1: |
| 124 | sheets_list = res.get('data') | 199 | self.ocr_2_wb(res, wb, img_info[1], img_info[2], bs_summary, unknown_summary, license_summary) |
| 125 | if not sheets_list: | 200 | |
| 201 | @staticmethod | ||
| 202 | def get_most(value_list): | ||
| 203 | if value_list: | ||
| 204 | most_common = Counter(value_list).most_common(1) | ||
| 205 | return most_common[0][0] if most_common else None | ||
| 206 | |||
| 207 | @staticmethod | ||
| 208 | def date_format(date_str, format_str): | ||
| 209 | try: | ||
| 210 | date = datetime.strptime(date_str, format_str) | ||
| 211 | except Exception as e: | ||
| 126 | return | 212 | return |
| 127 | img_name = os.path.basename(img_path) | 213 | else: |
| 128 | self.append_sheet(wb, sheets_list, img_name, role_summary) | 214 | return date |
| 215 | |||
| 216 | def get_validate_date(self, date_list): | ||
| 217 | for date_str in date_list: | ||
| 218 | for format_str in consts.DATE_FORMAT: | ||
| 219 | date = self.date_format(date_str, format_str) | ||
| 220 | if isinstance(date, datetime): | ||
| 221 | return date | ||
| 222 | |||
| 223 | def merge_card(self, bs_summary): | ||
| 224 | merged_bs_summary = {} | ||
| 225 | sorted_card = sorted(bs_summary.keys(), key=lambda x: bs_summary[x]['count'], reverse=True) | ||
| 226 | for main_card in sorted_card: | ||
| 227 | if bs_summary.get(main_card) is None: | ||
| 228 | continue | ||
| 229 | merged_bs_summary[main_card] = bs_summary.pop(main_card) | ||
| 230 | del merged_bs_summary[main_card]['count'] | ||
| 231 | merge_cards = [] | ||
| 232 | for card in bs_summary.keys(): | ||
| 233 | if difflib.SequenceMatcher(None, main_card, card).quick_ratio() > consts.CARD_RATIO: | ||
| 234 | merged_bs_summary[main_card]['classify'].extend(bs_summary[card]['classify']) | ||
| 235 | merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence']) | ||
| 236 | merged_bs_summary[main_card]['sheet'].extend(bs_summary[card]['sheet']) | ||
| 237 | merged_bs_summary[main_card]['role'].extend(bs_summary[card]['role']) | ||
| 238 | merged_bs_summary[main_card]['role_set'].update(bs_summary[card]['role_set']) | ||
| 239 | merged_bs_summary[main_card]['code'].extend(bs_summary[card]['sheet']) | ||
| 240 | merged_bs_summary[main_card]['print_time'].extend(bs_summary[card]['print_time']) | ||
| 241 | merged_bs_summary[main_card]['start_date'].extend(bs_summary[card]['start_date']) | ||
| 242 | merged_bs_summary[main_card]['end_date'].extend(bs_summary[card]['end_date']) | ||
| 243 | merge_cards.append(card) | ||
| 244 | for card in merge_cards: | ||
| 245 | del bs_summary[card] | ||
| 246 | merged_bs_summary[main_card]['classify'] = self.get_most(merged_bs_summary[main_card]['classify']) | ||
| 247 | merged_bs_summary[main_card]['role'] = self.get_most(merged_bs_summary[main_card]['role']) | ||
| 248 | del bs_summary | ||
| 249 | return merged_bs_summary | ||
| 250 | |||
| 251 | def prune_bs_summary(self, bs_summary): | ||
| 252 | for summary in bs_summary.values(): | ||
| 253 | del summary['count'] | ||
| 254 | summary['classify'] = self.get_most(summary['classify']) | ||
| 255 | summary['role'] = self.get_most(summary['role']) | ||
| 256 | return bs_summary | ||
| 257 | |||
| 258 | |||
| 259 | def rebuild_bs_summary(self, bs_summary, unknown_summary): | ||
| 260 | # bs_summary = { | ||
| 261 | # '卡号': { | ||
| 262 | # 'count': 100, | ||
| 263 | # 'classify': [], | ||
| 264 | # 'confidence': [], | ||
| 265 | # 'role': [], | ||
| 266 | # 'code': [('page', 'code')], | ||
| 267 | # 'print_time': [], | ||
| 268 | # 'start_date': [], | ||
| 269 | # 'end_date': [], | ||
| 270 | # 'sheet': ['sheet_name'] | ||
| 271 | # } | ||
| 272 | # } | ||
| 273 | # | ||
| 274 | # unknown_summary = { | ||
| 275 | # 0: { | ||
| 276 | # '户名': { | ||
| 277 | # 'classify': 0, | ||
| 278 | # 'confidence': [], | ||
| 279 | # 'role': '户名', | ||
| 280 | # 'code': [('page', 'code')], | ||
| 281 | # 'print_time': [], | ||
| 282 | # 'start_date': [], | ||
| 283 | # 'end_date': [], | ||
| 284 | # 'sheet': ['sheet_name'] | ||
| 285 | # } | ||
| 286 | # } | ||
| 287 | # } | ||
| 288 | # 无卡号 | ||
| 289 | if len(bs_summary) == 0: | ||
| 290 | del bs_summary | ||
| 291 | merged_bs_summary = {} | ||
| 292 | card_num = 1 | ||
| 293 | for role_dict in unknown_summary.values(): | ||
| 294 | for summary in role_dict.values(): | ||
| 295 | card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num) | ||
| 296 | card_num += 1 | ||
| 297 | merged_bs_summary[card] = summary | ||
| 298 | else: | ||
| 299 | # 1卡号 | ||
| 300 | if len(bs_summary) == 1: | ||
| 301 | merged_bs_summary = self.prune_bs_summary(bs_summary) | ||
| 302 | # 多卡号 | ||
| 303 | else: | ||
| 304 | merged_bs_summary = self.merge_card(bs_summary) | ||
| 305 | |||
| 306 | for card_summary in merged_bs_summary.values(): | ||
| 307 | merge_role = [] | ||
| 308 | classify_summary = unknown_summary.get(card_summary['classify'], {}) | ||
| 309 | for role, summary in classify_summary.items(): | ||
| 310 | if role in card_summary['role_set']: | ||
| 311 | merge_role.append(role) | ||
| 312 | card_summary['sheet'].extend(summary['sheet']) | ||
| 313 | card_summary['code'].extend(summary['sheet']) | ||
| 314 | card_summary['print_time'].extend(summary['print_time']) | ||
| 315 | card_summary['start_date'].extend(summary['start_date']) | ||
| 316 | card_summary['end_date'].extend(summary['end_date']) | ||
| 317 | |||
| 318 | for role in merge_role: | ||
| 319 | del classify_summary[role] | ||
| 320 | |||
| 321 | card_num = 1 | ||
| 322 | for role_dict in unknown_summary.values(): | ||
| 323 | for summary in role_dict.values(): | ||
| 324 | card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num) | ||
| 325 | card_num += 1 | ||
| 326 | merged_bs_summary[card] = summary | ||
| 327 | |||
| 328 | del unknown_summary | ||
| 329 | for summary in merged_bs_summary.values(): | ||
| 330 | if summary.get('role_set') is not None: | ||
| 331 | del summary['role_set'] | ||
| 332 | summary['print_time'] = self.get_validate_date(summary['print_time']) | ||
| 333 | summary['start_date'] = self.get_validate_date(summary['start_date']) | ||
| 334 | summary['end_date'] = self.get_validate_date(summary['end_date']) | ||
| 335 | summary['confidence'] = max(summary['confidence']) | ||
| 336 | return merged_bs_summary | ||
| 129 | 337 | ||
| 130 | # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理 | 338 | # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理 |
| 131 | # TODO 调用接口重试 | 339 | # TODO 调用接口重试 |
| 340 | # TODO 协程异步发送OCR请求 | ||
| 132 | # TODO 异常邮件通知 | 341 | # TODO 异常邮件通知 |
| 133 | # TODO 数据库断联问题 | 342 | # TODO 数据库断联问题 |
| 343 | # TODO 非流水证件处理,Excel模板 | ||
| 134 | def handle(self, *args, **kwargs): | 344 | def handle(self, *args, **kwargs): |
| 135 | sleep_second = int(conf.SLEEP_SECOND) | 345 | sleep_second = int(conf.SLEEP_SECOND) |
| 136 | max_sleep_second = int(conf.MAX_SLEEP_SECOND) | 346 | max_sleep_second = int(conf.MAX_SLEEP_SECOND) |
| ... | @@ -138,17 +348,19 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -138,17 +348,19 @@ class Command(BaseCommand, LoggerMixin): |
| 138 | while self.switch: | 348 | while self.switch: |
| 139 | # 1. 从队列获取文件信息 | 349 | # 1. 从队列获取文件信息 |
| 140 | doc, business_type = self.get_doc_info() | 350 | doc, business_type = self.get_doc_info() |
| 141 | try: | ||
| 142 | # 2. 从EDMS获取PDF文件 | ||
| 143 | doc_data_path, excel_path, src_excel_path, pdf_path = self.pdf_download(doc, business_type) | ||
| 144 | # 队列为空时的处理 | 351 | # 队列为空时的处理 |
| 145 | if pdf_path is None: | 352 | if doc is None: |
| 146 | time.sleep(sleep_second) | 353 | time.sleep(sleep_second) |
| 147 | sleep_second = min(max_sleep_second, sleep_second+5) | 354 | sleep_second = min(max_sleep_second, sleep_second + 5) |
| 148 | continue | 355 | continue |
| 149 | sleep_second = int(conf.SLEEP_SECOND) | 356 | sleep_second = int(conf.SLEEP_SECOND) |
| 150 | # 3.PDF文件提取图片 | 357 | |
| 358 | try: | ||
| 151 | start_time = time.time() | 359 | start_time = time.time() |
| 360 | # 2. 从EDMS获取PDF文件 | ||
| 361 | doc_data_path, excel_path, src_excel_path, pdf_path = self.pdf_download(doc, business_type) | ||
| 362 | |||
| 363 | # 3.PDF文件提取图片 | ||
| 152 | img_save_path = os.path.join(doc_data_path, 'img') | 364 | img_save_path = os.path.join(doc_data_path, 'img') |
| 153 | self.cronjob_log.info('{0} [pdf to img start] [business_type={1}] [doc_id={2}]'.format( | 365 | self.cronjob_log.info('{0} [pdf to img start] [business_type={1}] [doc_id={2}]'.format( |
| 154 | self.log_base, business_type, doc.id)) | 366 | self.log_base, business_type, doc.id)) |
| ... | @@ -158,28 +370,42 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -158,28 +370,42 @@ class Command(BaseCommand, LoggerMixin): |
| 158 | self.log_base, business_type, doc.id)) | 370 | self.log_base, business_type, doc.id)) |
| 159 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) | 371 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) |
| 160 | 372 | ||
| 161 | # 4.图片调用算法判断是否为银行流水, 图片调用算法OCR为excel文件 | 373 | # 4.获取OCR结果并且构建excel文件 |
| 162 | role_summary = { | 374 | bs_summary = {} |
| 163 | '银行-户名': [] | 375 | license_summary = {} |
| 164 | } | 376 | unknown_summary = [] |
| 165 | # interest_keyword = Keywords.objects.filter( | 377 | interest_keyword = Keywords.objects.filter( |
| 166 | # type=KeywordsType.INTEREST.value).values_list('keyword', flat=True) | 378 | type=KeywordsType.INTEREST.value).values_list('keyword', flat=True) |
| 167 | # salary_keyword = Keywords.objects.filter( | 379 | salary_keyword = Keywords.objects.filter( |
| 168 | # type=KeywordsType.SALARY.value).values_list('keyword', flat=True) | 380 | type=KeywordsType.SALARY.value).values_list('keyword', flat=True) |
| 169 | # loan_keyword = Keywords.objects.filter(type=KeywordsType.LOAN.value).values_list('keyword', flat=True) | 381 | loan_keyword = Keywords.objects.filter( |
| 170 | # wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword) | 382 | type__in=[KeywordsType.LOAN.value, KeywordsType.ALI_WECHART.value]).values_list( |
| 171 | wb = Workbook() | 383 | 'keyword', flat=True) |
| 384 | wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword) | ||
| 385 | |||
| 386 | # wb = Workbook() | ||
| 387 | |||
| 388 | # 4.1 获取OCR结果 | ||
| 172 | # loop = asyncio.get_event_loop() | 389 | # loop = asyncio.get_event_loop() |
| 173 | # tasks = [self.img_ocr_excel(wb, img_path, role_summary) for img_path in pdf_handler.img_path_list] | 390 | # tasks = [self.img_2_ocr_2_wb(wb, img_path, summary) for img_path in pdf_handler.img_path_list] |
| 174 | # loop.run_until_complete(asyncio.wait(tasks)) | 391 | # loop.run_until_complete(asyncio.wait(tasks)) |
| 175 | # loop.close() | 392 | # loop.close() |
| 176 | 393 | ||
| 177 | for img_path in pdf_handler.img_path_list: | 394 | for img_info in pdf_handler.img_info_list: |
| 178 | self.img_ocr_excel(wb, img_path, role_summary) | 395 | self.img_2_ocr_2_wb(wb, img_info, bs_summary, unknown_summary, license_summary) |
| 396 | |||
| 397 | self.cronjob_log.info('{0} [bs_summary={1}] [unknown_summary={2}] [license_summary={3}]'.format( | ||
| 398 | self.log_base, bs_summary, unknown_summary, license_summary)) | ||
| 179 | 399 | ||
| 180 | # 整合excel文件 | 400 | merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) |
| 181 | # wb.save(src_excel_path) | 401 | |
| 182 | # wb.rebuild(role_summary) | 402 | self.cronjob_log.info('{0} [merged_bs_summary={1}] [unknown_summary={2}]'.format( |
| 403 | self.log_base, merged_bs_summary, unknown_summary)) | ||
| 404 | del unknown_summary | ||
| 405 | |||
| 406 | # 4.2 重构Excel文件 | ||
| 407 | wb.save(src_excel_path) | ||
| 408 | wb.rebuild(merged_bs_summary, license_summary) | ||
| 183 | wb.save(excel_path) | 409 | wb.save(excel_path) |
| 184 | except Exception as e: | 410 | except Exception as e: |
| 185 | doc.status = DocStatus.PROCESS_FAILED.value | 411 | doc.status = DocStatus.PROCESS_FAILED.value |
| ... | @@ -194,14 +420,16 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -194,14 +420,16 @@ class Command(BaseCommand, LoggerMixin): |
| 194 | except Exception as e: | 420 | except Exception as e: |
| 195 | doc.status = DocStatus.UPLOAD_FAILED.value | 421 | doc.status = DocStatus.UPLOAD_FAILED.value |
| 196 | doc.save() | 422 | doc.save() |
| 197 | self.cronjob_log.error('{0} [upload failed] [business_type={1}] [doc_id={2}] [err={3}]'.format( | 423 | end_time = time.time() |
| 198 | self.log_base, business_type, doc.id, e)) | 424 | speed_time = int(end_time - start_time) |
| 425 | self.cronjob_log.error('{0} [upload failed] [business_type={1}] [doc_id={2}] [speed_time={3}] ' | ||
| 426 | '[err={4}]'.format(self.log_base, business_type, doc.id, speed_time, e)) | ||
| 199 | else: | 427 | else: |
| 200 | doc.status = DocStatus.COMPLETE.value | 428 | doc.status = DocStatus.COMPLETE.value |
| 201 | doc.save() | 429 | doc.save() |
| 202 | end_time = time.time() | 430 | end_time = time.time() |
| 203 | speed_time = int(end_time - start_time) | 431 | speed_time = int(end_time - start_time) |
| 204 | self.cronjob_log.info('{0} [doc process complete] [business_type={1}] [doc_id={2}] ' | 432 | self.cronjob_log.info('{0} [process complete] [business_type={1}] [doc_id={2}] ' |
| 205 | '[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time)) | 433 | '[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time)) |
| 206 | 434 | ||
| 207 | self.cronjob_log.info('{0} [stop safely]'.format(self.log_base)) | 435 | self.cronjob_log.info('{0} [stop safely]'.format(self.log_base)) | ... | ... |
| ... | @@ -11,6 +11,8 @@ class DocHandler: | ... | @@ -11,6 +11,8 @@ class DocHandler: |
| 11 | return '/data/{1}/{0}/{0}.pdf'.format(doc_id, business_type) | 11 | return '/data/{1}/{0}/{0}.pdf'.format(doc_id, business_type) |
| 12 | elif file == 'img': | 12 | elif file == 'img': |
| 13 | return '/data/{1}/{0}/{0}_img.zip'.format(doc_id, business_type) | 13 | return '/data/{1}/{0}/{0}_img.zip'.format(doc_id, business_type) |
| 14 | elif file == 'src_excel': | ||
| 15 | return '/data/{1}/{0}/src.xlsx'.format(doc_id, business_type) | ||
| 14 | else: | 16 | else: |
| 15 | return '/data/{1}/{0}/{0}.xlsx'.format(doc_id, business_type) | 17 | return '/data/{1}/{0}/{0}.xlsx'.format(doc_id, business_type) |
| 16 | 18 | ||
| ... | @@ -22,6 +24,7 @@ class DocHandler: | ... | @@ -22,6 +24,7 @@ class DocHandler: |
| 22 | doc_dict['pdf_link'] = self.get_link(doc_id, business_type) | 24 | doc_dict['pdf_link'] = self.get_link(doc_id, business_type) |
| 23 | doc_dict['img_link'] = self.get_link(doc_id, business_type, file='img') | 25 | doc_dict['img_link'] = self.get_link(doc_id, business_type, file='img') |
| 24 | doc_dict['excel_link'] = self.get_link(doc_id, business_type, file='excel') | 26 | doc_dict['excel_link'] = self.get_link(doc_id, business_type, file='excel') |
| 27 | doc_dict['src_excel_link'] = self.get_link(doc_id, business_type, file='src_excel') | ||
| 25 | return list(doc_queryset) | 28 | return list(doc_queryset) |
| 26 | 29 | ||
| 27 | @staticmethod | 30 | @staticmethod | ... | ... |
| ... | @@ -13,6 +13,7 @@ class BSWorkbook(Workbook): | ... | @@ -13,6 +13,7 @@ class BSWorkbook(Workbook): |
| 13 | 13 | ||
| 14 | def __init__(self, interest_keyword, salary_keyword, loan_keyword, *args, **kwargs): | 14 | def __init__(self, interest_keyword, salary_keyword, loan_keyword, *args, **kwargs): |
| 15 | super().__init__(*args, **kwargs) | 15 | super().__init__(*args, **kwargs) |
| 16 | locale.setlocale(locale.LC_NUMERIC, 'en_US.UTF-8') | ||
| 16 | self.meta_sheet_title = '关键信息提取和展示' | 17 | self.meta_sheet_title = '关键信息提取和展示' |
| 17 | self.blank_row = (None,) | 18 | self.blank_row = (None,) |
| 18 | self.code_header = ('页数', '电子回单验证码') | 19 | self.code_header = ('页数', '电子回单验证码') |
| ... | @@ -24,26 +25,59 @@ class BSWorkbook(Workbook): | ... | @@ -24,26 +25,59 @@ class BSWorkbook(Workbook): |
| 24 | self.proof_res = ('对', '错') | 25 | self.proof_res = ('对', '错') |
| 25 | self.loan_fill = PatternFill("solid", fgColor="00FFCC00") | 26 | self.loan_fill = PatternFill("solid", fgColor="00FFCC00") |
| 26 | self.amount_fill = PatternFill("solid", fgColor="00FFFF00") | 27 | self.amount_fill = PatternFill("solid", fgColor="00FFFF00") |
| 27 | self.bd = Side(style='thin', color="000000") | 28 | # self.bd = Side(style='thin', color="000000") |
| 28 | self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd) | 29 | # self.border = Border(left=self.bd, top=self.bd, right=self.bd, bottom=self.bd) |
| 29 | self.MAX_MEAN = 31 | 30 | self.MAX_MEAN = 31 |
| 30 | 31 | ||
| 31 | @staticmethod | 32 | @staticmethod |
| 32 | def sheet_prune(ws): | 33 | def sheet_prune(ws, classify): |
| 33 | ws.insert_cols(1, amount=consts.FIXED_COL_AMOUNT) | 34 | ws.insert_cols(1, amount=consts.FIXED_COL_AMOUNT) |
| 35 | moved_col_set = set() | ||
| 36 | header_col_set = set() | ||
| 37 | # 根据第一行关键词排列 | ||
| 34 | for col in range(consts.FIXED_COL_AMOUNT + 1, ws.max_column + 1): | 38 | for col in range(consts.FIXED_COL_AMOUNT + 1, ws.max_column + 1): |
| 35 | header_value = ws.cell(1, col).value | 39 | header_value = ws.cell(1, col).value |
| 36 | header_idx = consts.HEADERS_MAPPING.get(header_value) | 40 | header_col = consts.HEADERS_MAPPING.get(header_value) |
| 37 | # TODO 关键字段再次查找 | 41 | if header_col is not None: |
| 38 | # TODO 支付宝、微信流水第一行非表头,怎么处理 | 42 | letter = get_column_letter(col) |
| 39 | if header_idx is None: | 43 | ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_col - col) |
| 40 | continue | 44 | moved_col_set.add(col) |
| 45 | header_col_set.add(header_col) | ||
| 46 | elif header_value in consts.BORROW_HEADERS_SET: | ||
| 47 | letter = get_column_letter(col) | ||
| 48 | ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=consts.BORROW_HEADER_COL - col) | ||
| 49 | moved_col_set.add(col) | ||
| 50 | header_col_set.add(consts.BORROW_HEADER_COL) | ||
| 51 | elif header_value in consts.INCOME_HEADERS_SET: | ||
| 52 | letter = get_column_letter(col) | ||
| 53 | ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=consts.INCOME_HEADER_COL - col) | ||
| 54 | moved_col_set.add(col) | ||
| 55 | header_col_set.add(consts.INCOME_HEADER_COL) | ||
| 56 | elif header_value in consts.OUTLAY_HEADERS_SET: | ||
| 41 | letter = get_column_letter(col) | 57 | letter = get_column_letter(col) |
| 42 | ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_idx - col) | 58 | ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=consts.OUTLAY_HEADER_COL - col) |
| 59 | moved_col_set.add(col) | ||
| 60 | header_col_set.add(consts.OUTLAY_HEADER_COL) | ||
| 61 | |||
| 62 | # 缺失表头再次查找 | ||
| 63 | for header_col in range(1, consts.FIXED_COL_AMOUNT + 1): | ||
| 64 | if header_col in header_col_set or header_col == consts.RESULT_HEADER_COL: | ||
| 65 | continue | ||
| 66 | fix_col = consts.CLASSIFY_LIST[classify][1][header_col - 1] # TODO 合并分类情况 | ||
| 67 | if fix_col is None: | ||
| 68 | continue | ||
| 69 | fix_col = fix_col + consts.FIXED_COL_AMOUNT | ||
| 70 | if fix_col in moved_col_set: | ||
| 71 | break | ||
| 72 | letter = get_column_letter(fix_col) | ||
| 73 | ws.move_range("{0}1:{0}{1}".format(letter, ws.max_row), cols=header_col - fix_col) | ||
| 74 | |||
| 43 | ws.delete_cols(consts.FIXED_COL_AMOUNT + 1, amount=ws.max_column) | 75 | ws.delete_cols(consts.FIXED_COL_AMOUNT + 1, amount=ws.max_column) |
| 76 | min_row = 1 if len(moved_col_set) == 0 else 2 | ||
| 77 | return min_row | ||
| 44 | 78 | ||
| 45 | @staticmethod | 79 | @staticmethod |
| 46 | def month_split(dti, date_list): | 80 | def month_split(dti, date_list, date_statistics): |
| 47 | month_list = [] | 81 | month_list = [] |
| 48 | idx_list = [] | 82 | idx_list = [] |
| 49 | month_pre = None | 83 | month_pre = None |
| ... | @@ -53,11 +87,13 @@ class BSWorkbook(Workbook): | ... | @@ -53,11 +87,13 @@ class BSWorkbook(Workbook): |
| 53 | if month_str != month_pre: | 87 | if month_str != month_pre: |
| 54 | month_list.append(month_str) | 88 | month_list.append(month_str) |
| 55 | if month_pre is None: | 89 | if month_pre is None: |
| 90 | if date_statistics: | ||
| 56 | date_list.append(dti[idx].date()) | 91 | date_list.append(dti[idx].date()) |
| 57 | idx = 0 | 92 | idx = 0 |
| 58 | idx_list.append(idx) | 93 | idx_list.append(idx) |
| 59 | month_pre = month_str | 94 | month_pre = month_str |
| 60 | for idx in range(len(dti)-1, -1, -1): | 95 | if date_statistics: |
| 96 | for idx in range(len(dti) - 1, -1, -1): | ||
| 61 | if isinstance(dti[idx], NaTType): | 97 | if isinstance(dti[idx], NaTType): |
| 62 | continue | 98 | continue |
| 63 | date_list.append(dti[idx].date()) | 99 | date_list.append(dti[idx].date()) |
| ... | @@ -86,8 +122,8 @@ class BSWorkbook(Workbook): | ... | @@ -86,8 +122,8 @@ class BSWorkbook(Workbook): |
| 86 | reverse_trend = -1 | 122 | reverse_trend = -1 |
| 87 | return reverse_trend | 123 | return reverse_trend |
| 88 | 124 | ||
| 89 | def sheet_split(self, ws, month_mapping, date_list, reverse_trend_list): | 125 | def sheet_split(self, ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics): |
| 90 | for date_tuple_src in ws.iter_cols(min_col=1, max_col=1, min_row=2, values_only=True): | 126 | for date_tuple_src in ws.iter_cols(min_col=1, max_col=1, min_row=min_row, values_only=True): |
| 91 | date_tuple = [date[:10] if isinstance(date, str) else date for date in date_tuple_src] | 127 | date_tuple = [date[:10] if isinstance(date, str) else date for date in date_tuple_src] |
| 92 | dt_array, tz_parsed = tslib.array_to_datetime( | 128 | dt_array, tz_parsed = tslib.array_to_datetime( |
| 93 | np.array(date_tuple, copy=False, dtype=np.object_), | 129 | np.array(date_tuple, copy=False, dtype=np.object_), |
| ... | @@ -95,16 +131,16 @@ class BSWorkbook(Workbook): | ... | @@ -95,16 +131,16 @@ class BSWorkbook(Workbook): |
| 95 | utc=False, | 131 | utc=False, |
| 96 | dayfirst=False, | 132 | dayfirst=False, |
| 97 | yearfirst=False, | 133 | yearfirst=False, |
| 98 | require_iso8601=False, | 134 | require_iso8601=True, |
| 99 | ) | 135 | ) |
| 100 | dti = DatetimeIndex(dt_array, tz=None, name=None) | 136 | dti = DatetimeIndex(dt_array, tz=None, name=None) |
| 101 | 137 | ||
| 102 | month_list, idx_list = self.month_split(dti, date_list) | 138 | month_list, idx_list = self.month_split(dti, date_list, date_statistics) |
| 103 | 139 | ||
| 104 | if len(month_list) == 0: | 140 | if len(month_list) == 0: |
| 105 | # month_info process | 141 | # month_info process |
| 106 | month_info = month_mapping.setdefault('xxxx-xx', []) | 142 | month_info = month_mapping.setdefault('xxxx-xx', []) |
| 107 | month_info.append((ws.title, 2, ws.max_row, 0)) | 143 | month_info.append((ws.title, min_row, ws.max_row, 0)) |
| 108 | elif len(month_list) == 1: | 144 | elif len(month_list) == 1: |
| 109 | # reverse_trend_list process | 145 | # reverse_trend_list process |
| 110 | reverse_trend = self.get_reverse_trend(dti.day, idx_list) | 146 | reverse_trend = self.get_reverse_trend(dti.day, idx_list) |
| ... | @@ -113,14 +149,14 @@ class BSWorkbook(Workbook): | ... | @@ -113,14 +149,14 @@ class BSWorkbook(Workbook): |
| 113 | month_info = month_mapping.setdefault(month_list[0], []) | 149 | month_info = month_mapping.setdefault(month_list[0], []) |
| 114 | day_mean = np.mean(dti.day.dropna()) | 150 | day_mean = np.mean(dti.day.dropna()) |
| 115 | if len(month_info) == 0: | 151 | if len(month_info) == 0: |
| 116 | month_info.append((ws.title, 2, ws.max_row, day_mean)) | 152 | month_info.append((ws.title, min_row, ws.max_row, day_mean)) |
| 117 | else: | 153 | else: |
| 118 | for i, item in enumerate(month_info): | 154 | for i, item in enumerate(month_info): |
| 119 | if day_mean <= item[-1]: | 155 | if day_mean <= item[-1]: |
| 120 | month_info.insert(i, (ws.title, 2, ws.max_row, day_mean)) | 156 | month_info.insert(i, (ws.title, min_row, ws.max_row, day_mean)) |
| 121 | break | 157 | break |
| 122 | else: | 158 | else: |
| 123 | month_info.append((ws.title, 2, ws.max_row, day_mean)) | 159 | month_info.append((ws.title, min_row, ws.max_row, day_mean)) |
| 124 | else: | 160 | else: |
| 125 | # reverse_trend_list process | 161 | # reverse_trend_list process |
| 126 | reverse_trend = self.get_reverse_trend(dti.day, idx_list) | 162 | reverse_trend = self.get_reverse_trend(dti.day, idx_list) |
| ... | @@ -128,34 +164,41 @@ class BSWorkbook(Workbook): | ... | @@ -128,34 +164,41 @@ class BSWorkbook(Workbook): |
| 128 | # month_info process | 164 | # month_info process |
| 129 | for i, item in enumerate(month_list[:-1]): | 165 | for i, item in enumerate(month_list[:-1]): |
| 130 | month_mapping.setdefault(item, []).append( | 166 | month_mapping.setdefault(item, []).append( |
| 131 | (ws.title, idx_list[i] + 2, idx_list[i + 1] + 1, self.MAX_MEAN)) | 167 | (ws.title, idx_list[i] + min_row, idx_list[i + 1] + min_row - 1, self.MAX_MEAN)) |
| 132 | month_mapping.setdefault(month_list[-1], []).insert( | 168 | month_mapping.setdefault(month_list[-1], []).insert( |
| 133 | 0, (ws.title, idx_list[-1] + 2, ws.max_row, 0)) | 169 | 0, (ws.title, idx_list[-1] + min_row, ws.max_row, 0)) |
| 134 | 170 | ||
| 135 | def build_metadata_rows(self, confidence_max, code_list, print_time, start_date, end_date, date_interval): | 171 | def build_metadata_rows(self, classify, confidence, role, code, print_time, start_date, end_date): |
| 136 | metadata_rows = [('流水识别置信度', confidence_max), self.blank_row, self.code_header] | 172 | metadata_rows = [ |
| 137 | metadata_rows.extend(code_list) | 173 | ('流水识别置信度', confidence), |
| 174 | self.blank_row, | ||
| 175 | ('分类结果', classify), | ||
| 176 | self.blank_row, | ||
| 177 | ('户名', role), | ||
| 178 | self.blank_row, | ||
| 179 | self.code_header, | ||
| 180 | ] | ||
| 181 | metadata_rows.extend(code) | ||
| 138 | metadata_rows.extend( | 182 | metadata_rows.extend( |
| 139 | [self.blank_row, | 183 | [self.blank_row, |
| 140 | self.date_header, | 184 | self.date_header, |
| 141 | (print_time, start_date, end_date, date_interval), | 185 | (print_time, start_date, end_date, (end_date - start_date).days), |
| 142 | self.blank_row, | 186 | self.blank_row, |
| 143 | self.keyword_header] | 187 | self.keyword_header] |
| 144 | ) | 188 | ) |
| 145 | return metadata_rows | 189 | return metadata_rows |
| 146 | 190 | ||
| 147 | def create_meta_sheet(self, role): | 191 | def create_meta_sheet(self, card): |
| 148 | if self.worksheets[0].title == 'Sheet': | 192 | if self.worksheets[0].title == 'Sheet': |
| 149 | ms = self.worksheets[0] | 193 | ms = self.worksheets[0] |
| 150 | ms.title = '{0}({1})'.format(self.meta_sheet_title, role) | 194 | ms.title = '{0}({1})'.format(self.meta_sheet_title, card) |
| 151 | else: | 195 | else: |
| 152 | ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, role)) | 196 | ms = self.create_sheet('{0}({1})'.format(self.meta_sheet_title, card)) |
| 153 | return ms | 197 | return ms |
| 154 | 198 | ||
| 155 | def build_meta_sheet(self, role, confidence_max, code_list, print_time, start_date, end_date, date_interval): | 199 | def build_meta_sheet(self, card, classify, confidence, role, code, print_time, start_date, end_date): |
| 156 | metadata_rows = self.build_metadata_rows(confidence_max, code_list, print_time, | 200 | metadata_rows = self.build_metadata_rows(classify, confidence, role, code, print_time, start_date, end_date) |
| 157 | start_date, end_date, date_interval) | 201 | ms = self.create_meta_sheet(card) |
| 158 | ms = self.create_meta_sheet(role) | ||
| 159 | for row in metadata_rows: | 202 | for row in metadata_rows: |
| 160 | ms.append(row) | 203 | ms.append(row) |
| 161 | return ms | 204 | return ms |
| ... | @@ -169,55 +212,84 @@ class BSWorkbook(Workbook): | ... | @@ -169,55 +212,84 @@ class BSWorkbook(Workbook): |
| 169 | new_ws.append(consts.FIXED_HEADERS) | 212 | new_ws.append(consts.FIXED_HEADERS) |
| 170 | for part in parts: | 213 | for part in parts: |
| 171 | ws = self.get_sheet_by_name(part[0]) | 214 | ws = self.get_sheet_by_name(part[0]) |
| 172 | for row in ws.iter_rows(min_row=part[1], max_row=part[2], values_only=True): | 215 | for row_value in ws.iter_rows(min_row=part[1], max_row=part[2], values_only=True): |
| 173 | new_ws.append(row) | 216 | new_ws.append(row_value) |
| 174 | # 3.2.提取信息、高亮 | 217 | # 3.2.提取信息、高亮 |
| 175 | amount_mapping = {} | 218 | amount_mapping = {} |
| 176 | amount_fill_row = set() | 219 | amount_fill_row = set() |
| 177 | for rows in new_ws.iter_rows(): | 220 | for rows in new_ws.iter_rows(min_row=2): |
| 178 | summary_cell = rows[5] | 221 | summary_cell = rows[consts.SUMMARY_IDX] |
| 179 | date_cell = rows[0] | 222 | date_cell = rows[consts.DATE_IDX] |
| 223 | amount_cell = rows[consts.AMOUNT_IDX] | ||
| 224 | row = summary_cell.row | ||
| 180 | # 关键词1提取 | 225 | # 关键词1提取 |
| 181 | if summary_cell.value in self.interest_keyword: | 226 | if summary_cell.value in self.interest_keyword: |
| 182 | ms.append((summary_cell.value, date_cell.value, rows[2].value)) | 227 | ms.append((summary_cell.value, date_cell.value, amount_cell.value)) |
| 183 | # 关键词2提取至临时表 | 228 | # 关键词2提取至临时表 |
| 184 | elif summary_cell.value in self.salary_keyword: | 229 | elif summary_cell.value in self.salary_keyword: |
| 185 | tmp_ws.append((summary_cell.value, date_cell.value, rows[2].value)) | 230 | tmp_ws.append((summary_cell.value, date_cell.value, amount_cell.value)) |
| 186 | # 贷款关键词高亮 | 231 | # 贷款关键词高亮 |
| 187 | elif summary_cell.value in self.loan_keyword: | 232 | elif summary_cell.value in self.loan_keyword: |
| 188 | summary_cell.fill = self.loan_fill | 233 | summary_cell.fill = self.loan_fill |
| 189 | for i, cell in enumerate(rows): | 234 | |
| 190 | cell.border = self.border | 235 | # 3.3.余额转数值 |
| 191 | if (i == 2 or i == 3) and cell.row > 1: | 236 | over_cell = rows[consts.OVER_IDX] |
| 192 | try: | 237 | try: |
| 193 | # 3.3.金额、余额转数值 | 238 | if isinstance(over_cell.value, str): |
| 194 | cell.value = locale.atof(cell.value) | 239 | over_cell.value = over_cell.value.translate(consts.TRANS) |
| 195 | except Exception: | 240 | over_cell.value = locale.atof(over_cell.value) |
| 241 | except Exception as e: | ||
| 196 | continue | 242 | continue |
| 197 | else: | 243 | else: |
| 198 | cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1 | 244 | over_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1 |
| 199 | if i == 2: | 245 | |
| 246 | # 3.4.余额转数值 | ||
| 247 | try: | ||
| 248 | try: | ||
| 249 | if isinstance(amount_cell.value, str): | ||
| 250 | amount_cell.value = amount_cell.value.translate(consts.TRANS) | ||
| 251 | amount_cell.value = locale.atof(amount_cell.value) | ||
| 252 | except Exception as e: | ||
| 253 | try: | ||
| 254 | if isinstance(rows[consts.INCOME_IDX].value, str): | ||
| 255 | rows[consts.OUTLAY_IDX].value = rows[consts.INCOME_IDX].value.translate(consts.TRANS) | ||
| 256 | amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value) | ||
| 257 | except Exception as e: | ||
| 258 | if isinstance(rows[consts.OUTLAY_IDX].value, str): | ||
| 259 | rows[consts.OUTLAY_IDX].value = rows[consts.OUTLAY_IDX].value.translate(consts.TRANS) | ||
| 260 | amount_cell.value = locale.atof(rows[consts.OUTLAY_IDX].value) | ||
| 261 | if amount_cell.value > 0: | ||
| 262 | amount_cell.value = -amount_cell.value | ||
| 263 | except Exception as e: | ||
| 264 | continue | ||
| 265 | else: | ||
| 266 | if rows[consts.BORROW_IDX].value in consts.BORROW_OUTLAY_SET: | ||
| 267 | amount_cell.value = -amount_cell.value | ||
| 268 | amount_cell.number_format = numbers.FORMAT_NUMBER_COMMA_SEPARATED1 | ||
| 200 | same_amount_mapping = amount_mapping.get(date_cell.value, {}) | 269 | same_amount_mapping = amount_mapping.get(date_cell.value, {}) |
| 201 | fill_rows = same_amount_mapping.get(-cell.value) | 270 | fill_rows = same_amount_mapping.get(-amount_cell.value) |
| 202 | if fill_rows: | 271 | if fill_rows: |
| 203 | amount_fill_row.add(cell.row) | 272 | amount_fill_row.add(row) |
| 204 | amount_fill_row.update(fill_rows) | 273 | amount_fill_row.update(fill_rows) |
| 205 | amount_mapping.setdefault(date_cell.value, {}).setdefault( | 274 | amount_mapping.setdefault(date_cell.value, {}).setdefault( |
| 206 | cell.value, []).append(cell.row) | 275 | amount_cell.value, []).append(row) |
| 207 | # 3.4.核对结果 | 276 | |
| 208 | # TODO 借贷、开支类型银行流水,需要手动添加+-号 | 277 | # 3.5.核对结果 |
| 209 | if i == 9 and cell.row > 2: | 278 | if row > 2: |
| 210 | if is_reverse: | 279 | if is_reverse: |
| 211 | cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format( | 280 | rows[consts.RESULT_IDX].value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format( |
| 212 | cell.row - 1, cell.row, *self.proof_res) | 281 | row - 1, row, *self.proof_res) |
| 213 | else: | 282 | else: |
| 214 | cell.value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format( | 283 | rows[consts.RESULT_IDX].value = '=IF(D{0}=SUM(D{1},C{0}), "{2}", "{3}")'.format( |
| 215 | cell.row, cell.row - 1, *self.proof_res) | 284 | row, row - 1, *self.proof_res) |
| 285 | |||
| 286 | # 删除金额辅助列 | ||
| 287 | new_ws.delete_cols(consts.BORROW_HEADER_COL, amount=new_ws.max_column) | ||
| 216 | 288 | ||
| 217 | # 3.5.同一天相同进出账高亮 | 289 | # 3.6.同一天相同进出账高亮 |
| 218 | del amount_mapping | 290 | del amount_mapping |
| 219 | for row in amount_fill_row: | 291 | for row in amount_fill_row: |
| 220 | new_ws[row][2].fill = self.amount_fill | 292 | new_ws[row][consts.AMOUNT_IDX].fill = self.amount_fill |
| 221 | 293 | ||
| 222 | # 关键词2信息提取 | 294 | # 关键词2信息提取 |
| 223 | ms.append(self.blank_row) | 295 | ms.append(self.blank_row) |
| ... | @@ -226,34 +298,51 @@ class BSWorkbook(Workbook): | ... | @@ -226,34 +298,51 @@ class BSWorkbook(Workbook): |
| 226 | ms.append(row) | 298 | ms.append(row) |
| 227 | self.remove(tmp_ws) | 299 | self.remove(tmp_ws) |
| 228 | 300 | ||
| 229 | def rebuild(self, role_summary): | 301 | def bs_rebuild(self, bs_summary): |
| 230 | # (sheet_name, confidence, page, code, print_time, start_date, end_date) # TODO 表名简化,+卡号 | 302 | # bs_summary = { |
| 231 | for role, summary_list in role_summary.items(): | 303 | # '卡号': { |
| 304 | # 'classify': 0, | ||
| 305 | # 'confidence': 0.9, | ||
| 306 | # 'role': '柳雪', | ||
| 307 | # 'code': [('page', 'code')], | ||
| 308 | # 'print_time': 'datetime', | ||
| 309 | # 'start_date': 'datetime', | ||
| 310 | # 'end_date': 'datetime', | ||
| 311 | # 'sheet': ['sheet_name'] | ||
| 312 | # } | ||
| 313 | # } | ||
| 314 | for card, summary in bs_summary.items(): | ||
| 232 | # 1.原表修剪、排列、按照月份分割 | 315 | # 1.原表修剪、排列、按照月份分割 |
| 233 | reverse_trend_list = [] | 316 | start_date = summary['start_date'] |
| 234 | confidence_max = 0 | 317 | end_date = summary['end_date'] |
| 235 | code_list = [] | 318 | date_statistics = False |
| 236 | month_mapping = {} | 319 | if start_date is None or end_date is None: |
| 320 | date_statistics = True | ||
| 237 | date_list = [] | 321 | date_list = [] |
| 238 | start_date = end_date = date_interval = print_time = None | 322 | month_mapping = {} |
| 239 | for summary in summary_list: | 323 | reverse_trend_list = [] |
| 240 | sheet_name, confidence, page, code, print_time_local, start_date_local, end_date_local = summary | 324 | for sheet in summary['sheet']: |
| 241 | ws = self.get_sheet_by_name(sheet_name) | 325 | ws = self.get_sheet_by_name(sheet) |
| 242 | # 1.1.删除多余列、排列 | 326 | # 1.1.删除多余列、排列 |
| 243 | self.sheet_prune(ws) | 327 | min_row = self.sheet_prune(ws, summary['classify']) |
| 244 | # 1.2.按月份分割 | 328 | # 1.2.按月份分割 |
| 245 | self.sheet_split(ws, month_mapping, date_list, reverse_trend_list) | 329 | self.sheet_split(ws, month_mapping, reverse_trend_list, min_row, date_list, date_statistics) |
| 246 | # 1.3.元数据处理 TODO 时间与日期处理 | 330 | |
| 247 | confidence_max = max(confidence, confidence_max) | 331 | if date_statistics is True and len(date_list) > 1: |
| 248 | if code is not None: | 332 | start_date = min(date_list) if start_date is None else start_date |
| 249 | code_list.append((page, code)) | 333 | end_date = max(date_list) if end_date is None else end_date |
| 250 | 334 | ||
| 251 | if len(date_list) > 1: | ||
| 252 | start_date = min(date_list) | ||
| 253 | end_date = max(date_list) | ||
| 254 | date_interval = (end_date - start_date).days | ||
| 255 | # 2.元信息提取表 | 335 | # 2.元信息提取表 |
| 256 | ms = self.build_meta_sheet(role, confidence_max, code_list, print_time, start_date, end_date, date_interval) | 336 | bank_name = consts.CLASSIFY_LIST[summary['classify']][0] |
| 337 | base_sheet_name = '{0}_{1}'.format(bank_name, summary['role']) | ||
| 338 | ms = self.build_meta_sheet(card, | ||
| 339 | summary['classify'], | ||
| 340 | summary['confidence'], | ||
| 341 | summary['role'], | ||
| 342 | summary['code'], | ||
| 343 | summary['print_time'], | ||
| 344 | start_date, | ||
| 345 | end_date) | ||
| 257 | 346 | ||
| 258 | # 3.创建月份表、提取/高亮关键行 | 347 | # 3.创建月份表、提取/高亮关键行 |
| 259 | is_reverse = False | 348 | is_reverse = False |
| ... | @@ -261,8 +350,11 @@ class BSWorkbook(Workbook): | ... | @@ -261,8 +350,11 @@ class BSWorkbook(Workbook): |
| 261 | is_reverse = True | 350 | is_reverse = True |
| 262 | for month_list in month_mapping.values(): | 351 | for month_list in month_mapping.values(): |
| 263 | month_list.sort(key=lambda x: x[-1], reverse=True) | 352 | month_list.sort(key=lambda x: x[-1], reverse=True) |
| 264 | self.build_month_sheet(role, month_mapping, ms, is_reverse) | 353 | self.build_month_sheet(base_sheet_name, month_mapping, ms, is_reverse) |
| 354 | |||
| 355 | # 4.删除原表 | ||
| 356 | for sheet in summary['sheet']: | ||
| 357 | self.remove(self.get_sheet_by_name(sheet)) | ||
| 265 | 358 | ||
| 266 | # 删除原表 | 359 | def rebuild(self, bs_summary, license_summary): |
| 267 | for summary in summary_list: | 360 | self.bs_rebuild(bs_summary) |
| 268 | self.remove(self.get_sheet_by_name(summary[0])) | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
| ... | @@ -25,7 +25,7 @@ class PDFHandler: | ... | @@ -25,7 +25,7 @@ class PDFHandler: |
| 25 | def __init__(self, path, img_dir_path): | 25 | def __init__(self, path, img_dir_path): |
| 26 | self.path = path | 26 | self.path = path |
| 27 | self.img_dir_path = img_dir_path | 27 | self.img_dir_path = img_dir_path |
| 28 | self.img_path_list = [] | 28 | self.img_info_list = [] |
| 29 | self.xref_set = set() | 29 | self.xref_set = set() |
| 30 | 30 | ||
| 31 | def get_img_save_path(self, pno, img_index=0, ext='png'): | 31 | def get_img_save_path(self, pno, img_index=0, ext='png'): |
| ... | @@ -38,7 +38,7 @@ class PDFHandler: | ... | @@ -38,7 +38,7 @@ class PDFHandler: |
| 38 | pm = page.getPixmap(matrix=trans_2, alpha=False) | 38 | pm = page.getPixmap(matrix=trans_2, alpha=False) |
| 39 | img_save_path = self.get_img_save_path(page.number) | 39 | img_save_path = self.get_img_save_path(page.number) |
| 40 | pm.writePNG(img_save_path) | 40 | pm.writePNG(img_save_path) |
| 41 | self.img_path_list.append(img_save_path) | 41 | self.img_info_list.append((img_save_path, page.number, 0)) |
| 42 | 42 | ||
| 43 | @staticmethod | 43 | @staticmethod |
| 44 | def getimage(pix): | 44 | def getimage(pix): |
| ... | @@ -88,7 +88,7 @@ class PDFHandler: | ... | @@ -88,7 +88,7 @@ class PDFHandler: |
| 88 | with open(img_save_path, "wb") as f: | 88 | with open(img_save_path, "wb") as f: |
| 89 | f.write(img_data) | 89 | f.write(img_data) |
| 90 | self.xref_set.add(xref) | 90 | self.xref_set.add(xref) |
| 91 | self.img_path_list.append(img_save_path) | 91 | self.img_info_list.append((img_save_path, pno, img_index)) |
| 92 | 92 | ||
| 93 | @staticmethod | 93 | @staticmethod |
| 94 | def split_il(il): | 94 | def split_il(il): |
| ... | @@ -179,7 +179,7 @@ class PDFHandler: | ... | @@ -179,7 +179,7 @@ class PDFHandler: |
| 179 | img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2]) | 179 | img_save_path = self.get_img_save_path(pno, img_index, im_list[0][2]) |
| 180 | new_img.save(img_save_path) | 180 | new_img.save(img_save_path) |
| 181 | page_to_png = False | 181 | page_to_png = False |
| 182 | self.img_path_list.append(img_save_path) | 182 | self.img_info_list.append((img_save_path, pno, img_index)) |
| 183 | 183 | ||
| 184 | # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片 | 184 | # 3.3 碎图分组大于2、全过滤、含特殊filter,特殊处理:整个页面保存为png图片 |
| 185 | if page_to_png: | 185 | if page_to_png: | ... | ... |
-
Please register or sign in to post a comment