pdf_tools.py
2.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import cv2
import fitz
import numpy as np
def pdf_to_images(pdf_path: str):
"""PDF 转 OpenCV Image
Args:
pdf_path (str): Description
Returns:
TYPE: Description
"""
images = []
doc = fitz.open(pdf_path)
# producer = doc.metadata.get('producer')
for pno in range(doc.page_count):
page = doc.load_page(pno)
all_texts = page.get_text().replace('\n', '').strip()
# 根据经验过滤掉特殊情况
all_texts = all_texts.strip('Click to buy NOW!PDF-XChangewww.docu-track.comClick to buy NOW!PDF-XChangewww.docu-track.com')
blocks = page.get_text("dict")["blocks"]
imgblocks = [b for b in blocks if b["type"] == 1]
page_images = []
# 如果一个字都没有,
if len(all_texts) == 0 and len(imgblocks) != 0:
# # 这些 producer 包含碎图,如果真的是碎图我们把碎图拼接一下
# if producer in ['Microsoft: Print To PDF',
# 'GPL Ghostscript 8.71',
# 'doPDF Ver 7.3 Build 398 (Windows 7 Business Edition (SP 1) - Version: 6.1.7601 (x64))',
# '福昕阅读器PDF打印机 版本 11.0.114.4386']:
patches = []
for imgblock in imgblocks:
contents = imgblock["image"]
img_array = np.frombuffer(contents, dtype=np.uint8)
image = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
patches.append(image)
try:
try:
image = np.concatenate(patches, axis=0)
page_images.append(image)
except:
image = np.concatenate(patches, axis=1)
page_images.append(image)
except:
# 当两张拼不到一块的时候我们可以认为他是两张图,如果超过两张那就不一定了
if len(patches) == 2:
page_images = patches
else:
pix = page.get_pixmap(dpi=350)
contents = pix.tobytes(output="png")
img_array = np.frombuffer(contents, dtype=np.uint8)
image = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
page_images.append(image)
# else:
# for imgblock in imgblocks:
# contents = imgblock["image"]
# img_array = np.frombuffer(contents, dtype=np.uint8)
# image = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
# page_images.append(image)
else:
pix = page.get_pixmap(dpi=350)
contents = pix.tobytes(output="png")
img_array = np.frombuffer(contents, dtype=np.uint8)
image = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
page_images.append(image)
images.append(page_images)
return images