a220590e by 周伟奇

update ocr url

1 parent 10a4a80f
...@@ -28,16 +28,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -28,16 +28,8 @@ class Command(BaseCommand, LoggerMixin):
28 self.switch = True 28 self.switch = True
29 # 数据目录 29 # 数据目录
30 self.data_dir = conf.DATA_DIR 30 self.data_dir = conf.DATA_DIR
31 # pdf页面转图片
32 self.zoom_x = 2.0
33 self.zoom_y = 2.0
34 self.trans = fitz.Matrix(self.zoom_x, self.zoom_y).preRotate(0) # zoom factor 2 in each dimension
35 # ocr相关 31 # ocr相关
36 self.ocr_url = conf.OCR_URL 32 self.ocr_url = conf.OCR_URL
37 self.ocr_header = {
38 'X-Auth-Token': conf.OCR_TOKEN,
39 'Content-Type': 'application/json'
40 }
41 # EDMS web_service_api 33 # EDMS web_service_api
42 self.edms = EDMS(conf.EDMS_USER, conf.EDMS_PWD) 34 self.edms = EDMS(conf.EDMS_USER, conf.EDMS_PWD)
43 # 优雅退出信号:15 35 # 优雅退出信号:15
...@@ -103,12 +95,6 @@ class Command(BaseCommand, LoggerMixin): ...@@ -103,12 +95,6 @@ class Command(BaseCommand, LoggerMixin):
103 words = cell.get('words') 95 words = cell.get('words')
104 ws.cell(row=r1+1, column=c1+1, value=words) 96 ws.cell(row=r1+1, column=c1+1, value=words)
105 97
106 @staticmethod
107 def get_ocr_json(img_path):
108 with open(img_path, "rb") as f:
109 base64_data = base64.b64encode(f.read())
110 return {'imgBase64': base64_data.decode('utf-8')}
111
112 # async def fetch_ocr_result(self, img_path): 98 # async def fetch_ocr_result(self, img_path):
113 # async with aiohttp.ClientSession( 99 # async with aiohttp.ClientSession(
114 # headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False) 100 # headers=self.ocr_header, connector=aiohttp.TCPConnector(ssl=False)
...@@ -125,15 +111,20 @@ class Command(BaseCommand, LoggerMixin): ...@@ -125,15 +111,20 @@ class Command(BaseCommand, LoggerMixin):
125 # self.append_sheet(wb, sheets_list, img_name, role_summary) 111 # self.append_sheet(wb, sheets_list, img_name, role_summary)
126 112
127 def fetch_ocr_result(self, img_path): 113 def fetch_ocr_result(self, img_path):
128 json_data = self.get_ocr_json(img_path) 114 # payload = {'name': 'page_0_img_0_0'}
129 response = requests.post(self.ocr_url, json=json_data, headers=self.ocr_header) 115 files = [
116 ('img', open(img_path, 'rb'))
117 ]
118 response = requests.request("POST", self.ocr_url, files=files)
130 return response.json() 119 return response.json()
131 120
132 def img_ocr_excel(self, wb, img_path, role_summary): 121 def img_ocr_excel(self, wb, img_path, role_summary):
133 res = self.fetch_ocr_result(img_path) 122 res = self.fetch_ocr_result(img_path)
134 self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res)) 123 self.cronjob_log.info('{0} [fetch ocr result success] [img={1}] [res={2}]'.format(self.log_base, img_path, res))
135 if res.get('code') == '1': 124 if res.get('code') == 1:
136 sheets_list = res.get('result').get('res') 125 sheets_list = res.get('data')
126 if not sheets_list:
127 return
137 img_name = os.path.basename(img_path) 128 img_name = os.path.basename(img_path)
138 self.append_sheet(wb, sheets_list, img_name, role_summary) 129 self.append_sheet(wb, sheets_list, img_name, role_summary)
139 130
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!