Caoyanyi commited on
Commit
e849b9a
·
1 Parent(s): c5a8d8a

* Adjust requirements.txt

Browse files
Files changed (2) hide show
  1. app.py +194 -63
  2. requirements.txt +1 -0
app.py CHANGED
@@ -15,6 +15,56 @@ import tempfile
15
  paddleocr_available = False
16
  PaddleOCR = None
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  # 动态导入PaddleOCR函数
19
  def get_paddleocr():
20
  """动态导入PaddleOCR"""
@@ -28,6 +78,8 @@ def get_paddleocr():
28
  except ImportError as e:
29
  print(f"❌ PaddleOCR动态导入失败: {e}")
30
  paddleocr_available = False
 
 
31
  return PaddleOCR
32
 
33
  try:
@@ -144,55 +196,69 @@ def load_summarizer_model():
144
 
145
  # OCR模型加载函数
146
  def load_ocr_model():
147
- """延迟加载OCR模型"""
148
  global ocr_model, models_loaded, ocr_load_error
149
 
150
  # 重置错误信息
151
  ocr_load_error = None
152
 
153
  if ocr_model is None:
154
- # 动态导入PaddleOCR
155
  _PaddleOCR = get_paddleocr()
156
- if not _PaddleOCR:
157
- error_details = "PaddleOCR未安装或导入失败"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  print(f"❌ {error_details}")
159
  models_loaded["ocr"] = False
160
  ocr_load_error = error_details
161
  return None
162
 
163
- print("Starting to load PaddleOCR model...")
164
- # 尝试多种配置组合,使用PaddleOCR 3.3.2支持的参数
165
- # 注意:PaddleOCR 3.3.2版本已弃用use_gpu参数,改用device参数
166
- configs = [
167
- # 配置1:极简配置,只指定必要参数
168
- {
169
- 'lang': 'ch',
170
- 'device': 'cpu',
171
- }
172
- ]
173
-
174
- # 存储所有错误信息
175
- all_errors = []
176
-
177
- for i, config in enumerate(configs):
178
- try:
179
- print(f"Trying PaddleOCR config {i+1}: {config}")
180
- ocr_model = _PaddleOCR(**config)
181
- models_loaded["ocr"] = True
182
- print(f"Successfully loaded PaddleOCR model with config {i+1}")
183
- return ocr_model
184
- except Exception as e:
185
- error_msg = f"Config {i+1} failed: {str(e)}"
186
- print(error_msg)
187
- all_errors.append(error_msg)
188
- # 继续尝试下一个配置
189
- continue
190
-
191
- # 所有配置都失败
192
- error_details = f"All PaddleOCR configurations failed to load. Errors: {'; '.join(all_errors)}"
193
- print(error_details)
194
  models_loaded["ocr"] = False
195
  ocr_load_error = error_details
 
196
  return ocr_model
197
 
198
  # PDF转图片函数
@@ -240,6 +306,7 @@ def health_check():
240
  "transformers_available": transformers_available,
241
  "ocr_available": {
242
  "paddleocr": paddleocr_available,
 
243
  "pymupdf": fitz_available,
244
  "opencv": cv2_available,
245
  "onnxruntime": onnx_available
@@ -253,12 +320,24 @@ def health_check():
253
  async def ocr_document(file: UploadFile = File(...)):
254
  """OCR文档解析接口,支持PDF和图片"""
255
  try:
256
- # 先尝试动态导入PaddleOCR,更新可用性状态
257
  get_paddleocr()
 
258
 
259
- # 检查OCR相关依赖是否可用
260
- if not paddleocr_available:
261
- return JSONResponse(content={"error": "PaddleOCR模块未安装,OCR功能不可用", "suggestion": "请安装PaddleOCR: pip install paddleocr"}, status_code=503)
 
 
 
 
 
 
 
 
 
 
 
262
 
263
  # 保存临时文件
264
  with tempfile.NamedTemporaryFile(suffix=".tmp", delete=False) as temp_file:
@@ -297,13 +376,17 @@ async def ocr_document(file: UploadFile = File(...)):
297
  # 确保OCR模型已加载
298
  current_ocr_model = load_ocr_model()
299
  if current_ocr_model is None:
300
- # 检查models_loaded状态,返回更详细的错误
301
- if not paddleocr_available:
302
- return JSONResponse(content={"error": "PaddleOCR模块未安装"}, status_code=503)
303
- else:
304
- # 返回更详细的错误信息
305
- error_msg = ocr_load_error or 'OCR模型加载失败'
306
- return JSONResponse(content={"error": f"OCR模型加载失败: {error_msg}"}, status_code=503)
 
 
 
 
307
 
308
  # 执行OCR识别
309
  all_results = []
@@ -313,23 +396,68 @@ async def ocr_document(file: UploadFile = File(...)):
313
  # 图像预处理
314
  preprocessed_img = preprocess_image(img)
315
 
316
- # 执行OCR
317
  try:
318
- result = current_ocr_model.ocr(preprocessed_img, cls=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  except Exception as ocr_err:
320
- # 尝试禁用角度分类
321
- try:
322
- result = current_ocr_model.ocr(preprocessed_img, cls=False)
323
- print("OCR with cls=False succeeded after cls=True failed")
324
- except Exception as ocr_err2:
325
- return JSONResponse(content={"error": f"OCR识别失败: {str(ocr_err2)}"}, status_code=500)
326
-
327
- # 提取文本结果
328
- page_text = []
329
- for line in result[0]:
330
- text = line[1][0]
331
- confidence = line[1][1]
332
- page_text.append({"text": text, "confidence": confidence})
333
 
334
  all_results.append({
335
  "page": page_num,
@@ -346,7 +474,8 @@ async def ocr_document(file: UploadFile = File(...)):
346
  "filename": file.filename,
347
  "page_count": len(all_results),
348
  "pages": all_results,
349
- "full_text": full_document_text
 
350
  })
351
 
352
  finally:
@@ -360,10 +489,12 @@ async def ocr_document(file: UploadFile = File(...)):
360
  "details": str(e),
361
  "services": {
362
  "paddleocr_available": paddleocr_available,
 
363
  "fitz_available": fitz_available,
364
  "cv2_available": cv2_available,
365
  "models_loaded": models_loaded.get("ocr", False)
366
- }
 
367
  }
368
  return JSONResponse(content=error_details, status_code=500)
369
 
 
15
  paddleocr_available = False
16
  PaddleOCR = None
17
 
18
+ # pytesseract作为备选OCR方案
19
+ pytesseract_available = False
20
+ pytesseract = None
21
+ cv2_available = False
22
+ cv2 = None
23
+
24
+ # 动态导入OpenCV
25
+ def get_opencv():
26
+ """动态导入OpenCV"""
27
+ global cv2, cv2_available
28
+ if cv2 is None:
29
+ try:
30
+ import cv2 as _cv2
31
+ cv2 = _cv2
32
+ cv2_available = True
33
+ print("✅ OpenCV动态导入成功")
34
+ except ImportError as e:
35
+ print(f"❌ OpenCV动态导入失败: {e}")
36
+ cv2_available = False
37
+ return cv2
38
+
39
+ # 动态导入pytesseract
40
+ def get_pytesseract():
41
+ """动态导入pytesseract作为备选方案"""
42
+ global pytesseract, pytesseract_available
43
+ if pytesseract is None:
44
+ try:
45
+ import pytesseract as _pytesseract
46
+ pytesseract = _pytesseract
47
+
48
+ # 测试tesseract可执行文件是否可用
49
+ try:
50
+ pytesseract.pytesseract.tesseract_cmd = pytesseract.get_tesseract_version()
51
+ pytesseract_available = True
52
+ print("✅ pytesseract动态导入成功")
53
+ except pytesseract.pytesseract.TesseractError:
54
+ # tesseract可执行文件不可用,但库已导入
55
+ print("⚠️ pytesseract库已导入,但tesseract可执行文件不可用")
56
+ pytesseract_available = False
57
+ except Exception as e:
58
+ print(f"⚠️ pytesseract库已导入,但测试tesseract可执行文件时出错: {e}")
59
+ pytesseract_available = True
60
+
61
+ # 确保OpenCV也被导入
62
+ get_opencv()
63
+ except ImportError as e:
64
+ print(f"❌ pytesseract动态导入失败: {e}")
65
+ pytesseract_available = False
66
+ return pytesseract
67
+
68
  # 动态导入PaddleOCR函数
69
  def get_paddleocr():
70
  """动态导入PaddleOCR"""
 
78
  except ImportError as e:
79
  print(f"❌ PaddleOCR动态导入失败: {e}")
80
  paddleocr_available = False
81
+ # 尝试备选方案
82
+ get_pytesseract()
83
  return PaddleOCR
84
 
85
  try:
 
196
 
197
  # OCR模型加载函数
198
  def load_ocr_model():
199
+ """延迟加载OCR模型,优先使用PaddleOCR,失败则使用pytesseract"""
200
  global ocr_model, models_loaded, ocr_load_error
201
 
202
  # 重置错误信息
203
  ocr_load_error = None
204
 
205
  if ocr_model is None:
206
+ # 优先尝试PaddleOCR
207
  _PaddleOCR = get_paddleocr()
208
+ if _PaddleOCR:
209
+ print("Starting to load PaddleOCR model...")
210
+ # 尝试多种配置组合,使用PaddleOCR 3.3.2支持的参数
211
+ configs = [
212
+ {
213
+ 'lang': 'ch',
214
+ 'device': 'cpu',
215
+ }
216
+ ]
217
+
218
+ all_errors = []
219
+
220
+ for i, config in enumerate(configs):
221
+ try:
222
+ print(f"Trying PaddleOCR config {i+1}: {config}")
223
+ ocr_model = _PaddleOCR(**config)
224
+ models_loaded["ocr"] = True
225
+ print(f"Successfully loaded PaddleOCR model with config {i+1}")
226
+ return ocr_model
227
+ except Exception as e:
228
+ error_msg = f"Config {i+1} failed: {str(e)}"
229
+ print(error_msg)
230
+ all_errors.append(error_msg)
231
+ continue
232
+
233
+ # 所有PaddleOCR配置都失败,尝试pytesseract
234
+ print(f"All PaddleOCR configurations failed. Trying pytesseract as fallback...")
235
+
236
+ # 尝试pytesseract作为备选方案
237
+ _pytesseract = get_pytesseract()
238
+ if _pytesseract and pytesseract_available:
239
+ print("Using pytesseract as OCR solution...")
240
+ # pytesseract不需要预加载模型,直接使用
241
+ ocr_model = {
242
+ 'type': 'pytesseract',
243
+ 'engine': _pytesseract
244
+ }
245
+ models_loaded["ocr"] = True
246
+ print("Successfully configured pytesseract OCR")
247
+ return ocr_model
248
+ elif _pytesseract:
249
+ # pytesseract库已导入,但tesseract可执行文件不可用
250
+ error_details = "pytesseract库已安装,但tesseract可执行文件不可用,OCR功能无法使用"
251
  print(f"❌ {error_details}")
252
  models_loaded["ocr"] = False
253
  ocr_load_error = error_details
254
  return None
255
 
256
+ # 所有OCR方案都失败
257
+ error_details = "所有OCR方案均不可用(PaddleOCR和pytesseract均未安装或导入失败)"
258
+ print(f"❌ {error_details}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  models_loaded["ocr"] = False
260
  ocr_load_error = error_details
261
+ return None
262
  return ocr_model
263
 
264
  # PDF转图片函数
 
306
  "transformers_available": transformers_available,
307
  "ocr_available": {
308
  "paddleocr": paddleocr_available,
309
+ "pytesseract": pytesseract_available,
310
  "pymupdf": fitz_available,
311
  "opencv": cv2_available,
312
  "onnxruntime": onnx_available
 
320
  async def ocr_document(file: UploadFile = File(...)):
321
  """OCR文档解析接口,支持PDF和图片"""
322
  try:
323
+ # 先尝试动态导入所有OCR选项,更新可用性状态
324
  get_paddleocr()
325
+ get_pytesseract()
326
 
327
+ # 检查是否有可用的OCR解决方案
328
+ if not paddleocr_available and not pytesseract_available:
329
+ return JSONResponse(content={
330
+ "error": "所有OCR方案均不可用",
331
+ "details": {
332
+ "paddleocr": "PaddleOCR模块未安装或不兼容Python 3.13",
333
+ "pytesseract": "pytesseract库已安装,但tesseract可执行文件不可用"
334
+ },
335
+ "suggestions": [
336
+ "对于Python 3.13用户:安装tesseract可执行文件后重试",
337
+ "对于Python 3.10-3.12用户:安装PaddleOCR: pip install paddleocr",
338
+ "tesseract可执行文件下载地址:https://github.com/tesseract-ocr/tesseract/wiki/Downloads"
339
+ ]
340
+ }, status_code=503)
341
 
342
  # 保存临时文件
343
  with tempfile.NamedTemporaryFile(suffix=".tmp", delete=False) as temp_file:
 
376
  # 确保OCR模型已加载
377
  current_ocr_model = load_ocr_model()
378
  if current_ocr_model is None:
379
+ # 返回详细的错误信息
380
+ error_msg = ocr_load_error or 'OCR模型加载失败'
381
+ return JSONResponse(content={
382
+ "error": "OCR模型加载失败",
383
+ "details": error_msg,
384
+ "suggestions": [
385
+ "检查Python版本是否兼容(推荐3.10-3.12用于PaddleOCR)",
386
+ "如果使用Python 3.13,确保tesseract可执行文件已正确安装",
387
+ "查看服务器日志获取更多详细信息"
388
+ ]
389
+ }, status_code=503)
390
 
391
  # 执行OCR识别
392
  all_results = []
 
396
  # 图像预处理
397
  preprocessed_img = preprocess_image(img)
398
 
399
+ # 执行OCR,根据模型类型使用不同的调用方式
400
  try:
401
+ # 检查模型类型,处理不同OCR库的差异
402
+ if isinstance(current_ocr_model, dict) and current_ocr_model['type'] == 'pytesseract': # pytesseract
403
+ # pytesseract调用方式
404
+ pytesseract_engine = current_ocr_model['engine']
405
+
406
+ # 使用PIL Image或numpy array
407
+ if isinstance(preprocessed_img, np.ndarray):
408
+ # 转换为PIL Image
409
+ from PIL import Image
410
+ img_pil = Image.fromarray(preprocessed_img)
411
+ # 执行OCR
412
+ result = pytesseract_engine.image_to_data(img_pil, output_type=pytesseract_engine.Output.DICT)
413
+
414
+ # 提取文本结果 - pytesseract格式
415
+ page_text = []
416
+ for i in range(len(result['text'])):
417
+ text = result['text'][i].strip()
418
+ if text:
419
+ confidence = float(result['conf'][i]) / 100.0 # 转换为0-1范围
420
+ page_text.append({"text": text, "confidence": confidence})
421
+ else:
422
+ # 直接使用图像
423
+ result = pytesseract_engine.image_to_data(preprocessed_img, output_type=pytesseract_engine.Output.DICT)
424
+ page_text = []
425
+ for i in range(len(result['text'])):
426
+ text = result['text'][i].strip()
427
+ if text:
428
+ confidence = float(result['conf'][i]) / 100.0
429
+ page_text.append({"text": text, "confidence": confidence})
430
+ elif hasattr(current_ocr_model, 'ocr'): # PaddleOCR
431
+ try:
432
+ result = current_ocr_model.ocr(preprocessed_img, cls=True)
433
+ except Exception as ocr_err:
434
+ # 尝试禁用角度分类
435
+ try:
436
+ result = current_ocr_model.ocr(preprocessed_img, cls=False)
437
+ print("OCR with cls=False succeeded after cls=True failed")
438
+ except Exception as ocr_err2:
439
+ return JSONResponse(content={"error": f"OCR识别失败: {str(ocr_err2)}"}, status_code=500)
440
+
441
+ # 提取文本结果 - PaddleOCR格式
442
+ page_text = []
443
+ for line in result[0]:
444
+ text = line[1][0]
445
+ confidence = line[1][1]
446
+ page_text.append({"text": text, "confidence": confidence})
447
+ elif hasattr(current_ocr_model, 'readtext'): # EasyOCR
448
+ # EasyOCR调用方式
449
+ result = current_ocr_model.readtext(preprocessed_img)
450
+
451
+ # 提取文本结果 - EasyOCR格式
452
+ page_text = []
453
+ for detection in result:
454
+ text = detection[1]
455
+ confidence = detection[2]
456
+ page_text.append({"text": text, "confidence": confidence})
457
+ else:
458
+ return JSONResponse(content={"error": "未知的OCR模型类型"}, status_code=500)
459
  except Exception as ocr_err:
460
+ return JSONResponse(content={"error": f"OCR识别失败: {str(ocr_err)}"}, status_code=500)
 
 
 
 
 
 
 
 
 
 
 
 
461
 
462
  all_results.append({
463
  "page": page_num,
 
474
  "filename": file.filename,
475
  "page_count": len(all_results),
476
  "pages": all_results,
477
+ "full_text": full_document_text,
478
+ "ocr_engine": "paddleocr" if hasattr(current_ocr_model, 'ocr') else "pytesseract"
479
  })
480
 
481
  finally:
 
489
  "details": str(e),
490
  "services": {
491
  "paddleocr_available": paddleocr_available,
492
+ "pytesseract_available": pytesseract_available,
493
  "fitz_available": fitz_available,
494
  "cv2_available": cv2_available,
495
  "models_loaded": models_loaded.get("ocr", False)
496
+ },
497
+ "suggestion": "查看服务器日志获取更多详细信息,或尝试使用兼容的Python版本"
498
  }
499
  return JSONResponse(content=error_details, status_code=500)
500
 
requirements.txt CHANGED
@@ -15,6 +15,7 @@ paddleocr
15
  pymupdf
16
  opencv-python-headless
17
  onnxruntime
 
18
  # LayoutLM相关
19
  transformers[onnx]
20
  # 图像处理和文档处理
 
15
  pymupdf
16
  opencv-python-headless
17
  onnxruntime
18
+ pytesseract
19
  # LayoutLM相关
20
  transformers[onnx]
21
  # 图像处理和文档处理