Spaces:

caoyanyi
/

ai

Running

App Files Files Community

Caoyanyi commited on 3 days ago

Commit

e849b9a

1 Parent(s): c5a8d8a

* Adjust requirements.txt

Browse files

Files changed (2) hide show

app.py +194 -63
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -15,6 +15,56 @@ import tempfile
 paddleocr_available = False
 PaddleOCR = None
 # 动态导入PaddleOCR函数
 def get_paddleocr():
     """动态导入PaddleOCR"""
@@ -28,6 +78,8 @@ def get_paddleocr():
         except ImportError as e:
             print(f"❌ PaddleOCR动态导入失败: {e}")
             paddleocr_available = False
     return PaddleOCR
 try:
@@ -144,55 +196,69 @@ def load_summarizer_model():
 # OCR模型加载函数
 def load_ocr_model():
-    """延迟加载OCR模型"""
     global ocr_model, models_loaded, ocr_load_error
     # 重置错误信息
     ocr_load_error = None
     if ocr_model is None:
-        # 动态导入PaddleOCR
         _PaddleOCR = get_paddleocr()
-        if not _PaddleOCR:
-            error_details = "PaddleOCR未安装或导入失败"
             print(f"❌ {error_details}")
             models_loaded["ocr"] = False
             ocr_load_error = error_details
             return None
-        print("Starting to load PaddleOCR model...")
-        # 尝试多种配置组合，使用PaddleOCR 3.3.2支持的参数
-        # 注意：PaddleOCR 3.3.2版本已弃用use_gpu参数，改用device参数
-        configs = [
-            # 配置1：极简配置，只指定必要参数
-            {
-                'lang': 'ch',
-                'device': 'cpu',
-            }
-        ]
-        # 存储所有错误信息
-        all_errors = []
-        for i, config in enumerate(configs):
-            try:
-                print(f"Trying PaddleOCR config {i+1}: {config}")
-                ocr_model = _PaddleOCR(**config)
-                models_loaded["ocr"] = True
-                print(f"Successfully loaded PaddleOCR model with config {i+1}")
-                return ocr_model
-            except Exception as e:
-                error_msg = f"Config {i+1} failed: {str(e)}"
-                print(error_msg)
-                all_errors.append(error_msg)
-                # 继续尝试下一个配置
-                continue
-        # 所有配置都失败
-        error_details = f"All PaddleOCR configurations failed to load. Errors: {'; '.join(all_errors)}"
-        print(error_details)
         models_loaded["ocr"] = False
         ocr_load_error = error_details
     return ocr_model
 # PDF转图片函数
@@ -240,6 +306,7 @@ def health_check():
             "transformers_available": transformers_available,
             "ocr_available": {
                 "paddleocr": paddleocr_available,
                 "pymupdf": fitz_available,
                 "opencv": cv2_available,
                 "onnxruntime": onnx_available
@@ -253,12 +320,24 @@ def health_check():
 async def ocr_document(file: UploadFile = File(...)):
     """OCR文档解析接口，支持PDF和图片"""
     try:
-        # 先尝试动态导入PaddleOCR，更新可用性状态
         get_paddleocr()
-        # 检查OCR相关依赖是否可用
-        if not paddleocr_available:
-            return JSONResponse(content={"error": "PaddleOCR模块未安装，OCR功能不可用", "suggestion": "请安装PaddleOCR: pip install paddleocr"}, status_code=503)
         # 保存临时文件
         with tempfile.NamedTemporaryFile(suffix=".tmp", delete=False) as temp_file:
@@ -297,13 +376,17 @@ async def ocr_document(file: UploadFile = File(...)):
             # 确保OCR模型已加载
             current_ocr_model = load_ocr_model()
             if current_ocr_model is None:
-                # 检查models_loaded状态，返回更详细的错误
-                if not paddleocr_available:
-                    return JSONResponse(content={"error": "PaddleOCR模块未安装"}, status_code=503)
-                else:
-                    # 返回更详细的错误信息
-                    error_msg = ocr_load_error or 'OCR模型加载失败'
-                    return JSONResponse(content={"error": f"OCR模型加载失败: {error_msg}"}, status_code=503)
             # 执行OCR识别
             all_results = []
@@ -313,23 +396,68 @@ async def ocr_document(file: UploadFile = File(...)):
                 # 图像预处理
                 preprocessed_img = preprocess_image(img)
-                # 执行OCR
                 try:
-                    result = current_ocr_model.ocr(preprocessed_img, cls=True)
                 except Exception as ocr_err:
-                    # 尝试禁用角度分类
-                    try:
-                        result = current_ocr_model.ocr(preprocessed_img, cls=False)
-                        print("OCR with cls=False succeeded after cls=True failed")
-                    except Exception as ocr_err2:
-                        return JSONResponse(content={"error": f"OCR识别失败: {str(ocr_err2)}"}, status_code=500)
-                # 提取文本结果
-                page_text = []
-                for line in result[0]:
-                    text = line[1][0]
-                    confidence = line[1][1]
-                    page_text.append({"text": text, "confidence": confidence})
                 all_results.append({
                     "page": page_num,
@@ -346,7 +474,8 @@ async def ocr_document(file: UploadFile = File(...)):
                 "filename": file.filename,
                 "page_count": len(all_results),
                 "pages": all_results,
-                "full_text": full_document_text
             })
         finally:
@@ -360,10 +489,12 @@ async def ocr_document(file: UploadFile = File(...)):
             "details": str(e),
             "services": {
                 "paddleocr_available": paddleocr_available,
                 "fitz_available": fitz_available,
                 "cv2_available": cv2_available,
                 "models_loaded": models_loaded.get("ocr", False)
-            }
         }
         return JSONResponse(content=error_details, status_code=500)

 paddleocr_available = False
 PaddleOCR = None
+# pytesseract作为备选OCR方案
+pytesseract_available = False
+pytesseract = None
+cv2_available = False
+cv2 = None
+# 动态导入OpenCV
+def get_opencv():
+    """动态导入OpenCV"""
+    global cv2, cv2_available
+    if cv2 is None:
+        try:
+            import cv2 as _cv2
+            cv2 = _cv2
+            cv2_available = True
+            print("✅ OpenCV动态导入成功")
+        except ImportError as e:
+            print(f"❌ OpenCV动态导入失败: {e}")
+            cv2_available = False
+    return cv2
+# 动态导入pytesseract
+def get_pytesseract():
+    """动态导入pytesseract作为备选方案"""
+    global pytesseract, pytesseract_available
+    if pytesseract is None:
+        try:
+            import pytesseract as _pytesseract
+            pytesseract = _pytesseract
+            # 测试tesseract可执行文件是否可用
+            try:
+                pytesseract.pytesseract.tesseract_cmd = pytesseract.get_tesseract_version()
+                pytesseract_available = True
+                print("✅ pytesseract动态导入成功")
+            except pytesseract.pytesseract.TesseractError:
+                # tesseract可执行文件不可用，但库已导入
+                print("⚠️  pytesseract库已导入，但tesseract可执行文件不可用")
+                pytesseract_available = False
+            except Exception as e:
+                print(f"⚠️  pytesseract库已导入，但测试tesseract可执行文件时出错: {e}")
+                pytesseract_available = True
+            # 确保OpenCV也被导入
+            get_opencv()
+        except ImportError as e:
+            print(f"❌ pytesseract动态导入失败: {e}")
+            pytesseract_available = False
+    return pytesseract
 # 动态导入PaddleOCR函数
 def get_paddleocr():
     """动态导入PaddleOCR"""
         except ImportError as e:
             print(f"❌ PaddleOCR动态导入失败: {e}")
             paddleocr_available = False
+            # 尝试备选方案
+            get_pytesseract()
     return PaddleOCR
 try:
 # OCR模型加载函数
 def load_ocr_model():
+    """延迟加载OCR模型，优先使用PaddleOCR，失败则使用pytesseract"""
     global ocr_model, models_loaded, ocr_load_error
     # 重置错误信息
     ocr_load_error = None
     if ocr_model is None:
+        # 优先尝试PaddleOCR
         _PaddleOCR = get_paddleocr()
+        if _PaddleOCR:
+            print("Starting to load PaddleOCR model...")
+            # 尝试多种配置组合，使用PaddleOCR 3.3.2支持的参数
+            configs = [
+                {
+                    'lang': 'ch',
+                    'device': 'cpu',
+                }
+            ]
+            all_errors = []
+            for i, config in enumerate(configs):
+                try:
+                    print(f"Trying PaddleOCR config {i+1}: {config}")
+                    ocr_model = _PaddleOCR(**config)
+                    models_loaded["ocr"] = True
+                    print(f"Successfully loaded PaddleOCR model with config {i+1}")
+                    return ocr_model
+                except Exception as e:
+                    error_msg = f"Config {i+1} failed: {str(e)}"
+                    print(error_msg)
+                    all_errors.append(error_msg)
+                    continue
+            # 所有PaddleOCR配置都失败，尝试pytesseract
+            print(f"All PaddleOCR configurations failed. Trying pytesseract as fallback...")
+        # 尝试pytesseract作为备选方案
+        _pytesseract = get_pytesseract()
+        if _pytesseract and pytesseract_available:
+            print("Using pytesseract as OCR solution...")
+            # pytesseract不需要预加载模型，直接使用
+            ocr_model = {
+                'type': 'pytesseract',
+                'engine': _pytesseract
+            }
+            models_loaded["ocr"] = True
+            print("Successfully configured pytesseract OCR")
+            return ocr_model
+        elif _pytesseract:
+            # pytesseract库已导入，但tesseract可执行文件不可用
+            error_details = "pytesseract库已安装，但tesseract可执行文件不可用，OCR功能无法使用"
             print(f"❌ {error_details}")
             models_loaded["ocr"] = False
             ocr_load_error = error_details
             return None
+        # 所有OCR方案都失败
+        error_details = "所有OCR方案均不可用（PaddleOCR和pytesseract均未安装或导入失败）"
+        print(f"❌ {error_details}")
         models_loaded["ocr"] = False
         ocr_load_error = error_details
+        return None
     return ocr_model
 # PDF转图片函数
             "transformers_available": transformers_available,
             "ocr_available": {
                 "paddleocr": paddleocr_available,
+                "pytesseract": pytesseract_available,
                 "pymupdf": fitz_available,
                 "opencv": cv2_available,
                 "onnxruntime": onnx_available
 async def ocr_document(file: UploadFile = File(...)):
     """OCR文档解析接口，支持PDF和图片"""
     try:
+        # 先尝试动态导入所有OCR选项，更新可用性状态
         get_paddleocr()
+        get_pytesseract()
+        # 检查是否有可用的OCR解决方案
+        if not paddleocr_available and not pytesseract_available:
+            return JSONResponse(content={
+                "error": "所有OCR方案均不可用",
+                "details": {
+                    "paddleocr": "PaddleOCR模块未安装或不兼容Python 3.13",
+                    "pytesseract": "pytesseract库已安装，但tesseract可执行文件不可用"
+                },
+                "suggestions": [
+                    "对于Python 3.13用户：安装tesseract可执行文件后重试",
+                    "对于Python 3.10-3.12用户：安装PaddleOCR: pip install paddleocr",
+                    "tesseract可执行文件下载地址：https://github.com/tesseract-ocr/tesseract/wiki/Downloads"
+                ]
+            }, status_code=503)
         # 保存临时文件
         with tempfile.NamedTemporaryFile(suffix=".tmp", delete=False) as temp_file:
             # 确保OCR模型已加载
             current_ocr_model = load_ocr_model()
             if current_ocr_model is None:
+                # 返回详细的错误信息
+                error_msg = ocr_load_error or 'OCR模型加载失败'
+                return JSONResponse(content={
+                    "error": "OCR模型加载失败",
+                    "details": error_msg,
+                    "suggestions": [
+                        "检查Python版本是否兼容（推荐3.10-3.12用于PaddleOCR）",
+                        "如果使用Python 3.13，确保tesseract可执行文件已正确安装",
+                        "查看服务器日志获取更多详细信息"
+                    ]
+                }, status_code=503)
             # 执行OCR识别
             all_results = []
                 # 图像预处理
                 preprocessed_img = preprocess_image(img)
+                # 执行OCR，根据模型类型使用不同的调用方式
                 try:
+                    # 检查模型类型，处理不同OCR库的差异
+                    if isinstance(current_ocr_model, dict) and current_ocr_model['type'] == 'pytesseract':  # pytesseract
+                        # pytesseract调用方式
+                        pytesseract_engine = current_ocr_model['engine']
+                        # 使用PIL Image或numpy array
+                        if isinstance(preprocessed_img, np.ndarray):
+                            # 转换为PIL Image
+                            from PIL import Image
+                            img_pil = Image.fromarray(preprocessed_img)
+                            # 执行OCR
+                            result = pytesseract_engine.image_to_data(img_pil, output_type=pytesseract_engine.Output.DICT)
+                            # 提取文本结果 - pytesseract格式
+                            page_text = []
+                            for i in range(len(result['text'])):
+                                text = result['text'][i].strip()
+                                if text:
+                                    confidence = float(result['conf'][i]) / 100.0  # 转换为0-1范围
+                                    page_text.append({"text": text, "confidence": confidence})
+                        else:
+                            # 直接使用图像
+                            result = pytesseract_engine.image_to_data(preprocessed_img, output_type=pytesseract_engine.Output.DICT)
+                            page_text = []
+                            for i in range(len(result['text'])):
+                                text = result['text'][i].strip()
+                                if text:
+                                    confidence = float(result['conf'][i]) / 100.0
+                                    page_text.append({"text": text, "confidence": confidence})
+                    elif hasattr(current_ocr_model, 'ocr'):  # PaddleOCR
+                        try:
+                            result = current_ocr_model.ocr(preprocessed_img, cls=True)
+                        except Exception as ocr_err:
+                            # 尝试禁用角度分类
+                            try:
+                                result = current_ocr_model.ocr(preprocessed_img, cls=False)
+                                print("OCR with cls=False succeeded after cls=True failed")
+                            except Exception as ocr_err2:
+                                return JSONResponse(content={"error": f"OCR识别失败: {str(ocr_err2)}"}, status_code=500)
+                        # 提取文本结果 - PaddleOCR格式
+                        page_text = []
+                        for line in result[0]:
+                            text = line[1][0]
+                            confidence = line[1][1]
+                            page_text.append({"text": text, "confidence": confidence})
+                    elif hasattr(current_ocr_model, 'readtext'):  # EasyOCR
+                        # EasyOCR调用方式
+                        result = current_ocr_model.readtext(preprocessed_img)
+                        # 提取文本结果 - EasyOCR格式
+                        page_text = []
+                        for detection in result:
+                            text = detection[1]
+                            confidence = detection[2]
+                            page_text.append({"text": text, "confidence": confidence})
+                    else:
+                        return JSONResponse(content={"error": "未知的OCR模型类型"}, status_code=500)
                 except Exception as ocr_err:
+                    return JSONResponse(content={"error": f"OCR识别失败: {str(ocr_err)}"}, status_code=500)
                 all_results.append({
                     "page": page_num,
                 "filename": file.filename,
                 "page_count": len(all_results),
                 "pages": all_results,
+                "full_text": full_document_text,
+                "ocr_engine": "paddleocr" if hasattr(current_ocr_model, 'ocr') else "pytesseract"
             })
         finally:
             "details": str(e),
             "services": {
                 "paddleocr_available": paddleocr_available,
+                "pytesseract_available": pytesseract_available,
                 "fitz_available": fitz_available,
                 "cv2_available": cv2_available,
                 "models_loaded": models_loaded.get("ocr", False)
+            },
+            "suggestion": "查看服务器日志获取更多详细信息，或尝试使用兼容的Python版本"
         }
         return JSONResponse(content=error_details, status_code=500)

requirements.txt CHANGED Viewed

@@ -15,6 +15,7 @@ paddleocr
 pymupdf
 opencv-python-headless
 onnxruntime
 # LayoutLM相关
 transformers[onnx]
 # 图像处理和文档处理

 pymupdf
 opencv-python-headless
 onnxruntime
+pytesseract
 # LayoutLM相关
 transformers[onnx]
 # 图像处理和文档处理