Spaces:

caoyanyi
/

ai

Running

App Files Files Community

Caoyanyi commited on 3 days ago

Commit

8dae0e0

1 Parent(s): 9363b11

* Adjust ocr model logic.

Browse files

Files changed (1) hide show

app.py +29 -18

app.py CHANGED Viewed

@@ -447,26 +447,37 @@ async def ocr_document(file: UploadFile = File(...)):
                         if isinstance(preprocessed_img, np.ndarray):
                             # 转换为PIL Image
                             from PIL import Image
-                            img_pil = Image.fromarray(preprocessed_img)
-                            # 执行OCR
-                            result = pytesseract_engine.image_to_data(img_pil, output_type=pytesseract_engine.Output.DICT)
-                            # 提取文本结果 - pytesseract格式
-                            page_text = []
-                            for i in range(len(result['text'])):
-                                text = result['text'][i].strip()
-                                if text:
-                                    confidence = float(result['conf'][i]) / 100.0  # 转换为0-1范围
-                                    page_text.append({"text": text, "confidence": confidence})
                         else:
                             # 直接使用图像
-                            result = pytesseract_engine.image_to_data(preprocessed_img, output_type=pytesseract_engine.Output.DICT)
-                            page_text = []
-                            for i in range(len(result['text'])):
-                                text = result['text'][i].strip()
-                                if text:
-                                    confidence = float(result['conf'][i]) / 100.0
-                                    page_text.append({"text": text, "confidence": confidence})
                     elif hasattr(current_ocr_model, 'ocr'):  # PaddleOCR
                         try:
                             result = current_ocr_model.ocr(preprocessed_img, cls=True)

                         if isinstance(preprocessed_img, np.ndarray):
                             # 转换为PIL Image
                             from PIL import Image
+                            # 对于pytesseract，直接使用灰度图可能效果更好，尤其是对于截图
+                            # 检查是否是截图
+                            if file_ext in ['png', 'jpg', 'jpeg'] and 'screenshot' in file.filename.lower():
+                                # 对于截图，直接使用灰度图而不是二值化
+                                gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) if len(img.shape) == 3 else img
+                                img_pil = Image.fromarray(gray_img)
+                            else:
+                                img_pil = Image.fromarray(preprocessed_img)
+                            # 执行OCR，指定中文语言包
+                            result = pytesseract_engine.image_to_data(
+                                img_pil,
+                                output_type=pytesseract_engine.Output.DICT,
+                                lang='chi_sim+eng',  # 添加中文简体和英文语言包
+                                config='--psm 6'      # 假设单一文本块
+                            )
                         else:
                             # 直接使用图像
+                            result = pytesseract_engine.image_to_data(
+                                preprocessed_img,
+                                output_type=pytesseract_engine.Output.DICT,
+                                lang='chi_sim+eng',  # 添加中文简体和英文语言包
+                                config='--psm 6'      # 假设单一文本块
+                            )
+                        # 提取文本结果 - pytesseract格式
+                        page_text = []
+                        for i in range(len(result['text'])):
+                            text = result['text'][i].strip()
+                            if text:
+                                confidence = float(result['conf'][i]) / 100.0  # 转换为0-1范围
+                                page_text.append({"text": text, "confidence": confidence})
                     elif hasattr(current_ocr_model, 'ocr'):  # PaddleOCR
                         try:
                             result = current_ocr_model.ocr(preprocessed_img, cls=True)