Spaces:

caoyanyi
/

ai

Running

App Files Files Community

Caoyanyi commited on about 21 hours ago

Commit

022bc99

1 Parent(s): 8dae0e0

* Merge ocr results.

Browse files

Files changed (1) hide show

app.py +28 -10

app.py CHANGED Viewed

@@ -456,28 +456,46 @@ async def ocr_document(file: UploadFile = File(...)):
                             else:
                                 img_pil = Image.fromarray(preprocessed_img)
                             # 执行OCR，指定中文语言包
-                            result = pytesseract_engine.image_to_data(
                                 img_pil,
-                                output_type=pytesseract_engine.Output.DICT,
                                 lang='chi_sim+eng',  # 添加中文简体和英文语言包
                                 config='--psm 6'      # 假设单一文本块
                             )
                         else:
                             # 直接使用图像
-                            result = pytesseract_engine.image_to_data(
                                 preprocessed_img,
-                                output_type=pytesseract_engine.Output.DICT,
                                 lang='chi_sim+eng',  # 添加中文简体和英文语言包
                                 config='--psm 6'      # 假设单一文本块
                             )
-                        # 提取文本结果 - pytesseract格式
                         page_text = []
-                        for i in range(len(result['text'])):
-                            text = result['text'][i].strip()
-                            if text:
-                                confidence = float(result['conf'][i]) / 100.0  # 转换为0-1范围
-                                page_text.append({"text": text, "confidence": confidence})
                     elif hasattr(current_ocr_model, 'ocr'):  # PaddleOCR
                         try:
                             result = current_ocr_model.ocr(preprocessed_img, cls=True)

                             else:
                                 img_pil = Image.fromarray(preprocessed_img)
                             # 执行OCR，指定中文语言包
+                            # 使用image_to_string获取完整文本，避免字符分隔问题
+                            full_text = pytesseract_engine.image_to_string(
                                 img_pil,
                                 lang='chi_sim+eng',  # 添加中文简体和英文语言包
                                 config='--psm 6'      # 假设单一文本块
                             )
+                            # 同时获取数据用于置信度信息
+                            result_data = pytesseract_engine.image_to_data(
+                                img_pil,
+                                output_type=pytesseract_engine.Output.DICT,
+                                lang='chi_sim+eng',
+                                config='--psm 6'
+                            )
                         else:
                             # 直接使用图像
+                            full_text = pytesseract_engine.image_to_string(
                                 preprocessed_img,
                                 lang='chi_sim+eng',  # 添加中文简体和英文语言包
                                 config='--psm 6'      # 假设单一文本块
                             )
+                            # 同时获取数据用于置信度信息
+                            result_data = pytesseract_engine.image_to_data(
+                                preprocessed_img,
+                                output_type=pytesseract_engine.Output.DICT,
+                                lang='chi_sim+eng',
+                                config='--psm 6'
+                            )
+                        # 处理完整文本，按行分割
                         page_text = []
+                        # 获取平均置信度
+                        valid_confidences = [float(conf) for conf in result_data['conf'] if float(conf) > 0]
+                        avg_confidence = sum(valid_confidences) / len(valid_confidences) if valid_confidences else 0.5
+                        # 按行分割文本
+                        lines = full_text.strip().split('\n')
+                        for line in lines:
+                            line = line.strip()
+                            if line:
+                                page_text.append({"text": line, "confidence": avg_confidence / 100.0})
                     elif hasattr(current_ocr_model, 'ocr'):  # PaddleOCR
                         try:
                             result = current_ocr_model.ocr(preprocessed_img, cls=True)