Caoyanyi
commited on
Commit
·
022bc99
1
Parent(s):
8dae0e0
* Merge ocr results.
Browse files
app.py
CHANGED
|
@@ -456,28 +456,46 @@ async def ocr_document(file: UploadFile = File(...)):
|
|
| 456 |
else:
|
| 457 |
img_pil = Image.fromarray(preprocessed_img)
|
| 458 |
# 执行OCR,指定中文语言包
|
| 459 |
-
|
|
|
|
| 460 |
img_pil,
|
| 461 |
-
output_type=pytesseract_engine.Output.DICT,
|
| 462 |
lang='chi_sim+eng', # 添加中文简体和英文语言包
|
| 463 |
config='--psm 6' # 假设单一文本块
|
| 464 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
else:
|
| 466 |
# 直接使用图像
|
| 467 |
-
|
| 468 |
preprocessed_img,
|
| 469 |
-
output_type=pytesseract_engine.Output.DICT,
|
| 470 |
lang='chi_sim+eng', # 添加中文简体和英文语言包
|
| 471 |
config='--psm 6' # 假设单一文本块
|
| 472 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
|
| 474 |
-
#
|
| 475 |
page_text = []
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
elif hasattr(current_ocr_model, 'ocr'): # PaddleOCR
|
| 482 |
try:
|
| 483 |
result = current_ocr_model.ocr(preprocessed_img, cls=True)
|
|
|
|
| 456 |
else:
|
| 457 |
img_pil = Image.fromarray(preprocessed_img)
|
| 458 |
# 执行OCR,指定中文语言包
|
| 459 |
+
# 使用image_to_string获取完整文本,避免字符分隔问题
|
| 460 |
+
full_text = pytesseract_engine.image_to_string(
|
| 461 |
img_pil,
|
|
|
|
| 462 |
lang='chi_sim+eng', # 添加中文简体和英文语言包
|
| 463 |
config='--psm 6' # 假设单一文本块
|
| 464 |
)
|
| 465 |
+
# 同时获取数据用于置信度信息
|
| 466 |
+
result_data = pytesseract_engine.image_to_data(
|
| 467 |
+
img_pil,
|
| 468 |
+
output_type=pytesseract_engine.Output.DICT,
|
| 469 |
+
lang='chi_sim+eng',
|
| 470 |
+
config='--psm 6'
|
| 471 |
+
)
|
| 472 |
else:
|
| 473 |
# 直接使用图像
|
| 474 |
+
full_text = pytesseract_engine.image_to_string(
|
| 475 |
preprocessed_img,
|
|
|
|
| 476 |
lang='chi_sim+eng', # 添加中文简体和英文语言包
|
| 477 |
config='--psm 6' # 假设单一文本块
|
| 478 |
)
|
| 479 |
+
# 同时获取数据用于置信度信息
|
| 480 |
+
result_data = pytesseract_engine.image_to_data(
|
| 481 |
+
preprocessed_img,
|
| 482 |
+
output_type=pytesseract_engine.Output.DICT,
|
| 483 |
+
lang='chi_sim+eng',
|
| 484 |
+
config='--psm 6'
|
| 485 |
+
)
|
| 486 |
|
| 487 |
+
# 处理完整文本,按行分割
|
| 488 |
page_text = []
|
| 489 |
+
# 获取平均置信度
|
| 490 |
+
valid_confidences = [float(conf) for conf in result_data['conf'] if float(conf) > 0]
|
| 491 |
+
avg_confidence = sum(valid_confidences) / len(valid_confidences) if valid_confidences else 0.5
|
| 492 |
+
|
| 493 |
+
# 按行分割文本
|
| 494 |
+
lines = full_text.strip().split('\n')
|
| 495 |
+
for line in lines:
|
| 496 |
+
line = line.strip()
|
| 497 |
+
if line:
|
| 498 |
+
page_text.append({"text": line, "confidence": avg_confidence / 100.0})
|
| 499 |
elif hasattr(current_ocr_model, 'ocr'): # PaddleOCR
|
| 500 |
try:
|
| 501 |
result = current_ocr_model.ocr(preprocessed_img, cls=True)
|