Caoyanyi commited on
Commit
022bc99
·
1 Parent(s): 8dae0e0

* Merge ocr results.

Browse files
Files changed (1) hide show
  1. app.py +28 -10
app.py CHANGED
@@ -456,28 +456,46 @@ async def ocr_document(file: UploadFile = File(...)):
456
  else:
457
  img_pil = Image.fromarray(preprocessed_img)
458
  # 执行OCR,指定中文语言包
459
- result = pytesseract_engine.image_to_data(
 
460
  img_pil,
461
- output_type=pytesseract_engine.Output.DICT,
462
  lang='chi_sim+eng', # 添加中文简体和英文语言包
463
  config='--psm 6' # 假设单一文本块
464
  )
 
 
 
 
 
 
 
465
  else:
466
  # 直接使用图像
467
- result = pytesseract_engine.image_to_data(
468
  preprocessed_img,
469
- output_type=pytesseract_engine.Output.DICT,
470
  lang='chi_sim+eng', # 添加中文简体和英文语言包
471
  config='--psm 6' # 假设单一文本块
472
  )
 
 
 
 
 
 
 
473
 
474
- # 提取文本结果 - pytesseract格式
475
  page_text = []
476
- for i in range(len(result['text'])):
477
- text = result['text'][i].strip()
478
- if text:
479
- confidence = float(result['conf'][i]) / 100.0 # 转换为0-1范围
480
- page_text.append({"text": text, "confidence": confidence})
 
 
 
 
 
481
  elif hasattr(current_ocr_model, 'ocr'): # PaddleOCR
482
  try:
483
  result = current_ocr_model.ocr(preprocessed_img, cls=True)
 
456
  else:
457
  img_pil = Image.fromarray(preprocessed_img)
458
  # 执行OCR,指定中文语言包
459
+ # 使用image_to_string获取完整文本,避免字符分隔问题
460
+ full_text = pytesseract_engine.image_to_string(
461
  img_pil,
 
462
  lang='chi_sim+eng', # 添加中文简体和英文语言包
463
  config='--psm 6' # 假设单一文本块
464
  )
465
+ # 同时获取数据用于置信度信息
466
+ result_data = pytesseract_engine.image_to_data(
467
+ img_pil,
468
+ output_type=pytesseract_engine.Output.DICT,
469
+ lang='chi_sim+eng',
470
+ config='--psm 6'
471
+ )
472
  else:
473
  # 直接使用图像
474
+ full_text = pytesseract_engine.image_to_string(
475
  preprocessed_img,
 
476
  lang='chi_sim+eng', # 添加中文简体和英文语言包
477
  config='--psm 6' # 假设单一文本块
478
  )
479
+ # 同时获取数据用于置信度信息
480
+ result_data = pytesseract_engine.image_to_data(
481
+ preprocessed_img,
482
+ output_type=pytesseract_engine.Output.DICT,
483
+ lang='chi_sim+eng',
484
+ config='--psm 6'
485
+ )
486
 
487
+ # 处理完整文本,按行分割
488
  page_text = []
489
+ # 获取平均置信度
490
+ valid_confidences = [float(conf) for conf in result_data['conf'] if float(conf) > 0]
491
+ avg_confidence = sum(valid_confidences) / len(valid_confidences) if valid_confidences else 0.5
492
+
493
+ # 按行分割文本
494
+ lines = full_text.strip().split('\n')
495
+ for line in lines:
496
+ line = line.strip()
497
+ if line:
498
+ page_text.append({"text": line, "confidence": avg_confidence / 100.0})
499
  elif hasattr(current_ocr_model, 'ocr'): # PaddleOCR
500
  try:
501
  result = current_ocr_model.ocr(preprocessed_img, cls=True)