Caoyanyi commited on
Commit
8dae0e0
·
1 Parent(s): 9363b11

* Adjust ocr model logic.

Browse files
Files changed (1) hide show
  1. app.py +29 -18
app.py CHANGED
@@ -447,26 +447,37 @@ async def ocr_document(file: UploadFile = File(...)):
447
  if isinstance(preprocessed_img, np.ndarray):
448
  # 转换为PIL Image
449
  from PIL import Image
450
- img_pil = Image.fromarray(preprocessed_img)
451
- # 执行OCR
452
- result = pytesseract_engine.image_to_data(img_pil, output_type=pytesseract_engine.Output.DICT)
453
-
454
- # 提取文本结果 - pytesseract格式
455
- page_text = []
456
- for i in range(len(result['text'])):
457
- text = result['text'][i].strip()
458
- if text:
459
- confidence = float(result['conf'][i]) / 100.0 # 转换为0-1范围
460
- page_text.append({"text": text, "confidence": confidence})
 
 
 
 
461
  else:
462
  # 直接使用图像
463
- result = pytesseract_engine.image_to_data(preprocessed_img, output_type=pytesseract_engine.Output.DICT)
464
- page_text = []
465
- for i in range(len(result['text'])):
466
- text = result['text'][i].strip()
467
- if text:
468
- confidence = float(result['conf'][i]) / 100.0
469
- page_text.append({"text": text, "confidence": confidence})
 
 
 
 
 
 
 
470
  elif hasattr(current_ocr_model, 'ocr'): # PaddleOCR
471
  try:
472
  result = current_ocr_model.ocr(preprocessed_img, cls=True)
 
447
  if isinstance(preprocessed_img, np.ndarray):
448
  # 转换为PIL Image
449
  from PIL import Image
450
+ # 对于pytesseract,直接使用灰度图可能效果更好,尤其是对于截图
451
+ # 检查是否是截图
452
+ if file_ext in ['png', 'jpg', 'jpeg'] and 'screenshot' in file.filename.lower():
453
+ # 对于截图,直接使用灰度图而不是二值化
454
+ gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) if len(img.shape) == 3 else img
455
+ img_pil = Image.fromarray(gray_img)
456
+ else:
457
+ img_pil = Image.fromarray(preprocessed_img)
458
+ # 执行OCR,指定中文语言包
459
+ result = pytesseract_engine.image_to_data(
460
+ img_pil,
461
+ output_type=pytesseract_engine.Output.DICT,
462
+ lang='chi_sim+eng', # 添加中文简体和英文语言包
463
+ config='--psm 6' # 假设单一文本块
464
+ )
465
  else:
466
  # 直接使用图像
467
+ result = pytesseract_engine.image_to_data(
468
+ preprocessed_img,
469
+ output_type=pytesseract_engine.Output.DICT,
470
+ lang='chi_sim+eng', # 添加中文简体和英文语言包
471
+ config='--psm 6' # 假设单一文本块
472
+ )
473
+
474
+ # 提取文本结果 - pytesseract格式
475
+ page_text = []
476
+ for i in range(len(result['text'])):
477
+ text = result['text'][i].strip()
478
+ if text:
479
+ confidence = float(result['conf'][i]) / 100.0 # 转换为0-1范围
480
+ page_text.append({"text": text, "confidence": confidence})
481
  elif hasattr(current_ocr_model, 'ocr'): # PaddleOCR
482
  try:
483
  result = current_ocr_model.ocr(preprocessed_img, cls=True)