IFMedTechdemo's picture
Create app.py
ccaeb92 verified
import gradio as gr
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoProcessor
import spaces
# Model configuration
MODEL_PATH = "PaddlePaddle/PaddleOCR-VL"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Task prompts
PROMPTS = {
"OCR": "OCR:",
"Table Recognition": "Table Recognition:",
"Formula Recognition": "Formula Recognition:",
"Chart Recognition": "Chart Recognition:",
}
# Load model and processor
print(f"Loading model on {DEVICE}...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
trust_remote_code=True,
torch_dtype=torch.bfloat16
).to(DEVICE).eval()
processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
print("Model loaded successfully!")
@spaces.GPU
def process_image(image, task):
"""
Process an image with PaddleOCR-VL model.
Args:
image: PIL Image or path to image
task: Task type (OCR, Table Recognition, etc.)
Returns:
str: Recognition result
"""
if image is None:
return "Please upload an image first."
# Convert to PIL Image if needed
if not isinstance(image, Image.Image):
image = Image.open(image)
image = image.convert("RGB")
# Get prompt for the task
prompt = PROMPTS.get(task, PROMPTS["OCR"])
# Prepare messages
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": prompt},
]
}
]
# Process with model
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
).to(DEVICE)
# Generate output
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=1024)
# Decode and return result
result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
return result
# Create Gradio interface
demo = gr.Interface(
fn=process_image,
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Radio(
choices=list(PROMPTS.keys()),
value="OCR",
label="Task Type"
)
],
outputs=gr.Textbox(label="Result", lines=10),
title="PaddleOCR-VL: Multilingual Document Parsing",
description="Upload an image and select a task. This model supports OCR in 109 languages, table recognition, formula recognition, and chart recognition.",
examples=[
["example.png", "OCR"],
] if False else None, # Add examples if you upload sample images
)
if __name__ == "__main__":
demo.launch()