|
|
import gradio as gr |
|
|
from transformers import AutoModel, AutoProcessor |
|
|
from PIL import Image |
|
|
import torch |
|
|
import numpy as np |
|
|
|
|
|
model_name_or_path = "lyttt/VLV_captioner" |
|
|
model = AutoModel.from_pretrained(model_name_or_path, revision="master", trust_remote_code=True,low_cpu_mem_usage=False) |
|
|
|
|
|
|
|
|
def greet(image): |
|
|
if image.dtype != np.uint8: |
|
|
image = (np.clip(image, 0, 1) * 255).astype(np.uint8) |
|
|
image = Image.fromarray(image, mode='RGB') |
|
|
with torch.no_grad(): |
|
|
outputs = model([image], 300).generated_text[0] |
|
|
def drop_incomplete_tail(text): |
|
|
sentences = text.split('.') |
|
|
complete_sentences = [s.strip() for s in sentences if s.strip()] |
|
|
if not text.strip().endswith('.'): |
|
|
complete_sentences = complete_sentences[:-1] |
|
|
return '. '.join(complete_sentences) + ('.' if complete_sentences else '') |
|
|
return drop_incomplete_tail(outputs) |
|
|
|
|
|
demo = gr.Interface(fn=greet, inputs="image", outputs="text") |
|
|
demo.launch() |