Spaces:

CS4NLP
/

vqa_demo

Runtime error

App Files Files Community

MinxuanQin commited on Jul 19, 2023

Commit

6cb5353

1 Parent(s): 502b0e8

first trial with blip

Browse files

Files changed (2) hide show

app.py +34 -0
model_loader.py +203 -0

app.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import sys
+sys.path.append(".")
+import streamlit as st
+import pandas as pd
+from vqa_demo.model_loader import *
+# load dataset
+ds = load_dataset("test")
+# define selector
+model_name = st.sidebar.selectbox(
+    "Select a model: ",
+    ('vilt', 'git', 'blip', 'vbert')
+)
+image_selector_unspecific = st.number_input(
+    "Select an image id: ",
+    0, len(ds)
+)
+# select and display
+sample = ds[image_selector_unspecific]
+image = sample['image']
+image
+# inference
+question = st.text_input(f"Ask the model a question related to the image: \n"
+                               f"(e.g. \"{sample['question']}\")")
+args = load_model(model_name) # TODO: cache
+answer = get_answer(args, image, question, model_name)
+st.write("answer")

model_loader.py ADDED Viewed

	@@ -0,0 +1,203 @@

+from huggingface_hub import hf_hub_download
+from PIL import Image
+import torch
+from datasets import load_dataset, get_dataset_split_names
+import numpy as np
+import requests
+from transformers import ViltProcessor, ViltForQuestionAnswering
+from transformers import AutoProcessor, AutoModelForCausalLM
+from transformers import BlipProcessor, BlipForQuestionAnswering
+from nltk.corpus import wordnet
+import os
+import requests
+from tqdm import tqdm
+import timm
+# VLMO: modify in vlmo/config.py: set test_only -> True
+from datasets import load_dataset, get_dataset_split_names
+import torch
+import torchvision
+from torchvision.models import resnet50
+import torchvision.transforms as transforms
+from transformers import VisualBertForMultipleChoice, VisualBertForQuestionAnswering, BertTokenizerFast, AutoTokenizer, ViltForQuestionAnswering
+from PIL import Image
+from nltk.corpus import wordnet
+import time
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+VQA_URL = "https://dl.fbaipublicfiles.com/pythia/data/answers_vqa.txt"
+# load processor and model
+def load_model(name):
+    if name == "vilt":
+        processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
+        model = ViltForQuestionAnswering.from_pretrained("CARETS/vilt_neg_model")
+    elif name == "git":
+        processor = AutoProcessor.from_pretrained("microsoft/git-base-vqav2")
+        model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vqav2")
+    elif name == "blip":
+        processor = BlipProcessor.from_pretrained('Salesforce/blip-vqa-base')
+        model = BlipForQuestionAnswering.from_pretrained('Salesforce/blip-vqa-base')
+    elif name == "vbert":
+        processor = AutoTokenizer.from_pretrained("bert-base-uncased")
+        model = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa")
+    else:
+        raise ValueError("invalid model name: ", name)
+    return (processor, model)
+def load_dataset(type):
+    if type == "train":
+        return load_dataset("HuggingFaceM4/VQAv2", split="train", streaming=False)
+    elif type == "test":
+        return load_dataset("HuggingFaceM4/VQAv2", split="validation", streaming=False)
+    else:
+        raise ValueError("invalid dataset: ", type)
+def tokenize_function(examples, processor):
+    sample = {}
+    sample['inputs'] = processor(images=examples['image'], text=examples['question'], return_tensors="pt")
+    sample['outputs'] = examples['multiple_choice_answer']
+    return sample
+def label_count_list(labels):
+    res = {}
+    keys = set(labels)
+    for key in keys:
+        res[key] = labels.count(key)
+    return res
+def get_item(image, question, tokenizer, image_model, model_name):
+    inputs = tokenizer(
+        question,
+        # padding='max_length',
+        # truncation=True,
+        # max_length=128,
+        return_tensors='pt'
+    )
+    visual_embeds = get_img_feats(image, image_model=image_model, name=model_name)\
+        .squeeze(2, 3).unsqueeze(0)
+    visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
+    visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
+    upd_dict = {
+        "visual_embeds": visual_embeds,
+        "visual_token_type_ids": visual_token_type_ids,
+        "visual_attention_mask": visual_attention_mask,
+    }
+    inputs.update(upd_dict)
+    return upd_dict, inputs
+def get_img_feats(image, image_model, new_size=None, name='resnet50'):
+    if name == "resnet50":
+        image_model = torch.nn.Sequential(*list(image_model.children())[:-1])
+    # apply transforms when necessary
+    if new_size is not None:
+        transfrom_f = transforms.Resize((new_size, new_size), interpolation=transforms.InterpolationMode.LANCZOS)
+        image = transfrom_f(image)
+    transform = transforms.Compose([
+        transforms.ToTensor(),  # Convert PIL Image back to tensor
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    ])
+    # get features
+    image = transform(image)
+    if name == "resnet50":
+        image_features = image_model(image.unsqueeze(0))
+    elif name == "vitb16":
+        image_features = image_model.forward_features(image.unsqueeze(0))
+    return image_features
+def get_data(query, delim=","):
+    assert isinstance(query, str)
+    if os.path.isfile(query):
+        with open(query) as f:
+            data = eval(f.read())
+    else:
+        req = requests.get(query)
+        try:
+            data = requests.json()
+        except Exception:
+            data = req.content.decode()
+            assert data is not None, "could not connect"
+            try:
+                data = eval(data)
+            except Exception:
+                data = data.split("\n")
+        req.close()
+    return data
+def err_msg():
+    print("Load error, try again")
+    return "[ERROR]"
+def get_answer(model_loader_args, img, question, model_name):
+    processor, model = model_loader_args[0], model_loader_args[1]
+    if model_name == "vilt":
+        try:
+            encoding = processor(images=img, text=question, return_tensors="pt")
+        except Exception:
+            return err_msg()
+        else:
+            outputs = model(**encoding)
+            logits = outputs.logits
+            idx = logits.argmax(-1).item()
+            pred = model.config.id2label[idx]
+    elif model_name == "git":
+        try:
+            pixel_values = processor(images=img, return_tensors="pt").pixel_values
+            input_ids = processor(text=question, add_special_tokens=False).input_ids
+            input_ids = [processor.tokenizer.cls_token_id] + input_ids
+            input_ids = torch.tensor(input_ids).unsqueeze(0)
+        except Exception:
+            return err_msg()
+        else:
+            generate_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)
+            output = processor.batch_decode(generate_ids, skip_special_tokens=True)
+            output = output[0]
+            pred = output.split('?')[-1]
+            pred = pred.strip()
+    elif model_name == "vbert":
+        vqa_answers = get_data(VQA_URL)
+        try:
+            # load question and image (processor = tokenizer)
+            _, inputs = get_item(img, question, processor, model_name)
+            outputs = model(**inputs)
+        except Exception:
+            return err_msg()
+        else:
+            answer_idx = torch.argmax(outputs.logits, dim=1).item()  # from 3129
+            pred = vqa_answers[answer_idx]
+    elif model_name == "blip":
+        try:
+            pixel_values = processor(images=img, return_tensors="pt").pixel_values
+            blip_ques = processor.tokenizer.cls_token + question
+            batch_input_ids = processor(text=blip_ques, add_special_tokens=False).input_ids
+            batch_input_ids = torch.tensor(batch_input_ids).unsqueeze(0)
+            generate_ids = model.generate(pixel_values=pixel_values, input_ids=batch_input_ids, max_length=50)
+            blip_output = processor.batch_decode(generate_ids, skip_special_tokens=True)
+        except Exception:
+            return err_msg()
+        else:
+            pred = blip_output[0]
+    else:
+        return "Invalid model name"
+    return pred