import gradio as gr import spaces import torch from transformers import AutoProcessor, AutoModelForZeroShotImageClassification from datasets import load_dataset dataset = load_dataset("not-lain/embedded-pokemon", split="train") dataset = dataset.add_faiss_index("embeddings") device = "cuda" if torch.cuda.is_available() else "cpu" processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14") model = AutoModelForZeroShotImageClassification.from_pretrained( "openai/clip-vit-large-patch14", device_map=device ) @spaces.GPU def search(query: str, k: int = 4): """a function that embeds a new image and returns the most probable results""" pixel_values = processor(images=query, return_tensors="pt")[ "pixel_values" ] # embed new image pixel_values = pixel_values.to(device) img_emb = model.get_image_features(pixel_values)[0] # because 1 element img_emb = img_emb.cpu().detach().numpy() # because datasets only works with numpy scores, retrieved_examples = dataset.get_nearest_examples( # retrieve results "embeddings", img_emb, # compare our new embedded query with the dataset embeddings k=k, # get only top k results ) # return as image, caption pairs out = [] for i in range(k): out.append([retrieved_examples["image"][i], retrieved_examples["text"][i]]) return out demo = gr.Interface( search, inputs="image", outputs=[ "gallery" # , "label" ], examples=["./charmander.jpg"], ) demo.launch(debug=True)