|
import faiss |
|
import gradio as gr |
|
import numpy as np |
|
import pandas as pd |
|
import torch.nn.functional as F |
|
from sentence_transformers import SentenceTransformer |
|
|
|
DIM = 768 |
|
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True) |
|
print("Model loaded successfully") |
|
|
|
papers_df = pd.read_csv("data/cvpr2024_papers_with_details.csv", index_col=None, on_bad_lines='skip') |
|
papers_df = papers_df[~papers_df["summary"].isna() & ~papers_df["pdf_path"].isna()] |
|
print("Data loaded successfully") |
|
|
|
with open('data/embeddings.npy', 'rb') as f: |
|
embeddings = np.load(f) |
|
|
|
index = faiss.IndexFlatL2(DIM) |
|
index.add(embeddings) |
|
print("Index loaded successfully") |
|
|
|
|
|
def encode_query(query): |
|
query_embeddings = model.encode([query], convert_to_tensor=True) |
|
query_embeddings = F.layer_norm(query_embeddings, normalized_shape=(query_embeddings.shape[1],)) |
|
query_embeddings = query_embeddings[:, :DIM] |
|
query_embeddings = F.normalize(query_embeddings, p=2, dim=1) |
|
return query_embeddings |
|
|
|
def search_nearest_papers(query, k=5): |
|
query_embeddings = encode_query(query) |
|
D, I = index.search(query_embeddings, k) |
|
return papers_df.iloc[I[0]][["Title", "arXiv_link"]] |
|
|
|
demo = gr.Interface( |
|
search_nearest_papers, |
|
[ |
|
"text", |
|
gr.Slider(1, 10, value=5), |
|
], |
|
gr.Dataframe( |
|
headers=["Title", "PDF"], |
|
), |
|
title="CVPR 2024 Paper Search", |
|
description="Semantic search over CPVR 2024 paper summary. This app was made using the data available on https://github.com/harpreetsahota204/CVPR-2024-Papers.", |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|