Spaces:

eremeev-d
/

graph-rec

Sleeping

App Files Files Community

erermeev-d commited on Aug 4, 2024

Commit

d4852d9

0 Parent(s):

Initial commit

Browse files

Files changed (21) hide show

.gitattributes +1 -0
Dockerfile +18 -0
Makefile +3 -0
README.md +42 -0
app/__init__.py +0 -0
app/database.py +35 -0
app/main.py +60 -0
app/recommendations.py +19 -0
exp/__init__.py +0 -0
exp/deepwalk.py +80 -0
exp/evaluate.py +81 -0
exp/gnn.py +284 -0
exp/prepare_db.py +33 -0
exp/prepare_embeddings.sh +52 -0
exp/prepare_index.py +20 -0
exp/process_raw_data.py +116 -0
exp/requirements.txt +7 -0
exp/requirements_gpu.txt +8 -0
exp/sbert.py +39 -0
exp/utils.py +69 -0
requirements.txt +4 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ data/* filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+FROM python:3.10.13
+COPY requirements.txt .
+RUN pip3 install -r requirements.txt
+RUN mkdir /data
+RUN wget https://storage.yandexcloud.net/eremeev-d-bucket-main/1722760266.tar -O data.tar
+RUN tar -xf data.tar -C /data
+RUN rm data.tar
+RUN mkdir /app
+COPY app app
+EXPOSE 8501
+ENV PYTHONPATH .
+ENTRYPOINT ["streamlit", "run", "app/main.py", "--server.port=8501"]

Makefile ADDED Viewed

	@@ -0,0 +1,3 @@

+run-app:
+	docker build -t graph-rec-app .
+	docker run --rm -p 8501:8501 graph-rec-app

README.md ADDED Viewed

	@@ -0,0 +1,42 @@

+---
+title: A simple Graph-based Recommender System
+emoji: 📚
+colorFrom: purple
+colorTo: yellow
+sdk: docker
+app_port: 8501
+---
+# A simple Graph-based Recommender System
+### What is it?
+This app is a simple graph-based recommender system that searches for items and recommends similar ones. It can be applied to any dataset. For demonstration purposes, we use the (filtered) [Goodreads](https://mengtingwan.github.io/data/goodreads#datasets) dataset.
+### Where can I try this app?
+The app is currently deployed at HuggingFace Spaces ([link](https://huggingface.co/spaces/eremeev-d/graph-rec)). You will probably need to wait a minute or two for app to start running.
+### How to use it?
+Simply enter a keyword (e.g., "Brave") into the search bar and press the "Search" button. The app will display relevant books along with their short descriptions.
+For each book, you can click "Recommend Similar Items" to see other books you might enjoy if you liked the selected one.
+### How to reproduce embeddings computation?
+First, install needed requirements from `exp/requirements.txt` (or `exp/requirements_gpu.txt` for gpu) file.
+Then, download needed raw data from [Goodreads website](https://mengtingwan.github.io/data/goodreads#datasets). We will need the following files: `book_id_map.csv`, `goodreads_books.json`, `goodreads_interactions.csv` and `user_id_map.csv`. You can download this files manually or use this [Kaggle dataset](https://www.kaggle.com/datasets/eremeevd/graph-rec-goodreads).
+Finally, simply run the following command at the root of the repo:
+```
+sh exp/prepare_embeddings.sh INPUT_DIRECTORY SAVE_DIRECTORY
+```
+where `INPUT_DIRECTORY` is path to the directory with raw data (e.g. `/kaggle/input/graph-rec-goodreads/goodreads-books`). And `SAVE_DIRECTORY` is path to the directory, where results will be saved (e.g. `/kaggle/working/embeddings`). To use obtained embeddings, copy the following files to the `app/data`: `index.faiss` and `items.db`.
+To run on GPU, run the following command:
+```
+sh exp/prepare_embeddings.sh INPUT_DIRECTORY SAVE_DIRECTORY cuda
+```
+For further information, refer to the `exp` directory in this repo.

app/__init__.py ADDED Viewed

File without changes

app/database.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import io
+import sqlite3
+import numpy as np
+class ItemDatabase:
+    def __init__(self, db_path):
+        sqlite3.register_converter("embedding", self._text_to_numpy_array)
+        self._db_path = db_path
+    @staticmethod
+    def _text_to_numpy_array(text):
+        out = io.BytesIO(text)
+        out.seek(0)
+        return np.load(out)
+    def _connect(self):
+        return sqlite3.connect(
+            self._db_path, detect_types=sqlite3.PARSE_DECLTYPES)
+    def search_items(self, query, n_items=10):
+        with self._connect() as conn:
+            c = conn.cursor()
+            c.execute(f"select item_id from items where title like '%{query}%'")
+            rows = c.fetchall()[:n_items]
+            return [row[0] for row in rows]
+    def get_item(self, item_id):
+        with self._connect() as conn:
+            c = conn.cursor()
+            c.row_factory = sqlite3.Row
+            c.execute(f"select * from items where item_id like '{item_id}'")
+            return c.fetchone()

app/main.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import sqlite3
+import sys
+import streamlit as st
+from app.database import ItemDatabase
+from app.recommendations import RecommenderSystem
+def show_item(item_id):
+    item = st.session_state["db"].get_item(item_id)
+    title = item["title"]
+    with st.container(border=True):
+        st.write(f"**{title}**")
+        st.write(item["description"])
+        if st.button("Recommend similar items", key=item["item_id"]):
+            st.session_state["recommendation_query"] = item["item_id"]
+            st.session_state["search_query"] = None  # reset
+            st.rerun()
+def main():
+    st.title("Graph-based RecSys")
+    if "db" not in st.session_state:
+        st.session_state["db"] = ItemDatabase(
+            db_path="/data/items.db")
+    if "recsys" not in st.session_state:
+        st.session_state["recsys"] = RecommenderSystem(
+            faiss_index_path="/data/index.faiss",
+            db_path="/data/items.db")
+    if "search_query" not in st.session_state:
+        st.session_state["search_query"] = None
+    if "recommendation_query" not in st.session_state:
+        st.session_state["recommendation_query"] = None
+    search_query = st.text_input("Enter item name", st.session_state["search_query"])
+    if st.button("Search"):
+        st.session_state["search_query"] = search_query
+        st.session_state["recommendation_query"] = None  # reset
+    if st.session_state["recommendation_query"] is not None:
+        query = st.session_state["recommendation_query"]
+        base_item_title = st.session_state["db"].get_item(query)["title"]
+        st.subheader(f'Recommendation Results for "{base_item_title}"')
+        results = st.session_state["recsys"].recommend_items(query)
+        for item_id in results:
+            show_item(item_id)
+    elif st.session_state["search_query"] is not None:
+        query = st.session_state["search_query"]
+        st.subheader(f'Search Results for "{query}"')
+        results = st.session_state["db"].search_items(query)
+        for item_id in results:
+            show_item(item_id)
+if __name__ == "__main__":
+    main()

app/recommendations.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import itertools
+import os
+import numpy as np
+import faiss
+from app.database import ItemDatabase
+class RecommenderSystem:
+    def __init__(self, faiss_index_path, db_path):
+        self._index = faiss.read_index(faiss_index_path)
+        self._db = ItemDatabase(db_path)
+    def recommend_items(self, query, n_items=10):
+        query_embedding = self._db.get_item(query)["embedding"]
+        _, results = self._index.search(query_embedding, k=n_items+1)
+        results = filter(lambda item: item != query, results[0])
+        return itertools.islice(results, n_items)

exp/__init__.py ADDED Viewed

File without changes

exp/deepwalk.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import argparse
+import os
+import numpy as np
+import pandas as pd
+import dgl
+import torch
+import wandb
+from tqdm.auto import tqdm
+from utils import prepare_graphs, extract_item_embeddings, normalize_embeddings
+def prepare_deepwalk_embeddings(
+        items_path,
+        ratings_path,
+        embeddings_savepath,
+        emb_dim,
+        window_size,
+        batch_size,
+        lr,
+        num_epochs,
+        device,
+        wandb_name,
+        use_wandb
+):
+    ### Prepare graph
+    bipartite_graph, graph = prepare_graphs(items_path, ratings_path)
+    bipartite_graph = bipartite_graph.to(device)
+    graph = graph.to(device)
+    ### Run deepwalk
+    if use_wandb:
+        wandb.init(project="graph-recs-deepwalk", name=wandb_name)
+    model = dgl.nn.DeepWalk(graph, emb_dim=emb_dim, window_size=window_size)
+    model = model.to(device)
+    dataloader = torch.utils.data.DataLoader(
+        torch.arange(graph.num_nodes()),
+        batch_size=batch_size,
+        shuffle=True,
+        collate_fn=model.sample)
+    optimizer = torch.optim.SparseAdam(model.parameters(), lr=lr)
+    for epoch in range(num_epochs):
+        for batch_walk in tqdm(dataloader):
+            loss = model(batch_walk)
+            if use_wandb:
+                wandb.log({"loss": loss.item()})
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+    if use_wandb:
+        wandb.finish()
+    node_embeddings = model.node_embed.weight.detach().to(device)
+    ### Extract & save item embeddings
+    item_embeddings = extract_item_embeddings(node_embeddings, bipartite_graph, graph)
+    item_embeddings = normalize_embeddings(item_embeddings)
+    np.save(embeddings_savepath, item_embeddings)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Prepare DeepWalk embeddings.")
+    parser.add_argument("--items_path", type=str, required=True, help="Path to the items file.")
+    parser.add_argument("--ratings_path", type=str, required=True, help="Path to the ratings file.")
+    parser.add_argument("--embeddings_savepath", type=str, required=True, help="Path to the file where embeddings will be saved.")
+    parser.add_argument("--emb_dim", type=int, default=384, help="Dimensionality of the embeddings.")
+    parser.add_argument("--window_size", type=int, default=4, help="Window size for the DeepWalk algorithm.")
+    parser.add_argument("--batch_size", type=int, default=512, help="Batch size for training.")
+    parser.add_argument("--lr", type=float, default=1e-2, help="Learning rate for training.")
+    parser.add_argument("--num_epochs", type=int, default=2, help="Number of epochs for training.")
+    parser.add_argument("--device", type=str, default="cpu", help="Device to use for training (cpu or cuda).")
+    parser.add_argument("--wandb_name", type=str, help="Name for WandB run.")
+    parser.add_argument("--no_wandb", action="store_false", dest="use_wandb", help="Disable WandB logging")
+    args = parser.parse_args()
+    prepare_deepwalk_embeddings(**vars(args))

exp/evaluate.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import argparse
+import json
+import pandas as pd
+import numpy as np
+from app.recommendations import RecommenderSystem
+def precision_at_k(recommended_items, relevant_items, k):
+    recommended_at_k = set(recommended_items[:k])
+    relevant_set = set(relevant_items)
+    return len(recommended_at_k & relevant_set) / k
+def evaluate_recsys(
+    metrics_savepath,
+    val_ratings_path,
+    faiss_index_path,
+    db_path,
+    n_recommend_items,
+):
+    recsys = RecommenderSystem(
+        faiss_index_path=faiss_index_path,
+        db_path=db_path)
+    val_ratings = pd.read_csv(val_ratings_path)
+    grouped_items = val_ratings.groupby("user_id")["item_id"].apply(list).reset_index()
+    grouped_items = grouped_items["item_id"].tolist()
+    metric_arrays = {
+        "precision@1": [],
+        "precision@3": [],
+        "precision@10": []
+    }
+    for item_group in grouped_items:
+        if len(item_group) == 1:
+            continue
+        ### Precision@k is computed for each edge.
+        ### We will first aggregate it over all edges for user
+        ### And after that - aggregate over all users
+        user_metric_arrays = dict()
+        for metric in metric_arrays.keys():
+            user_metric_arrays[metric] = []
+        for item in item_group:
+            recommend_items = list(recsys.recommend_items(item, n_recommend_items))
+            relevant_items = set(item_group) - {item}
+            user_metric_arrays["precision@1"].append(
+                precision_at_k(recommend_items, relevant_items, k=1))
+            user_metric_arrays["precision@3"].append(
+                precision_at_k(recommend_items, relevant_items, k=3))
+            user_metric_arrays["precision@10"].append(
+                precision_at_k(recommend_items, relevant_items, k=10))
+        for metric in metric_arrays.keys():
+            user_metric = np.mean(user_metric_arrays[metric])
+            metric_arrays[metric].append(user_metric)
+    metrics = dict()
+    for metric, array in metric_arrays.items():
+        metrics[metric] = np.mean(array)
+    with open(metrics_savepath, "w") as f:
+        json.dump(metrics, f)
+    print(f"Saved metrics to {metrics_savepath}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Evaluate a recommendation system.")
+    parser.add_argument("--metrics_savepath", required=True, type=str, help="Path to save the evaluation metrics.")
+    parser.add_argument("--val_ratings_path", required=True, type=str, help="Path to the csv file with validation ratings.")
+    parser.add_argument("--faiss_index_path", required=True, type=str, help="Path to the FAISS index.")
+    parser.add_argument("--db_path", required=True, type=str, help="Path to the database file.")
+    parser.add_argument("--n_recommend_items", type=int, default=10, help="Number of items to recommend.")
+    args = parser.parse_args()
+    evaluate_recsys(**vars(args))

exp/gnn.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import argparse
+import os
+import numpy as np
+import pandas as pd
+import dgl
+import torch
+import wandb
+from tqdm.auto import tqdm
+from utils import prepare_graphs, normalize_embeddings, LRSchedule
+class GNNLayer(torch.nn.Module):
+    def __init__(self, hidden_dim, aggregator_type, skip_connection, bidirectional):
+        super().__init__()
+        self._skip_connection = skip_connection
+        self._bidirectional = bidirectional
+        self._norm = torch.nn.LayerNorm(hidden_dim)
+        self._conv = dgl.nn.SAGEConv(hidden_dim, hidden_dim, aggregator_type)
+        self._activation = torch.nn.ReLU()
+        if bidirectional:
+            self._norm_rev = torch.nn.LayerNorm(hidden_dim)
+            self._conv_rev = dgl.nn.SAGEConv(hidden_dim, hidden_dim, aggregator_type)
+            self._activation_rev = torch.nn.ReLU()
+    def forward(self, graph, x):
+        y = self._activation(self._conv(graph, self._norm(x)))
+        if self._bidirectional:
+            y = y + self._activation_rev(self._conv_rev(dgl.reverse(graph), self._norm_rev(x)))
+        if self._skip_connection:
+            return x + y
+        else:
+            return y
+class GNNModel(torch.nn.Module):
+    def __init__(
+            self,
+            bipartite_graph,
+            text_embeddings,
+            deepwalk_embeddings,
+            num_layers,
+            hidden_dim,
+            aggregator_type,
+            skip_connection,
+            bidirectional,
+            num_traversals,
+            termination_prob,
+            num_random_walks,
+            num_neighbor,
+    ):
+        super().__init__()
+        self._bipartite_graph = bipartite_graph
+        self._text_embeddings = text_embeddings
+        self._deepwalk_embeddings = deepwalk_embeddings
+        self._sampler = dgl.sampling.PinSAGESampler(
+            bipartite_graph, "Item", "User", num_traversals,
+            termination_prob, num_random_walks, num_neighbor)
+        self._text_encoder = torch.nn.Linear(text_embeddings.shape[-1], hidden_dim)
+        self._deepwalk_encoder = torch.nn.Linear(deepwalk_embeddings.shape[-1], hidden_dim)
+        self._layers = torch.nn.ModuleList()
+        for _ in range(num_layers):
+            self._layers.append(GNNLayer(
+                hidden_dim, aggregator_type, skip_connection, bidirectional))
+    def _sample_subraph(self, frontier_ids):
+        num_layers = len(self._layers)
+        device = self._bipartite_graph.device
+        subgraph = dgl.graph(([], []), num_nodes=self._bipartite_graph.num_nodes("Item")).to(device)
+        prev_ids = set()
+        for _ in range(num_layers):
+            frontier_ids = torch.tensor(frontier_ids, dtype=torch.int64).to(device)
+            new_edges = self._sampler(frontier_ids).edges()
+            subgraph.add_edges(*new_edges)
+            prev_ids |= set(frontier_ids.cpu().tolist())
+            frontier_ids = set(dgl.compact_graphs(subgraph).ndata[dgl.NID].cpu().tolist())
+            frontier_ids = list(frontier_ids - prev_ids)
+        return subgraph
+    def forward(self, ids):
+        ### Sample subgraph
+        sampled_subgraph = self._sample_subraph(ids)
+        sampled_subgraph = dgl.compact_graphs(sampled_subgraph, always_preserve=ids)
+        ### Encode text & DeepWalk embeddings
+        text_embeddings = self._text_embeddings[
+            sampled_subgraph.ndata[dgl.NID]]
+        deepwalk_embeddings = self._deepwalk_embeddings[
+            sampled_subgraph.ndata[dgl.NID]]
+        features = self._text_encoder(text_embeddings) \
+            + self._deepwalk_encoder(deepwalk_embeddings)
+        ### GNN goes brr...
+        for layer in self._layers:
+            features = layer(sampled_subgraph, features)
+        ### Select features for initial ids
+        # TODO: write it more efficiently?
+        matches = sampled_subgraph.ndata[dgl.NID].unsqueeze(0) == ids.unsqueeze(1)
+        ids_in_subgraph = matches.nonzero(as_tuple=True)[1]
+        features = features[ids_in_subgraph]
+        ### Normalize and return
+        features = features / torch.linalg.norm(features, dim=1, keepdim=True)
+        return features
+### Based on https://arxiv.org/pdf/2205.03169
+def nt_xent_loss(sim, temperature):
+    sim = sim / temperature
+    n = sim.shape[0] // 2  # n = |user_batch|
+    aligment_loss = -torch.mean(sim[torch.arange(n), torch.arange(n)+n])
+    mask = torch.diag(torch.ones(2*n, dtype=torch.bool)).to(sim.device)
+    sim = torch.where(mask, -torch.inf, sim)
+    sim = sim[:n, :]
+    distribution_loss = torch.mean(torch.logsumexp(sim, dim=1))
+    loss = aligment_loss + distribution_loss
+    return loss
+def sample_item_batch(user_batch, bipartite_graph):
+    sampled_edges = dgl.sampling.sample_neighbors(
+        bipartite_graph, {"User": user_batch}, fanout=2
+    ).edges(etype="ItemUser")
+    item_batch = sampled_edges[0]
+    item_batch = item_batch[torch.argsort(sampled_edges[1])]
+    item_batch = item_batch.reshape(-1, 2)
+    item_batch = item_batch.T
+    return item_batch
+def prepare_gnn_embeddings(
+        # Paths
+        items_path,
+        ratings_path,
+        text_embeddings_path,
+        deepwalk_embeddings_path,
+        embeddings_savepath,
+        # Learning hyperparameters
+        temperature,
+        batch_size,
+        lr,
+        num_epochs,
+        # Model hyperparameters
+        num_layers,
+        hidden_dim,
+        aggregator_type,
+        skip_connection,
+        bidirectional,
+        num_traversals,
+        termination_prob,
+        num_random_walks,
+        num_neighbor,
+        # Misc
+        device,
+        wandb_name,
+        use_wandb,
+):
+    ### Prepare graph
+    bipartite_graph, _ = prepare_graphs(items_path, ratings_path)
+    bipartite_graph = bipartite_graph.to(device)
+    ### Init wandb
+    if use_wandb:
+        wandb.init(project="graph-rec-gnn", name=wandb_name)
+    ### Prepare model
+    text_embeddings = torch.tensor(np.load(text_embeddings_path)).to(device)
+    deepwalk_embeddings = torch.tensor(np.load(deepwalk_embeddings_path)).to(device)
+    model = GNNModel(
+        bipartite_graph=bipartite_graph,
+        text_embeddings=text_embeddings,
+        deepwalk_embeddings=deepwalk_embeddings,
+        num_layers=num_layers,
+        hidden_dim=hidden_dim,
+        aggregator_type=aggregator_type,
+        skip_connection=skip_connection,
+        bidirectional=bidirectional,
+        num_traversals=num_traversals,
+        termination_prob=termination_prob,
+        num_random_walks=num_random_walks,
+        num_neighbor=num_neighbor
+    )
+    model = model.to(device)
+    ### Prepare dataloader
+    all_users = torch.arange(bipartite_graph.num_nodes("User")).to(device)
+    all_users = all_users[bipartite_graph.in_degrees(all_users, etype="ItemUser") > 1] # We need to sample 2 items per user
+    dataloader = torch.utils.data.DataLoader(
+        all_users, batch_size=batch_size, shuffle=True, drop_last=True)
+    ### Prepare optimizer & LR scheduler
+    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
+    total_steps = num_epochs * len(dataloader)
+    lr_schedule = LRSchedule(
+        total_steps=total_steps,
+        warmup_steps=int(0.1*total_steps),
+        final_factor=0.1)  # TODO: move to args
+    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_schedule)
+    ### Train loop
+    model.train()
+    for epoch in range(num_epochs):
+        for user_batch in tqdm(dataloader):
+            item_batch = sample_item_batch(user_batch, bipartite_graph)  # (2, |user_batch|)
+            item_batch = item_batch.reshape(-1)  # (2 * |user_batch|)
+            features = model(item_batch)  # (2 * |user_batch|, hidden_dim)
+            sim = features @ features.T  # (2 * |user_batch|, 2 * |user_batch|)
+            loss = nt_xent_loss(sim, temperature)
+            if use_wandb:
+                wandb.log({"loss": loss.item()})
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            lr_scheduler.step()
+    if use_wandb:
+        wandb.finish()
+    ### Process full dataset
+    model.eval()
+    with torch.no_grad():
+        hidden_dim = text_embeddings.shape[-1]
+        item_embeddings = torch.zeros(bipartite_graph.num_nodes("Item"), hidden_dim).to(device)
+        for items_batch in tqdm(torch.utils.data.DataLoader(
+                torch.arange(bipartite_graph.num_nodes("Item")),
+                batch_size=batch_size,
+                shuffle=True
+        )):
+            item_embeddings[items_batch] = model(items_batch.to(device))
+    ### Extract & save item embeddings
+    item_embeddings = normalize_embeddings(item_embeddings.cpu().numpy())
+    np.save(embeddings_savepath, item_embeddings)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Prepare GNN Embeddings")
+    # Paths
+    parser.add_argument("--items_path", type=str, required=True, help="Path to the items file")
+    parser.add_argument("--ratings_path", type=str, required=True, help="Path to the ratings file")
+    parser.add_argument("--text_embeddings_path", type=str, required=True, help="Path to the text embeddings file")
+    parser.add_argument("--deepwalk_embeddings_path", type=str, required=True, help="Path to the deepwalk embeddings file")
+    parser.add_argument("--embeddings_savepath", type=str, required=True, help="Path to the file where gnn embeddings will be saved")
+    # Learning hyperparameters
+    parser.add_argument("--temperature", type=float, default=0.1, help="Temperature for NT-Xent loss")
+    parser.add_argument("--batch_size", type=int, default=512, help="Batch size for training")
+    parser.add_argument("--lr", type=float, default=0.01, help="Learning rate")
+    parser.add_argument("--num_epochs", type=int, default=4, help="Number of epochs")
+    # Model hyperparameters
+    parser.add_argument("--num_layers", type=int, default=2, help="Number of layers in the model")
+    parser.add_argument("--hidden_dim", type=int, default=384, help="Hidden dimension size")
+    parser.add_argument("--aggregator_type", type=str, default="mean", help="Type of aggregator in SAGEConv")
+    parser.add_argument("--no_skip_connection", action="store_false", dest="skip_connection", help="Disable skip connections")
+    parser.add_argument("--no_bidirectional", action="store_false", dest="bidirectional", help="Do not use reversed edges in convolution")
+    parser.add_argument("--num_traversals", type=int, default=4, help="Number of traversals in PinSAGE-like sampler")
+    parser.add_argument("--termination_prob", type=float, default=0.5, help="Termination probability in PinSAGE-like sampler")
+    parser.add_argument("--num_random_walks", type=int, default=200, help="Number of random walks in PinSAGE-like sampler")
+    parser.add_argument("--num_neighbor", type=int, default=3, help="Number of neighbors in PinSAGE-like sampler")
+    # Misc
+    parser.add_argument("--device", type=str, default="cpu", help="Device to run the model on (cpu or cuda)")
+    parser.add_argument("--wandb_name", type=str, help="WandB run name")
+    parser.add_argument("--no_wandb", action="store_false", dest="use_wandb", help="Disable WandB logging")
+    args = parser.parse_args()
+    prepare_gnn_embeddings(**vars(args))

exp/prepare_db.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import argparse
+import sqlite3
+import io
+import pandas as pd
+import numpy as np
+def convert_numpy_array_to_text(array):
+    stream = io.BytesIO()
+    np.save(stream, array)
+    stream.seek(0)
+    return sqlite3.Binary(stream.read())
+def prepare_items_db(items_path, embeddings_path, db_path):
+    items = pd.read_csv(items_path)
+    embeddings = np.load(embeddings_path)
+    items["embedding"] = np.split(embeddings, embeddings.shape[0])
+    sqlite3.register_adapter(np.ndarray, convert_numpy_array_to_text)
+    with sqlite3.connect(db_path, detect_types=sqlite3.PARSE_DECLTYPES) as conn:
+        items.to_sql("items", conn, if_exists="replace", index=False, dtype={"embedding": "embedding"})
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Prepare items database from a CSV file.")
+    parser.add_argument("--items_path", required=True, type=str, help="Path to the CSV file containing items.")
+    parser.add_argument("--embeddings_path", required=True, type=str, help="Path to the .npy file containing item embeddings.")
+    parser.add_argument("--db_path", required=True, type=str, help="Path to the SQLite database file.")
+    args = parser.parse_args()
+    prepare_items_db(**vars(args))

exp/prepare_embeddings.sh ADDED Viewed

	@@ -0,0 +1,52 @@

+#!/bin/bash
+set -e
+input_directory="$1"
+save_directory="$2"
+device="${3:-cpu}"
+echo Running on "$device".
+PYTHONPATH=. python exp/process_raw_data.py \
+    --input_directory "$input_directory" \
+    --save_directory "$save_directory" \
+    --create_train_val_split
+PYTHONPATH=. python exp/deepwalk.py \
+    --items_path "$save_directory/items.csv" \
+    --ratings_path "$save_directory/train_ratings.csv" \
+    --embeddings_savepath "$save_directory/deepwalk_embeddings.npy" \
+    --device $device \
+    --no_wandb
+PYTHONPATH=. python exp/sbert.py \
+    --items_path "$save_directory/items.csv" \
+    --embeddings_savepath "$save_directory/text_embeddings.npy" \
+    --device $device
+PYTHONPATH=. python exp/gnn.py \
+    --items_path "$save_directory/items.csv" \
+    --ratings_path "$save_directory/train_ratings.csv" \
+    --text_embeddings_path "$save_directory/text_embeddings.npy" \
+    --deepwalk_embeddings_path "$save_directory/deepwalk_embeddings.npy" \
+    --embeddings_savepath "$save_directory/embeddings.npy"\
+    --device $device \
+    --no_wandb
+PYTHONPATH=. python exp/prepare_index.py \
+    --embeddings_path "$save_directory/embeddings.npy" \
+    --save_path "$save_directory/index.faiss"
+PYTHONPATH=. python exp/prepare_db.py \
+    --items_path "$save_directory/items.csv" \
+    --embeddings_path "$save_directory/embeddings.npy" \
+    --db_path "$save_directory/items.db"
+PYTHONPATH=. python exp/evaluate.py \
+    --metrics_savepath "$save_directory/metrics.json" \
+    --val_ratings_path "$save_directory/val_ratings.csv" \
+    --faiss_index_path "$save_directory/index.faiss" \
+    --db_path "$save_directory/items.db"
+echo "Evaluation metrics:"
+cat "$save_directory/metrics.json"

exp/prepare_index.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import argparse
+import faiss
+import numpy as np
+def build_index(embeddings_path, save_path, n_neighbors):
+    embeddings = np.load(embeddings_path)
+    index = faiss.IndexHNSWFlat(embeddings.shape[-1], 32)
+    index.add(embeddings)
+    faiss.write_index(index, save_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Build an HNSW index from embeddings.")
+    parser.add_argument("--embeddings_path", required=True, type=str, help="Path to the embeddings file.")
+    parser.add_argument("--save_path", type=str, required=True, help="Path to save the built index.")
+    parser.add_argument("--n_neighbors", type=int, default=32, help="Number of neighbors for the index.")
+    args = parser.parse_args()
+    build_index(**vars(args))

exp/process_raw_data.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import argparse
+import os
+import json
+import pandas as pd
+import numpy as np
+def book_filter(book, ratings_count_threshold=10_000):
+    try:
+        if book["ratings_count"] == "":
+            return False
+        if int(book["ratings_count"]) < ratings_count_threshold:
+            return False
+        if book["description"] == "":
+            return False
+        if book["title"] == "":
+            return False
+        if book["title_without_series"] == "":
+            return False
+        possible_lang_codes = {"eng", "en-GB", "en-US"}
+        if not book["language_code"] in possible_lang_codes:
+            return False
+        return True
+    except Exception:
+        return False
+def process_raw_data_goodreads(input_directory, save_directory, positive_rating_threshold = 4.0):
+    os.makedirs(save_directory, exist_ok=True)
+    ### Process items
+    columns = [
+        "book_id",
+        "description",
+        "title_without_series",
+    ]
+    numeric_columns = [
+        "book_id",
+    ]
+    items = []
+    with open(os.path.join(input_directory, "goodreads_books.json"), "r") as f:
+        for line in f:
+            item = json.loads(line)
+            if book_filter(item):
+                items.append([item[col] for col in columns])
+    items = pd.DataFrame(items, columns=columns)
+    for col in numeric_columns:
+        items[col] = pd.to_numeric(items[col])
+    items["item_id"] = items.index
+    items["title"] = items["title_without_series"]
+    items.drop("title_without_series", axis=1, inplace=True)
+    items.to_csv(os.path.join(save_directory, "items.csv"), index=False)
+    ### Process ratings
+    ratings = pd.read_csv(os.path.join(input_directory, "goodreads_interactions.csv"))
+    book_id_map = pd.read_csv(os.path.join(input_directory, "book_id_map.csv"))
+    csv_to_usual_map = dict(zip(book_id_map["book_id_csv"], book_id_map["book_id"]))
+    usual_to_csv_map = dict(zip(book_id_map["book_id"], book_id_map["book_id_csv"]))
+    book_ids = items["book_id"].unique()
+    book_ids_csv = set([usual_to_csv_map[book_id] for book_id in book_ids])
+    ratings = ratings[ratings["rating"] >= positive_rating_threshold]
+    ratings = ratings[ratings["book_id"].isin(book_ids_csv)]
+    book_to_item_id_map = dict(zip(items["book_id"], items["item_id"]))
+    ratings["item_id"] = ratings["book_id"].map(csv_to_usual_map).map(book_to_item_id_map)
+    user_ids = list(ratings["user_id"].unique())
+    user_ids_map = dict(zip(user_ids, range(len(user_ids))))
+    ratings["user_id"] = ratings["user_id"].map(user_ids_map)
+    ratings.to_csv(os.path.join(save_directory, "ratings.csv"), index=False)
+def create_train_val_split(ratings_path, train_savepath, val_savepath, seed=42):
+    ratings = pd.read_csv(ratings_path)
+    user_ids = ratings["user_id"].unique()
+    rng = np.random.default_rng(seed=seed)
+    train_size = int(len(user_ids) * 0.9)
+    train_indices = rng.choice(user_ids, size=train_size, replace=False)
+    train_data = ratings.loc[ratings["user_id"].isin(train_indices)]
+    val_data = ratings.loc[~ratings["user_id"].isin(train_indices)]
+    print(f"Train size: {len(train_data)}.")
+    print(f"Validation size: {len(val_data)}.")
+    train_data.to_csv(train_savepath, index=False)
+    val_data.to_csv(val_savepath, index=False)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process raw data.")
+    parser.add_argument("--input_directory", required=True, type=str, help="Directory containing the raw data.")
+    parser.add_argument("--save_directory", required=True, type=str, help="Directory where processed data will be saved.")
+    parser.add_argument("--create_train_val_split", action="store_true", help="Flag to indicate whether to create a train-validation split.")
+    args = parser.parse_args()
+    print("Processing raw data...")
+    process_raw_data_goodreads(args.input_directory, args.save_directory)
+    if args.create_train_val_split:
+        create_train_val_split(
+            os.path.join(args.save_directory, "ratings.csv"),
+            os.path.join(args.save_directory, "train_ratings.csv"),
+            os.path.join(args.save_directory, "val_ratings.csv")
+        )
+    print("The raw data has been successfully processed.")

exp/requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+-r ../requirements.txt  # install base requirements of app
+dgl==2.1.0
+torch==2.1.2
+wandb==0.17.0
+tqdm==4.66.4
+pydantic==2.5.3
+sentence_transformers==3.0.1

exp/requirements_gpu.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+-f https://data.dgl.ai/wheels/cu121/repo.html
+-r ../requirements.txt  # install base requirements of app
+dgl==2.1.0
+torch==2.1.0
+wandb==0.17.0
+tqdm==4.66.4
+pydantic==2.5.3
+sentence_transformers==3.0.1

exp/sbert.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import argparse
+import os
+import numpy as np
+import pandas as pd
+from tqdm.auto import tqdm
+from sentence_transformers import SentenceTransformer
+from utils import normalize_embeddings
+def prepare_sbert_embeddings(
+        items_path,
+        embeddings_savepath,
+        model_name,
+        batch_size,
+        device
+):
+    items = pd.read_csv(items_path).sort_values("item_id")
+    sentences = items["description"].values
+    model = SentenceTransformer(model_name).to(device)
+    embeddings = []
+    for start_index in tqdm(range(0, len(sentences), batch_size)):
+        batch = sentences[start_index:start_index+batch_size]
+        embeddings.extend(model.encode(batch))
+    embeddings = normalize_embeddings(np.array(embeddings))
+    np.save(embeddings_savepath, embeddings)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Prepare SBERT embeddings.")
+    parser.add_argument("--items_path", type=str, required=True, help="Path to the items file.")
+    parser.add_argument("--embeddings_savepath", type=str, required=True, help="Path to save the embeddings.")
+    parser.add_argument("--model_name", type=str, default="sentence-transformers/all-MiniLM-L6-v2", help="Name of the SBERT model to use.")
+    parser.add_argument("--batch_size", type=int, default=32, help="Batch size.")
+    parser.add_argument("--device", type=str, default="cpu", help="Device to use for training (cpu or cuda).")
+    args = parser.parse_args()
+    prepare_sbert_embeddings(**vars(args))

exp/utils.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import numpy as np
+import pandas as pd
+import dgl
+import torch
+def normalize_embeddings(embeddings):
+    embeddings_norm = np.linalg.norm(embeddings, axis=1)
+    nonzero_embeddings = embeddings_norm > 0.0
+    embeddings[nonzero_embeddings] /= embeddings_norm[nonzero_embeddings, None]
+    return embeddings
+def prepare_graphs(items_path, ratings_path):
+    items = pd.read_csv(items_path)
+    ratings = pd.read_csv(ratings_path)
+    n_users = np.max(ratings["user_id"].unique()) + 1
+    item_ids = torch.tensor(sorted(items["item_id"].unique()))
+    edges = torch.tensor(ratings["user_id"]), torch.tensor(ratings["item_id"])
+    reverse_edges = (edges[1], edges[0])
+    bipartite_graph = dgl.heterograph(
+        data_dict={
+            ("User", "UserItem", "Item"): edges,
+            ("Item", "ItemUser", "User"): reverse_edges
+        },
+        num_nodes_dict={
+            "User": n_users,
+            "Item": len(item_ids)
+        }
+    )
+    graph = dgl.to_homogeneous(bipartite_graph)
+    graph = dgl.add_self_loop(graph)
+    return bipartite_graph, graph
+def extract_item_embeddings(node_embeddings, bipartite_graph, graph):
+    item_ntype = bipartite_graph.ntypes.index("Item")
+    item_mask = graph.ndata[dgl.NTYPE] == item_ntype
+    item_embeddings = node_embeddings[item_mask]
+    original_ids = graph.ndata[dgl.NID][item_mask]
+    item_embeddings = item_embeddings[torch.argsort(original_ids)]
+    return item_embeddings.cpu().numpy()
+class LRSchedule:
+    def __init__(self, total_steps, warmup_steps, final_factor):
+        self._total_steps = total_steps
+        self._warmup_steps = warmup_steps
+        self._final_factor = final_factor
+    def __call__(self, step):
+        if step >= self._total_steps:
+            return self._final_factor
+        if self._warmup_steps > 0:
+            warmup_factor = step / self._warmup_steps
+        else:
+            warmup_factor = 1.0
+        steps_after_warmup = step - self._warmup_steps
+        total_steps_after_warmup = self._total_steps - self._warmup_steps
+        after_warmup_factor = 1 \
+            - (1 - self._final_factor) * (steps_after_warmup / total_steps_after_warmup)
+        factor = min(warmup_factor, after_warmup_factor)
+        return min(max(factor, 0), 1)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+streamlit==1.35.0
+pandas==2.2.2
+numpy==1.26.4
+faiss-cpu==1.8.0