Spaces:

eremeev-d
/

graph-rec

Sleeping

App Files Files Community

erermeev-d commited on Aug 9, 2024

Commit

c746c39

1 Parent(s): a4da241

Refactored experiments code

Browse files

Files changed (7) hide show

exp/gnn/__init__.py +0 -0
exp/gnn/loss.py +17 -0
exp/gnn/model.py +110 -0
exp/{gnn.py → gnn/train.py} +7 -154
exp/gnn/utils.py +82 -0
exp/{prepare_embeddings.sh → pipeline.sh} +1 -1
exp/utils.py +1 -62

exp/gnn/__init__.py ADDED Viewed

File without changes

exp/gnn/loss.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import torch
+### Based on https://arxiv.org/pdf/2205.03169
+def nt_xent_loss(sim, temperature):
+    sim = sim / temperature
+    n = sim.shape[0] // 2  # n = |user_batch|
+    aligment_loss = -torch.mean(sim[torch.arange(n), torch.arange(n)+n])
+    mask = torch.diag(torch.ones(2*n, dtype=torch.bool)).to(sim.device)
+    sim = torch.where(mask, -torch.inf, sim)
+    sim = sim[:n, :]
+    distribution_loss = torch.mean(torch.logsumexp(sim, dim=1))
+    loss = aligment_loss + distribution_loss
+    return loss

exp/gnn/model.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import torch
+import dgl
+class GNNLayer(torch.nn.Module):
+    def __init__(self, hidden_dim, aggregator_type, skip_connection, bidirectional):
+        super().__init__()
+        self._skip_connection = skip_connection
+        self._bidirectional = bidirectional
+        self._conv = dgl.nn.SAGEConv(hidden_dim, hidden_dim, aggregator_type)
+        self._activation = torch.nn.ReLU()
+        if bidirectional:
+            self._conv_rev = dgl.nn.SAGEConv(hidden_dim, hidden_dim, aggregator_type)
+            self._activation_rev = torch.nn.ReLU()
+    def forward(self, graph, x):
+        edge_weights = graph.edata["weights"]
+        y = self._activation(self._conv(graph, x, edge_weights))
+        if self._bidirectional:
+            reversed_graph = dgl.reverse(graph, copy_edata=True)
+            edge_weights = reversed_graph.edata["weights"]
+            y = y + self._activation_rev(self._conv_rev(reversed_graph, x, edge_weights))
+        if self._skip_connection:
+            return x + y
+        else:
+            return y
+class GNNModel(torch.nn.Module):
+    def __init__(
+            self,
+            bipartite_graph,
+            text_embeddings,
+            num_layers,
+            hidden_dim,
+            aggregator_type,
+            skip_connection,
+            bidirectional,
+            num_traversals,
+            termination_prob,
+            num_random_walks,
+            num_neighbor,
+    ):
+        super().__init__()
+        self._bipartite_graph = bipartite_graph
+        self._text_embeddings = text_embeddings
+        self._sampler = dgl.sampling.PinSAGESampler(
+            bipartite_graph, "Item", "User", num_traversals,
+            termination_prob, num_random_walks, num_neighbor)
+        self._text_encoder = torch.nn.Linear(text_embeddings.shape[-1], hidden_dim)
+        self._layers = torch.nn.ModuleList()
+        for _ in range(num_layers):
+            self._layers.append(GNNLayer(
+                hidden_dim, aggregator_type, skip_connection, bidirectional))
+    def _sample_subraph(self, frontier_ids):
+        num_layers = len(self._layers)
+        device = self._bipartite_graph.device
+        subgraph = dgl.graph(([], []), num_nodes=self._bipartite_graph.num_nodes("Item")).to(device)
+        prev_ids = set()
+        weights = []
+        for _ in range(num_layers):
+            frontier_ids = torch.tensor(frontier_ids, dtype=torch.int64).to(device)
+            new_sample = self._sampler(frontier_ids)
+            new_weights = new_sample.edata["weights"]
+            new_edges = new_sample.edges()
+            subgraph.add_edges(*new_edges)
+            weights.append(new_weights)
+            prev_ids |= set(frontier_ids.cpu().tolist())
+            frontier_ids = set(dgl.compact_graphs(subgraph).ndata[dgl.NID].cpu().tolist())
+            frontier_ids = list(frontier_ids - prev_ids)
+        subgraph.edata["weights"] = torch.cat(weights, dim=0).to(torch.float32)
+        return subgraph
+    def forward(self, ids):
+        ### Sample subgraph
+        sampled_subgraph = self._sample_subraph(ids)
+        sampled_subgraph = dgl.compact_graphs(sampled_subgraph, always_preserve=ids)
+        ### Encode text embeddings
+        text_embeddings = self._text_embeddings[
+            sampled_subgraph.ndata[dgl.NID]]
+        features = self._text_encoder(text_embeddings)
+        ### GNN goes brr...
+        for layer in self._layers:
+            features = layer(sampled_subgraph, features)
+        ### Select features for initial ids
+        # TODO: write it more efficiently?
+        matches = sampled_subgraph.ndata[dgl.NID].unsqueeze(0) == ids.unsqueeze(1)
+        ids_in_subgraph = matches.nonzero(as_tuple=True)[1]
+        features = features[ids_in_subgraph]
+        ### Normalize and return
+        features = features / torch.linalg.norm(features, dim=1, keepdim=True)
+        return features

exp/{gnn.py → gnn/train.py} RENAMED Viewed

@@ -9,159 +9,14 @@ import torch
 import wandb
 from tqdm.auto import tqdm
-from exp.utils import prepare_graphs, normalize_embeddings, LRSchedule
 from exp.prepare_recsys import prepare_recsys
 from exp.evaluate import evaluate_recsys
-class GNNLayer(torch.nn.Module):
-    def __init__(self, hidden_dim, aggregator_type, skip_connection, bidirectional):
-        super().__init__()
-        self._skip_connection = skip_connection
-        self._bidirectional = bidirectional
-        self._conv = dgl.nn.SAGEConv(hidden_dim, hidden_dim, aggregator_type)
-        self._activation = torch.nn.ReLU()
-        if bidirectional:
-            self._conv_rev = dgl.nn.SAGEConv(hidden_dim, hidden_dim, aggregator_type)
-            self._activation_rev = torch.nn.ReLU()
-    def forward(self, graph, x):
-        edge_weights = graph.edata["weights"]
-        y = self._activation(self._conv(graph, x, edge_weights))
-        if self._bidirectional:
-            reversed_graph = dgl.reverse(graph, copy_edata=True)
-            edge_weights = reversed_graph.edata["weights"]
-            y = y + self._activation_rev(self._conv_rev(reversed_graph, x, edge_weights))
-        if self._skip_connection:
-            return x + y
-        else:
-            return y
-class GNNModel(torch.nn.Module):
-    def __init__(
-            self,
-            bipartite_graph,
-            text_embeddings,
-            num_layers,
-            hidden_dim,
-            aggregator_type,
-            skip_connection,
-            bidirectional,
-            num_traversals,
-            termination_prob,
-            num_random_walks,
-            num_neighbor,
-    ):
-        super().__init__()
-        self._bipartite_graph = bipartite_graph
-        self._text_embeddings = text_embeddings
-        self._sampler = dgl.sampling.PinSAGESampler(
-            bipartite_graph, "Item", "User", num_traversals,
-            termination_prob, num_random_walks, num_neighbor)
-        self._text_encoder = torch.nn.Linear(text_embeddings.shape[-1], hidden_dim)
-        self._layers = torch.nn.ModuleList()
-        for _ in range(num_layers):
-            self._layers.append(GNNLayer(
-                hidden_dim, aggregator_type, skip_connection, bidirectional))
-    def _sample_subraph(self, frontier_ids):
-        num_layers = len(self._layers)
-        device = self._bipartite_graph.device
-        subgraph = dgl.graph(([], []), num_nodes=self._bipartite_graph.num_nodes("Item")).to(device)
-        prev_ids = set()
-        weights = []
-        for _ in range(num_layers):
-            frontier_ids = torch.tensor(frontier_ids, dtype=torch.int64).to(device)
-            new_sample = self._sampler(frontier_ids)
-            new_weights = new_sample.edata["weights"]
-            new_edges = new_sample.edges()
-            subgraph.add_edges(*new_edges)
-            weights.append(new_weights)
-            prev_ids |= set(frontier_ids.cpu().tolist())
-            frontier_ids = set(dgl.compact_graphs(subgraph).ndata[dgl.NID].cpu().tolist())
-            frontier_ids = list(frontier_ids - prev_ids)
-        subgraph.edata["weights"] = torch.cat(weights, dim=0).to(torch.float32)
-        return subgraph
-    def forward(self, ids):
-        ### Sample subgraph
-        sampled_subgraph = self._sample_subraph(ids)
-        sampled_subgraph = dgl.compact_graphs(sampled_subgraph, always_preserve=ids)
-        ### Encode text embeddings
-        text_embeddings = self._text_embeddings[
-            sampled_subgraph.ndata[dgl.NID]]
-        features = self._text_encoder(text_embeddings)
-        ### GNN goes brr...
-        for layer in self._layers:
-            features = layer(sampled_subgraph, features)
-        ### Select features for initial ids
-        # TODO: write it more efficiently?
-        matches = sampled_subgraph.ndata[dgl.NID].unsqueeze(0) == ids.unsqueeze(1)
-        ids_in_subgraph = matches.nonzero(as_tuple=True)[1]
-        features = features[ids_in_subgraph]
-        ### Normalize and return
-        features = features / torch.linalg.norm(features, dim=1, keepdim=True)
-        return features
-### Based on https://arxiv.org/pdf/2205.03169
-def nt_xent_loss(sim, temperature):
-    sim = sim / temperature
-    n = sim.shape[0] // 2  # n = |user_batch|
-    aligment_loss = -torch.mean(sim[torch.arange(n), torch.arange(n)+n])
-    mask = torch.diag(torch.ones(2*n, dtype=torch.bool)).to(sim.device)
-    sim = torch.where(mask, -torch.inf, sim)
-    sim = sim[:n, :]
-    distribution_loss = torch.mean(torch.logsumexp(sim, dim=1))
-    loss = aligment_loss + distribution_loss
-    return loss
-def sample_item_batch(user_batch, bipartite_graph):
-    sampled_edges = dgl.sampling.sample_neighbors(
-        bipartite_graph, {"User": user_batch}, fanout=2
-    ).edges(etype="ItemUser")
-    item_batch = sampled_edges[0]
-    item_batch = item_batch[torch.argsort(sampled_edges[1])]
-    item_batch = item_batch.reshape(-1, 2)
-    item_batch = item_batch.T
-    return item_batch
-@torch.no_grad()
-def inference_model(model, bipartite_graph, batch_size, hidden_dim, device):
-    model.eval()
-    item_embeddings = torch.zeros(bipartite_graph.num_nodes("Item"), hidden_dim).to(device)
-    for items_batch in tqdm(torch.utils.data.DataLoader(
-            torch.arange(bipartite_graph.num_nodes("Item")),
-            batch_size=batch_size,
-            shuffle=True
-    )):
-        item_embeddings[items_batch] = model(items_batch.to(device))
-    item_embeddings = normalize_embeddings(item_embeddings.cpu().numpy())
-    return item_embeddings
 def prepare_gnn_embeddings(
@@ -228,9 +83,9 @@ def prepare_gnn_embeddings(
     lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda _: 1.0)
     ### Train loop
-    model.train()
     for epoch in range(num_epochs):
         ### Train
         for user_batch in tqdm(dataloader):
             item_batch = sample_item_batch(user_batch, bipartite_graph)  # (2, |user_batch|)
             item_batch = item_batch.reshape(-1)  # (2 * |user_batch|)
@@ -258,8 +113,6 @@ def prepare_gnn_embeddings(
                 print(f"Epoch {epoch + 1} / {num_epochs}. {metrics}")
                 if use_wandb:
                     wandb.log(metrics)
     if use_wandb:
         wandb.finish()

 import wandb
 from tqdm.auto import tqdm
+from exp.utils import normalize_embeddings
 from exp.prepare_recsys import prepare_recsys
 from exp.evaluate import evaluate_recsys
+from exp.gnn.model import GNNModel
+from exp.gnn.loss import nt_xent_loss
+from exp.gnn.utils import (
+    prepare_graphs, LRSchedule,
+    sample_item_batch, inference_model)
 def prepare_gnn_embeddings(
     lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda _: 1.0)
     ### Train loop
     for epoch in range(num_epochs):
         ### Train
+        model.train()
         for user_batch in tqdm(dataloader):
             item_batch = sample_item_batch(user_batch, bipartite_graph)  # (2, |user_batch|)
             item_batch = item_batch.reshape(-1)  # (2 * |user_batch|)
                 print(f"Epoch {epoch + 1} / {num_epochs}. {metrics}")
                 if use_wandb:
                     wandb.log(metrics)
     if use_wandb:
         wandb.finish()

exp/gnn/utils.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import torch
+import dgl
+import pandas as pd
+import numpy as np
+from tqdm.auto import tqdm
+from exp.utils import normalize_embeddings
+class LRSchedule:
+    def __init__(self, total_steps, warmup_steps, final_factor):
+        self._total_steps = total_steps
+        self._warmup_steps = warmup_steps
+        self._final_factor = final_factor
+    def __call__(self, step):
+        if step >= self._total_steps:
+            return self._final_factor
+        if self._warmup_steps > 0:
+            warmup_factor = step / self._warmup_steps
+        else:
+            warmup_factor = 1.0
+        steps_after_warmup = step - self._warmup_steps
+        total_steps_after_warmup = self._total_steps - self._warmup_steps
+        after_warmup_factor = 1 \
+            - (1 - self._final_factor) * (steps_after_warmup / total_steps_after_warmup)
+        factor = min(warmup_factor, after_warmup_factor)
+        return min(max(factor, 0), 1)
+def prepare_graphs(items_path, ratings_path):
+    items = pd.read_csv(items_path)
+    ratings = pd.read_csv(ratings_path)
+    n_users = np.max(ratings["user_id"].unique()) + 1
+    item_ids = torch.tensor(sorted(items["item_id"].unique()))
+    edges = torch.tensor(ratings["user_id"]), torch.tensor(ratings["item_id"])
+    reverse_edges = (edges[1], edges[0])
+    bipartite_graph = dgl.heterograph(
+        data_dict={
+            ("User", "UserItem", "Item"): edges,
+            ("Item", "ItemUser", "User"): reverse_edges
+        },
+        num_nodes_dict={
+            "User": n_users,
+            "Item": len(item_ids)
+        }
+    )
+    graph = dgl.to_homogeneous(bipartite_graph)
+    graph = dgl.add_self_loop(graph)
+    return bipartite_graph, graph
+def sample_item_batch(user_batch, bipartite_graph):
+    sampled_edges = dgl.sampling.sample_neighbors(
+        bipartite_graph, {"User": user_batch}, fanout=2
+    ).edges(etype="ItemUser")
+    item_batch = sampled_edges[0]
+    item_batch = item_batch[torch.argsort(sampled_edges[1])]
+    item_batch = item_batch.reshape(-1, 2)
+    item_batch = item_batch.T
+    return item_batch
+@torch.no_grad()
+def inference_model(model, bipartite_graph, batch_size, hidden_dim, device):
+    model.eval()
+    item_embeddings = torch.zeros(bipartite_graph.num_nodes("Item"), hidden_dim).to(device)
+    for items_batch in tqdm(torch.utils.data.DataLoader(
+            torch.arange(bipartite_graph.num_nodes("Item")),
+            batch_size=batch_size,
+            shuffle=True
+    )):
+        item_embeddings[items_batch] = model(items_batch.to(device))
+    item_embeddings = normalize_embeddings(item_embeddings.cpu().numpy())
+    return item_embeddings

exp/{prepare_embeddings.sh → pipeline.sh} RENAMED Viewed

@@ -17,7 +17,7 @@ PYTHONPATH=. python exp/sbert.py \
     --embeddings_savepath "$save_directory/text_embeddings.npy" \
     --device $device
-PYTHONPATH=. python exp/gnn.py \
     --items_path "$save_directory/items.csv" \
     --train_ratings_path "$save_directory/train_ratings.csv" \
     --val_ratings_path "$save_directory/val_ratings.csv" \

     --embeddings_savepath "$save_directory/text_embeddings.npy" \
     --device $device
+PYTHONPATH=. python exp/gnn/train.py \
     --items_path "$save_directory/items.csv" \
     --train_ratings_path "$save_directory/train_ratings.csv" \
     --val_ratings_path "$save_directory/val_ratings.csv" \

exp/utils.py CHANGED Viewed

@@ -1,69 +1,8 @@
 import numpy as np
-import pandas as pd
-import dgl
-import torch
 def normalize_embeddings(embeddings):
     embeddings_norm = np.linalg.norm(embeddings, axis=1)
     nonzero_embeddings = embeddings_norm > 0.0
     embeddings[nonzero_embeddings] /= embeddings_norm[nonzero_embeddings, None]
-    return embeddings
-def prepare_graphs(items_path, ratings_path):
-    items = pd.read_csv(items_path)
-    ratings = pd.read_csv(ratings_path)
-    n_users = np.max(ratings["user_id"].unique()) + 1
-    item_ids = torch.tensor(sorted(items["item_id"].unique()))
-    edges = torch.tensor(ratings["user_id"]), torch.tensor(ratings["item_id"])
-    reverse_edges = (edges[1], edges[0])
-    bipartite_graph = dgl.heterograph(
-        data_dict={
-            ("User", "UserItem", "Item"): edges,
-            ("Item", "ItemUser", "User"): reverse_edges
-        },
-        num_nodes_dict={
-            "User": n_users,
-            "Item": len(item_ids)
-        }
-    )
-    graph = dgl.to_homogeneous(bipartite_graph)
-    graph = dgl.add_self_loop(graph)
-    return bipartite_graph, graph
-def extract_item_embeddings(node_embeddings, bipartite_graph, graph):
-    item_ntype = bipartite_graph.ntypes.index("Item")
-    item_mask = graph.ndata[dgl.NTYPE] == item_ntype
-    item_embeddings = node_embeddings[item_mask]
-    original_ids = graph.ndata[dgl.NID][item_mask]
-    item_embeddings = item_embeddings[torch.argsort(original_ids)]
-    return item_embeddings.cpu().numpy()
-class LRSchedule:
-    def __init__(self, total_steps, warmup_steps, final_factor):
-        self._total_steps = total_steps
-        self._warmup_steps = warmup_steps
-        self._final_factor = final_factor
-    def __call__(self, step):
-        if step >= self._total_steps:
-            return self._final_factor
-        if self._warmup_steps > 0:
-            warmup_factor = step / self._warmup_steps
-        else:
-            warmup_factor = 1.0
-        steps_after_warmup = step - self._warmup_steps
-        total_steps_after_warmup = self._total_steps - self._warmup_steps
-        after_warmup_factor = 1 \
-            - (1 - self._final_factor) * (steps_after_warmup / total_steps_after_warmup)
-        factor = min(warmup_factor, after_warmup_factor)
-        return min(max(factor, 0), 1)

 import numpy as np
 def normalize_embeddings(embeddings):
     embeddings_norm = np.linalg.norm(embeddings, axis=1)
     nonzero_embeddings = embeddings_norm > 0.0
     embeddings[nonzero_embeddings] /= embeddings_norm[nonzero_embeddings, None]
+    return embeddings