import pandas as pd import numpy as np from DLM_emb_model import MolEmbDLM from transformers import AutoTokenizer import torch import selfies as sf MODEL_DIR = "Kiria-Nozan/ApexOracle" # Load model and tokenizer tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR) model = MolEmbDLM.from_pretrained(MODEL_DIR) model.eval() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = model.to(device) # Load CSV data df = pd.read_csv("temp_data/polymers_lit_scraped.csv") # Extract all unique monomer SMILES monomer_columns = ["monomer A", "monomer B", "monomer C", "monomer D", "monomer E", "monomer F"] all_monomers = set() for col in monomer_columns: if col in df.columns: monomers = df[col].dropna().unique() all_monomers.update(monomers) print(f"Total unique monomers: {len(all_monomers)}") # Convert SMILES to SELFIES and prepare for embedding monomer_selfies = {} valid_monomers = [] for smiles in all_monomers: try: selfies = sf.encoder(smiles) monomer_selfies[smiles] = selfies valid_monomers.append((smiles, selfies)) except Exception as e: print(f"Error converting {smiles} to SELFIES: {e}") print(f"Valid monomers for embedding: {len(valid_monomers)}") # Generate embeddings for all monomers monomer_embeddings = {} for smiles, selfies in valid_monomers: # Prepare input similar to example.py batch = tokenizer( selfies.replace('][', '] ['), padding=False, truncation=False, return_tensors="pt", ) batch = {k: v.to(device) for k, v in batch.items()} with torch.no_grad(): embeddings = model( input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], ) # Store the embedding (average pooling over sequence length) monomer_embeddings[smiles] = embeddings[0][0].cpu().numpy() print(f"Generated embeddings for {len(monomer_embeddings)} monomers") print(f"Embedding shape: {list(monomer_embeddings.values())[0].shape}") # Save results np.save("temp_data/monomer_embeddings.npy", monomer_embeddings) print("Embeddings saved to monomer_embeddings.npy")