Spaces:
Sleeping
Sleeping
File size: 4,396 Bytes
2e996cb b6545dd f22f24a b6545dd f22f24a 2e996cb a16c4b0 2e996cb f22f24a 2e996cb b6545dd 63a2973 2e996cb f22f24a 2e996cb b6545dd f22f24a b6545dd a49f427 b6545dd 2e996cb b6545dd 2e996cb f22f24a b6545dd f22f24a 2e996cb b6545dd 2e996cb b6545dd 2e996cb f22f24a b6545dd f22f24a b6545dd f22f24a b6545dd f22f24a b6545dd f22f24a b6545dd 2e996cb b6545dd f22f24a 2e996cb a16c4b0 2e996cb f22f24a 2e996cb a16c4b0 2e996cb f22f24a 2e996cb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import os
import time
import requests
import numpy as np
from flask import Flask, render_template, request, send_file
from rdkit import Chem
from transformers import AutoModelForMaskedLM, AutoTokenizer
from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
from transformers import AutoModel, AutoTokenizer
import torch
import re
import torch.nn as nn
# DIRECTORIES
bio_model_dir = "/app/modelsBioembedSmall"
cvn_model_dir = "/app/models_folder"
UPLOAD_FOLDER = "/app/Samples"
UF="/tmp/"
os.makedirs(bio_model_dir, exist_ok=True)
os.makedirs(cvn_model_dir, exist_ok=True)
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
# ENV VARIABLES
os.environ["TMPDIR"] = bio_model_dir
os.environ["TEMP"] = bio_model_dir
os.environ["TMP"] = bio_model_dir
os.environ['NUMBA_CACHE_DIR'] = '/app/numba_cache'
os.environ['TRANSFORMERS_CACHE'] = '/app/hf_cache'
# ESM2 MODEL AND TOKENIZER
try:
print("Loading ESM2 model...")
model_name = "facebook/esm2_t6_8M_UR50D" # Smaller model with 320-dim embedding
tokenizer = AutoTokenizer.from_pretrained(bio_model_dir)
model = AutoModel.from_pretrained(bio_model_dir)
model.eval()
print("ESM2 model loaded.")
except Exception as e:
print(f"Error loading ESM2 model: {e}")
model = None
tokenizer = None
# linear transformation to map 320D embeddings to 1024D
class EmbeddingTransformer(nn.Module):
def __init__(self, input_dim, output_dim):
super(EmbeddingTransformer, self).__init__()
self.linear = nn.Linear(input_dim, output_dim)
def forward(self, x):
return self.linear(x)
transformer = EmbeddingTransformer(input_dim=320, output_dim=1024)
# UDF TO GENERATE EMBEDDINGS
def generate_bio_embeddings(sequence):
"""
Generate protein sequence embeddings using ESM2 model.
Maps the 320-dimensional embedding to 1024 dimensions.
"""
if model is None or tokenizer is None:
print("Model or tokenizer not loaded.")
return None
if not sequence:
print("Sequence is empty after cleaning.")
return None
try:
inputs = tokenizer(sequence, return_tensors="pt", add_special_tokens=True)
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state
mean_embedding = embeddings.mean(dim=1).squeeze()
transformed_embedding = transformer(mean_embedding)
transformed_embedding = transformed_embedding.detach().numpy()
return transformed_embedding.reshape(1, -1)
except Exception as e:
print(f"Embedding Error: {e}")
return None
# UDF FOR SMILES GENERATION
def generate_smiles(sequence, n_samples=100):
start_time = time.time()
protein_embedding = generate_bio_embeddings(sequence)
if protein_embedding is None:
return None, "Embedding generation failed!"
model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
valid_samples = [sample for sample in samples if sample is not None]
smiles_list = [
Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
]
if not smiles_list:
return None, "No valid SMILES generated!"
filename = os.path.join(UF, "SMILES_GENERATED.txt")
with open(filename, "w") as file:
file.write("\n".join(smiles_list))
elapsed_time = time.time() - start_time
return filename, elapsed_time
app = Flask(__name__)
@app.route("/", methods=["GET", "POST"])
def index():
if request.method == "POST":
sequence = request.form["sequence"].strip()
if not sequence:
return render_template("index.html", message="Please enter a valid sequence.")
file_path, result = generate_smiles(sequence)
if file_path is None:
return render_template("index.html", message=f"Error: {result}")
return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
return render_template("index.html")
@app.route("/download")
def download_file():
file_path = os.path.join(UF, "SMILES_GENERATED.txt")
return send_file(file_path, as_attachment=True)
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860)
|