File size: 4,396 Bytes
2e996cb
 
 
 
 
 
 
 
b6545dd
 
 
f22f24a
 
b6545dd
 
f22f24a
 
2e996cb
 
a16c4b0
2e996cb
 
 
 
 
f22f24a
2e996cb
 
 
b6545dd
 
63a2973
2e996cb
f22f24a
2e996cb
b6545dd
f22f24a
b6545dd
a49f427
 
b6545dd
 
2e996cb
b6545dd
 
 
2e996cb
f22f24a
b6545dd
 
 
 
 
 
 
 
 
 
f22f24a
2e996cb
b6545dd
 
 
 
 
 
 
 
 
 
2e996cb
b6545dd
2e996cb
f22f24a
b6545dd
 
f22f24a
b6545dd
 
 
f22f24a
 
b6545dd
f22f24a
b6545dd
 
f22f24a
b6545dd
 
 
 
2e996cb
 
 
 
b6545dd
f22f24a
2e996cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a16c4b0
2e996cb
 
 
 
 
 
f22f24a
2e996cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a16c4b0
2e996cb
 
f22f24a
2e996cb
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import os
import time
import requests
import numpy as np
from flask import Flask, render_template, request, send_file
from rdkit import Chem
from transformers import AutoModelForMaskedLM, AutoTokenizer
from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
from transformers import AutoModel, AutoTokenizer
import torch
import re
import torch.nn as nn



# DIRECTORIES
bio_model_dir = "/app/modelsBioembedSmall"  
cvn_model_dir = "/app/models_folder"
UPLOAD_FOLDER = "/app/Samples"
UF="/tmp/"

os.makedirs(bio_model_dir, exist_ok=True)
os.makedirs(cvn_model_dir, exist_ok=True)
os.makedirs(UPLOAD_FOLDER, exist_ok=True)

# ENV VARIABLES
os.environ["TMPDIR"] = bio_model_dir
os.environ["TEMP"] = bio_model_dir
os.environ["TMP"] = bio_model_dir
os.environ['NUMBA_CACHE_DIR'] = '/app/numba_cache'
os.environ['TRANSFORMERS_CACHE'] = '/app/hf_cache'


# ESM2 MODEL AND TOKENIZER
try:
    print("Loading ESM2 model...")
    model_name = "facebook/esm2_t6_8M_UR50D"  # Smaller model with 320-dim embedding

    tokenizer = AutoTokenizer.from_pretrained(bio_model_dir)
    model = AutoModel.from_pretrained(bio_model_dir)
    model.eval()
    print("ESM2 model loaded.")
except Exception as e:
    print(f"Error loading ESM2 model: {e}")
    model = None
    tokenizer = None

# linear transformation to map 320D embeddings to 1024D
class EmbeddingTransformer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(EmbeddingTransformer, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

transformer = EmbeddingTransformer(input_dim=320, output_dim=1024)

# UDF TO GENERATE EMBEDDINGS
def generate_bio_embeddings(sequence):
    """
    Generate protein sequence embeddings using ESM2 model.
    Maps the 320-dimensional embedding to 1024 dimensions.
    """
    if model is None or tokenizer is None:
        print("Model or tokenizer not loaded.")
        return None

    if not sequence:
        print("Sequence is empty after cleaning.")
        return None

    try:
        
        inputs = tokenizer(sequence, return_tensors="pt", add_special_tokens=True)
        
    
        with torch.no_grad():
            outputs = model(**inputs)

        embeddings = outputs.last_hidden_state  
        mean_embedding = embeddings.mean(dim=1).squeeze()  

    
        transformed_embedding = transformer(mean_embedding)

        
        transformed_embedding = transformed_embedding.detach().numpy()
        
        return transformed_embedding.reshape(1, -1)
    
    except Exception as e:
        print(f"Embedding Error: {e}")
        return None


# UDF FOR SMILES GENERATION
def generate_smiles(sequence, n_samples=100):
    start_time = time.time()

    protein_embedding = generate_bio_embeddings(sequence)
    if protein_embedding is None:
        return None, "Embedding generation failed!"

    model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
    samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
    valid_samples = [sample for sample in samples if sample is not None]

    smiles_list = [
        Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
    ]

    if not smiles_list:
        return None, "No valid SMILES generated!"

    filename = os.path.join(UF, "SMILES_GENERATED.txt")
    with open(filename, "w") as file:
        file.write("\n".join(smiles_list))

    elapsed_time = time.time() - start_time
    return filename, elapsed_time


app = Flask(__name__)

@app.route("/", methods=["GET", "POST"])
def index():
    if request.method == "POST":
        sequence = request.form["sequence"].strip()
        if not sequence:
            return render_template("index.html", message="Please enter a valid sequence.")

        file_path, result = generate_smiles(sequence)
        if file_path is None:
            return render_template("index.html", message=f"Error: {result}")

        return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)

    return render_template("index.html")

@app.route("/download")
def download_file():
    file_path = os.path.join(UF, "SMILES_GENERATED.txt")
    return send_file(file_path, as_attachment=True)


if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)