Bhanushray commited on
Commit
f22f24a
Β·
verified Β·
1 Parent(s): a16c4b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -1040
app.py CHANGED
@@ -6,15 +6,15 @@ from flask import Flask, render_template, request, send_file
6
  from rdkit import Chem
7
  from transformers import AutoModelForMaskedLM, AutoTokenizer
8
  from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
9
-
10
  from transformers import AutoModel, AutoTokenizer
11
  import torch
12
- import numpy as np
13
  import re
 
 
14
 
15
 
16
- # πŸš€ Define Directories for Railway
17
- bio_model_dir = "/app/modelsBioembedSmall" # Persistent model storage
18
  cvn_model_dir = "/app/models_folder"
19
  UPLOAD_FOLDER = "/app/Samples"
20
  UF="/tmp/"
@@ -23,7 +23,7 @@ os.makedirs(bio_model_dir, exist_ok=True)
23
  os.makedirs(cvn_model_dir, exist_ok=True)
24
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
25
 
26
- # βœ… Environment Variables for Temp Directory
27
  os.environ["TMPDIR"] = bio_model_dir
28
  os.environ["TEMP"] = bio_model_dir
29
  os.environ["TMP"] = bio_model_dir
@@ -31,81 +31,10 @@ os.environ['NUMBA_CACHE_DIR'] = '/app/numba_cache'
31
  os.environ['TRANSFORMERS_CACHE'] = '/app/hf_cache'
32
 
33
 
34
- # πŸ”— Dropbox Links for Model Files
35
- DROPBOX_LINKS = {
36
- "pytorch_model.bin": "https://www.dropbox.com/scl/fi/b41t8c6ji7j6uk5y2jj8g/pytorch_model.bin?rlkey=kuuwkid36ugml560c4a465ilr&st=t60bfemx&dl=1",
37
- "config.json": "https://www.dropbox.com/scl/fi/js6czj3kfc4a5kshfkzie/config.json?rlkey=5oysq4ecilnan5tviuqe86v93&st=75zpce8h&dl=1",
38
- "tokenizer_config.json": "https://www.dropbox.com/scl/fi/x11poym6mueoxod7xb6f1/tokenizer_config.json?rlkey=s51pik2rkmqp1fu99qj9qaria&st=z9kkcxp7&dl=1",
39
- "vocab.txt": "https://www.dropbox.com/scl/fi/v6e2gn10ck4lpx4iv9kpe/vocab.txt?rlkey=dcu29g5ns4wtqdv0pkks0ehx1&st=qt187rhq&dl=1",
40
- "special_tokens_map.json": "https://www.dropbox.com/scl/fi/t3lvmp5x28d1zjac3j7ec/special_tokens_map.json?rlkey=z2xbompa54iu4y9qgb5bvmfc9&st=zrxlpjdt&dl=1"
41
- }
42
-
43
- # # πŸ“₯ Function to Download Model Files
44
- # def download_model_files():
45
- # for filename, url in DROPBOX_LINKS.items():
46
- # file_path = os.path.join(bio_model_dir, filename)
47
- # if not os.path.exists(file_path): # Avoid re-downloading
48
- # print(f"Downloading {filename}...")
49
- # response = requests.get(url, stream=True)
50
- # if response.status_code == 200:
51
- # with open(file_path, "wb") as f:
52
- # for chunk in response.iter_content(chunk_size=1024):
53
- # f.write(chunk)
54
- # print(f"Downloaded: {filename}")
55
- # else:
56
- # print(f"Failed to download {filename}")
57
- def download_model_files():
58
- for filename, url in DROPBOX_LINKS.items():
59
- file_path = os.path.join(bio_model_dir, filename)
60
-
61
- print(f"Downloading {filename} (forcing overwrite)...")
62
- response = requests.get(url, stream=True)
63
- if response.status_code == 200:
64
- with open(file_path, "wb") as f:
65
- for chunk in response.iter_content(chunk_size=1024):
66
- f.write(chunk)
67
- print(f"Downloaded: {filename}")
68
- else:
69
- print(f"Failed to download {filename}")
70
-
71
- # # πŸ“₯ Download models before starting
72
- # download_model_files()
73
-
74
- # # βœ… Load ProtTrans-BERT-BFD Model
75
- # print("Loading ProtTrans-BERT-BFD model...")
76
- # model = AutoModelForMaskedLM.from_pretrained(bio_model_dir)
77
- # tokenizer = AutoTokenizer.from_pretrained(bio_model_dir)
78
- ##
79
- ### βœ… Load Bio-Embedding Model
80
- ##try:
81
- ## print("Loading ProtTrans-BERT-BFD model...")
82
- ## embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
83
- ##except Exception as e:
84
- ## print(f"Error loading ProtTrans-BERT-BFD model: {e}")
85
- ## embedder = None
86
- ##
87
- ### 🧬 Generate Bio-Embeddings
88
- ##def generate_bio_embeddings(sequence):
89
- ## if embedder is None:
90
- ## return None
91
- ## try:
92
- ## embedding_protein = embedder.embed(sequence)
93
- ## embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
94
- ## return np.array(embedding_per_protein).reshape(1, -1)
95
- ## except Exception as e:
96
- ## print(f"Embedding Error: {e}")
97
- ## return None
98
- import torch
99
- from transformers import AutoTokenizer, AutoModel
100
- import re
101
- import numpy as np
102
- import torch.nn as nn
103
-
104
- # Load ESM2 model and tokenizer
105
  try:
106
  print("Loading ESM2 model...")
107
- # Using a smaller model that is only ~200 MB
108
- model_name = "facebook/esm2_t6_8M_UR50D" # Smaller model with 320-dim embeddings
109
 
110
  tokenizer = AutoTokenizer.from_pretrained(bio_model_dir)
111
  model = AutoModel.from_pretrained(bio_model_dir)
@@ -116,7 +45,7 @@ except Exception as e:
116
  model = None
117
  tokenizer = None
118
 
119
- # Define a linear transformation to map 320D embeddings to 1024D
120
  class EmbeddingTransformer(nn.Module):
121
  def __init__(self, input_dim, output_dim):
122
  super(EmbeddingTransformer, self).__init__()
@@ -125,17 +54,9 @@ class EmbeddingTransformer(nn.Module):
125
  def forward(self, x):
126
  return self.linear(x)
127
 
128
- # Initialize the transformation layer
129
  transformer = EmbeddingTransformer(input_dim=320, output_dim=1024)
130
 
131
- # Function to clean protein sequence
132
- def clean_sequence(seq):
133
- """
134
- Clean the protein sequence by removing non-standard characters
135
- and converting to uppercase.
136
- """
137
- return re.sub(r'[^ACDEFGHIKLMNPQRSTVWY]', '', seq.upper())
138
- # Function to generate embeddings from a protein sequence
139
  def generate_bio_embeddings(sequence):
140
  """
141
  Generate protein sequence embeddings using ESM2 model.
@@ -145,30 +66,27 @@ def generate_bio_embeddings(sequence):
145
  print("Model or tokenizer not loaded.")
146
  return None
147
 
148
- #sequence = clean_sequence(sequence)
149
  if not sequence:
150
  print("Sequence is empty after cleaning.")
151
  return None
152
 
153
  try:
154
- # Tokenize the sequence for ESM2 model
155
  inputs = tokenizer(sequence, return_tensors="pt", add_special_tokens=True)
156
 
157
- # Pass the tokenized input through the ESM2 model to get embeddings
158
  with torch.no_grad():
159
  outputs = model(**inputs)
160
 
161
- # Extract the last hidden state (embeddings) and average across the sequence length
162
- embeddings = outputs.last_hidden_state # shape: (batch_size, seq_len, 320)
163
- mean_embedding = embeddings.mean(dim=1).squeeze() # shape: (320,)
164
 
165
- # Map the 320-dimensional embedding to a 1024-dimensional space using the transformer
166
  transformed_embedding = transformer(mean_embedding)
167
 
168
- # Detach the tensor from the computation graph and convert to numpy
169
  transformed_embedding = transformed_embedding.detach().numpy()
170
 
171
- # Return the transformed embedding as a 2D numpy array (1, 1024)
172
  return transformed_embedding.reshape(1, -1)
173
 
174
  except Exception as e:
@@ -176,7 +94,7 @@ def generate_bio_embeddings(sequence):
176
  return None
177
 
178
 
179
- # πŸ”¬ Generate SMILES from Protein Sequence
180
  def generate_smiles(sequence, n_samples=100):
181
  start_time = time.time()
182
 
@@ -202,7 +120,7 @@ def generate_smiles(sequence, n_samples=100):
202
  elapsed_time = time.time() - start_time
203
  return filename, elapsed_time
204
 
205
- # 🌐 Flask Web App
206
  app = Flask(__name__)
207
 
208
  @app.route("/", methods=["GET", "POST"])
@@ -225,949 +143,9 @@ def download_file():
225
  file_path = os.path.join(UF, "SMILES_GENERATED.txt")
226
  return send_file(file_path, as_attachment=True)
227
 
228
- # πŸš€ Run the Flask App on Railway
229
  if __name__ == "__main__":
230
  app.run(host="0.0.0.0", port=7860)
231
 
232
 
233
 
234
-
235
-
236
-
237
-
238
-
239
-
240
-
241
-
242
- # import os
243
- # import time
244
- # import requests
245
- # import numpy as np
246
- # import subprocess
247
- # from flask import Flask, render_template, request, send_file
248
- # from rdkit import Chem
249
- # from transformers import AutoModel
250
- # from bio_embeddings.embed import ProtTransBertBFDEmbedder
251
- # from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
252
-
253
- # # DROPBOX LINKS FOR MODEL FILES
254
- # DROPBOX_LINKS = {
255
- # "pytorch_model.bin": "https://www.dropbox.com/scl/fi/b41t8c6ji7j6uk5y2jj8g/pytorch_model.bin?rlkey=kuuwkid36ugml560c4a465ilr&st=t60bfemx&dl=1",
256
- # "config.json": "https://www.dropbox.com/scl/fi/js6czj3kfc4a5kshfkzie/config.json?rlkey=5oysq4ecilnan5tviuqe86v93&st=75zpce8h&dl=1",
257
- # "tokenizer_config.json": "https://www.dropbox.com/scl/fi/x11poym6mueoxod7xb6f1/tokenizer_config.json?rlkey=s51pik2rkmqp1fu99qj9qaria&st=z9kkcxp7&dl=1",
258
- # "vocab.txt": "https://www.dropbox.com/scl/fi/v6e2gn10ck4lpx4iv9kpe/vocab.txt?rlkey=dcu29g5ns4wtqdv0pkks0ehx1&st=qt187rhq&dl=1",
259
- # "special_tokens_map.json": "https://www.dropbox.com/scl/fi/t3lvmp5x28d1zjac3j7ec/special_tokens_map.json?rlkey=z2xbompa54iu4y9qgb5bvmfc9&st=zrxlpjdt&dl=1"
260
- # }
261
-
262
- # # LOCAL DIRECTORIES
263
- # bio_model_dir = os.path.join(os.getcwd(), "modelsBioembed")
264
- # cvn_model_dir = os.path.join(os.getcwd(), "models_folder")
265
- # UPLOAD_FOLDER = "Samples"
266
-
267
- # os.makedirs(bio_model_dir, exist_ok=True)
268
- # os.makedirs(cvn_model_dir, exist_ok=True)
269
- # os.makedirs(UPLOAD_FOLDER, exist_ok=True)
270
-
271
- # os.environ["TMPDIR"] = bio_model_dir
272
- # os.environ["TEMP"] = bio_model_dir
273
- # os.environ["TMP"] = bio_model_dir
274
-
275
- # # FUNCTION TO DOWNLOAD FILES FROM DROPBOX
276
- # for file_name, url in DROPBOX_LINKS.items():
277
- # file_path = os.path.join(bio_model_dir, file_name)
278
- # if not os.path.exists(file_path):
279
- # print(f"Downloading {file_name} from Dropbox...")
280
- # subprocess.run(["wget", "-O", file_path, url], check=True)
281
- # print(f"{file_name} downloaded!")
282
-
283
- # # BIO-EMBEDDING MODEL LOADING
284
- # try:
285
- # embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
286
- # except Exception as e:
287
- # print(f"Error loading ProtTrans-BERT-BFD model: {e}")
288
- # embedder = None
289
-
290
- # def generate_bio_embeddings(sequence):
291
- # if embedder is None:
292
- # return None
293
- # try:
294
- # embedding_protein = embedder.embed(sequence)
295
- # embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
296
- # return np.array(embedding_per_protein).reshape(1, -1)
297
- # except Exception as e:
298
- # print(f"Embedding Error: {e}")
299
- # return None
300
-
301
- # def generate_smiles(sequence, n_samples=100):
302
- # start_time = time.time()
303
- # protein_embedding = generate_bio_embeddings(sequence)
304
- # if protein_embedding is None:
305
- # return None, "Embedding generation failed!"
306
-
307
- # model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
308
- # samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
309
- # valid_samples = [sample for sample in samples if sample is not None]
310
-
311
- # smiles_list = [
312
- # Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
313
- # ]
314
-
315
- # if not smiles_list:
316
- # return None, "No valid SMILES generated!"
317
-
318
- # filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
319
- # with open(filename, "w") as file:
320
- # file.write("\n".join(smiles_list))
321
-
322
- # elapsed_time = time.time() - start_time
323
- # return filename, elapsed_time
324
-
325
- # app = Flask(__name__)
326
-
327
- # @app.route("/", methods=["GET", "POST"])
328
- # def index():
329
- # if request.method == "POST":
330
- # sequence = request.form["sequence"].strip()
331
- # if not sequence:
332
- # return render_template("index.html", message="Please enter a valid sequence.")
333
-
334
- # file_path, result = generate_smiles(sequence)
335
- # if file_path is None:
336
- # return render_template("index.html", message=f"Error: {result}")
337
-
338
- # return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
339
-
340
- # return render_template("index.html")
341
-
342
- # @app.route("/download")
343
- # def download_file():
344
- # file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
345
- # return send_file(file_path, as_attachment=True)
346
-
347
- # if __name__ == "__main__":
348
- # app.run(host="0.0.0.0", port=8000, debug=True)
349
-
350
-
351
-
352
- # import os
353
- # import time
354
- # import numpy as np
355
- # from flask import Flask, render_template, request, send_file
356
- # from rdkit import Chem
357
- # from transformers import AutoModel
358
- # from bio_embeddings.embed import ProtTransBertBFDEmbedder
359
- # from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
360
-
361
- # # # DIRECTORIES
362
- # # bio_model_dir = os.path.join(os.getcwd(), "modelsBioembed") # For bio-embeddings
363
- # # cvn_model_dir = os.path.join(os.getcwd(), "models_folder") # For CVanilla_RNN_Builder
364
- # #bio_model_dir = os.getenv("BIO_MODEL_DIR", "modelsBioembed")
365
- # bio_model_dir = "/app/modelsBioembed"
366
- # cvn_model_dir = os.getenv("CVN_MODEL_DIR", "models_folder")
367
-
368
-
369
- # os.makedirs(bio_model_dir, exist_ok=True)
370
- # os.makedirs(cvn_model_dir, exist_ok=True)
371
-
372
- # os.environ["TMPDIR"] = bio_model_dir
373
- # os.environ["TEMP"] = bio_model_dir
374
- # os.environ["TMP"] = bio_model_dir
375
-
376
- # UPLOAD_FOLDER = "Samples"
377
- # os.makedirs(UPLOAD_FOLDER, exist_ok=True)
378
-
379
- # app = Flask(__name__)
380
-
381
- # # model_path = os.path.join(bio_model_dir, "pytorch_model.bin")
382
- # # if not os.path.exists(model_path):
383
- # # print("Downloading ProtTrans-BERT-BFD model...")
384
- # # AutoModel.from_pretrained("Rostlab/prot_bert_bfd", low_cpu_mem_usage=True).save_pretrained(bio_model_dir)
385
-
386
-
387
- # # BIO-EMBEDDING MODEL LOADING
388
- # try:
389
- # print("Loading Model")
390
- # embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
391
- # except Exception as e:
392
- # print(f"Error loading ProtTrans-BERT-BFD model: {e}")
393
- # embedder = None
394
-
395
- # def generate_bio_embeddings(sequence):
396
- # """Generate bio-embeddings for a given protein sequence."""
397
- # if embedder is None:
398
- # return None
399
- # try:
400
- # embedding_protein = embedder.embed(sequence)
401
- # embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
402
- # return np.array(embedding_per_protein).reshape(1, -1) # Reshape for model compatibility
403
- # except Exception as e:
404
- # print(f"Embedding Error: {e}")
405
- # return None
406
-
407
- # def generate_smiles(sequence, n_samples=100):
408
- # """Generate SMILES from a protein sequence."""
409
- # start_time = time.time()
410
-
411
- # protein_embedding = generate_bio_embeddings(sequence)
412
- # if protein_embedding is None:
413
- # return None, "Embedding generation failed!"
414
-
415
- # # TRAINED CVanilla_RNN_Builder MODEL LOADING
416
- # model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
417
-
418
- # # MOLECULAR GRAPH GENERATION
419
- # samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
420
- # valid_samples = [sample for sample in samples if sample is not None]
421
-
422
- # # CONVERSION TO SMILES
423
- # smiles_list = [
424
- # Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
425
- # ]
426
-
427
- # if not smiles_list:
428
- # return None, "No valid SMILES generated!"
429
-
430
- # # SAVING TO FILE
431
- # filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
432
- # with open(filename, "w") as file:
433
- # file.write("\n".join(smiles_list))
434
-
435
- # elapsed_time = time.time() - start_time
436
- # return filename, elapsed_time
437
-
438
- # @app.route("/", methods=["GET", "POST"])
439
- # def index():
440
- # if request.method == "POST":
441
- # sequence = request.form["sequence"].strip()
442
- # if not sequence:
443
- # return render_template("index.html", message="Please enter a valid sequence.")
444
-
445
- # file_path, result = generate_smiles(sequence)
446
- # if file_path is None:
447
- # return render_template("index.html", message=f"Error: {result}")
448
-
449
- # return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
450
-
451
- # return render_template("index.html")
452
-
453
- # @app.route("/download")
454
- # def download_file():
455
- # file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
456
- # return send_file(file_path, as_attachment=True)
457
-
458
- # if __name__ == "__main__":
459
- # app.run(host="0.0.0.0", port=8000)
460
- #MAIN
461
-
462
-
463
-
464
-
465
- # import os
466
- # import time
467
- # import requests
468
- # import numpy as np
469
- # from flask import Flask, render_template, request, send_file
470
- # from rdkit import Chem
471
- # from transformers import AutoModel
472
- # from bio_embeddings.embed import ProtTransBertBFDEmbedder
473
- # from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
474
-
475
- # # HUGGING FACE MODEL REPO (Replace with your actual Hugging Face username)
476
- # MODEL_BASE_URL = "https://huggingface.co/Bhanushray/protein-smiles-model/tree/main"
477
-
478
- # # REQUIRED MODEL FILES
479
- # MODEL_FILES = [
480
- # "pytorch_model.bin",
481
- # "config.json",
482
- # "tokenizer_config.json",
483
- # "vocab.txt",
484
- # "special_tokens_map.json"
485
- # ]
486
-
487
- # # DIRECTORIES
488
- # bio_model_dir = os.getenv("BIO_MODEL_DIR", "modelsBioembed")
489
- # cvn_model_dir = os.getenv("CVN_MODEL_DIR", "models_folder")
490
-
491
- # # bio_model_dir = os.path.join(os.getcwd(), "modelsBioembed") # For bio-embeddings
492
- # # cvn_model_dir = os.path.join(os.getcwd(), "models_folder") # For CVanilla_RNN_Builder
493
-
494
- # os.makedirs(bio_model_dir, exist_ok=True)
495
- # os.makedirs(cvn_model_dir, exist_ok=True)
496
-
497
- # os.environ["TMPDIR"] = bio_model_dir
498
- # os.environ["TEMP"] = bio_model_dir
499
- # os.environ["TMP"] = bio_model_dir
500
-
501
- # UPLOAD_FOLDER = "Samples"
502
- # os.makedirs(UPLOAD_FOLDER, exist_ok=True)
503
-
504
- # app = Flask(__name__)
505
-
506
- # # DOWNLOAD MODEL FILES IF MISSING
507
- # for file_name in MODEL_FILES:
508
- # file_path = os.path.join(bio_model_dir, file_name)
509
-
510
- # if not os.path.exists(file_path):
511
- # print(f"Downloading {file_name} ...")
512
- # response = requests.get(MODEL_BASE_URL + file_name, stream=True)
513
- # with open(file_path, "wb") as f:
514
- # for chunk in response.iter_content(chunk_size=1024):
515
- # f.write(chunk)
516
- # print(f"{file_name} downloaded!")
517
-
518
- # # BIO-EMBEDDING MODEL LOADING
519
- # try:
520
- # embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
521
- # except Exception as e:
522
- # print(f"Error loading ProtTrans-BERT-BFD model: {e}")
523
- # embedder = None
524
-
525
- # def generate_bio_embeddings(sequence):
526
- # """Generate bio-embeddings for a given protein sequence."""
527
- # if embedder is None:
528
- # return None
529
- # try:
530
- # embedding_protein = embedder.embed(sequence)
531
- # embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
532
- # return np.array(embedding_per_protein).reshape(1, -1) # Reshape for model compatibility
533
- # except Exception as e:
534
- # print(f"Embedding Error: {e}")
535
- # return None
536
-
537
- # def generate_smiles(sequence, n_samples=100):
538
- # """Generate SMILES from a protein sequence."""
539
- # start_time = time.time()
540
-
541
- # protein_embedding = generate_bio_embeddings(sequence)
542
- # if protein_embedding is None:
543
- # return None, "Embedding generation failed!"
544
-
545
- # # LOAD TRAINED CVanilla_RNN_Builder MODEL
546
- # model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
547
-
548
- # # MOLECULAR GRAPH GENERATION
549
- # samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
550
- # valid_samples = [sample for sample in samples if sample is not None]
551
-
552
- # # CONVERT TO SMILES
553
- # smiles_list = [
554
- # Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
555
- # ]
556
-
557
- # if not smiles_list:
558
- # return None, "No valid SMILES generated!"
559
-
560
- # # SAVE TO FILE
561
- # filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
562
- # with open(filename, "w") as file:
563
- # file.write("\n".join(smiles_list))
564
-
565
- # elapsed_time = time.time() - start_time
566
- # return filename, elapsed_time
567
-
568
- # @app.route("/", methods=["GET", "POST"])
569
- # def index():
570
- # if request.method == "POST":
571
- # sequence = request.form["sequence"].strip()
572
- # if not sequence:
573
- # return render_template("index.html", message="Please enter a valid sequence.")
574
-
575
- # file_path, result = generate_smiles(sequence)
576
- # if file_path is None:
577
- # return render_template("index.html", message=f"Error: {result}")
578
-
579
- # return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
580
-
581
- # return render_template("index.html")
582
-
583
- # @app.route("/download")
584
- # def download_file():
585
- # file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
586
- # return send_file(file_path, as_attachment=True)
587
-
588
- # if __name__ == "__main__":
589
- # app.run(host="0.0.0.0", port=8000, debug=True)
590
-
591
-
592
- # import os
593
- # import time
594
- # import numpy as np
595
- # from flask import Flask, render_template, request, send_file
596
- # from rdkit import Chem
597
- # from transformers import AutoModel
598
- # from bio_embeddings.embed import ProtTransBertBFDEmbedder
599
- # from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
600
-
601
- # # DIRECTORIES
602
- # bio_model_dir = os.path.join(os.getcwd(), "modelsBioembed") # For bio-embeddings
603
- # cvn_model_dir = os.path.join(os.getcwd(), "models_folder") # For CVanilla_RNN_Builder
604
-
605
- # os.makedirs(bio_model_dir, exist_ok=True)
606
- # os.makedirs(cvn_model_dir, exist_ok=True)
607
-
608
- # os.environ["TMPDIR"] = bio_model_dir
609
- # os.environ["TEMP"] = bio_model_dir
610
- # os.environ["TMP"] = bio_model_dir
611
-
612
- # UPLOAD_FOLDER = "Samples"
613
- # os.makedirs(UPLOAD_FOLDER, exist_ok=True)
614
-
615
- # app = Flask(__name__)
616
-
617
- # model_path = os.path.join(bio_model_dir, "pytorch_model.bin")
618
- # if not os.path.exists(model_path):
619
- # print("Downloading ProtTrans-BERT-BFD model...")
620
- # AutoModel.from_pretrained("Rostlab/prot_bert_bfd", low_cpu_mem_usage=True).save_pretrained(bio_model_dir)
621
-
622
-
623
- # # BIO-EMBEDDING MODEL LOADING
624
- # try:
625
- # embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
626
- # except Exception as e:
627
- # print(f"Error loading ProtTrans-BERT-BFD model: {e}")
628
- # embedder = None
629
-
630
- # def generate_bio_embeddings(sequence):
631
- # """Generate bio-embeddings for a given protein sequence."""
632
- # if embedder is None:
633
- # return None
634
- # try:
635
- # embedding_protein = embedder.embed(sequence)
636
- # embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
637
- # return np.array(embedding_per_protein).reshape(1, -1) # Reshape for model compatibility
638
- # except Exception as e:
639
- # print(f"Embedding Error: {e}")
640
- # return None
641
-
642
- # def generate_smiles(sequence, n_samples=100):
643
- # """Generate SMILES from a protein sequence."""
644
- # start_time = time.time()
645
-
646
- # protein_embedding = generate_bio_embeddings(sequence)
647
- # if protein_embedding is None:
648
- # return None, "Embedding generation failed!"
649
-
650
- # # TRAINED CVanilla_RNN_Builder MODEL LOADING
651
- # model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
652
-
653
- # # MOLECULAR GRAPH GENERATION
654
- # samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
655
- # valid_samples = [sample for sample in samples if sample is not None]
656
-
657
- # # CONVERSION TO SMILES
658
- # smiles_list = [
659
- # Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
660
- # ]
661
-
662
- # if not smiles_list:
663
- # return None, "No valid SMILES generated!"
664
-
665
- # # SAVING TO FILE
666
- # filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
667
- # with open(filename, "w") as file:
668
- # file.write("\n".join(smiles_list))
669
-
670
- # elapsed_time = time.time() - start_time
671
- # return filename, elapsed_time
672
-
673
- # @app.route("/", methods=["GET", "POST"])
674
- # def index():
675
- # if request.method == "POST":
676
- # sequence = request.form["sequence"].strip()
677
- # if not sequence:
678
- # return render_template("index.html", message="Please enter a valid sequence.")
679
-
680
- # file_path, result = generate_smiles(sequence)
681
- # if file_path is None:
682
- # return render_template("index.html", message=f"Error: {result}")
683
-
684
- # return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
685
-
686
- # return render_template("index.html")
687
-
688
- # @app.route("/download")
689
- # def download_file():
690
- # file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
691
- # return send_file(file_path, as_attachment=True)
692
-
693
- # if __name__ == "__main__":
694
- # app.run(host="0.0.0.0", port=8000,debug=True)
695
-
696
-
697
-
698
-
699
-
700
-
701
-
702
-
703
-
704
-
705
-
706
-
707
-
708
- # import os
709
- # import time
710
- # import numpy as np
711
- # from flask import Flask, render_template, request, send_file
712
- # from rdkit import Chem
713
- # from transformers import AutoModel
714
- # from bio_embeddings.embed import ProtTransBertBFDEmbedder
715
- # from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
716
- # from huggingface_hub import hf_hub_download # Import for direct file download
717
-
718
- # # Define directories for different models
719
- # bio_model_dir = os.path.join(os.getcwd(), "modelsBioembed") # For bio-embeddings
720
- # cvn_model_dir = os.path.join(os.getcwd(), "models_folder") # For CVanilla_RNN_Builder
721
-
722
- # # Ensure directories exist
723
- # os.makedirs(bio_model_dir, exist_ok=True)
724
- # os.makedirs(cvn_model_dir, exist_ok=True)
725
-
726
- # UPLOAD_FOLDER = "Samples"
727
- # os.makedirs(UPLOAD_FOLDER, exist_ok=True)
728
-
729
- # app = Flask(__name__)
730
-
731
- # # Download only the required pytorch_model.bin file
732
- # model_filename = "pytorch_model.bin"
733
- # model_path = os.path.join(bio_model_dir, model_filename)
734
- # if not os.path.exists(model_path):
735
- # print("Downloading pytorch_model.bin from Hugging Face...")
736
- # hf_hub_download(repo_id="Rostlab/prot_bert_bfd", filename=model_filename, local_dir=bio_model_dir)
737
-
738
- # # Load bio-embedding model once
739
- # embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
740
-
741
- # def generate_bio_embeddings(sequence):
742
- # """Generate bio-embeddings for a given protein sequence."""
743
- # try:
744
- # embedding_protein = embedder.embed(sequence)
745
- # embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
746
- # return np.array(embedding_per_protein).reshape(1, -1)
747
- # except Exception as e:
748
- # print(f"Embedding Error: {e}")
749
- # return None
750
-
751
- # def generate_smiles(sequence, n_samples=100):
752
- # """Generate SMILES from a protein sequence."""
753
- # start_time = time.time()
754
-
755
- # protein_embedding = generate_bio_embeddings(sequence)
756
- # if protein_embedding is None:
757
- # return None, "Embedding generation failed!"
758
-
759
- # model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
760
- # samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
761
- # valid_samples = [sample for sample in samples if sample is not None]
762
-
763
- # smiles_list = [
764
- # Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
765
- # ]
766
-
767
- # if not smiles_list:
768
- # return None, "No valid SMILES generated!"
769
-
770
- # filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
771
- # with open(filename, "w") as file:
772
- # file.write("\n".join(smiles_list))
773
-
774
- # elapsed_time = time.time() - start_time
775
- # return filename, elapsed_time
776
-
777
- # @app.route("/", methods=["GET", "POST"])
778
- # def index():
779
- # if request.method == "POST":
780
- # sequence = request.form["sequence"].strip()
781
- # if not sequence:
782
- # return render_template("index.html", message="Please enter a valid sequence.")
783
-
784
- # file_path, result = generate_smiles(sequence)
785
- # if file_path is None:
786
- # return render_template("index.html", message=f"Error: {result}")
787
-
788
- # return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
789
-
790
- # return render_template("index.html")
791
-
792
- # @app.route("/download")
793
- # def download_file():
794
- # file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
795
- # return send_file(file_path, as_attachment=True)
796
-
797
- # if __name__ == "__main__":
798
- # app.run(host="0.0.0.0", port=8000, debug=True)
799
-
800
-
801
-
802
-
803
-
804
- # import os
805
- # import time
806
- # import requests
807
- # import numpy as np
808
- # import gdown # NEW: For Google Drive downloads
809
- # from flask import Flask, render_template, request, send_file
810
- # from rdkit import Chem
811
- # from transformers import AutoModel
812
- # from bio_embeddings.embed import ProtTransBertBFDEmbedder
813
- # from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
814
-
815
- # # REPLACE WITH YOUR GOOGLE DRIVE FILE IDs
816
- # GDRIVE_FILE_IDS = {
817
- # "pytorch_model.bin": "11g7bAXYNxlPsnwC8_qsUIZITAjG85JXb", # Replace with actual ID
818
- # "config.json": "1ZfuhTnEuKAI1Z92m1QnDTOEQYNe9y24E",
819
- # "tokenizer_config.json": "1r4ncUsWBNQZVKp4zw97DLTf0AgRUiuFc",
820
- # "vocab.txt": "1G1UQIGMHvCC3OokCG1tl-cTxjIVqw04w",
821
- # "special_tokens_map.json": "1pINnV2P1eBmaC7X0A52UhjrmlJgzxqbl"
822
- # }
823
-
824
- # # LOCAL DIRECTORIES
825
- # bio_model_dir = os.path.join(os.getcwd(), "modelsBioembed") # For bio-embeddings
826
- # cvn_model_dir = os.path.join(os.getcwd(), "models_folder") # For CVanilla_RNN_Builder
827
-
828
- # os.makedirs(bio_model_dir, exist_ok=True)
829
- # os.makedirs(cvn_model_dir, exist_ok=True)
830
-
831
- # os.environ["TMPDIR"] = bio_model_dir
832
- # os.environ["TEMP"] = bio_model_dir
833
- # os.environ["TMP"] = bio_model_dir
834
-
835
- # UPLOAD_FOLDER = "Samples"
836
- # os.makedirs(UPLOAD_FOLDER, exist_ok=True)
837
-
838
- # app = Flask(__name__)
839
-
840
- # # DOWNLOAD MODEL FILES IF MISSING
841
- # for file_name, file_id in GDRIVE_FILE_IDS.items():
842
- # file_path = os.path.join(bio_model_dir, file_name)
843
-
844
- # if not os.path.exists(file_path):
845
- # print(f"Downloading {file_name} from Google Drive...")
846
- # gdown.download(f"https://drive.google.com/uc?id={file_id}", file_path, quiet=False)
847
- # print(f"{file_name} downloaded!")
848
-
849
- # # BIO-EMBEDDING MODEL LOADING
850
- # try:
851
- # embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
852
- # except Exception as e:
853
- # print(f"Error loading ProtTrans-BERT-BFD model: {e}")
854
- # embedder = None
855
-
856
- # def generate_bio_embeddings(sequence):
857
- # """Generate bio-embeddings for a given protein sequence."""
858
- # if embedder is None:
859
- # return None
860
- # try:
861
- # embedding_protein = embedder.embed(sequence)
862
- # embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
863
- # return np.array(embedding_per_protein).reshape(1, -1) # Reshape for model compatibility
864
- # except Exception as e:
865
- # print(f"Embedding Error: {e}")
866
- # return None
867
-
868
- # def generate_smiles(sequence, n_samples=100):
869
- # """Generate SMILES from a protein sequence."""
870
- # start_time = time.time()
871
-
872
- # protein_embedding = generate_bio_embeddings(sequence)
873
- # if protein_embedding is None:
874
- # return None, "Embedding generation failed!"
875
-
876
- # # LOAD TRAINED CVanilla_RNN_Builder MODEL
877
- # model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
878
-
879
- # # MOLECULAR GRAPH GENERATION
880
- # samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
881
- # valid_samples = [sample for sample in samples if sample is not None]
882
-
883
- # # CONVERT TO SMILES
884
- # smiles_list = [
885
- # Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
886
- # ]
887
-
888
- # if not smiles_list:
889
- # return None, "No valid SMILES generated!"
890
-
891
- # # SAVE TO FILE
892
- # filename = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
893
- # with open(filename, "w") as file:
894
- # file.write("\n".join(smiles_list))
895
-
896
- # elapsed_time = time.time() - start_time
897
- # return filename, elapsed_time
898
-
899
- # @app.route("/", methods=["GET", "POST"])
900
- # def index():
901
- # if request.method == "POST":
902
- # sequence = request.form["sequence"].strip()
903
- # if not sequence:
904
- # return render_template("index.html", message="Please enter a valid sequence.")
905
-
906
- # file_path, result = generate_smiles(sequence)
907
- # if file_path is None:
908
- # return render_template("index.html", message=f"Error: {result}")
909
-
910
- # return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
911
-
912
- # return render_template("index.html")
913
-
914
- # @app.route("/download")
915
- # def download_file():
916
- # file_path = os.path.join(UPLOAD_FOLDER, "SMILES_GENERATED.txt")
917
- # return send_file(file_path, as_attachment=True)
918
-
919
- # if __name__ == "__main__":
920
- # app.run(host="0.0.0.0", port=8000, debug=True)
921
-
922
-
923
-
924
- # import os
925
- # import time
926
- # import gdown
927
- # import numpy as np
928
- # from flask import Flask, render_template, request, send_file
929
- # from rdkit import Chem
930
- # from bio_embeddings.embed import ProtTransBertBFDEmbedder
931
- # from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
932
-
933
- # # DIRECTORIES
934
- # bio_model_dir = "/app/modelsBioembed"
935
- # cvn_model_dir = os.getenv("CVN_MODEL_DIR", "models_folder")
936
- # upload_folder = "Samples"
937
-
938
- # # Create directories if they don't exist
939
- # os.makedirs(bio_model_dir, exist_ok=True)
940
- # os.makedirs(cvn_model_dir, exist_ok=True)
941
- # os.makedirs(upload_folder, exist_ok=True)
942
-
943
- # # Google Drive file IDs for the model files
944
- # MODEL_FILES = {
945
- # "pytorch_model.bin": "1Z9XWk-kP5yrBRdBF_mQPQsM8drqQXafJ",
946
- # "config.json": "1adE428T5ZWeosoLsBeX7sVnn6m4VvVgL",
947
- # "tokenizer_config.json": "1USvLAZ3dM4TzVSRLjINk2_W989k1HDQ0",
948
- # "vocab.txt": "1tsdesfbr61UyLShV0ojvsXOp6VJ9Exrt",
949
- # "special_tokens_map.json": "1ChCwdz0NH8ODasqscGwCS9mY7urhQte2",
950
- # }
951
-
952
- # # Function to download missing files from Google Drive
953
- # def download_model_files():
954
- # for filename, file_id in MODEL_FILES.items():
955
- # file_path = os.path.join(bio_model_dir, filename)
956
- # if not os.path.exists(file_path):
957
- # print(f"Downloading {filename} from Google Drive...")
958
- # gdown.download(f"https://drive.google.com/uc?id={file_id}", file_path, quiet=False)
959
-
960
- # # Download required model files
961
- # download_model_files()
962
- # print("All model files are ready!")
963
-
964
- # # Load the ProtTrans-BERT-BFD Model
965
- # try:
966
- # embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
967
- # print("ProtTrans-BERT-BFD model loaded successfully!")
968
- # except Exception as e:
969
- # print(f"Error loading model: {e}")
970
- # embedder = None
971
-
972
- # # Function to generate protein embeddings
973
- # def generate_bio_embeddings(sequence):
974
- # if embedder is None:
975
- # return None
976
- # try:
977
- # embedding_protein = embedder.embed(sequence)
978
- # embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
979
- # return np.array(embedding_per_protein).reshape(1, -1)
980
- # except Exception as e:
981
- # print(f"Embedding Error: {e}")
982
- # return None
983
-
984
- # # Function to generate SMILES from a protein sequence
985
- # def generate_smiles(sequence, n_samples=100):
986
- # start_time = time.time()
987
-
988
- # protein_embedding = generate_bio_embeddings(sequence)
989
- # if protein_embedding is None:
990
- # return None, "Embedding generation failed!"
991
-
992
- # # Load the trained CVanilla_RNN_Builder model
993
- # model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
994
-
995
- # # Generate molecular graphs
996
- # samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
997
- # valid_samples = [sample for sample in samples if sample is not None]
998
-
999
- # # Convert to SMILES format
1000
- # smiles_list = [
1001
- # Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
1002
- # ]
1003
-
1004
- # if not smiles_list:
1005
- # return None, "No valid SMILES generated!"
1006
-
1007
- # # Save SMILES to a file
1008
- # filename = os.path.join(upload_folder, "SMILES_GENERATED.txt")
1009
- # with open(filename, "w") as file:
1010
- # file.write("\n".join(smiles_list))
1011
-
1012
- # elapsed_time = time.time() - start_time
1013
- # return filename, elapsed_time
1014
-
1015
- # # Initialize Flask App
1016
- # app = Flask(__name__)
1017
-
1018
- # @app.route("/", methods=["GET", "POST"])
1019
- # def index():
1020
- # if request.method == "POST":
1021
- # sequence = request.form["sequence"].strip()
1022
- # if not sequence:
1023
- # return render_template("index.html", message="Please enter a valid sequence.")
1024
-
1025
- # file_path, result = generate_smiles(sequence)
1026
- # if file_path is None:
1027
- # return render_template("index.html", message=f"Error: {result}")
1028
-
1029
- # return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
1030
-
1031
- # return render_template("index.html")
1032
-
1033
- # @app.route("/download")
1034
- # def download_file():
1035
- # file_path = os.path.join(upload_folder, "SMILES_GENERATED.txt")
1036
- # return send_file(file_path, as_attachment=True)
1037
-
1038
- # if __name__ == "__main__":
1039
- # app.run(host="0.0.0.0", port=8000)
1040
-
1041
-
1042
-
1043
- # import os
1044
- # import time
1045
- # import requests
1046
- # from flask import Flask, render_template, request, send_file
1047
- # from rdkit import Chem
1048
- # from bio_embeddings.embed import ProtTransBertBFDEmbedder
1049
- # from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
1050
-
1051
- # # DIRECTORIES
1052
- # bio_model_dir = "/app/modelsBioembed"
1053
- # cvn_model_dir = os.getenv("CVN_MODEL_DIR", "models_folder")
1054
- # upload_folder = "Samples"
1055
-
1056
- # # Create directories if they don't exist
1057
- # os.makedirs(bio_model_dir, exist_ok=True)
1058
- # os.makedirs(cvn_model_dir, exist_ok=True)
1059
- # os.makedirs(upload_folder, exist_ok=True)
1060
-
1061
- # # Google Drive file IDs for the model files
1062
- # MODEL_FILES = {
1063
- # "pytorch_model.bin": "1Z9XWk-kP5yrBRdBF_mQPQsM8drqQXafJ",
1064
- # "config.json": "1adE428T5ZWeosoLsBeX7sVnn6m4VvVgL",
1065
- # "tokenizer_config.json": "1USvLAZ3dM4TzVSRLjINk2_W989k1HDQ0",
1066
- # "vocab.txt": "1tsdesfbr61UyLShV0ojvsXOp6VJ9Exrt",
1067
- # "special_tokens_map.json": "1ChCwdz0NH8ODasqscGwCS9mY7urhQte2",
1068
- # }
1069
-
1070
- # # Function to download a file from Google Drive
1071
- # def download_file_from_google_drive(file_id, destination):
1072
- # URL = f"https://drive.google.com/uc?export=download&id={file_id}"
1073
- # session = requests.Session()
1074
- # response = session.get(URL, stream=True)
1075
-
1076
- # # Check if the request was successful
1077
- # if response.status_code == 200:
1078
- # with open(destination, "wb") as f:
1079
- # for chunk in response.iter_content(chunk_size=128):
1080
- # f.write(chunk)
1081
- # print(f"Downloaded {destination}")
1082
- # else:
1083
- # print(f"Failed to download {destination}")
1084
-
1085
- # # Function to download missing files from Google Drive
1086
- # def download_model_files():
1087
- # for filename, file_id in MODEL_FILES.items():
1088
- # file_path = os.path.join(bio_model_dir, filename)
1089
- # if not os.path.exists(file_path):
1090
- # print(f"Downloading {filename} from Google Drive...")
1091
- # download_file_from_google_drive(file_id, file_path)
1092
-
1093
- # # Download required model files
1094
- # download_model_files()
1095
- # print("All model files are ready!")
1096
-
1097
- # # Load the ProtTrans-BERT-BFD Model
1098
- # try:
1099
- # embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
1100
- # print("ProtTrans-BERT-BFD model loaded successfully!")
1101
- # except Exception as e:
1102
- # print(f"Error loading model: {e}")
1103
- # embedder = None
1104
-
1105
- # # Function to generate protein embeddings
1106
- # def generate_bio_embeddings(sequence):
1107
- # if embedder is None:
1108
- # return None
1109
- # try:
1110
- # embedding_protein = embedder.embed(sequence)
1111
- # embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
1112
- # return np.array(embedding_per_protein).reshape(1, -1)
1113
- # except Exception as e:
1114
- # print(f"Embedding Error: {e}")
1115
- # return None
1116
-
1117
- # # Function to generate SMILES from a protein sequence
1118
- # def generate_smiles(sequence, n_samples=100):
1119
- # start_time = time.time()
1120
-
1121
- # protein_embedding = generate_bio_embeddings(sequence)
1122
- # if protein_embedding is None:
1123
- # return None, "Embedding generation failed!"
1124
-
1125
- # # Load the trained CVanilla_RNN_Builder model
1126
- # model = CVanilla_RNN_Builder(cvn_model_dir, gpu_id=None)
1127
-
1128
- # # Generate molecular graphs
1129
- # samples = model.sample(n_samples, c=protein_embedding[0], output_type='graph')
1130
- # valid_samples = [sample for sample in samples if sample is not None]
1131
-
1132
- # # Convert to SMILES format
1133
- # smiles_list = [
1134
- # Chem.MolToSmiles(mol) for mol in get_mol_from_graph_list(valid_samples, sanitize=True) if mol is not None
1135
- # ]
1136
-
1137
- # if not smiles_list:
1138
- # return None, "No valid SMILES generated!"
1139
-
1140
- # # Save SMILES to a file
1141
- # filename = os.path.join(upload_folder, "SMILES_GENERATED.txt")
1142
- # with open(filename, "w") as file:
1143
- # file.write("\n".join(smiles_list))
1144
-
1145
- # elapsed_time = time.time() - start_time
1146
- # return filename, elapsed_time
1147
-
1148
- # # Initialize Flask App
1149
- # app = Flask(__name__)
1150
-
1151
- # @app.route("/", methods=["GET", "POST"])
1152
- # def index():
1153
- # if request.method == "POST":
1154
- # sequence = request.form["sequence"].strip()
1155
- # if not sequence:
1156
- # return render_template("index.html", message="Please enter a valid sequence.")
1157
-
1158
- # file_path, result = generate_smiles(sequence)
1159
- # if file_path is None:
1160
- # return render_template("index.html", message=f"Error: {result}")
1161
-
1162
- # return render_template("index.html", message="SMILES generated successfully!", file_path=file_path, time_taken=result)
1163
-
1164
- # return render_template("index.html")
1165
-
1166
- # @app.route("/download")
1167
- # def download_file():
1168
- # file_path = os.path.join(upload_folder, "SMILES_GENERATED.txt")
1169
- # return send_file(file_path, as_attachment=True)
1170
-
1171
- # if __name__ == "__main__":
1172
- # app.run(host="0.0.0.0", port=8000)
1173
-
 
6
  from rdkit import Chem
7
  from transformers import AutoModelForMaskedLM, AutoTokenizer
8
  from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
 
9
  from transformers import AutoModel, AutoTokenizer
10
  import torch
 
11
  import re
12
+ import torch.nn as nn
13
+
14
 
15
 
16
+ # DIRECTORIES
17
+ bio_model_dir = "/app/modelsBioembedSmall"
18
  cvn_model_dir = "/app/models_folder"
19
  UPLOAD_FOLDER = "/app/Samples"
20
  UF="/tmp/"
 
23
  os.makedirs(cvn_model_dir, exist_ok=True)
24
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
25
 
26
+ # ENV VARIABLES
27
  os.environ["TMPDIR"] = bio_model_dir
28
  os.environ["TEMP"] = bio_model_dir
29
  os.environ["TMP"] = bio_model_dir
 
31
  os.environ['TRANSFORMERS_CACHE'] = '/app/hf_cache'
32
 
33
 
34
+ # ESM2 MODEL AND TOKENIZER
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  try:
36
  print("Loading ESM2 model...")
37
+ model_name = "facebook/esm2_t6_8M_UR50D" # Smaller model with 320-dim embedding
 
38
 
39
  tokenizer = AutoTokenizer.from_pretrained(bio_model_dir)
40
  model = AutoModel.from_pretrained(bio_model_dir)
 
45
  model = None
46
  tokenizer = None
47
 
48
+ # linear transformation to map 320D embeddings to 1024D
49
  class EmbeddingTransformer(nn.Module):
50
  def __init__(self, input_dim, output_dim):
51
  super(EmbeddingTransformer, self).__init__()
 
54
  def forward(self, x):
55
  return self.linear(x)
56
 
 
57
  transformer = EmbeddingTransformer(input_dim=320, output_dim=1024)
58
 
59
+ # UDF TO GENERATE EMBEDDINGS
 
 
 
 
 
 
 
60
  def generate_bio_embeddings(sequence):
61
  """
62
  Generate protein sequence embeddings using ESM2 model.
 
66
  print("Model or tokenizer not loaded.")
67
  return None
68
 
 
69
  if not sequence:
70
  print("Sequence is empty after cleaning.")
71
  return None
72
 
73
  try:
74
+
75
  inputs = tokenizer(sequence, return_tensors="pt", add_special_tokens=True)
76
 
77
+
78
  with torch.no_grad():
79
  outputs = model(**inputs)
80
 
81
+ embeddings = outputs.last_hidden_state
82
+ mean_embedding = embeddings.mean(dim=1).squeeze()
 
83
 
84
+
85
  transformed_embedding = transformer(mean_embedding)
86
 
87
+
88
  transformed_embedding = transformed_embedding.detach().numpy()
89
 
 
90
  return transformed_embedding.reshape(1, -1)
91
 
92
  except Exception as e:
 
94
  return None
95
 
96
 
97
+ # UDF FOR SMILES GENERATION
98
  def generate_smiles(sequence, n_samples=100):
99
  start_time = time.time()
100
 
 
120
  elapsed_time = time.time() - start_time
121
  return filename, elapsed_time
122
 
123
+
124
  app = Flask(__name__)
125
 
126
  @app.route("/", methods=["GET", "POST"])
 
143
  file_path = os.path.join(UF, "SMILES_GENERATED.txt")
144
  return send_file(file_path, as_attachment=True)
145
 
146
+
147
  if __name__ == "__main__":
148
  app.run(host="0.0.0.0", port=7860)
149
 
150
 
151