Bhanushray commited on
Commit
b6545dd
·
verified ·
1 Parent(s): dbf6fec

Upload 9 files

Browse files
Dockerfile CHANGED
@@ -1,91 +1,29 @@
1
  FROM python:3.8
2
 
3
- # Install system dependencies
4
  RUN apt-get update && apt-get install -y libopenblas-dev git curl && rm -rf /var/lib/apt/lists/*
5
 
6
- # Set environment variables
7
- ENV HF_HOME=/app/cache/huggingface
8
- ENV NUMBA_DISABLE_CACHING=1
9
-
10
- # Set working directory
11
  WORKDIR /app
12
 
13
- # Pre-create writable cache directories
14
- # REMOVED /app/cache/numba from here
15
- RUN mkdir -p /app/cache/huggingface /app/modelsBioembed /app/models_folder /app/Samples
16
-
17
- # Copy requirements and install
18
  COPY requirements.txt .
 
 
19
  RUN pip install --no-cache-dir -r requirements.txt
20
 
21
- # Copy app code
 
 
 
22
  COPY . .
23
 
24
- # Expose app port
25
  EXPOSE 7860
26
 
27
- # Run the app
28
  CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]
29
 
30
- # FROM python:3.8
31
-
32
- # # Install required system dependencies
33
- # RUN apt-get update && apt-get install -y libopenblas-dev git curl && rm -rf /var/lib/apt/lists/*
34
-
35
- # # Set environment variables for cache
36
- # ENV TRANSFORMERS_CACHE=/app/cache/huggingface
37
- # ENV NUMBA_CACHE_DIR=/app/cache/numba
38
-
39
- # # Set the working directory inside the container
40
- # WORKDIR /app
41
-
42
- # # Create necessary directories
43
- # RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples \
44
- # && mkdir -p $TRANSFORMERS_CACHE $NUMBA_CACHE_DIR
45
-
46
- # # Copy the requirements file into the container
47
- # COPY requirements.txt .
48
-
49
- # # Install dependencies
50
- # RUN pip install --no-cache-dir -r requirements.txt
51
-
52
- # # Copy the entire project to the container
53
- # COPY . .
54
-
55
- # # Expose the port for Flask
56
- # EXPOSE 7860
57
-
58
- # # Run the app with Gunicorn
59
- # CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]
60
-
61
-
62
-
63
- # FROM python:3.8
64
-
65
- # # Install required system dependencies
66
- # RUN apt-get update && apt-get install -y libopenblas-dev git curl && rm -rf /var/lib/apt/lists/*
67
-
68
- # # Set the working directory inside the container
69
- # WORKDIR /app
70
-
71
- # # Copy the requirements file into the container
72
- # COPY requirements.txt .
73
-
74
- # # Install dependencies
75
- # RUN pip install --no-cache-dir -r requirements.txt
76
-
77
- # # Create necessary directories (but don't download models here!)
78
- # RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples
79
-
80
- # # Copy the entire project to the container
81
- # COPY . .
82
-
83
- # # Expose the port for Flask
84
- # EXPOSE 7860
85
-
86
- # # Run the app with Gunicorn
87
- # CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]
88
-
89
 
90
  # # Use Python 3.8 as the base image
91
  # FROM python:3.8
 
1
  FROM python:3.8
2
 
3
+ # Install required system dependencies
4
  RUN apt-get update && apt-get install -y libopenblas-dev git curl && rm -rf /var/lib/apt/lists/*
5
 
6
+ # Set the working directory inside the container
 
 
 
 
7
  WORKDIR /app
8
 
9
+ # Copy the requirements file into the container
 
 
 
 
10
  COPY requirements.txt .
11
+
12
+ # Install dependencies
13
  RUN pip install --no-cache-dir -r requirements.txt
14
 
15
+ # Create necessary directories (but don't download models here!)
16
+ RUN mkdir -p /app/modelsBioembedSmall /app/models_folder /app/Samples /app/numba_cache /app/hf_cache
17
+
18
+ # Copy the entire project to the container
19
  COPY . .
20
 
21
+ # Expose the port for Flask
22
  EXPOSE 7860
23
 
24
+ # Run the app with Gunicorn
25
  CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  # # Use Python 3.8 as the base image
29
  # FROM python:3.8
app.py CHANGED
@@ -8,8 +8,14 @@ from transformers import AutoModelForMaskedLM, AutoTokenizer
8
  from bio_embeddings.embed import ProtTransBertBFDEmbedder
9
  from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
10
 
 
 
 
 
 
 
11
  # 🚀 Define Directories for Railway
12
- bio_model_dir = "/app/modelsBioembed" # Persistent model storage
13
  cvn_model_dir = "/app/models_folder"
14
  UPLOAD_FOLDER = "/app/Samples"
15
 
@@ -21,6 +27,8 @@ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
21
  os.environ["TMPDIR"] = bio_model_dir
22
  os.environ["TEMP"] = bio_model_dir
23
  os.environ["TMP"] = bio_model_dir
 
 
24
 
25
 
26
  # 🔗 Dropbox Links for Model Files
@@ -32,62 +40,142 @@ DROPBOX_LINKS = {
32
  "special_tokens_map.json": "https://www.dropbox.com/scl/fi/t3lvmp5x28d1zjac3j7ec/special_tokens_map.json?rlkey=z2xbompa54iu4y9qgb5bvmfc9&st=zrxlpjdt&dl=1"
33
  }
34
 
35
- # 📥 Function to Download Model Files
36
- def download_model_files():
37
- for filename, url in DROPBOX_LINKS.items():
38
- file_path = os.path.join(bio_model_dir, filename)
39
- if not os.path.exists(file_path): # Avoid re-downloading
40
- print(f"Downloading {filename}...")
41
- response = requests.get(url, stream=True)
42
- if response.status_code == 200:
43
- with open(file_path, "wb") as f:
44
- for chunk in response.iter_content(chunk_size=1024):
45
- f.write(chunk)
46
- print(f"Downloaded: {filename}")
47
- else:
48
- print(f"Failed to download {filename}")
49
  # def download_model_files():
50
  # for filename, url in DROPBOX_LINKS.items():
51
  # file_path = os.path.join(bio_model_dir, filename)
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- # print(f"Downloading {filename} (forcing overwrite)...")
54
- # response = requests.get(url, stream=True)
55
- # if response.status_code == 200:
56
- # with open(file_path, "wb") as f:
57
- # for chunk in response.iter_content(chunk_size=1024):
58
- # f.write(chunk)
59
- # print(f"Downloaded: {filename}")
60
- # else:
61
- # print(f"Failed to download {filename}")
62
-
63
- # 📥 Download models before starting
64
- download_model_files()
65
 
66
  # # ✅ Load ProtTrans-BERT-BFD Model
67
  # print("Loading ProtTrans-BERT-BFD model...")
68
  # model = AutoModelForMaskedLM.from_pretrained(bio_model_dir)
69
  # tokenizer = AutoTokenizer.from_pretrained(bio_model_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- # Load Bio-Embedding Model
72
  try:
73
- print("Loading ProtTrans-BERT-BFD model...")
74
- embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
 
 
 
 
 
 
75
  except Exception as e:
76
- print(f"Error loading ProtTrans-BERT-BFD model: {e}")
77
- embedder = None
78
-
79
- # 🧬 Generate Bio-Embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  def generate_bio_embeddings(sequence):
81
- if embedder is None:
 
 
 
 
 
82
  return None
 
 
 
 
 
 
83
  try:
84
- embedding_protein = embedder.embed(sequence)
85
- embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
86
- return np.array(embedding_per_protein).reshape(1, -1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  except Exception as e:
88
  print(f"Embedding Error: {e}")
89
  return None
90
 
 
91
  # 🔬 Generate SMILES from Protein Sequence
92
  def generate_smiles(sequence, n_samples=100):
93
  start_time = time.time()
 
8
  from bio_embeddings.embed import ProtTransBertBFDEmbedder
9
  from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
10
 
11
+ from transformers import AutoModel, AutoTokenizer
12
+ import torch
13
+ import numpy as np
14
+ import re
15
+
16
+
17
  # 🚀 Define Directories for Railway
18
+ bio_model_dir = "/app/modelsBioembedSmall" # Persistent model storage
19
  cvn_model_dir = "/app/models_folder"
20
  UPLOAD_FOLDER = "/app/Samples"
21
 
 
27
  os.environ["TMPDIR"] = bio_model_dir
28
  os.environ["TEMP"] = bio_model_dir
29
  os.environ["TMP"] = bio_model_dir
30
+ os.environ['NUMBA_CACHE_DIR'] = '/app/numba_cache'
31
+ os.environ['TRANSFORMERS_CACHE'] = '/app/hf_cache'
32
 
33
 
34
  # 🔗 Dropbox Links for Model Files
 
40
  "special_tokens_map.json": "https://www.dropbox.com/scl/fi/t3lvmp5x28d1zjac3j7ec/special_tokens_map.json?rlkey=z2xbompa54iu4y9qgb5bvmfc9&st=zrxlpjdt&dl=1"
41
  }
42
 
43
+ # # 📥 Function to Download Model Files
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  # def download_model_files():
45
  # for filename, url in DROPBOX_LINKS.items():
46
  # file_path = os.path.join(bio_model_dir, filename)
47
+ # if not os.path.exists(file_path): # Avoid re-downloading
48
+ # print(f"Downloading {filename}...")
49
+ # response = requests.get(url, stream=True)
50
+ # if response.status_code == 200:
51
+ # with open(file_path, "wb") as f:
52
+ # for chunk in response.iter_content(chunk_size=1024):
53
+ # f.write(chunk)
54
+ # print(f"Downloaded: {filename}")
55
+ # else:
56
+ # print(f"Failed to download {filename}")
57
+ def download_model_files():
58
+ for filename, url in DROPBOX_LINKS.items():
59
+ file_path = os.path.join(bio_model_dir, filename)
60
 
61
+ print(f"Downloading {filename} (forcing overwrite)...")
62
+ response = requests.get(url, stream=True)
63
+ if response.status_code == 200:
64
+ with open(file_path, "wb") as f:
65
+ for chunk in response.iter_content(chunk_size=1024):
66
+ f.write(chunk)
67
+ print(f"Downloaded: {filename}")
68
+ else:
69
+ print(f"Failed to download {filename}")
70
+
71
+ # # 📥 Download models before starting
72
+ # download_model_files()
73
 
74
  # # ✅ Load ProtTrans-BERT-BFD Model
75
  # print("Loading ProtTrans-BERT-BFD model...")
76
  # model = AutoModelForMaskedLM.from_pretrained(bio_model_dir)
77
  # tokenizer = AutoTokenizer.from_pretrained(bio_model_dir)
78
+ ##
79
+ ### ✅ Load Bio-Embedding Model
80
+ ##try:
81
+ ## print("Loading ProtTrans-BERT-BFD model...")
82
+ ## embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
83
+ ##except Exception as e:
84
+ ## print(f"Error loading ProtTrans-BERT-BFD model: {e}")
85
+ ## embedder = None
86
+ ##
87
+ ### 🧬 Generate Bio-Embeddings
88
+ ##def generate_bio_embeddings(sequence):
89
+ ## if embedder is None:
90
+ ## return None
91
+ ## try:
92
+ ## embedding_protein = embedder.embed(sequence)
93
+ ## embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
94
+ ## return np.array(embedding_per_protein).reshape(1, -1)
95
+ ## except Exception as e:
96
+ ## print(f"Embedding Error: {e}")
97
+ ## return None
98
+ import torch
99
+ from transformers import AutoTokenizer, AutoModel
100
+ import re
101
+ import numpy as np
102
+ import torch.nn as nn
103
 
104
+ # Load ESM2 model and tokenizer
105
  try:
106
+ print("Loading ESM2 model...")
107
+ # Using a smaller model that is only ~200 MB
108
+ model_name = "facebook/esm2_t6_8M_UR50D" # Smaller model with 320-dim embeddings
109
+
110
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
111
+ model = AutoModel.from_pretrained(model_name)
112
+ model.eval()
113
+ print("ESM2 model loaded.")
114
  except Exception as e:
115
+ print(f"Error loading ESM2 model: {e}")
116
+ model = None
117
+ tokenizer = None
118
+
119
+ # Define a linear transformation to map 320D embeddings to 1024D
120
+ class EmbeddingTransformer(nn.Module):
121
+ def __init__(self, input_dim, output_dim):
122
+ super(EmbeddingTransformer, self).__init__()
123
+ self.linear = nn.Linear(input_dim, output_dim)
124
+
125
+ def forward(self, x):
126
+ return self.linear(x)
127
+
128
+ # Initialize the transformation layer
129
+ transformer = EmbeddingTransformer(input_dim=320, output_dim=1024)
130
+
131
+ # Function to clean protein sequence
132
+ def clean_sequence(seq):
133
+ """
134
+ Clean the protein sequence by removing non-standard characters
135
+ and converting to uppercase.
136
+ """
137
+ return re.sub(r'[^ACDEFGHIKLMNPQRSTVWY]', '', seq.upper())
138
+ # Function to generate embeddings from a protein sequence
139
  def generate_bio_embeddings(sequence):
140
+ """
141
+ Generate protein sequence embeddings using ESM2 model.
142
+ Maps the 320-dimensional embedding to 1024 dimensions.
143
+ """
144
+ if model is None or tokenizer is None:
145
+ print("Model or tokenizer not loaded.")
146
  return None
147
+
148
+ #sequence = clean_sequence(sequence)
149
+ if not sequence:
150
+ print("Sequence is empty after cleaning.")
151
+ return None
152
+
153
  try:
154
+ # Tokenize the sequence for ESM2 model
155
+ inputs = tokenizer(sequence, return_tensors="pt", add_special_tokens=True)
156
+
157
+ # Pass the tokenized input through the ESM2 model to get embeddings
158
+ with torch.no_grad():
159
+ outputs = model(**inputs)
160
+
161
+ # Extract the last hidden state (embeddings) and average across the sequence length
162
+ embeddings = outputs.last_hidden_state # shape: (batch_size, seq_len, 320)
163
+ mean_embedding = embeddings.mean(dim=1).squeeze() # shape: (320,)
164
+
165
+ # Map the 320-dimensional embedding to a 1024-dimensional space using the transformer
166
+ transformed_embedding = transformer(mean_embedding)
167
+
168
+ # Detach the tensor from the computation graph and convert to numpy
169
+ transformed_embedding = transformed_embedding.detach().numpy()
170
+
171
+ # Return the transformed embedding as a 2D numpy array (1, 1024)
172
+ return transformed_embedding.reshape(1, -1)
173
+
174
  except Exception as e:
175
  print(f"Embedding Error: {e}")
176
  return None
177
 
178
+
179
  # 🔬 Generate SMILES from Protein Sequence
180
  def generate_smiles(sequence, n_samples=100):
181
  start_time = time.time()
modelsBioembedSmall/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/tmp/facebook/esm2_t6_8M_UR50D",
3
+ "architectures": [
4
+ "EsmForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "classifier_dropout": null,
8
+ "emb_layer_norm_before": false,
9
+ "esmfold_config": null,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.0,
12
+ "hidden_size": 320,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 1280,
15
+ "is_folding_model": false,
16
+ "layer_norm_eps": 1e-05,
17
+ "mask_token_id": 32,
18
+ "max_position_embeddings": 1026,
19
+ "model_type": "esm",
20
+ "num_attention_heads": 20,
21
+ "num_hidden_layers": 6,
22
+ "pad_token_id": 1,
23
+ "position_embedding_type": "rotary",
24
+ "token_dropout": true,
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.25.0.dev0",
27
+ "use_cache": true,
28
+ "vocab_list": null,
29
+ "vocab_size": 33
30
+ }
modelsBioembedSmall/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24c5fa474c48f3b754b86efe752d5f189d2bcd88190fa2270fc92b2ef3034189
3
+ size 31384292
modelsBioembedSmall/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9edcf393212f3a26684cd68ca8095ec43c2c341ee0fcc3ba7a4d3a47c5dc138f
3
+ size 31406877
modelsBioembedSmall/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "<cls>",
3
+ "eos_token": "<eos>",
4
+ "mask_token": "<mask>",
5
+ "pad_token": "<pad>",
6
+ "unk_token": "<unk>"
7
+ }
modelsBioembedSmall/tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbb600f469925d93572cb238a475914a2d0846fb8a7814a98028a7a2e21a05a3
3
+ size 30256864
modelsBioembedSmall/tokenizer_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "model_max_length": 1000000000000000019884624838656,
3
+ "tokenizer_class": "EsmTokenizer"
4
+ }
modelsBioembedSmall/vocab.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <cls>
2
+ <pad>
3
+ <eos>
4
+ <unk>
5
+ L
6
+ A
7
+ G
8
+ V
9
+ S
10
+ E
11
+ R
12
+ T
13
+ I
14
+ D
15
+ P
16
+ K
17
+ Q
18
+ N
19
+ F
20
+ Y
21
+ M
22
+ H
23
+ W
24
+ C
25
+ X
26
+ B
27
+ U
28
+ Z
29
+ O
30
+ .
31
+ -
32
+ <null_1>
33
+ <mask>