Spaces:

Bhanushray
/

WebAppPTS

Sleeping

App Files Files Community

Bhanushray commited on Apr 23

Commit

b6545dd

verified ·

1 Parent(s): dbf6fec

Upload 9 files

Browse files

Files changed (9) hide show

Dockerfile +11 -73
app.py +126 -38
modelsBioembedSmall/config.json +30 -0
modelsBioembedSmall/model.safetensors +3 -0
modelsBioembedSmall/pytorch_model.bin +3 -0
modelsBioembedSmall/special_tokens_map.json +7 -0
modelsBioembedSmall/tf_model.h5 +3 -0
modelsBioembedSmall/tokenizer_config.json +4 -0
modelsBioembedSmall/vocab.txt +33 -0

Dockerfile CHANGED Viewed

@@ -1,91 +1,29 @@
 FROM python:3.8
-# Install system dependencies
 RUN apt-get update && apt-get install -y libopenblas-dev git curl && rm -rf /var/lib/apt/lists/*
-# Set environment variables
-ENV HF_HOME=/app/cache/huggingface
-ENV NUMBA_DISABLE_CACHING=1
-# Set working directory
 WORKDIR /app
-# Pre-create writable cache directories
-# REMOVED /app/cache/numba from here
-RUN mkdir -p /app/cache/huggingface /app/modelsBioembed /app/models_folder /app/Samples
-# Copy requirements and install
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy app code
 COPY . .
-# Expose app port
 EXPOSE 7860
-# Run the app
 CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]
-# FROM python:3.8
-# # Install required system dependencies
-# RUN apt-get update && apt-get install -y libopenblas-dev git curl && rm -rf /var/lib/apt/lists/*
-# # Set environment variables for cache
-# ENV TRANSFORMERS_CACHE=/app/cache/huggingface
-# ENV NUMBA_CACHE_DIR=/app/cache/numba
-# # Set the working directory inside the container
-# WORKDIR /app
-# # Create necessary directories
-# RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples \
-#     && mkdir -p $TRANSFORMERS_CACHE $NUMBA_CACHE_DIR
-# # Copy the requirements file into the container
-# COPY requirements.txt .
-# # Install dependencies
-# RUN pip install --no-cache-dir -r requirements.txt
-# # Copy the entire project to the container
-# COPY . .
-# # Expose the port for Flask
-# EXPOSE 7860
-# # Run the app with Gunicorn
-# CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]
-# FROM python:3.8
-# # Install required system dependencies
-# RUN apt-get update && apt-get install -y libopenblas-dev git curl && rm -rf /var/lib/apt/lists/*
-# # Set the working directory inside the container
-# WORKDIR /app
-# # Copy the requirements file into the container
-# COPY requirements.txt .
-# # Install dependencies
-# RUN pip install --no-cache-dir -r requirements.txt
-# # Create necessary directories (but don't download models here!)
-# RUN mkdir -p /app/modelsBioembed /app/models_folder /app/Samples
-# # Copy the entire project to the container
-# COPY . .
-# # Expose the port for Flask
-# EXPOSE 7860
-# # Run the app with Gunicorn
-# CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]
 # # Use Python 3.8 as the base image
 # FROM python:3.8

 FROM python:3.8
+# Install required system dependencies
 RUN apt-get update && apt-get install -y libopenblas-dev git curl && rm -rf /var/lib/apt/lists/*
+# Set the working directory inside the container
 WORKDIR /app
+# Copy the requirements file into the container
 COPY requirements.txt .
+# Install dependencies
 RUN pip install --no-cache-dir -r requirements.txt
+# Create necessary directories (but don't download models here!)
+RUN mkdir -p /app/modelsBioembedSmall /app/models_folder /app/Samples /app/numba_cache /app/hf_cache
+# Copy the entire project to the container
 COPY . .
+# Expose the port for Flask
 EXPOSE 7860
+# Run the app with Gunicorn
 CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]
 # # Use Python 3.8 as the base image
 # FROM python:3.8

app.py CHANGED Viewed

@@ -8,8 +8,14 @@ from transformers import AutoModelForMaskedLM, AutoTokenizer
 from bio_embeddings.embed import ProtTransBertBFDEmbedder
 from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
 # 🚀 Define Directories for Railway
-bio_model_dir = "/app/modelsBioembed"  # Persistent model storage
 cvn_model_dir = "/app/models_folder"
 UPLOAD_FOLDER = "/app/Samples"
@@ -21,6 +27,8 @@ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 os.environ["TMPDIR"] = bio_model_dir
 os.environ["TEMP"] = bio_model_dir
 os.environ["TMP"] = bio_model_dir
 # 🔗 Dropbox Links for Model Files
@@ -32,62 +40,142 @@ DROPBOX_LINKS = {
     "special_tokens_map.json": "https://www.dropbox.com/scl/fi/t3lvmp5x28d1zjac3j7ec/special_tokens_map.json?rlkey=z2xbompa54iu4y9qgb5bvmfc9&st=zrxlpjdt&dl=1"
 }
-# 📥 Function to Download Model Files
-def download_model_files():
-    for filename, url in DROPBOX_LINKS.items():
-        file_path = os.path.join(bio_model_dir, filename)
-        if not os.path.exists(file_path):  # Avoid re-downloading
-            print(f"Downloading {filename}...")
-            response = requests.get(url, stream=True)
-            if response.status_code == 200:
-                with open(file_path, "wb") as f:
-                    for chunk in response.iter_content(chunk_size=1024):
-                        f.write(chunk)
-                print(f"Downloaded: {filename}")
-            else:
-                print(f"Failed to download {filename}")
 # def download_model_files():
 #     for filename, url in DROPBOX_LINKS.items():
 #         file_path = os.path.join(bio_model_dir, filename)
-#         print(f"Downloading {filename} (forcing overwrite)...")
-#         response = requests.get(url, stream=True)
-#         if response.status_code == 200:
-#             with open(file_path, "wb") as f:
-#                 for chunk in response.iter_content(chunk_size=1024):
-#                     f.write(chunk)
-#             print(f"Downloaded: {filename}")
-#         else:
-#             print(f"Failed to download {filename}")
-# 📥 Download models before starting
-download_model_files()
 # # ✅ Load ProtTrans-BERT-BFD Model
 # print("Loading ProtTrans-BERT-BFD model...")
 # model = AutoModelForMaskedLM.from_pretrained(bio_model_dir)
 # tokenizer = AutoTokenizer.from_pretrained(bio_model_dir)
-# ✅ Load Bio-Embedding Model
 try:
-    print("Loading ProtTrans-BERT-BFD model...")
-    embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
 except Exception as e:
-    print(f"Error loading ProtTrans-BERT-BFD model: {e}")
-    embedder = None
-# 🧬 Generate Bio-Embeddings
 def generate_bio_embeddings(sequence):
-    if embedder is None:
         return None
     try:
-        embedding_protein = embedder.embed(sequence)
-        embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
-        return np.array(embedding_per_protein).reshape(1, -1)
     except Exception as e:
         print(f"Embedding Error: {e}")
         return None
 # 🔬 Generate SMILES from Protein Sequence
 def generate_smiles(sequence, n_samples=100):
     start_time = time.time()

 from bio_embeddings.embed import ProtTransBertBFDEmbedder
 from modelstrc import CVanilla_RNN_Builder, get_mol_from_graph_list
+from transformers import AutoModel, AutoTokenizer
+import torch
+import numpy as np
+import re
 # 🚀 Define Directories for Railway
+bio_model_dir = "/app/modelsBioembedSmall"  # Persistent model storage
 cvn_model_dir = "/app/models_folder"
 UPLOAD_FOLDER = "/app/Samples"
 os.environ["TMPDIR"] = bio_model_dir
 os.environ["TEMP"] = bio_model_dir
 os.environ["TMP"] = bio_model_dir
+os.environ['NUMBA_CACHE_DIR'] = '/app/numba_cache'
+os.environ['TRANSFORMERS_CACHE'] = '/app/hf_cache'
 # 🔗 Dropbox Links for Model Files
     "special_tokens_map.json": "https://www.dropbox.com/scl/fi/t3lvmp5x28d1zjac3j7ec/special_tokens_map.json?rlkey=z2xbompa54iu4y9qgb5bvmfc9&st=zrxlpjdt&dl=1"
 }
+# # 📥 Function to Download Model Files
 # def download_model_files():
 #     for filename, url in DROPBOX_LINKS.items():
 #         file_path = os.path.join(bio_model_dir, filename)
+#         if not os.path.exists(file_path):  # Avoid re-downloading
+#             print(f"Downloading {filename}...")
+#             response = requests.get(url, stream=True)
+#             if response.status_code == 200:
+#                 with open(file_path, "wb") as f:
+#                     for chunk in response.iter_content(chunk_size=1024):
+#                         f.write(chunk)
+#                 print(f"Downloaded: {filename}")
+#             else:
+#                 print(f"Failed to download {filename}")
+def download_model_files():
+    for filename, url in DROPBOX_LINKS.items():
+        file_path = os.path.join(bio_model_dir, filename)
+        print(f"Downloading {filename} (forcing overwrite)...")
+        response = requests.get(url, stream=True)
+        if response.status_code == 200:
+            with open(file_path, "wb") as f:
+                for chunk in response.iter_content(chunk_size=1024):
+                    f.write(chunk)
+            print(f"Downloaded: {filename}")
+        else:
+            print(f"Failed to download {filename}")
+# # 📥 Download models before starting
+# download_model_files()
 # # ✅ Load ProtTrans-BERT-BFD Model
 # print("Loading ProtTrans-BERT-BFD model...")
 # model = AutoModelForMaskedLM.from_pretrained(bio_model_dir)
 # tokenizer = AutoTokenizer.from_pretrained(bio_model_dir)
+##
+### ✅ Load Bio-Embedding Model
+##try:
+##    print("Loading ProtTrans-BERT-BFD model...")
+##    embedder = ProtTransBertBFDEmbedder(model_directory=bio_model_dir)
+##except Exception as e:
+##    print(f"Error loading ProtTrans-BERT-BFD model: {e}")
+##    embedder = None
+##
+### 🧬 Generate Bio-Embeddings
+##def generate_bio_embeddings(sequence):
+##    if embedder is None:
+##        return None
+##    try:
+##        embedding_protein = embedder.embed(sequence)
+##        embedding_per_protein = embedder.reduce_per_protein(embedding_protein)
+##        return np.array(embedding_per_protein).reshape(1, -1)
+##    except Exception as e:
+##        print(f"Embedding Error: {e}")
+##        return None
+import torch
+from transformers import AutoTokenizer, AutoModel
+import re
+import numpy as np
+import torch.nn as nn
+# Load ESM2 model and tokenizer
 try:
+    print("Loading ESM2 model...")
+    # Using a smaller model that is only ~200 MB
+    model_name = "facebook/esm2_t6_8M_UR50D"  # Smaller model with 320-dim embeddings
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModel.from_pretrained(model_name)
+    model.eval()
+    print("ESM2 model loaded.")
 except Exception as e:
+    print(f"Error loading ESM2 model: {e}")
+    model = None
+    tokenizer = None
+# Define a linear transformation to map 320D embeddings to 1024D
+class EmbeddingTransformer(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super(EmbeddingTransformer, self).__init__()
+        self.linear = nn.Linear(input_dim, output_dim)
+    def forward(self, x):
+        return self.linear(x)
+# Initialize the transformation layer
+transformer = EmbeddingTransformer(input_dim=320, output_dim=1024)
+# Function to clean protein sequence
+def clean_sequence(seq):
+    """
+    Clean the protein sequence by removing non-standard characters
+    and converting to uppercase.
+    """
+    return re.sub(r'[^ACDEFGHIKLMNPQRSTVWY]', '', seq.upper())
+# Function to generate embeddings from a protein sequence
 def generate_bio_embeddings(sequence):
+    """
+    Generate protein sequence embeddings using ESM2 model.
+    Maps the 320-dimensional embedding to 1024 dimensions.
+    """
+    if model is None or tokenizer is None:
+        print("Model or tokenizer not loaded.")
         return None
+    #sequence = clean_sequence(sequence)
+    if not sequence:
+        print("Sequence is empty after cleaning.")
+        return None
     try:
+        # Tokenize the sequence for ESM2 model
+        inputs = tokenizer(sequence, return_tensors="pt", add_special_tokens=True)
+        # Pass the tokenized input through the ESM2 model to get embeddings
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # Extract the last hidden state (embeddings) and average across the sequence length
+        embeddings = outputs.last_hidden_state  # shape: (batch_size, seq_len, 320)
+        mean_embedding = embeddings.mean(dim=1).squeeze()  # shape: (320,)
+        # Map the 320-dimensional embedding to a 1024-dimensional space using the transformer
+        transformed_embedding = transformer(mean_embedding)
+        # Detach the tensor from the computation graph and convert to numpy
+        transformed_embedding = transformed_embedding.detach().numpy()
+        # Return the transformed embedding as a 2D numpy array (1, 1024)
+        return transformed_embedding.reshape(1, -1)
     except Exception as e:
         print(f"Embedding Error: {e}")
         return None
 # 🔬 Generate SMILES from Protein Sequence
 def generate_smiles(sequence, n_samples=100):
     start_time = time.time()

modelsBioembedSmall/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_name_or_path": "/tmp/facebook/esm2_t6_8M_UR50D",
+  "architectures": [
+    "EsmForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "classifier_dropout": null,
+  "emb_layer_norm_before": false,
+  "esmfold_config": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 320,
+  "initializer_range": 0.02,
+  "intermediate_size": 1280,
+  "is_folding_model": false,
+  "layer_norm_eps": 1e-05,
+  "mask_token_id": 32,
+  "max_position_embeddings": 1026,
+  "model_type": "esm",
+  "num_attention_heads": 20,
+  "num_hidden_layers": 6,
+  "pad_token_id": 1,
+  "position_embedding_type": "rotary",
+  "token_dropout": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.25.0.dev0",
+  "use_cache": true,
+  "vocab_list": null,
+  "vocab_size": 33
+}

modelsBioembedSmall/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24c5fa474c48f3b754b86efe752d5f189d2bcd88190fa2270fc92b2ef3034189
+size 31384292

modelsBioembedSmall/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9edcf393212f3a26684cd68ca8095ec43c2c341ee0fcc3ba7a4d3a47c5dc138f
+size 31406877

modelsBioembedSmall/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "<cls>",
+  "eos_token": "<eos>",
+  "mask_token": "<mask>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

modelsBioembedSmall/tf_model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bbb600f469925d93572cb238a475914a2d0846fb8a7814a98028a7a2e21a05a3
+size 30256864

modelsBioembedSmall/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "model_max_length": 1000000000000000019884624838656,
+  "tokenizer_class": "EsmTokenizer"
+}

modelsBioembedSmall/vocab.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+<cls>
+<pad>
+<eos>
+<unk>
+L
+A
+G
+V
+S
+E
+R
+T
+I
+D
+P
+K
+Q
+N
+F
+Y
+M
+H
+W
+C
+X
+B
+U
+Z
+O
+.
+-
+<null_1>
+<mask>