FROM python:3.10-slim ENV HF_HOME=/cache \ TRANSFORMERS_CACHE=/cache \ SENTENCE_TRANSFORMERS_HOME=/cache \ XDG_CACHE_HOME=/cache \ PYTHONUNBUFFERED=1 \ PYTHONOPTIMIZE=2 \ TOKENIZERS_PARALLELISM=false \ OMP_NUM_THREADS=1 \ MKL_NUM_THREADS=1 \ NUMEXPR_NUM_THREADS=1 \ INTRA_THREADS=1 \ WORKERS=8 RUN apt-get update && apt-get install -y --no-install-recommends build-essential && rm -rf /var/lib/apt/lists/* RUN pip install --no-cache-dir "torch==2.3.1+cpu" --index-url https://download.pytorch.org/whl/cpu \ && pip install --no-cache-dir fastapi uvicorn[standard] sentence-transformers numpy orjson WORKDIR /app RUN mkdir -p /cache && chmod -R 777 /cache RUN printf 'import os, torch, numpy as np\nfrom fastapi import FastAPI\nfrom fastapi.responses import ORJSONResponse\nfrom pydantic import BaseModel\nfrom typing import List\nfrom sentence_transformers import SentenceTransformer\n\ntry:\n torch.set_num_threads(int(os.getenv("INTRA_THREADS","1")))\n torch.set_num_interop_threads(1)\nexcept Exception:\n pass\n\napp = FastAPI(title="gte-large-embed", default_response_class=ORJSONResponse)\ndevice = "cuda" if torch.cuda.is_available() else "cpu"\nmodel_path = "/cache/preload" if os.path.isdir("/cache/preload") else "thenlper/gte-large"\nmodel = SentenceTransformer(model_path, device=device)\n\nclass EmbedIn(BaseModel):\n texts: List[str]\n normalize: bool = True\n batch_size: int = 128\n\n@app.on_event("startup")\ndef warmup():\n with torch.inference_mode():\n _ = model.encode(["warmup","ready"], normalize_embeddings=True, batch_size=2, convert_to_numpy=True, show_progress_bar=False)\n\n@app.get("/health")\ndef health():\n return {"ok": True, "device": device, "threads": {"intra": torch.get_num_threads()}}\n\n@app.post("/embed")\ndef embed(inp: EmbedIn):\n with torch.inference_mode():\n vecs = model.encode(inp.texts, normalize_embeddings=inp.normalize, batch_size=inp.batch_size, convert_to_numpy=True, show_progress_bar=False)\n return {"embeddings": vecs.tolist(), "dim": int(vecs.shape[1]), "n": int(vecs.shape[0])}\n' > /app/app.py RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('thenlper/gte-large').save('/cache/preload')" EXPOSE 7860 CMD ["bash","-lc","uvicorn app:app --host 0.0.0.0 --port 7860 --workers ${WORKERS}"]