Spaces:

seanpedrickcase
/

document_redaction

Running

document_redaction / docker-compose_vllm.yml

Sync: Changed to pi agent to improve interrupt function. Changes to open text redaction - earlier checks for Comprehend connection, now a markdown preview shows first redaction results

5a1ee57 about 10 hours ago

raw

history blame contribute delete

5.94 kB

	# Pick which vLLM stack runs via COMPOSE_PROFILES (or --profile). Set VLLM_OPENAI_MODEL in .env
	# to match the served model (required for 27b; 9b defaults below if omitted):
	# COMPOSE_PROFILES=vllm-9b -> QuantTrio/Qwen3.5-9B-AWQ (default VLLM_OPENAI_MODEL)
	# COMPOSE_PROFILES=vllm-27b -> set VLLM_OPENAI_MODEL=QuantTrio/Qwen3.5-27B-AWQ
	# App uses http://vllm-inference:8000 (shared network alias on both vLLM services).
	# Example CLI commands (add --build to the below commands if you want to rebuild the app images)

	# Recommended for 16gb VRAM systems:
	# docker compose -f docker-compose_vllm.yml --profile vllm-9b up -d

	# Recommended for 24gb VRAM systems:
	# docker compose -f docker-compose_vllm.yml --profile vllm-27b up -d
	#
	# Optional Docker-only settings for redaction-app-vllm: config/docker_app_config.env
	# (see config/docker_app_config.env.example). Loaded at container start; the
	# service environment: block overrides values from that file.

	x-redaction-app-env: &redaction-app-env
	env_file:
	- path: config/docker_app_config.env
	required: false

	services:
	vllm-server-qwen35-9b:
	profiles: ["vllm-9b"]
	image: vllm/vllm-openai:latest
	shm_size: '8gb'
	command: \|
	--model QuantTrio/Qwen3.5-9B-AWQ
	--gpu-memory-utilization 0.926
	--tensor-parallel-size 1
	--max-num-seqs 1
	--reasoning-parser qwen3
	--max-model-len 16384
	--max-num-batched-tokens 2048
	--speculative-config '{"method":"mtp","num_speculative_tokens":3}'

	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	healthcheck:
	test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null \|\| exit 1"]
	interval: 30s
	timeout: 15s
	retries: 8
	start_period: 1200s
	ports:
	- "8000:8000"
	volumes:
	- hf-model-cache:/root/.cache/huggingface
	networks:
	redaction-net-vllm:
	aliases:
	- vllm-inference

	vllm-server-qwen35-27b:
	profiles: ["vllm-27b"]
	image: vllm/vllm-openai:latest
	shm_size: '16gb'
	command: \|
	--model QuantTrio/Qwen3.5-27B-AWQ
	--gpu-memory-utilization 0.94
	--tensor-parallel-size 1
	--max-num-seqs 2
	--reasoning-parser qwen3
	--max-model-len 16384
	--max-num-batched-tokens 4096
	--enforce-eager
	--kv-cache-dtype fp8
	--enable-chunked-prefill
	--enable-prefix-caching

	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	healthcheck:
	test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null \|\| exit 1"]
	interval: 30s
	timeout: 15s
	retries: 8
	start_period: 1200s
	ports:
	- "8001:8000"
	volumes:
	- hf-model-cache:/root/.cache/huggingface
	networks:
	redaction-net-vllm:
	aliases:
	- vllm-inference

	redaction-app-vllm:
	<<: *redaction-app-env
	profiles: ["vllm-9b", "vllm-27b"]
	image: redaction-app-main
	build:
	context: . # Look in the current folder
	dockerfile: Dockerfile # Use this file
	target: gradio # Use the 'gradio' stage from your Dockerfile
	args: # Pass your build-time variables here!
	- TORCH_GPU_ENABLED=False
	- INSTALL_VLM=False
	- PADDLE_GPU_ENABLED=True
	- INSTALL_PADDLEOCR=True
	shm_size: '8gb'
	depends_on:
	vllm-server-qwen35-9b:
	condition: service_healthy
	required: false
	vllm-server-qwen35-27b:
	condition: service_healthy
	required: false
	environment:
	- FLAGS_fraction_of_gpu_memory_to_use=0.05
	- RUN_FASTAPI=True
	- APP_MODE=fastapi
	- SHOW_PADDLE_MODEL_OPTIONS=True
	- SHOW_LOCAL_OCR_MODEL_OPTIONS=True
	- SHOW_INFERENCE_SERVER_PII_OPTIONS=True
	- SHOW_INFERENCE_SERVER_VLM_OPTIONS=True
	- SHOW_HYBRID_MODELS=True
	- SHOW_DIFFICULT_OCR_EXAMPLES=True
	- SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True
	- SHOW_SUMMARISATION=True
	- SHOW_AWS_API_KEYS=True
	- DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text
	- DEFAULT_LOCAL_OCR_MODEL=paddle
	- DEFAULT_PII_DETECTION_MODEL=Local
	- CUSTOM_VLM_BACKEND=inference_vlm
	- MAX_WORKERS=12
	- TESSERACT_MAX_WORKERS=8
	- PADDLE_MAX_WORKERS=1 # Keep this to 1 to avoid VRAM overflow or errors
	- LOAD_PADDLE_AT_STARTUP=False
	- INFERENCE_SERVER_API_URL=http://vllm-inference:8000
	- DEFAULT_INFERENCE_SERVER_VLM_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} # Change this to QuantTrio/Qwen3.5-27B-AWQ if running that model
	- DEFAULT_INFERENCE_SERVER_PII_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} # Change this to QuantTrio/Qwen3.5-27B-AWQ if running that model
	- EFFICIENT_OCR=True
	- SHOW_CUSTOM_VLM_ENTITIES=True
	- SESSION_OUTPUT_FOLDER=True
	- SAVE_PAGE_OCR_VISUALISATIONS=False
	- HYBRID_OCR_CONFIDENCE_THRESHOLD=97
	- INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True
	- PREPROCESS_LOCAL_OCR_IMAGES=False
	- INFERENCE_SERVER_DISABLE_THINKING=True
	- MAX_NEW_TOKENS=8192
	- SAVE_EXAMPLE_HYBRID_IMAGES=False
	- SAVE_VLM_INPUT_IMAGES=False
	- VLM_MAX_DPI=200.0
	- DEFAULT_NEW_BATCH_CHAR_COUNT=1250
	- REPORT_VLM_OUTPUTS_TO_GUI=True
	- REPORT_LLM_OUTPUTS_TO_GUI=True
	- ADD_VLM_BOUNDING_BOX_RULES=False
	- VLM_DEFAULT_STREAM=False

	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	ports:
	- "7860:7860"
	networks:
	- redaction-net-vllm

	networks:
	redaction-net-vllm:
	driver: bridge

	volumes:
	hf-model-cache: