Sync: Changed to pi agent to improve interrupt function. Changes to open text redaction - earlier checks for Comprehend connection, now a markdown preview shows first redaction results
5a1ee57 | # Pick which vLLM stack runs via COMPOSE_PROFILES (or --profile). Set VLLM_OPENAI_MODEL in .env | |
| # to match the served model (required for 27b; 9b defaults below if omitted): | |
| # COMPOSE_PROFILES=vllm-9b -> QuantTrio/Qwen3.5-9B-AWQ (default VLLM_OPENAI_MODEL) | |
| # COMPOSE_PROFILES=vllm-27b -> set VLLM_OPENAI_MODEL=QuantTrio/Qwen3.5-27B-AWQ | |
| # App uses http://vllm-inference:8000 (shared network alias on both vLLM services). | |
| # Example CLI commands (add --build to the below commands if you want to rebuild the app images) | |
| # Recommended for 16gb VRAM systems: | |
| # docker compose -f docker-compose_vllm.yml --profile vllm-9b up -d | |
| # Recommended for 24gb VRAM systems: | |
| # docker compose -f docker-compose_vllm.yml --profile vllm-27b up -d | |
| # | |
| # Optional Docker-only settings for redaction-app-vllm: config/docker_app_config.env | |
| # (see config/docker_app_config.env.example). Loaded at container start; the | |
| # service environment: block overrides values from that file. | |
| x-redaction-app-env: &redaction-app-env | |
| env_file: | |
| - path: config/docker_app_config.env | |
| required: false | |
| services: | |
| vllm-server-qwen35-9b: | |
| profiles: ["vllm-9b"] | |
| image: vllm/vllm-openai:latest | |
| shm_size: '8gb' | |
| command: | | |
| --model QuantTrio/Qwen3.5-9B-AWQ | |
| --gpu-memory-utilization 0.926 | |
| --tensor-parallel-size 1 | |
| --max-num-seqs 1 | |
| --reasoning-parser qwen3 | |
| --max-model-len 16384 | |
| --max-num-batched-tokens 2048 | |
| --speculative-config '{"method":"mtp","num_speculative_tokens":3}' | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: all | |
| capabilities: [gpu] | |
| healthcheck: | |
| test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null || exit 1"] | |
| interval: 30s | |
| timeout: 15s | |
| retries: 8 | |
| start_period: 1200s | |
| ports: | |
| - "8000:8000" | |
| volumes: | |
| - hf-model-cache:/root/.cache/huggingface | |
| networks: | |
| redaction-net-vllm: | |
| aliases: | |
| - vllm-inference | |
| vllm-server-qwen35-27b: | |
| profiles: ["vllm-27b"] | |
| image: vllm/vllm-openai:latest | |
| shm_size: '16gb' | |
| command: | | |
| --model QuantTrio/Qwen3.5-27B-AWQ | |
| --gpu-memory-utilization 0.94 | |
| --tensor-parallel-size 1 | |
| --max-num-seqs 2 | |
| --reasoning-parser qwen3 | |
| --max-model-len 16384 | |
| --max-num-batched-tokens 4096 | |
| --enforce-eager | |
| --kv-cache-dtype fp8 | |
| --enable-chunked-prefill | |
| --enable-prefix-caching | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: all | |
| capabilities: [gpu] | |
| healthcheck: | |
| test: ["CMD-SHELL", "curl -fsS http://localhost:8000/v1/models >/dev/null || exit 1"] | |
| interval: 30s | |
| timeout: 15s | |
| retries: 8 | |
| start_period: 1200s | |
| ports: | |
| - "8001:8000" | |
| volumes: | |
| - hf-model-cache:/root/.cache/huggingface | |
| networks: | |
| redaction-net-vllm: | |
| aliases: | |
| - vllm-inference | |
| redaction-app-vllm: | |
| <<: *redaction-app-env | |
| profiles: ["vllm-9b", "vllm-27b"] | |
| image: redaction-app-main | |
| build: | |
| context: . # Look in the current folder | |
| dockerfile: Dockerfile # Use this file | |
| target: gradio # Use the 'gradio' stage from your Dockerfile | |
| args: # Pass your build-time variables here! | |
| - TORCH_GPU_ENABLED=False | |
| - INSTALL_VLM=False | |
| - PADDLE_GPU_ENABLED=True | |
| - INSTALL_PADDLEOCR=True | |
| shm_size: '8gb' | |
| depends_on: | |
| vllm-server-qwen35-9b: | |
| condition: service_healthy | |
| required: false | |
| vllm-server-qwen35-27b: | |
| condition: service_healthy | |
| required: false | |
| environment: | |
| - FLAGS_fraction_of_gpu_memory_to_use=0.05 | |
| - RUN_FASTAPI=True | |
| - APP_MODE=fastapi | |
| - SHOW_PADDLE_MODEL_OPTIONS=True | |
| - SHOW_LOCAL_OCR_MODEL_OPTIONS=True | |
| - SHOW_INFERENCE_SERVER_PII_OPTIONS=True | |
| - SHOW_INFERENCE_SERVER_VLM_OPTIONS=True | |
| - SHOW_HYBRID_MODELS=True | |
| - SHOW_DIFFICULT_OCR_EXAMPLES=True | |
| - SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True | |
| - SHOW_SUMMARISATION=True | |
| - SHOW_AWS_API_KEYS=True | |
| - DEFAULT_TEXT_EXTRACTION_MODEL=Local OCR model - PDFs without selectable text | |
| - DEFAULT_LOCAL_OCR_MODEL=paddle | |
| - DEFAULT_PII_DETECTION_MODEL=Local | |
| - CUSTOM_VLM_BACKEND=inference_vlm | |
| - MAX_WORKERS=12 | |
| - TESSERACT_MAX_WORKERS=8 | |
| - PADDLE_MAX_WORKERS=1 # Keep this to 1 to avoid VRAM overflow or errors | |
| - LOAD_PADDLE_AT_STARTUP=False | |
| - INFERENCE_SERVER_API_URL=http://vllm-inference:8000 | |
| - DEFAULT_INFERENCE_SERVER_VLM_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} # Change this to QuantTrio/Qwen3.5-27B-AWQ if running that model | |
| - DEFAULT_INFERENCE_SERVER_PII_MODEL=${VLLM_OPENAI_MODEL:-QuantTrio/Qwen3.5-9B-AWQ} # Change this to QuantTrio/Qwen3.5-27B-AWQ if running that model | |
| - EFFICIENT_OCR=True | |
| - SHOW_CUSTOM_VLM_ENTITIES=True | |
| - SESSION_OUTPUT_FOLDER=True | |
| - SAVE_PAGE_OCR_VISUALISATIONS=False | |
| - HYBRID_OCR_CONFIDENCE_THRESHOLD=97 | |
| - INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True | |
| - PREPROCESS_LOCAL_OCR_IMAGES=False | |
| - INFERENCE_SERVER_DISABLE_THINKING=True | |
| - MAX_NEW_TOKENS=8192 | |
| - SAVE_EXAMPLE_HYBRID_IMAGES=False | |
| - SAVE_VLM_INPUT_IMAGES=False | |
| - VLM_MAX_DPI=200.0 | |
| - DEFAULT_NEW_BATCH_CHAR_COUNT=1250 | |
| - REPORT_VLM_OUTPUTS_TO_GUI=True | |
| - REPORT_LLM_OUTPUTS_TO_GUI=True | |
| - ADD_VLM_BOUNDING_BOX_RULES=False | |
| - VLM_DEFAULT_STREAM=False | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: all | |
| capabilities: [gpu] | |
| ports: | |
| - "7860:7860" | |
| networks: | |
| - redaction-net-vllm | |
| networks: | |
| redaction-net-vllm: | |
| driver: bridge | |
| volumes: | |
| hf-model-cache: |