# =========================================================== # HuggingFace Spaces — Qwen2.5-7B-Instruct vLLM inference server # SDK: Docker | Hardware: T4 small (~$0.40/hr, pause when idle) # # Exposes OpenAI-compatible API at port 7860: # POST /v1/chat/completions # GET /v1/models # # Deploy to a SEPARATE HF Space (not the chatbot Space): # huggingface-cli repo create wanderlust-llm --type space --space-sdk docker # # Required Space secrets (Settings → Repository secrets): # HF_TOKEN — read token (optional, Qwen2.5 is public) # # After deploy, set in chatbot Space: # LLM_SELF_HOSTED_URL = https://-wanderlust-llm.hf.space # =========================================================== FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 ENV DEBIAN_FRONTEND=noninteractive \ PYTHONUNBUFFERED=1 \ HF_HOME=/app/.cache/huggingface WORKDIR /app # System deps RUN apt-get update && apt-get install -y --no-install-recommends \ python3.11 python3-pip python3.11-dev \ git curl wget \ && rm -rf /var/lib/apt/lists/* \ && ln -s /usr/bin/python3.11 /usr/bin/python # vLLM — handles model sharding, KV-cache, continuous batching # Pin to a stable release; upgrade manually when needed RUN pip install --no-cache-dir \ vllm==0.4.3 \ huggingface_hub \ fastapi \ uvicorn[standard] # Copy the thin wrapper that adds the HF Spaces health check + startup script COPY llm_server/ ./llm_server/ # Pre-download model weights at build time so the first request is fast. # HF_TOKEN is available as a build arg if the model is gated. ARG HF_TOKEN="" ENV HF_TOKEN=${HF_TOKEN} RUN python -c "\ from huggingface_hub import snapshot_download; \ import os; \ token = os.getenv('HF_TOKEN') or None; \ snapshot_download('Qwen/Qwen2.5-7B-Instruct', token=token, ignore_patterns=['*.msgpack','*.h5']); \ print('Model downloaded.')" || echo "⚠️ Model download failed — will retry at runtime" # HF Spaces requires port 7860 EXPOSE 7860 # Launch vLLM OpenAI-compatible server # --gpu-memory-utilization 0.90 : leave 10% headroom # --max-model-len 4096 : cap context to save VRAM (T4 small = 16GB) # --dtype half : fp16, fits on single T4 # Qwen2.5 has a built-in chat template in vLLM, no --chat-template needed CMD ["python", "-m", "vllm.entrypoints.openai.api_server", \ "--model", "Qwen/Qwen2.5-7B-Instruct", \ "--host", "0.0.0.0", \ "--port", "7860", \ "--dtype", "half", \ "--max-model-len", "4096", \ "--gpu-memory-utilization", "0.90", \ "--served-model-name", "qwen2.5-7b"]