# ===========================================================
# HuggingFace Spaces — Qwen2.5-7B-Instruct vLLM inference server
# SDK: Docker  |  Hardware: T4 small (~$0.40/hr, pause when idle)
#
# Exposes OpenAI-compatible API at port 7860:
#   POST /v1/chat/completions
#   GET  /v1/models
#
# Deploy to a SEPARATE HF Space (not the chatbot Space):
#   huggingface-cli repo create wanderlust-llm --type space --space-sdk docker
#
# Required Space secrets (Settings → Repository secrets):
#   HF_TOKEN  — read token (optional, Qwen2.5 is public)
#
# After deploy, set in chatbot Space:
#   LLM_SELF_HOSTED_URL = https://<your-hf-username>-wanderlust-llm.hf.space
# ===========================================================

FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04

ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 \
    HF_HOME=/app/.cache/huggingface

WORKDIR /app

# System deps
RUN apt-get update && apt-get install -y --no-install-recommends \
    python3.11 python3-pip python3.11-dev \
    git curl wget \
    && rm -rf /var/lib/apt/lists/* \
    && ln -s /usr/bin/python3.11 /usr/bin/python

# vLLM — handles model sharding, KV-cache, continuous batching
# Pin to a stable release; upgrade manually when needed
RUN pip install --no-cache-dir \
    vllm==0.4.3 \
    huggingface_hub \
    fastapi \
    uvicorn[standard]

# Copy the thin wrapper that adds the HF Spaces health check + startup script
COPY llm_server/ ./llm_server/

# Pre-download model weights at build time so the first request is fast.
# HF_TOKEN is available as a build arg if the model is gated.
ARG HF_TOKEN=""
ENV HF_TOKEN=${HF_TOKEN}
RUN python -c "\
from huggingface_hub import snapshot_download; \
import os; \
token = os.getenv('HF_TOKEN') or None; \
snapshot_download('Qwen/Qwen2.5-7B-Instruct', token=token, ignore_patterns=['*.msgpack','*.h5']); \
print('Model downloaded.')" || echo "⚠️  Model download failed — will retry at runtime"

# HF Spaces requires port 7860
EXPOSE 7860

# Launch vLLM OpenAI-compatible server
# --gpu-memory-utilization 0.90  : leave 10% headroom
# --max-model-len 4096           : cap context to save VRAM (T4 small = 16GB)
# --dtype half                   : fp16, fits on single T4
# Qwen2.5 has a built-in chat template in vLLM, no --chat-template needed
CMD ["python", "-m", "vllm.entrypoints.openai.api_server", \
     "--model", "Qwen/Qwen2.5-7B-Instruct", \
     "--host", "0.0.0.0", \
     "--port", "7860", \
     "--dtype", "half", \
     "--max-model-len", "4096", \
     "--gpu-memory-utilization", "0.90", \
     "--served-model-name", "qwen2.5-7b"]