diff --git a/docker/Dockerfile.lmcache b/docker/Dockerfile.lmcache new file mode 100644 index 0000000..5ebef07 --- /dev/null +++ b/docker/Dockerfile.lmcache @@ -0,0 +1,18 @@ +ARG LMCACHE_VERSION=latest +FROM lmcache/vllm-openai:${LMCACHE_VERSION} + +COPY . /opt/contextpilot +WORKDIR /opt/contextpilot + +RUN /opt/venv/bin/python3 -m ensurepip && \ + /opt/venv/bin/python3 -m pip install --no-cache-dir . && \ + /opt/venv/bin/python3 contextpilot/install_hook.py + +ENV CONTEXTPILOT_INDEX_URL=http://localhost:8765 +EXPOSE 8000 8765 + +COPY docker/entrypoint-vllm.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +ENTRYPOINT ["/entrypoint.sh"] +CMD ["Qwen/Qwen2.5-7B-Instruct", "--enable-prefix-caching", "--kv-transfer-config", "{\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}"] diff --git a/docs/getting_started/docker.md b/docs/getting_started/docker.md index f48d0c3..289359b 100644 --- a/docs/getting_started/docker.md +++ b/docs/getting_started/docker.md @@ -40,8 +40,9 @@ Single container with both the engine and ContextPilot server. ### Build ```bash -docker build -t contextpilot-sglang -f docker/Dockerfile.sglang . -docker build -t contextpilot-vllm -f docker/Dockerfile.vllm . +docker build -t contextpilot-sglang -f docker/Dockerfile.sglang . +docker build -t contextpilot-vllm -f docker/Dockerfile.vllm . +docker build -t contextpilot-lmcache -f docker/Dockerfile.lmcache . ``` Pin a specific engine version: @@ -75,6 +76,40 @@ docker run --gpus all --ipc=host \ Everything after the image name is passed to the engine. Defaults are `Qwen/Qwen3.5-2B` for both images. +**vLLM + LMCache (KV cache CPU offloading):** + +[LMCache](https://github.com/LMCache/LMCache) offloads KV cache to CPU/disk so evicted prefixes can be restored without recomputation. ContextPilot works with LMCache out of the box — the `BlockPool` hook is unaffected. + +```bash +docker build -t contextpilot-lmcache -f docker/Dockerfile.lmcache . +``` + +Pin a specific LMCache version: + +```bash +docker build -t contextpilot-lmcache -f docker/Dockerfile.lmcache --build-arg LMCACHE_VERSION=latest . +``` + +Run: + +```bash +docker run --gpus all --ipc=host \ + -p 8000:8000 -p 8765:8765 \ + -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN \ + contextpilot-lmcache +``` + +Override the model or LMCache config: + +```bash +docker run --gpus all --ipc=host \ + -p 8000:8000 -p 8765:8765 \ + -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN \ + contextpilot-lmcache \ + Qwen/Qwen3-4B --enable-prefix-caching \ + --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}' +``` + ## GPU Selection ```bash diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index 3365b08..3a2a33d 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -175,6 +175,19 @@ python -m vllm.entrypoints.openai.api_server \ --enable-prefix-caching ``` +**vLLM + LMCache (optional KV cache CPU offloading):** + +[LMCache](https://github.com/LMCache/LMCache) offloads evicted KV cache to CPU/disk so prefixes can be restored without recomputation. Just install it and add the `--kv-transfer-config` flag — ContextPilot works with LMCache out of the box. + +```bash +pip install lmcache +python -m vllm.entrypoints.openai.api_server \ + --model Qwen/Qwen3-4B \ + --port 30000 \ + --enable-prefix-caching \ + --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}' +``` + > **Note:** For eviction sync, prefix with `CONTEXTPILOT_INDEX_URL=http://localhost:8765`. This lets the inference engine notify ContextPilot when KV cache entries are evicted. ## Step 2: Start ContextPilot