From 67d4f454f23d22773e40a71a28a987d313c137b1 Mon Sep 17 00:00:00 2001 From: tsuiusi Date: Tue, 10 Mar 2026 15:04:54 +0100 Subject: [PATCH 01/10] lmcache docker --- docker/Dockerfile.lmcache | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 docker/Dockerfile.lmcache diff --git a/docker/Dockerfile.lmcache b/docker/Dockerfile.lmcache new file mode 100644 index 0000000..63146de --- /dev/null +++ b/docker/Dockerfile.lmcache @@ -0,0 +1,17 @@ +ARG LMCACHE_VERSION=latest +FROM lmcache/vllm-openai:${LMCACHE_VERSION} + +COPY . /opt/contextpilot +WORKDIR /opt/contextpilot + +RUN pip install --no-cache-dir . && \ + python3 -m contextpilot.install_hook + +ENV CONTEXTPILOT_INDEX_URL=http://localhost:8765 +EXPOSE 8000 8765 + +COPY docker/entrypoint-vllm.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +ENTRYPOINT ["/entrypoint.sh"] +CMD ["Qwen/Qwen2.5-7B-Instruct", "--enable-prefix-caching", "--kv-transfer-config", "{\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}"] From 2e4d57dd6b52d766c8cbab0134fd5d228e1d50ec Mon Sep 17 00:00:00 2001 From: tsuiusi Date: Tue, 10 Mar 2026 15:24:05 +0100 Subject: [PATCH 02/10] lmcache support --- docker/Dockerfile.lmcache | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.lmcache b/docker/Dockerfile.lmcache index 63146de..0038b76 100644 --- a/docker/Dockerfile.lmcache +++ b/docker/Dockerfile.lmcache @@ -4,7 +4,7 @@ FROM lmcache/vllm-openai:${LMCACHE_VERSION} COPY . /opt/contextpilot WORKDIR /opt/contextpilot -RUN pip install --no-cache-dir . && \ +RUN pip install --no-cache-dir --break-system-packages . && \ python3 -m contextpilot.install_hook ENV CONTEXTPILOT_INDEX_URL=http://localhost:8765 From 266a130185b4b15cf6d87baef9e71445ddd5843b Mon Sep 17 00:00:00 2001 From: tsuiusi Date: Tue, 10 Mar 2026 15:59:08 +0100 Subject: [PATCH 03/10] fix --- docker/Dockerfile.lmcache | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.lmcache b/docker/Dockerfile.lmcache index 0038b76..0bc04b4 100644 --- a/docker/Dockerfile.lmcache +++ b/docker/Dockerfile.lmcache @@ -4,8 +4,8 @@ FROM lmcache/vllm-openai:${LMCACHE_VERSION} COPY . /opt/contextpilot WORKDIR /opt/contextpilot -RUN pip install --no-cache-dir --break-system-packages . && \ - python3 -m contextpilot.install_hook +RUN /opt/venv/bin/pip install --no-cache-dir . && \ + /opt/venv/bin/python3 -m contextpilot.install_hook ENV CONTEXTPILOT_INDEX_URL=http://localhost:8765 EXPOSE 8000 8765 From dab83ec98dc7148e03f5989b961fd024759a393c Mon Sep 17 00:00:00 2001 From: tsuiusi Date: Tue, 10 Mar 2026 16:05:28 +0100 Subject: [PATCH 04/10] Use venv python3 -m pip in LMCache Dockerfile Co-Authored-By: Claude Opus 4.6 --- docker/Dockerfile.lmcache | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.lmcache b/docker/Dockerfile.lmcache index 0bc04b4..bdfe428 100644 --- a/docker/Dockerfile.lmcache +++ b/docker/Dockerfile.lmcache @@ -4,7 +4,7 @@ FROM lmcache/vllm-openai:${LMCACHE_VERSION} COPY . /opt/contextpilot WORKDIR /opt/contextpilot -RUN /opt/venv/bin/pip install --no-cache-dir . && \ +RUN /opt/venv/bin/python3 -m pip install --no-cache-dir . && \ /opt/venv/bin/python3 -m contextpilot.install_hook ENV CONTEXTPILOT_INDEX_URL=http://localhost:8765 From 88ca651a8276d95c09f273338e75350be6f939e8 Mon Sep 17 00:00:00 2001 From: tsuiusi Date: Tue, 10 Mar 2026 16:19:22 +0100 Subject: [PATCH 05/10] Use system pip + direct script for LMCache Dockerfile The lmcache/vllm-openai image has no pip in its venv. System pip works but python3 -m contextpilot.install_hook triggers the full package __init__.py import chain. Run install_hook.py directly as a script to avoid this. Co-Authored-By: Claude Opus 4.6 --- docker/Dockerfile.lmcache | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.lmcache b/docker/Dockerfile.lmcache index bdfe428..5a513e5 100644 --- a/docker/Dockerfile.lmcache +++ b/docker/Dockerfile.lmcache @@ -4,8 +4,8 @@ FROM lmcache/vllm-openai:${LMCACHE_VERSION} COPY . /opt/contextpilot WORKDIR /opt/contextpilot -RUN /opt/venv/bin/python3 -m pip install --no-cache-dir . && \ - /opt/venv/bin/python3 -m contextpilot.install_hook +RUN pip install --no-cache-dir --break-system-packages . && \ + python3 contextpilot/install_hook.py ENV CONTEXTPILOT_INDEX_URL=http://localhost:8765 EXPOSE 8000 8765 From e07c323ce7f1bfe01185f1be9ed713468190ff2f Mon Sep 17 00:00:00 2001 From: tsuiusi Date: Tue, 10 Mar 2026 20:30:20 +0100 Subject: [PATCH 06/10] Install contextpilot into venv, not system Python The lmcache image runs python3 from /opt/venv but had no pip there. Use ensurepip to bootstrap pip into the venv, then install everything into the same environment that runs at runtime. Co-Authored-By: Claude Opus 4.6 --- docker/Dockerfile.lmcache | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.lmcache b/docker/Dockerfile.lmcache index 5a513e5..5ebef07 100644 --- a/docker/Dockerfile.lmcache +++ b/docker/Dockerfile.lmcache @@ -4,8 +4,9 @@ FROM lmcache/vllm-openai:${LMCACHE_VERSION} COPY . /opt/contextpilot WORKDIR /opt/contextpilot -RUN pip install --no-cache-dir --break-system-packages . && \ - python3 contextpilot/install_hook.py +RUN /opt/venv/bin/python3 -m ensurepip && \ + /opt/venv/bin/python3 -m pip install --no-cache-dir . && \ + /opt/venv/bin/python3 contextpilot/install_hook.py ENV CONTEXTPILOT_INDEX_URL=http://localhost:8765 EXPOSE 8000 8765 From 2421feccf87f577c0da2b09d748b8a12f260d2b9 Mon Sep 17 00:00:00 2001 From: tsuiusi Date: Tue, 10 Mar 2026 20:50:59 +0100 Subject: [PATCH 07/10] docs --- docs/getting_started/docker.md | 39 ++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/docs/getting_started/docker.md b/docs/getting_started/docker.md index f48d0c3..289359b 100644 --- a/docs/getting_started/docker.md +++ b/docs/getting_started/docker.md @@ -40,8 +40,9 @@ Single container with both the engine and ContextPilot server. ### Build ```bash -docker build -t contextpilot-sglang -f docker/Dockerfile.sglang . -docker build -t contextpilot-vllm -f docker/Dockerfile.vllm . +docker build -t contextpilot-sglang -f docker/Dockerfile.sglang . +docker build -t contextpilot-vllm -f docker/Dockerfile.vllm . +docker build -t contextpilot-lmcache -f docker/Dockerfile.lmcache . ``` Pin a specific engine version: @@ -75,6 +76,40 @@ docker run --gpus all --ipc=host \ Everything after the image name is passed to the engine. Defaults are `Qwen/Qwen3.5-2B` for both images. +**vLLM + LMCache (KV cache CPU offloading):** + +[LMCache](https://github.com/LMCache/LMCache) offloads KV cache to CPU/disk so evicted prefixes can be restored without recomputation. ContextPilot works with LMCache out of the box — the `BlockPool` hook is unaffected. + +```bash +docker build -t contextpilot-lmcache -f docker/Dockerfile.lmcache . +``` + +Pin a specific LMCache version: + +```bash +docker build -t contextpilot-lmcache -f docker/Dockerfile.lmcache --build-arg LMCACHE_VERSION=latest . +``` + +Run: + +```bash +docker run --gpus all --ipc=host \ + -p 8000:8000 -p 8765:8765 \ + -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN \ + contextpilot-lmcache +``` + +Override the model or LMCache config: + +```bash +docker run --gpus all --ipc=host \ + -p 8000:8000 -p 8765:8765 \ + -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN \ + contextpilot-lmcache \ + Qwen/Qwen3-4B --enable-prefix-caching \ + --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}' +``` + ## GPU Selection ```bash From df76f6c41edae01310f132df360602ff9321be74 Mon Sep 17 00:00:00 2001 From: tsuiusi Date: Tue, 10 Mar 2026 23:25:54 +0100 Subject: [PATCH 08/10] docs --- docs/getting_started/quickstart.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index 3365b08..3a2a33d 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -175,6 +175,19 @@ python -m vllm.entrypoints.openai.api_server \ --enable-prefix-caching ``` +**vLLM + LMCache (optional KV cache CPU offloading):** + +[LMCache](https://github.com/LMCache/LMCache) offloads evicted KV cache to CPU/disk so prefixes can be restored without recomputation. Just install it and add the `--kv-transfer-config` flag — ContextPilot works with LMCache out of the box. + +```bash +pip install lmcache +python -m vllm.entrypoints.openai.api_server \ + --model Qwen/Qwen3-4B \ + --port 30000 \ + --enable-prefix-caching \ + --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}' +``` + > **Note:** For eviction sync, prefix with `CONTEXTPILOT_INDEX_URL=http://localhost:8765`. This lets the inference engine notify ContextPilot when KV cache entries are evicted. ## Step 2: Start ContextPilot From 1b698cdb53f73a390eb3fb4eebf92b4b7e46b917 Mon Sep 17 00:00:00 2001 From: tsuiusi Date: Tue, 10 Mar 2026 23:29:14 +0100 Subject: [PATCH 09/10] docs --- README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/README.md b/README.md index 69c8521..0f15843 100644 --- a/README.md +++ b/README.md @@ -131,6 +131,22 @@ xcode-select --install # one-time: provides clang++ to compile the native hoo More [detailed installation instructions](https://efficientcontext.github.io/contextpilot-docs/getting_started/installation) are available in the docs. +--- + +### LMCache (Optional KV Cache CPU Offloading) + +[LMCache](https://github.com/LMCache/LMCache) offloads evicted KV cache to CPU/disk so prefixes can be restored without recomputation. ContextPilot works with LMCache out of the box — just install it and add one flag: + +```bash +pip install lmcache +vllm serve Qwen/Qwen3-4B --enable-prefix-caching \ + --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}' +``` + +Docker images are also available — see the [Quick Start](https://efficientcontext.github.io/contextpilot-docs/getting_started/quickstart) and [Docker guide](https://efficientcontext.github.io/contextpilot-docs/getting_started/docker) for details. + +--- + Docker images are also available for both all-in-one and standalone deployment. See the [Docker guide](https://efficientcontext.github.io/contextpilot-docs/getting_started/docker). ## Getting Started From 4e2bab88d4f580b7711a4486836d1511453e070a Mon Sep 17 00:00:00 2001 From: tsuiusi Date: Tue, 10 Mar 2026 23:37:08 +0100 Subject: [PATCH 10/10] removed unnecessary docs --- README.md | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/README.md b/README.md index 0f15843..69c8521 100644 --- a/README.md +++ b/README.md @@ -131,22 +131,6 @@ xcode-select --install # one-time: provides clang++ to compile the native hoo More [detailed installation instructions](https://efficientcontext.github.io/contextpilot-docs/getting_started/installation) are available in the docs. ---- - -### LMCache (Optional KV Cache CPU Offloading) - -[LMCache](https://github.com/LMCache/LMCache) offloads evicted KV cache to CPU/disk so prefixes can be restored without recomputation. ContextPilot works with LMCache out of the box — just install it and add one flag: - -```bash -pip install lmcache -vllm serve Qwen/Qwen3-4B --enable-prefix-caching \ - --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}' -``` - -Docker images are also available — see the [Quick Start](https://efficientcontext.github.io/contextpilot-docs/getting_started/quickstart) and [Docker guide](https://efficientcontext.github.io/contextpilot-docs/getting_started/docker) for details. - ---- - Docker images are also available for both all-in-one and standalone deployment. See the [Docker guide](https://efficientcontext.github.io/contextpilot-docs/getting_started/docker). ## Getting Started