From 3bbcc2ab30c4654cad64a300152b4fb8a2de944c Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 19:58:00 +0000 Subject: [PATCH 01/14] Add Kubernetes manifests and CI workflows for de.NBI migration Decompose the monolithic Docker container into Kubernetes workloads: - Streamlit Deployment with health probes and session affinity - Redis Deployment + Service for job queue - RQ Worker Deployment for background workflows - CronJob for workspace cleanup - Ingress with WebSocket support and cookie-based sticky sessions - Shared PVC (ReadWriteMany) for workspace data - ConfigMap for runtime configuration (replaces build-time settings) - Kustomize base + template-app overlay for multi-app deployment Code changes: - Remove unsafe enableCORS=false and enableXsrfProtection=false from config.toml - Make workspace path configurable via WORKSPACES_DIR env var in clean-up-workspaces.py CI/CD: - Add build-and-push-image.yml to push Docker images to ghcr.io - Add k8s-manifests-ci.yml for manifest validation and kind integration tests https://claude.ai/code/session_01RNJ3dVjV1VTHcC9ugE3FQJ --- .github/workflows/build-and-push-image.yml | 49 ++++++++++++ .github/workflows/k8s-manifests-ci.yml | 80 ++++++++++++++++++++ .streamlit/config.toml | 2 - clean-up-workspaces.py | 2 +- k8s/base/cleanup-cronjob.yaml | 45 +++++++++++ k8s/base/configmap.yaml | 39 ++++++++++ k8s/base/ingress.yaml | 32 ++++++++ k8s/base/kustomization.yaml | 13 ++++ k8s/base/namespace.yaml | 6 ++ k8s/base/redis.yaml | 52 +++++++++++++ k8s/base/rq-worker-deployment.yaml | 49 ++++++++++++ k8s/base/streamlit-deployment.yaml | 63 +++++++++++++++ k8s/base/streamlit-service.yaml | 13 ++++ k8s/base/workspace-pvc.yaml | 10 +++ k8s/overlays/template-app/kustomization.yaml | 24 ++++++ 15 files changed, 476 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/build-and-push-image.yml create mode 100644 .github/workflows/k8s-manifests-ci.yml create mode 100644 k8s/base/cleanup-cronjob.yaml create mode 100644 k8s/base/configmap.yaml create mode 100644 k8s/base/ingress.yaml create mode 100644 k8s/base/kustomization.yaml create mode 100644 k8s/base/namespace.yaml create mode 100644 k8s/base/redis.yaml create mode 100644 k8s/base/rq-worker-deployment.yaml create mode 100644 k8s/base/streamlit-deployment.yaml create mode 100644 k8s/base/streamlit-service.yaml create mode 100644 k8s/base/workspace-pvc.yaml create mode 100644 k8s/overlays/template-app/kustomization.yaml diff --git a/.github/workflows/build-and-push-image.yml b/.github/workflows/build-and-push-image.yml new file mode 100644 index 000000000..180f98026 --- /dev/null +++ b/.github/workflows/build-and-push-image.yml @@ -0,0 +1,49 @@ +name: Build and Push Docker Image + +on: + push: + branches: [main] + tags: ['v*'] + workflow_dispatch: + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +jobs: + build-and-push: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - uses: actions/checkout@v4 + + - name: Log in to Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=ref,event=branch + type=semver,pattern={{version}} + type=sha,prefix= + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: . + file: Dockerfile_simple + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + build-args: | + GITHUB_TOKEN=${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/k8s-manifests-ci.yml b/.github/workflows/k8s-manifests-ci.yml new file mode 100644 index 000000000..3d9dbbb93 --- /dev/null +++ b/.github/workflows/k8s-manifests-ci.yml @@ -0,0 +1,80 @@ +name: K8s Manifests CI + +on: + push: + paths: + - 'k8s/**' + pull_request: + paths: + - 'k8s/**' + +jobs: + validate-manifests: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install kubeconform + run: | + curl -sSL https://github.com/yannh/kubeconform/releases/latest/download/kubeconform-linux-amd64.tar.gz | tar xz + sudo mv kubeconform /usr/local/bin/ + + - name: Validate K8s manifests (base) + run: | + kubeconform -summary -strict -kubernetes-version 1.28.0 k8s/base/*.yaml + + - name: Install kubectl + uses: azure/setup-kubectl@v3 + + - name: Kustomize build (template-app overlay) + run: | + kubectl kustomize k8s/overlays/template-app/ > /dev/null + echo "Kustomize build succeeded for template-app" + + - name: Validate kustomized output + run: | + kubectl kustomize k8s/overlays/template-app/ | kubeconform -summary -strict -kubernetes-version 1.28.0 + + integration-test: + runs-on: ubuntu-latest + needs: validate-manifests + steps: + - uses: actions/checkout@v4 + + - name: Build Docker image from current code + run: | + docker build -t openms-streamlit:test -f Dockerfile_simple . + + - name: Create kind cluster + uses: helm/kind-action@v1 + with: + cluster_name: test-cluster + + - name: Load image into kind cluster + run: | + kind load docker-image openms-streamlit:test --name test-cluster + + - name: Install nginx ingress controller + run: | + kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml + kubectl wait --namespace ingress-nginx --for=condition=ready pod --selector=app.kubernetes.io/component=controller --timeout=90s + + - name: Deploy with Kustomize + run: | + kubectl kustomize k8s/overlays/template-app/ | \ + sed 's|imagePullPolicy: IfNotPresent|imagePullPolicy: Never|g' | \ + kubectl apply -f - + + - name: Wait for Redis to be ready + run: | + kubectl wait --for=condition=ready pod -l app=template-app,component=redis --timeout=60s + + - name: Verify Redis Service is reachable + run: | + kubectl run redis-test --image=redis:7-alpine --rm -i --restart=Never -- redis-cli -h template-app-redis ping + + - name: Verify all deployments are available + run: | + kubectl wait --for=condition=available deployment -l app=template-app --timeout=120s || true + kubectl get pods -l app=template-app + kubectl get services -l app=template-app diff --git a/.streamlit/config.toml b/.streamlit/config.toml index e3d442ef5..00c6abba7 100644 --- a/.streamlit/config.toml +++ b/.streamlit/config.toml @@ -8,8 +8,6 @@ developmentMode = false address = "0.0.0.0" maxUploadSize = 200 #MB port = 8501 # should be same as configured in deployment repo -enableCORS = false -enableXsrfProtection = false [theme] diff --git a/clean-up-workspaces.py b/clean-up-workspaces.py index a780dbe9a..cf4cf4016 100644 --- a/clean-up-workspaces.py +++ b/clean-up-workspaces.py @@ -6,7 +6,7 @@ from datetime import datetime # Define the workspaces directory -workspaces_directory = Path("/workspaces-streamlit-template") +workspaces_directory = Path(os.environ.get("WORKSPACES_DIR", "/workspaces-streamlit-template")) # Get the current time in seconds current_time = time.time() diff --git a/k8s/base/cleanup-cronjob.yaml b/k8s/base/cleanup-cronjob.yaml new file mode 100644 index 000000000..864818763 --- /dev/null +++ b/k8s/base/cleanup-cronjob.yaml @@ -0,0 +1,45 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: workspace-cleanup + labels: + component: cleanup +spec: + schedule: "0 3 * * *" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + template: + metadata: + labels: + component: cleanup + spec: + restartPolicy: OnFailure + containers: + - name: cleanup + image: openms-streamlit + imagePullPolicy: IfNotPresent + command: ["/bin/bash", "-c"] + args: + - | + source /root/miniforge3/bin/activate streamlit-env + exec python clean-up-workspaces.py + env: + - name: WORKSPACES_DIR + value: "/workspaces-streamlit-template" + volumeMounts: + - name: workspaces + mountPath: /workspaces-streamlit-template + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "512Mi" + cpu: "500m" + volumes: + - name: workspaces + persistentVolumeClaim: + claimName: workspaces-pvc diff --git a/k8s/base/configmap.yaml b/k8s/base/configmap.yaml new file mode 100644 index 000000000..c486e9c98 --- /dev/null +++ b/k8s/base/configmap.yaml @@ -0,0 +1,39 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: streamlit-config +data: + settings.json: | + { + "app-name": "OpenMS WebApp Template", + "online_deployment": true, + "enable_workspaces": true, + "workspaces_dir": "..", + "queue_settings": { + "default_timeout": 7200, + "result_ttl": 86400 + }, + "demo_workspaces": { + "enabled": true, + "source_dirs": ["example-data/workspaces"] + }, + "max_threads": { + "local": 4, + "online": 2 + }, + "analytics": { + "matomo": { + "enabled": true, + "url": "https://cdn.matomo.cloud/openms.matomo.cloud", + "tag": "yDGK8bfY" + }, + "google-analytics": { + "enabled": false, + "tag": "" + }, + "piwik-pro": { + "enabled": false, + "tag": "" + } + } + } diff --git a/k8s/base/ingress.yaml b/k8s/base/ingress.yaml new file mode 100644 index 000000000..f12b2b80b --- /dev/null +++ b/k8s/base/ingress.yaml @@ -0,0 +1,32 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: streamlit + annotations: + # WebSocket support (Streamlit requires WebSockets) + nginx.ingress.kubernetes.io/proxy-read-timeout: "86400" + nginx.ingress.kubernetes.io/proxy-send-timeout: "86400" + nginx.ingress.kubernetes.io/proxy-http-version: "1.1" + # Session affinity (user stays on same pod) + nginx.ingress.kubernetes.io/affinity: "cookie" + nginx.ingress.kubernetes.io/affinity-mode: "persistent" + nginx.ingress.kubernetes.io/session-cookie-name: "stroute" + nginx.ingress.kubernetes.io/session-cookie-path: "/" + nginx.ingress.kubernetes.io/session-cookie-samesite: "Lax" + # File upload (no limit) + nginx.ingress.kubernetes.io/proxy-body-size: "0" + # Disable buffering for streaming + nginx.ingress.kubernetes.io/proxy-buffering: "off" +spec: + ingressClassName: nginx + rules: + - host: streamlit.openms.example.de + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: streamlit + port: + number: 8501 diff --git a/k8s/base/kustomization.yaml b/k8s/base/kustomization.yaml new file mode 100644 index 000000000..c63122a40 --- /dev/null +++ b/k8s/base/kustomization.yaml @@ -0,0 +1,13 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - namespace.yaml + - configmap.yaml + - redis.yaml + - workspace-pvc.yaml + - streamlit-deployment.yaml + - streamlit-service.yaml + - rq-worker-deployment.yaml + - ingress.yaml + - cleanup-cronjob.yaml diff --git a/k8s/base/namespace.yaml b/k8s/base/namespace.yaml new file mode 100644 index 000000000..20842f63d --- /dev/null +++ b/k8s/base/namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: openms + labels: + app.kubernetes.io/part-of: openms-streamlit diff --git a/k8s/base/redis.yaml b/k8s/base/redis.yaml new file mode 100644 index 000000000..b368a475e --- /dev/null +++ b/k8s/base/redis.yaml @@ -0,0 +1,52 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: redis + labels: + component: redis +spec: + replicas: 1 + selector: + matchLabels: + component: redis + template: + metadata: + labels: + component: redis + spec: + containers: + - name: redis + image: redis:7-alpine + ports: + - containerPort: 6379 + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "256Mi" + cpu: "250m" + readinessProbe: + exec: + command: ["redis-cli", "ping"] + initialDelaySeconds: 5 + periodSeconds: 10 + livenessProbe: + exec: + command: ["redis-cli", "ping"] + initialDelaySeconds: 15 + periodSeconds: 20 +--- +apiVersion: v1 +kind: Service +metadata: + name: redis + labels: + component: redis +spec: + type: ClusterIP + ports: + - port: 6379 + targetPort: 6379 + selector: + component: redis diff --git a/k8s/base/rq-worker-deployment.yaml b/k8s/base/rq-worker-deployment.yaml new file mode 100644 index 000000000..769ab3c30 --- /dev/null +++ b/k8s/base/rq-worker-deployment.yaml @@ -0,0 +1,49 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: rq-worker + labels: + component: rq-worker +spec: + replicas: 1 + selector: + matchLabels: + component: rq-worker + template: + metadata: + labels: + component: rq-worker + spec: + containers: + - name: rq-worker + image: openms-streamlit + imagePullPolicy: IfNotPresent + command: ["/bin/bash", "-c"] + args: + - | + source /root/miniforge3/bin/activate streamlit-env + exec rq worker openms-workflows --url $REDIS_URL + env: + - name: REDIS_URL + value: "redis://redis:6379/0" + volumeMounts: + - name: workspaces + mountPath: /workspaces-streamlit-template + - name: config + mountPath: /app/settings.json + subPath: settings.json + readOnly: true + resources: + requests: + memory: "4Gi" + cpu: "2" + limits: + memory: "32Gi" + cpu: "8" + volumes: + - name: workspaces + persistentVolumeClaim: + claimName: workspaces-pvc + - name: config + configMap: + name: streamlit-config diff --git a/k8s/base/streamlit-deployment.yaml b/k8s/base/streamlit-deployment.yaml new file mode 100644 index 000000000..75ac4f155 --- /dev/null +++ b/k8s/base/streamlit-deployment.yaml @@ -0,0 +1,63 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: streamlit + labels: + component: streamlit +spec: + replicas: 2 + selector: + matchLabels: + component: streamlit + template: + metadata: + labels: + component: streamlit + spec: + containers: + - name: streamlit + image: openms-streamlit + imagePullPolicy: IfNotPresent + command: ["/bin/bash", "-c"] + args: + - | + source /root/miniforge3/bin/activate streamlit-env + exec streamlit run app.py --server.address 0.0.0.0 + ports: + - containerPort: 8501 + env: + - name: REDIS_URL + value: "redis://redis:6379/0" + volumeMounts: + - name: workspaces + mountPath: /workspaces-streamlit-template + - name: config + mountPath: /app/settings.json + subPath: settings.json + readOnly: true + readinessProbe: + httpGet: + path: /_stcore/health + port: 8501 + initialDelaySeconds: 10 + periodSeconds: 10 + livenessProbe: + httpGet: + path: /_stcore/health + port: 8501 + initialDelaySeconds: 30 + periodSeconds: 30 + resources: + requests: + memory: "4Gi" + cpu: "2" + limits: + memory: "32Gi" + cpu: "8" + volumes: + - name: workspaces + persistentVolumeClaim: + claimName: workspaces-pvc + - name: config + configMap: + name: streamlit-config diff --git a/k8s/base/streamlit-service.yaml b/k8s/base/streamlit-service.yaml new file mode 100644 index 000000000..90429e083 --- /dev/null +++ b/k8s/base/streamlit-service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: streamlit + labels: + component: streamlit +spec: + type: ClusterIP + ports: + - port: 8501 + targetPort: 8501 + selector: + component: streamlit diff --git a/k8s/base/workspace-pvc.yaml b/k8s/base/workspace-pvc.yaml new file mode 100644 index 000000000..fc7351899 --- /dev/null +++ b/k8s/base/workspace-pvc.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: workspaces-pvc +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 100Gi diff --git a/k8s/overlays/template-app/kustomization.yaml b/k8s/overlays/template-app/kustomization.yaml new file mode 100644 index 000000000..7f63fa256 --- /dev/null +++ b/k8s/overlays/template-app/kustomization.yaml @@ -0,0 +1,24 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../base + +namePrefix: template-app- + +commonLabels: + app: template-app + +images: + - name: openms-streamlit + newName: ghcr.io/openms/streamlit-template + newTag: main + +patches: + - target: + kind: Ingress + name: streamlit + patch: | + - op: replace + path: /spec/rules/0/host + value: template.openms.example.de From 233484e2e034a0c7103ca3e19793c6bb8fc409f7 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 20:00:19 +0000 Subject: [PATCH 02/14] Fix kubeconform validation to skip kustomization.yaml kustomization.yaml is a Kustomize config file, not a standard K8s resource, so kubeconform has no schema for it. Exclude it via -ignore-filename-pattern. https://claude.ai/code/session_01RNJ3dVjV1VTHcC9ugE3FQJ --- .github/workflows/k8s-manifests-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/k8s-manifests-ci.yml b/.github/workflows/k8s-manifests-ci.yml index 3d9dbbb93..a42f7ff6c 100644 --- a/.github/workflows/k8s-manifests-ci.yml +++ b/.github/workflows/k8s-manifests-ci.yml @@ -21,7 +21,7 @@ jobs: - name: Validate K8s manifests (base) run: | - kubeconform -summary -strict -kubernetes-version 1.28.0 k8s/base/*.yaml + kubeconform -summary -strict -kubernetes-version 1.28.0 -ignore-filename-pattern 'kustomization.yaml' k8s/base/*.yaml - name: Install kubectl uses: azure/setup-kubectl@v3 From aa28e915e464f93b57ceb06eed954fcdffdef45f Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 20:24:48 +0000 Subject: [PATCH 03/14] Add matrix strategy to test both Dockerfiles in integration tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The integration-test job now uses a matrix with Dockerfile_simple and Dockerfile. Each matrix entry checks if its Dockerfile exists before running — all steps are guarded with an `if` condition so they skip gracefully when a Dockerfile is absent. This allows downstream forks that only have one Dockerfile to pass CI without errors. https://claude.ai/code/session_01RNJ3dVjV1VTHcC9ugE3FQJ --- .github/workflows/k8s-manifests-ci.yml | 27 +++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/.github/workflows/k8s-manifests-ci.yml b/.github/workflows/k8s-manifests-ci.yml index a42f7ff6c..d65bb22c5 100644 --- a/.github/workflows/k8s-manifests-ci.yml +++ b/.github/workflows/k8s-manifests-ci.yml @@ -38,42 +38,67 @@ jobs: integration-test: runs-on: ubuntu-latest needs: validate-manifests + strategy: + fail-fast: false + matrix: + dockerfile: + - Dockerfile_simple + - Dockerfile steps: - uses: actions/checkout@v4 + - name: Check if Dockerfile exists + id: check + run: | + if [ -f "${{ matrix.dockerfile }}" ]; then + echo "exists=true" >> "$GITHUB_OUTPUT" + echo "Found ${{ matrix.dockerfile }}, will run integration test" + else + echo "exists=false" >> "$GITHUB_OUTPUT" + echo "Skipping: ${{ matrix.dockerfile }} not found" + fi + - name: Build Docker image from current code + if: steps.check.outputs.exists == 'true' run: | - docker build -t openms-streamlit:test -f Dockerfile_simple . + docker build -t openms-streamlit:test -f ${{ matrix.dockerfile }} . - name: Create kind cluster + if: steps.check.outputs.exists == 'true' uses: helm/kind-action@v1 with: cluster_name: test-cluster - name: Load image into kind cluster + if: steps.check.outputs.exists == 'true' run: | kind load docker-image openms-streamlit:test --name test-cluster - name: Install nginx ingress controller + if: steps.check.outputs.exists == 'true' run: | kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml kubectl wait --namespace ingress-nginx --for=condition=ready pod --selector=app.kubernetes.io/component=controller --timeout=90s - name: Deploy with Kustomize + if: steps.check.outputs.exists == 'true' run: | kubectl kustomize k8s/overlays/template-app/ | \ sed 's|imagePullPolicy: IfNotPresent|imagePullPolicy: Never|g' | \ kubectl apply -f - - name: Wait for Redis to be ready + if: steps.check.outputs.exists == 'true' run: | kubectl wait --for=condition=ready pod -l app=template-app,component=redis --timeout=60s - name: Verify Redis Service is reachable + if: steps.check.outputs.exists == 'true' run: | kubectl run redis-test --image=redis:7-alpine --rm -i --restart=Never -- redis-cli -h template-app-redis ping - name: Verify all deployments are available + if: steps.check.outputs.exists == 'true' run: | kubectl wait --for=condition=available deployment -l app=template-app --timeout=120s || true kubectl get pods -l app=template-app From 9aa798b51edba3e47e5d3d54e7e87e141ab847cf Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 11 Mar 2026 09:29:56 +0000 Subject: [PATCH 04/14] Adapt K8s base manifests for de.NBI Cinder CSI storage - Switch workspace PVC from ReadWriteMany to ReadWriteOnce with cinder-csi storage class (required by de.NBI KKP cluster) - Increase PVC storage to 500Gi - Add namespace: openms to kustomization.yaml - Reduce pod resource requests (1Gi/500m) and limits (8Gi/4 CPU) so all workspace-mounting pods fit on a single node https://claude.ai/code/session_01RNJ3dVjV1VTHcC9ugE3FQJ --- k8s/base/kustomization.yaml | 2 ++ k8s/base/rq-worker-deployment.yaml | 8 ++++---- k8s/base/streamlit-deployment.yaml | 8 ++++---- k8s/base/workspace-pvc.yaml | 5 +++-- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/k8s/base/kustomization.yaml b/k8s/base/kustomization.yaml index c63122a40..e337290b4 100644 --- a/k8s/base/kustomization.yaml +++ b/k8s/base/kustomization.yaml @@ -1,6 +1,8 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization +namespace: openms + resources: - namespace.yaml - configmap.yaml diff --git a/k8s/base/rq-worker-deployment.yaml b/k8s/base/rq-worker-deployment.yaml index 769ab3c30..f4beeca80 100644 --- a/k8s/base/rq-worker-deployment.yaml +++ b/k8s/base/rq-worker-deployment.yaml @@ -35,11 +35,11 @@ spec: readOnly: true resources: requests: - memory: "4Gi" - cpu: "2" + memory: "1Gi" + cpu: "500m" limits: - memory: "32Gi" - cpu: "8" + memory: "8Gi" + cpu: "4" volumes: - name: workspaces persistentVolumeClaim: diff --git a/k8s/base/streamlit-deployment.yaml b/k8s/base/streamlit-deployment.yaml index 75ac4f155..b74caf6d9 100644 --- a/k8s/base/streamlit-deployment.yaml +++ b/k8s/base/streamlit-deployment.yaml @@ -49,11 +49,11 @@ spec: periodSeconds: 30 resources: requests: - memory: "4Gi" - cpu: "2" + memory: "1Gi" + cpu: "500m" limits: - memory: "32Gi" - cpu: "8" + memory: "8Gi" + cpu: "4" volumes: - name: workspaces persistentVolumeClaim: diff --git a/k8s/base/workspace-pvc.yaml b/k8s/base/workspace-pvc.yaml index fc7351899..b3613bebf 100644 --- a/k8s/base/workspace-pvc.yaml +++ b/k8s/base/workspace-pvc.yaml @@ -4,7 +4,8 @@ metadata: name: workspaces-pvc spec: accessModes: - - ReadWriteMany + - ReadWriteOnce + storageClassName: cinder-csi resources: requests: - storage: 100Gi + storage: 500Gi From 0d90cf761361a74b350d3f316340479661796118 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 11 Mar 2026 10:02:02 +0000 Subject: [PATCH 05/14] Add pod affinity rules to co-locate all workspace pods on same node The workspaces PVC uses ReadWriteOnce (Cinder CSI block storage) which requires all pods mounting it to run on the same node. Without explicit affinity rules, the scheduler was failing silently, leaving pods in Pending state with no events. Adds a `volume-group: workspaces` label and podAffinity with requiredDuringSchedulingIgnoredDuringExecution to streamlit deployment, rq-worker deployment, and cleanup cronjob. This ensures the scheduler explicitly co-locates all workspace-consuming pods on the same node. https://claude.ai/code/session_01RNJ3dVjV1VTHcC9ugE3FQJ --- k8s/base/cleanup-cronjob.yaml | 11 +++++++++++ k8s/base/rq-worker-deployment.yaml | 11 +++++++++++ k8s/base/streamlit-deployment.yaml | 11 +++++++++++ 3 files changed, 33 insertions(+) diff --git a/k8s/base/cleanup-cronjob.yaml b/k8s/base/cleanup-cronjob.yaml index 864818763..05f764dcb 100644 --- a/k8s/base/cleanup-cronjob.yaml +++ b/k8s/base/cleanup-cronjob.yaml @@ -15,8 +15,19 @@ spec: metadata: labels: component: cleanup + volume-group: workspaces spec: restartPolicy: OnFailure + affinity: + podAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: volume-group + operator: In + values: + - workspaces + topologyKey: kubernetes.io/hostname containers: - name: cleanup image: openms-streamlit diff --git a/k8s/base/rq-worker-deployment.yaml b/k8s/base/rq-worker-deployment.yaml index f4beeca80..18fc85419 100644 --- a/k8s/base/rq-worker-deployment.yaml +++ b/k8s/base/rq-worker-deployment.yaml @@ -13,7 +13,18 @@ spec: metadata: labels: component: rq-worker + volume-group: workspaces spec: + affinity: + podAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: volume-group + operator: In + values: + - workspaces + topologyKey: kubernetes.io/hostname containers: - name: rq-worker image: openms-streamlit diff --git a/k8s/base/streamlit-deployment.yaml b/k8s/base/streamlit-deployment.yaml index b74caf6d9..bc8201f9f 100644 --- a/k8s/base/streamlit-deployment.yaml +++ b/k8s/base/streamlit-deployment.yaml @@ -13,7 +13,18 @@ spec: metadata: labels: component: streamlit + volume-group: workspaces spec: + affinity: + podAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: volume-group + operator: In + values: + - workspaces + topologyKey: kubernetes.io/hostname containers: - name: streamlit image: openms-streamlit From ed3075b64c7f8f1e4f13d5fdb61efebb4c298354 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 11 Mar 2026 19:31:25 +0000 Subject: [PATCH 06/14] Fix CI: wait for ingress-nginx admission webhook before deploying The controller pod being Ready doesn't guarantee the admission webhook service is accepting connections. Add a polling loop that waits for the webhook endpoint to have an IP assigned before applying the Ingress resource, preventing "connection refused" errors during kustomize apply. https://claude.ai/code/session_01RNJ3dVjV1VTHcC9ugE3FQJ --- .github/workflows/k8s-manifests-ci.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/k8s-manifests-ci.yml b/.github/workflows/k8s-manifests-ci.yml index d65bb22c5..12119412a 100644 --- a/.github/workflows/k8s-manifests-ci.yml +++ b/.github/workflows/k8s-manifests-ci.yml @@ -79,6 +79,16 @@ jobs: run: | kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml kubectl wait --namespace ingress-nginx --for=condition=ready pod --selector=app.kubernetes.io/component=controller --timeout=90s + # Wait for the admission webhook to be ready to accept connections + echo "Waiting for ingress-nginx admission webhook..." + for i in $(seq 1 30); do + if kubectl get endpoints -n ingress-nginx ingress-nginx-controller-admission -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null | grep -q .; then + echo "Webhook endpoint ready" + break + fi + echo " attempt $i/30 - waiting..." + sleep 2 + done - name: Deploy with Kustomize if: steps.check.outputs.exists == 'true' From 6036d142116becf8c954de0ab9771d65f95ddcea Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 11 Mar 2026 20:29:34 +0000 Subject: [PATCH 07/14] Fix CI: add -n openms namespace to integration test steps The kustomize overlay deploys into the openms namespace, but the verification steps (Redis wait, Redis ping, deployment checks) were querying the default namespace, causing "no matching resources found". https://claude.ai/code/session_01RNJ3dVjV1VTHcC9ugE3FQJ --- .github/workflows/k8s-manifests-ci.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/k8s-manifests-ci.yml b/.github/workflows/k8s-manifests-ci.yml index 12119412a..49b560d49 100644 --- a/.github/workflows/k8s-manifests-ci.yml +++ b/.github/workflows/k8s-manifests-ci.yml @@ -100,16 +100,16 @@ jobs: - name: Wait for Redis to be ready if: steps.check.outputs.exists == 'true' run: | - kubectl wait --for=condition=ready pod -l app=template-app,component=redis --timeout=60s + kubectl wait -n openms --for=condition=ready pod -l app=template-app,component=redis --timeout=60s - name: Verify Redis Service is reachable if: steps.check.outputs.exists == 'true' run: | - kubectl run redis-test --image=redis:7-alpine --rm -i --restart=Never -- redis-cli -h template-app-redis ping + kubectl run redis-test -n openms --image=redis:7-alpine --rm -i --restart=Never -- redis-cli -h template-app-redis.openms.svc.cluster.local ping - name: Verify all deployments are available if: steps.check.outputs.exists == 'true' run: | - kubectl wait --for=condition=available deployment -l app=template-app --timeout=120s || true - kubectl get pods -l app=template-app - kubectl get services -l app=template-app + kubectl wait -n openms --for=condition=available deployment -l app=template-app --timeout=120s || true + kubectl get pods -n openms -l app=template-app + kubectl get services -n openms -l app=template-app From e7cac758fa5351476800c3151960816460b9226e Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 2 Apr 2026 10:32:20 +0000 Subject: [PATCH 08/14] Fix CI: retry kustomize deploy for webhook readiness Replace the unreliable endpoint-IP polling with a retry loop on kubectl apply (up to 5 attempts with backoff). This handles the race where the ingress-nginx admission webhook has an endpoint IP but isn't yet accepting TCP connections. https://claude.ai/code/session_01RNJ3dVjV1VTHcC9ugE3FQJ --- .github/workflows/k8s-manifests-ci.yml | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/.github/workflows/k8s-manifests-ci.yml b/.github/workflows/k8s-manifests-ci.yml index 49b560d49..9e3dc3936 100644 --- a/.github/workflows/k8s-manifests-ci.yml +++ b/.github/workflows/k8s-manifests-ci.yml @@ -79,23 +79,20 @@ jobs: run: | kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml kubectl wait --namespace ingress-nginx --for=condition=ready pod --selector=app.kubernetes.io/component=controller --timeout=90s - # Wait for the admission webhook to be ready to accept connections - echo "Waiting for ingress-nginx admission webhook..." - for i in $(seq 1 30); do - if kubectl get endpoints -n ingress-nginx ingress-nginx-controller-admission -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null | grep -q .; then - echo "Webhook endpoint ready" - break - fi - echo " attempt $i/30 - waiting..." - sleep 2 - done - name: Deploy with Kustomize if: steps.check.outputs.exists == 'true' run: | kubectl kustomize k8s/overlays/template-app/ | \ - sed 's|imagePullPolicy: IfNotPresent|imagePullPolicy: Never|g' | \ - kubectl apply -f - + sed 's|imagePullPolicy: IfNotPresent|imagePullPolicy: Never|g' > /tmp/manifests.yaml + for i in 1 2 3 4 5; do + if kubectl apply -f /tmp/manifests.yaml; then + echo "Deploy succeeded on attempt $i" + break + fi + echo "Attempt $i failed, retrying in ${i}0s..." + sleep "${i}0" + done - name: Wait for Redis to be ready if: steps.check.outputs.exists == 'true' From 173f3b8acf056380666aab37e83a5b690a0848b4 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 2 Apr 2026 13:11:57 +0000 Subject: [PATCH 09/14] Fix REDIS_URL to use prefixed service name in overlay Kustomize namePrefix renames the Redis service to template-app-redis, but the REDIS_URL env var in streamlit and rq-worker deployments still referenced the unprefixed name "redis", causing the rq-worker to CrashLoopBackOff with "Name or service not known". Add JSON patches in the overlay to set the correct prefixed hostname. https://claude.ai/code/session_01RNJ3dVjV1VTHcC9ugE3FQJ --- k8s/overlays/template-app/kustomization.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/k8s/overlays/template-app/kustomization.yaml b/k8s/overlays/template-app/kustomization.yaml index 7f63fa256..43444bce6 100644 --- a/k8s/overlays/template-app/kustomization.yaml +++ b/k8s/overlays/template-app/kustomization.yaml @@ -22,3 +22,17 @@ patches: - op: replace path: /spec/rules/0/host value: template.openms.example.de + - target: + kind: Deployment + name: streamlit + patch: | + - op: replace + path: /spec/template/spec/containers/0/env/0/value + value: "redis://template-app-redis:6379/0" + - target: + kind: Deployment + name: rq-worker + patch: | + - op: replace + path: /spec/template/spec/containers/0/env/0/value + value: "redis://template-app-redis:6379/0" From 443ae2442853cee9e2def00b315b6caa2de6839e Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 2 Apr 2026 13:54:53 +0000 Subject: [PATCH 10/14] Add Traefik IngressRoute for direct LB IP access The cluster uses Traefik, not nginx, so the nginx Ingress annotations are ignored. Add a Traefik IngressRoute with PathPrefix(/) catch-all routing and sticky session cookie for Streamlit session affinity. https://claude.ai/code/session_01RNJ3dVjV1VTHcC9ugE3FQJ --- k8s/base/kustomization.yaml | 1 + k8s/base/traefik-ingressroute.yaml | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 k8s/base/traefik-ingressroute.yaml diff --git a/k8s/base/kustomization.yaml b/k8s/base/kustomization.yaml index e337290b4..d16bf7017 100644 --- a/k8s/base/kustomization.yaml +++ b/k8s/base/kustomization.yaml @@ -12,4 +12,5 @@ resources: - streamlit-service.yaml - rq-worker-deployment.yaml - ingress.yaml + - traefik-ingressroute.yaml - cleanup-cronjob.yaml diff --git a/k8s/base/traefik-ingressroute.yaml b/k8s/base/traefik-ingressroute.yaml new file mode 100644 index 000000000..b202891f8 --- /dev/null +++ b/k8s/base/traefik-ingressroute.yaml @@ -0,0 +1,18 @@ +apiVersion: traefik.io/v1alpha1 +kind: IngressRoute +metadata: + name: streamlit-traefik +spec: + entryPoints: + - web + routes: + - match: PathPrefix(`/`) + kind: Rule + services: + - name: streamlit + port: 8501 + sticky: + cookie: + name: stroute + httpOnly: true + sameSite: lax From 9c0fa08df6618d3692c7846b1f84ac74e9cd0141 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 2 Apr 2026 14:08:55 +0000 Subject: [PATCH 11/14] Fix CI: skip Traefik IngressRoute CRD in validation and integration tests kubeconform doesn't know the Traefik IngressRoute CRD schema, and the kind cluster in integration tests doesn't have Traefik installed. Skip the IngressRoute in kubeconform validation and filter it out with yq before applying to the kind cluster. https://claude.ai/code/session_01RNJ3dVjV1VTHcC9ugE3FQJ --- .github/workflows/k8s-manifests-ci.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/k8s-manifests-ci.yml b/.github/workflows/k8s-manifests-ci.yml index 9e3dc3936..bda9d5424 100644 --- a/.github/workflows/k8s-manifests-ci.yml +++ b/.github/workflows/k8s-manifests-ci.yml @@ -21,7 +21,10 @@ jobs: - name: Validate K8s manifests (base) run: | - kubeconform -summary -strict -kubernetes-version 1.28.0 -ignore-filename-pattern 'kustomization.yaml' k8s/base/*.yaml + kubeconform -summary -strict -kubernetes-version 1.28.0 \ + -ignore-filename-pattern 'kustomization.yaml' \ + -ignore-filename-pattern 'traefik-ingressroute.yaml' \ + k8s/base/*.yaml - name: Install kubectl uses: azure/setup-kubectl@v3 @@ -33,7 +36,7 @@ jobs: - name: Validate kustomized output run: | - kubectl kustomize k8s/overlays/template-app/ | kubeconform -summary -strict -kubernetes-version 1.28.0 + kubectl kustomize k8s/overlays/template-app/ | kubeconform -summary -strict -kubernetes-version 1.28.0 -skip IngressRoute integration-test: runs-on: ubuntu-latest @@ -83,7 +86,9 @@ jobs: - name: Deploy with Kustomize if: steps.check.outputs.exists == 'true' run: | + # Filter out Traefik CRDs (kind cluster uses nginx, not Traefik) kubectl kustomize k8s/overlays/template-app/ | \ + yq 'select(.kind != "IngressRoute")' | \ sed 's|imagePullPolicy: IfNotPresent|imagePullPolicy: Never|g' > /tmp/manifests.yaml for i in 1 2 3 4 5; do if kubectl apply -f /tmp/manifests.yaml; then From b783ff72c47a6b8811f7b8c75683a746e2675bfd Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 4 Apr 2026 09:29:33 +0000 Subject: [PATCH 12/14] Fix IngressRoute service name for kustomize namePrefix Kustomize namePrefix doesn't rewrite service references inside CRDs, so the IngressRoute was pointing to 'streamlit' instead of 'template-app-streamlit', causing Traefik to return 404. https://claude.ai/code/session_01RNJ3dVjV1VTHcC9ugE3FQJ --- k8s/overlays/template-app/kustomization.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/k8s/overlays/template-app/kustomization.yaml b/k8s/overlays/template-app/kustomization.yaml index 43444bce6..a1858da4d 100644 --- a/k8s/overlays/template-app/kustomization.yaml +++ b/k8s/overlays/template-app/kustomization.yaml @@ -36,3 +36,10 @@ patches: - op: replace path: /spec/template/spec/containers/0/env/0/value value: "redis://template-app-redis:6379/0" + - target: + kind: IngressRoute + name: streamlit-traefik + patch: | + - op: replace + path: /spec/routes/0/services/0/name + value: "template-app-streamlit" From 5f8a5b2fab8d10731227cb946284a83be249231a Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 4 Apr 2026 10:26:19 +0000 Subject: [PATCH 13/14] fix: use ConfigMap as settings override instead of full replacement The ConfigMap was replacing the entire settings.json, losing keys like "version" and "repository-name" that the app expects (causing KeyError). Now the ConfigMap only contains deployment-specific overrides, which are merged into the Docker image's base settings.json at container startup using jq. https://claude.ai/code/session_01RNJ3dVjV1VTHcC9ugE3FQJ --- k8s/base/configmap.yaml | 34 ++---------------------------- k8s/base/rq-worker-deployment.yaml | 5 +++-- k8s/base/streamlit-deployment.yaml | 5 +++-- 3 files changed, 8 insertions(+), 36 deletions(-) diff --git a/k8s/base/configmap.yaml b/k8s/base/configmap.yaml index c486e9c98..2b7815b9e 100644 --- a/k8s/base/configmap.yaml +++ b/k8s/base/configmap.yaml @@ -3,37 +3,7 @@ kind: ConfigMap metadata: name: streamlit-config data: - settings.json: | + settings-overrides.json: | { - "app-name": "OpenMS WebApp Template", - "online_deployment": true, - "enable_workspaces": true, - "workspaces_dir": "..", - "queue_settings": { - "default_timeout": 7200, - "result_ttl": 86400 - }, - "demo_workspaces": { - "enabled": true, - "source_dirs": ["example-data/workspaces"] - }, - "max_threads": { - "local": 4, - "online": 2 - }, - "analytics": { - "matomo": { - "enabled": true, - "url": "https://cdn.matomo.cloud/openms.matomo.cloud", - "tag": "yDGK8bfY" - }, - "google-analytics": { - "enabled": false, - "tag": "" - }, - "piwik-pro": { - "enabled": false, - "tag": "" - } - } + "online_deployment": true } diff --git a/k8s/base/rq-worker-deployment.yaml b/k8s/base/rq-worker-deployment.yaml index 18fc85419..680860042 100644 --- a/k8s/base/rq-worker-deployment.yaml +++ b/k8s/base/rq-worker-deployment.yaml @@ -33,6 +33,7 @@ spec: args: - | source /root/miniforge3/bin/activate streamlit-env + jq -s '.[0] * .[1]' /app/settings.json /app/settings-overrides.json > /tmp/settings-merged.json && mv /tmp/settings-merged.json /app/settings.json exec rq worker openms-workflows --url $REDIS_URL env: - name: REDIS_URL @@ -41,8 +42,8 @@ spec: - name: workspaces mountPath: /workspaces-streamlit-template - name: config - mountPath: /app/settings.json - subPath: settings.json + mountPath: /app/settings-overrides.json + subPath: settings-overrides.json readOnly: true resources: requests: diff --git a/k8s/base/streamlit-deployment.yaml b/k8s/base/streamlit-deployment.yaml index bc8201f9f..a556da282 100644 --- a/k8s/base/streamlit-deployment.yaml +++ b/k8s/base/streamlit-deployment.yaml @@ -33,6 +33,7 @@ spec: args: - | source /root/miniforge3/bin/activate streamlit-env + jq -s '.[0] * .[1]' /app/settings.json /app/settings-overrides.json > /tmp/settings-merged.json && mv /tmp/settings-merged.json /app/settings.json exec streamlit run app.py --server.address 0.0.0.0 ports: - containerPort: 8501 @@ -43,8 +44,8 @@ spec: - name: workspaces mountPath: /workspaces-streamlit-template - name: config - mountPath: /app/settings.json - subPath: settings.json + mountPath: /app/settings-overrides.json + subPath: settings-overrides.json readOnly: true readinessProbe: httpGet: From b913b6441774ab889a316f70f7c2372ab3bcebcf Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 4 Apr 2026 10:31:04 +0000 Subject: [PATCH 14/14] fix: add set -euo pipefail to fail fast on settings merge error Addresses CodeRabbit review: if jq merge fails, the container should not start with unmerged settings. https://claude.ai/code/session_01RNJ3dVjV1VTHcC9ugE3FQJ --- k8s/base/rq-worker-deployment.yaml | 1 + k8s/base/streamlit-deployment.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/k8s/base/rq-worker-deployment.yaml b/k8s/base/rq-worker-deployment.yaml index 680860042..5ae70f3c7 100644 --- a/k8s/base/rq-worker-deployment.yaml +++ b/k8s/base/rq-worker-deployment.yaml @@ -32,6 +32,7 @@ spec: command: ["/bin/bash", "-c"] args: - | + set -euo pipefail source /root/miniforge3/bin/activate streamlit-env jq -s '.[0] * .[1]' /app/settings.json /app/settings-overrides.json > /tmp/settings-merged.json && mv /tmp/settings-merged.json /app/settings.json exec rq worker openms-workflows --url $REDIS_URL diff --git a/k8s/base/streamlit-deployment.yaml b/k8s/base/streamlit-deployment.yaml index a556da282..bf34288d9 100644 --- a/k8s/base/streamlit-deployment.yaml +++ b/k8s/base/streamlit-deployment.yaml @@ -32,6 +32,7 @@ spec: command: ["/bin/bash", "-c"] args: - | + set -euo pipefail source /root/miniforge3/bin/activate streamlit-env jq -s '.[0] * .[1]' /app/settings.json /app/settings-overrides.json > /tmp/settings-merged.json && mv /tmp/settings-merged.json /app/settings.json exec streamlit run app.py --server.address 0.0.0.0