From 04ef8163c29e810fae0f408bba05c67c90c529f1 Mon Sep 17 00:00:00 2001 From: urismiley Date: Thu, 5 Mar 2026 13:09:50 -0500 Subject: [PATCH 01/16] feat: add local telemetry playground and monitoring docs - Restructured telemetry playground into organized k8s/, dashboards/, scripts/ layout - Split monolithic observability-stack.yaml into per-component files - Added two Grafana dashboards: Gateway (ops, latency, errors, connections) and Internals (PostgreSQL, indexes, infrastructure) - Added deploy.sh and teardown.sh scripts for one-command setup/teardown - Added Grafana dashboard provisioning via ConfigMap - Added traffic generators with read/write split (primary vs replicas) - Added OTEL_TRACING_ENABLED=true to sidecar injector for trace collection - Added monitoring docs (overview.md, metrics.md) with architecture diagrams - Added README with Mermaid architecture diagram Signed-off-by: urismiley --- .../preview/monitoring/metrics.md | 465 +++++++ .../preview/monitoring/overview.md | 284 ++++ .../telemetry/local/README.md | 127 ++ .../telemetry/local/dashboards/gateway.json | 1051 +++++++++++++++ .../telemetry/local/dashboards/internals.json | 1166 +++++++++++++++++ .../local/k8s/documentdb/cluster.yaml | 29 + .../k8s/documentdb/collector-bridge.yaml | 15 + .../local/k8s/observability/grafana.yaml | 105 ++ .../local/k8s/observability/loki.yaml | 74 ++ .../local/k8s/observability/namespace.yaml | 4 + .../k8s/observability/otel-collector.yaml | 175 +++ .../local/k8s/observability/prometheus.yaml | 59 + .../local/k8s/observability/tempo.yaml | 76 ++ .../local/k8s/traffic/traffic-generator.yaml | 269 ++++ .../telemetry/local/scripts/deploy.sh | 46 + .../telemetry/local/scripts/setup-kind.sh | 73 ++ .../telemetry/local/scripts/teardown.sh | 20 + mkdocs.yml | 3 + .../internal/lifecycle/lifecycle.go | 16 + 19 files changed, 4057 insertions(+) create mode 100644 docs/operator-public-documentation/preview/monitoring/metrics.md create mode 100644 docs/operator-public-documentation/preview/monitoring/overview.md create mode 100644 documentdb-playground/telemetry/local/README.md create mode 100644 documentdb-playground/telemetry/local/dashboards/gateway.json create mode 100644 documentdb-playground/telemetry/local/dashboards/internals.json create mode 100644 documentdb-playground/telemetry/local/k8s/documentdb/cluster.yaml create mode 100644 documentdb-playground/telemetry/local/k8s/documentdb/collector-bridge.yaml create mode 100644 documentdb-playground/telemetry/local/k8s/observability/grafana.yaml create mode 100644 documentdb-playground/telemetry/local/k8s/observability/loki.yaml create mode 100644 documentdb-playground/telemetry/local/k8s/observability/namespace.yaml create mode 100644 documentdb-playground/telemetry/local/k8s/observability/otel-collector.yaml create mode 100644 documentdb-playground/telemetry/local/k8s/observability/prometheus.yaml create mode 100644 documentdb-playground/telemetry/local/k8s/observability/tempo.yaml create mode 100644 documentdb-playground/telemetry/local/k8s/traffic/traffic-generator.yaml create mode 100755 documentdb-playground/telemetry/local/scripts/deploy.sh create mode 100755 documentdb-playground/telemetry/local/scripts/setup-kind.sh create mode 100755 documentdb-playground/telemetry/local/scripts/teardown.sh diff --git a/docs/operator-public-documentation/preview/monitoring/metrics.md b/docs/operator-public-documentation/preview/monitoring/metrics.md new file mode 100644 index 00000000..9159bf83 --- /dev/null +++ b/docs/operator-public-documentation/preview/monitoring/metrics.md @@ -0,0 +1,465 @@ +# Metrics Reference + +This page documents the key metrics available when monitoring a DocumentDB cluster, organized by source. Each section includes the metric name, description, labels, and example PromQL queries. + +## Container Resource Metrics + +These metrics are collected via the kubelet/cAdvisor interface (or the OpenTelemetry `kubeletstats` receiver). They cover CPU, memory, network, and filesystem for the **postgres** and **documentdb-gateway** containers in each DocumentDB pod. + +### CPU + +| Metric | Type | Description | +|--------|------|-------------| +| `container_cpu_usage_seconds_total` | Counter | Cumulative CPU time consumed in seconds | +| `container_spec_cpu_quota` | Gauge | CPU quota (microseconds per `cpu_period`) | +| `container_spec_cpu_period` | Gauge | CPU CFS scheduling period (microseconds) | + +**Common labels:** `namespace`, `pod`, `container`, `node` + +#### Example Queries + +CPU usage rate per container over 5 minutes: + +```promql +rate(container_cpu_usage_seconds_total{ + container=~"postgres|documentdb-gateway", + pod=~".*documentdb.*" +}[5m]) +``` + +CPU utilization as a percentage of limit: + +```promql +(rate(container_cpu_usage_seconds_total{ + container="postgres", + pod=~".*documentdb.*" +}[5m]) +/ on(pod, container) +(container_spec_cpu_quota{ + container="postgres", + pod=~".*documentdb.*" +} +/ container_spec_cpu_period{ + container="postgres", + pod=~".*documentdb.*" +})) * 100 +``` + +Compare gateway vs. postgres CPU across all pods: + +```promql +sum by (container) ( + rate(container_cpu_usage_seconds_total{ + container=~"postgres|documentdb-gateway", + pod=~".*documentdb.*" + }[5m]) +) +``` + +### Memory + +| Metric | Type | Description | +|--------|------|-------------| +| `container_memory_working_set_bytes` | Gauge | Current working set memory (bytes) | +| `container_memory_rss` | Gauge | Resident set size (bytes) | +| `container_memory_cache` | Gauge | Page cache memory (bytes) | +| `container_spec_memory_limit_bytes` | Gauge | Memory limit (bytes) | + +**Common labels:** `namespace`, `pod`, `container`, `node` + +#### Example Queries + +Memory usage in MiB per container: + +```promql +container_memory_working_set_bytes{ + container=~"postgres|documentdb-gateway", + pod=~".*documentdb.*" +} / 1024 / 1024 +``` + +Memory utilization as a percentage of limit: + +```promql +(container_memory_working_set_bytes{ + container=~"postgres|documentdb-gateway", + pod=~".*documentdb.*" +} +/ container_spec_memory_limit_bytes{ + container=~"postgres|documentdb-gateway", + pod=~".*documentdb.*" +}) * 100 +``` + +Top 5 pods by memory usage: + +```promql +topk(5, + sum by (pod) ( + container_memory_working_set_bytes{ + container=~"postgres|documentdb-gateway", + pod=~".*documentdb.*" + } + ) +) +``` + +### Network + +| Metric | Type | Description | +|--------|------|-------------| +| `container_network_receive_bytes_total` | Counter | Bytes received | +| `container_network_transmit_bytes_total` | Counter | Bytes transmitted | + +**Common labels:** `namespace`, `pod`, `interface` + +#### Example Queries + +Network throughput (bytes/sec) per pod: + +```promql +sum by (pod) ( + rate(container_network_receive_bytes_total{ + pod=~".*documentdb.*" + }[5m]) + + rate(container_network_transmit_bytes_total{ + pod=~".*documentdb.*" + }[5m]) +) +``` + +### Filesystem + +| Metric | Type | Description | +|--------|------|-------------| +| `container_fs_usage_bytes` | Gauge | Filesystem usage (bytes) | +| `container_fs_reads_bytes_total` | Counter | Filesystem read bytes | +| `container_fs_writes_bytes_total` | Counter | Filesystem write bytes | + +**Common labels:** `namespace`, `pod`, `container`, `device` + +#### Example Queries + +Disk I/O rate for the postgres container: + +```promql +rate(container_fs_writes_bytes_total{ + container="postgres", + pod=~".*documentdb.*" +}[5m]) +``` + +## Gateway Metrics + +The DocumentDB Gateway exports application-level metrics via OTLP (OpenTelemetry Protocol) push. The gateway sidecar injector automatically sets `OTEL_EXPORTER_OTLP_ENDPOINT` and `OTEL_RESOURCE_ATTRIBUTES` (with `service.instance.id` set to the pod name) on each gateway container, so metrics are exported without manual configuration. + +Metrics are exported to an OpenTelemetry Collector, which converts them to Prometheus format via the `prometheus` exporter. + +### Operations + +| Metric | Type | Description | +|--------|------|-------------| +| `db_client_operations_total` | Counter | Total MongoDB operations processed | +| `db_client_operation_duration_seconds_total` | Counter | Cumulative operation duration | + +**Common labels:** `db_operation_name` (e.g., `Find`, `Insert`, `Update`, `Aggregate`, `Delete`), `db_namespace`, `db_system_name`, `service_instance_id` (pod name), `error_type` (set on failed operations) + +#### Example Queries + +Operations per second by command type: + +```promql +sum by (db_operation_name) ( + rate(db_client_operations_total[1m]) +) +``` + +Average latency per operation (milliseconds): + +```promql +sum by (db_operation_name) ( + rate(db_client_operation_duration_seconds_total{db_operation_phase=""}[1m]) +) / sum by (db_operation_name) ( + rate(db_client_operations_total[1m]) +) * 1000 +``` + +Error rate as a percentage: + +```promql +sum(rate(db_client_operations_total{error_type!=""}[1m])) +/ sum(rate(db_client_operations_total[1m])) * 100 +``` + +Operations per second for a specific instance: + +```promql +sum by (db_operation_name) ( + rate(db_client_operations_total{ + service_instance_id="documentdb-preview-1" + }[1m]) +) +``` + +### Client Connections + +| Metric | Type | Description | +|--------|------|-------------| +| `gateway_client_connections_active` | Gauge | Current active client connections | +| `gateway_client_connections_total` | Counter | Cumulative client connections accepted | + +**Common labels:** `service_instance_id` (pod name) + +#### Example Queries + +Active connections per instance: + +```promql +gateway_client_connections_active +``` + +Connection rate: + +```promql +rate(gateway_client_connections_total[1m]) +``` + +### Connection Pool + +| Metric | Type | Description | +|--------|------|-------------| +| `db_client_connection_active` | Gauge | Active backend (PG) pool connections | +| `db_client_connection_idle` | Gauge | Idle backend pool connections | +| `db_client_connection_max` | Gauge | Maximum pool size | +| `db_client_connection_waiting` | Gauge | Requests waiting for a pool connection | + +**Common labels:** `service_instance_id` (pod name) + +#### Example Queries + +Pool utilization: + +```promql +db_client_connection_active +/ db_client_connection_max * 100 +``` + +### Request/Response Size + +| Metric | Type | Description | +|--------|------|-------------| +| `db_client_request_size_bytes_total` | Counter | Cumulative request payload size | +| `db_client_response_size_bytes_total` | Counter | Cumulative response payload size | + +**Common labels:** `service_instance_id` (pod name) + +#### Example Queries + +Average request throughput (bytes/sec): + +```promql +sum(rate(db_client_request_size_bytes_total[1m])) +``` + +### Operation Phases + +| Metric | Type | Description | +|--------|------|-------------| +| `db_client_operation_duration_seconds_total` | Counter | Duration broken down by phase | + +**Key `db_operation_phase` values:** `pg_query`, `cursor_iteration`, `bson_serialization`, `command_parsing` + +#### Example Queries + +Time spent in each phase per second: + +```promql +sum by (db_operation_phase) ( + rate(db_client_operation_duration_seconds_total{ + db_operation_phase!="" + }[1m]) +) +``` + +## Operator Metrics (controller-runtime) + +The DocumentDB operator binary exposes standard controller-runtime metrics on its metrics endpoint. These track reconciliation performance and work queue health. + +### Reconciliation + +| Metric | Type | Description | +|--------|------|-------------| +| `controller_runtime_reconcile_total` | Counter | Total reconciliations | +| `controller_runtime_reconcile_errors_total` | Counter | Total reconciliation errors | +| `controller_runtime_reconcile_time_seconds` | Histogram | Time spent in reconciliation | + +**Common labels:** `controller` (e.g., `documentdb-controller`, `backup`, `scheduledbackup`, `certificate-controller`, `persistentvolume`), `result` (`success`, `error`, `requeue`, `requeue_after`) + +#### Example Queries + +Reconciliation error rate by controller: + +```promql +sum by (controller) ( + rate(controller_runtime_reconcile_errors_total[5m]) +) +``` + +P95 reconciliation latency for the DocumentDB controller: + +```promql +histogram_quantile(0.95, + sum by (le) ( + rate(controller_runtime_reconcile_time_seconds_bucket{ + controller="documentdb-controller" + }[5m]) + ) +) +``` + +Reconciliation throughput (reconciles/sec): + +```promql +sum by (controller) ( + rate(controller_runtime_reconcile_total[5m]) +) +``` + +### Work Queue + +| Metric | Type | Description | +|--------|------|-------------| +| `workqueue_depth` | Gauge | Current number of items in the queue | +| `workqueue_adds_total` | Counter | Total items added | +| `workqueue_queue_duration_seconds` | Histogram | Time items spend in queue before processing | +| `workqueue_work_duration_seconds` | Histogram | Time spent processing items | +| `workqueue_retries_total` | Counter | Total retries | + +**Common labels:** `name` (queue name, maps to controller name) + +#### Example Queries + +Work queue depth by controller: + +```promql +workqueue_depth{name=~"documentdb-controller|backup|scheduledbackup|certificate-controller"} +``` + +Average time items spend waiting in queue: + +```promql +rate(workqueue_queue_duration_seconds_sum{name="documentdb-controller"}[5m]) +/ rate(workqueue_queue_duration_seconds_count{name="documentdb-controller"}[5m]) +``` + +## CNPG / PostgreSQL Metrics + +CloudNative-PG exposes PostgreSQL-level metrics from each managed pod. These are available when CNPG monitoring is enabled. For the full list, see the [CloudNative-PG monitoring docs](https://cloudnative-pg.io/documentation/current/monitoring/). + +Additionally, the OpenTelemetry Collector's `postgresql` receiver collects metrics directly from PostgreSQL via SQL queries. + +### Replication + +| Metric | Type | Description | +|--------|------|-------------| +| `cnpg_pg_replication_lag` | Gauge | Replication lag in seconds (CNPG) | +| `postgresql_replication_data_delay` | Gauge | Replication data delay (OTel PG receiver) | + +#### Example Queries + +Replication lag per pod: + +```promql +cnpg_pg_replication_lag{pod=~".*documentdb.*"} +``` + +### Connections + +| Metric | Type | Description | +|--------|------|-------------| +| `cnpg_pg_stat_activity_count` | Gauge | Active backend connections by state (CNPG) | +| `postgresql_backends` | Gauge | Number of backends (OTel PG receiver) | +| `postgresql_connection_max` | Gauge | Maximum connections (OTel PG receiver) | + +#### Example Queries + +Active connections by state: + +```promql +sum by (state) ( + cnpg_pg_stat_activity_count{pod=~".*documentdb.*"} +) +``` + +Backend utilization: + +```promql +postgresql_backends / postgresql_connection_max * 100 +``` + +### Storage + +| Metric | Type | Description | +|--------|------|-------------| +| `cnpg_pg_database_size_bytes` | Gauge | Total database size (CNPG) | +| `postgresql_db_size_bytes` | Gauge | Database size (OTel PG receiver) | +| `postgresql_wal_age_seconds` | Gauge | WAL age (OTel PG receiver) | + +#### Example Queries + +Database size in GiB: + +```promql +postgresql_db_size_bytes / 1024 / 1024 / 1024 +``` + +### Operations + +| Metric | Type | Description | +|--------|------|-------------| +| `postgresql_commits_total` | Counter | Total committed transactions | +| `postgresql_rollbacks_total` | Counter | Total rolled-back transactions | +| `postgresql_operations_total` | Counter | Row operations (labels: `operation`) | + +#### Example Queries + +Transaction rate: + +```promql +rate(postgresql_commits_total[1m]) +``` + +Row operations per second by type: + +```promql +sum by (operation) (rate(postgresql_operations_total[1m])) +``` + +### Cluster Health + +| Metric | Type | Description | +|--------|------|-------------| +| `cnpg_collector_up` | Gauge | 1 if the CNPG metrics collector is running | +| `cnpg_pg_postmaster_start_time` | Gauge | PostgreSQL start timestamp | + +#### Example Queries + +Detect pods where the metrics collector is down: + +```promql +cnpg_collector_up{pod=~".*documentdb.*"} == 0 +``` + +## OpenTelemetry Metric Names + +When using the OpenTelemetry `kubeletstats` receiver, metric names use the OpenTelemetry naming convention instead of Prometheus-style names: + +| OpenTelemetry Name | Prometheus Equivalent | +|---|---| +| `k8s.container.cpu.time` | `container_cpu_usage_seconds_total` | +| `k8s.container.memory.usage` | `container_memory_working_set_bytes` | +| `k8s.container.cpu.limit` | `container_spec_cpu_quota` | +| `k8s.container.memory.limit` | `container_spec_memory_limit_bytes` | +| `k8s.pod.network.io` | `container_network_*_bytes_total` | + +When writing queries, use the naming convention matching your collection method. The telemetry playground uses the OpenTelemetry names; a direct Prometheus scrape of cAdvisor uses Prometheus names. diff --git a/docs/operator-public-documentation/preview/monitoring/overview.md b/docs/operator-public-documentation/preview/monitoring/overview.md new file mode 100644 index 00000000..eb99aaea --- /dev/null +++ b/docs/operator-public-documentation/preview/monitoring/overview.md @@ -0,0 +1,284 @@ +# Monitoring Overview + +This guide describes how to monitor DocumentDB clusters running on Kubernetes using OpenTelemetry, Prometheus, and Grafana. + +## Prerequisites + +- A running Kubernetes cluster with the DocumentDB operator installed +- [Helm 3](https://helm.sh/docs/intro/install/) for deploying Prometheus and Grafana +- [kubectl](https://kubernetes.io/docs/tasks/tools/) configured for your cluster +- [`jq`](https://jqlang.github.io/jq/) for processing JSON in verification commands +- (Optional) [OpenTelemetry Operator](https://opentelemetry.io/docs/kubernetes/operator/) for managed collector deployments + +## Architecture + +A DocumentDB pod contains two containers: + +- **PostgreSQL container** — the DocumentDB engine (PostgreSQL with DocumentDB extensions) +- **Gateway container** — MongoDB-compatible API sidecar that exports telemetry via OTLP + +The gateway sidecar injector automatically configures each gateway container with: + +- `OTEL_EXPORTER_OTLP_ENDPOINT` — points to an OpenTelemetry Collector service +- `OTEL_RESOURCE_ATTRIBUTES` — sets `service.instance.id` to the pod name for per-instance metric attribution + +The recommended monitoring stack collects three signals — **metrics**, **traces**, and **logs** — from these containers and stores them for visualization and alerting. + +``` +┌──────────────────────────────────────────────────────┐ +│ Grafana │ +│ (dashboards, alerts, trace viewer) │ +└──────────┬──────────────┬──────────────┬─────────────┘ + │ │ │ + ┌─────┴─────┐ ┌────┴────┐ ┌────┴────┐ + │ Prometheus │ │ Tempo │ │ Loki │ + │ (metrics) │ │(traces) │ │ (logs) │ + └─────┬─────┘ └────┬────┘ └────┬────┘ + │ │ │ +┌──────────┴──────────────┴──────────────┴─────────────┐ +│ OpenTelemetry Collector │ +│ Receivers: otlp, postgresql, kubeletstats │ +│ Processors: batch, resource │ +│ Exporters: prometheus, otlp/tempo, otlphttp/loki │ +└──────────┬──────────────┬────────────────────────────┘ + │ │ + ┌──────┴──────┐ ┌───┴──────────────┐ + │ OTLP push │ │ SQL scrape │ + │ (gateway) │ │ (PG receiver) │ + └──────┬──────┘ └───┬──────────────┘ + │ │ +┌──────────┴──────────────┴────────────────────────────┐ +│ DocumentDB Pods │ +│ ┌──────────────┐ ┌──────────────┐ │ +│ │ PostgreSQL │ │ Gateway │──── OTLP push │ +│ │ container │ │ container │ (metrics, │ +│ │ ◄──SQL scrape │ traces, logs) │ +│ └──────────────┘ └──────────────┘ │ +└──────────────────────────────────────────────────────┘ +``` + +### How gateway telemetry reaches the collector + +The gateway sidecar injector (a CNPG plugin) injects an `OTEL_EXPORTER_OTLP_ENDPOINT` environment variable into every gateway container. The endpoint follows the pattern: + +``` +http://-collector..svc.cluster.local:4317 +``` + +The collector must be reachable at this address. In the local telemetry playground, an `ExternalName` service bridges the namespace gap between the DocumentDB namespace and the observability namespace. + +### Collector deployment modes + +The [telemetry design document](https://github.com/documentdb/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) recommends the OpenTelemetry Collector as a **DaemonSet** (one collector per node) for single-tenant clusters. This provides: + +- Lower resource overhead — one collector per node instead of one per pod +- Node-level metrics visibility (CPU, memory, filesystem) +- Simpler configuration and management + +The [telemetry playground](https://github.com/documentdb/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) implements a **Deployment** (one collector per namespace) instead, which is better suited for multi-tenant setups requiring per-namespace metric isolation. Choose the mode that fits your isolation requirements. + +## Prometheus Integration + +### Operator Metrics + +The DocumentDB operator exposes a metrics endpoint via controller-runtime. By default: + +- **Bind address**: controlled by `--metrics-bind-address` (default `0`, disabled) +- **Secure mode**: `--metrics-secure=true` serves via HTTPS with authn/authz +- **Certificates**: supply `--metrics-cert-path` for custom TLS, otherwise self-signed certs are generated + +To enable metrics scraping, set the bind address in the operator deployment (for example, `:8443` for HTTPS or `:8080` for HTTP). + +### CNPG Cluster Metrics + +The underlying CloudNative-PG cluster exposes PostgreSQL metrics on each pod. These are collected by the OpenTelemetry Collector's `postgresql` receiver via direct SQL queries, or by the `prometheus` receiver via Kubernetes service discovery. Key metric sources: + +| Source | Method | Metrics | +|--------|--------|---------| +| kubelet/cAdvisor | `kubeletstats` receiver | Container CPU, memory, network, filesystem | +| PostgreSQL | `postgresql` receiver (SQL) | Backends, commits, rollbacks, replication lag, DB size | +| Gateway | OTLP push | Operations, latency, connections, request/response size | +| Kubernetes API | `k8s_cluster` receiver | Pod status, restart counts, resource requests/limits | + +### ServiceMonitor / PodMonitor + +The operator does not ship a metrics `Service` or `ServiceMonitor` by default. If you use the Prometheus Operator and want to scrape controller-runtime metrics, create a `Service` and `ServiceMonitor` matching your deployment. For example, with a Helm release named `documentdb`: + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: documentdb-operator-metrics + namespace: documentdb-operator + labels: + app: documentdb +spec: + selector: + app: documentdb # must match your Helm release name + ports: + - name: metrics + port: 8443 + targetPort: 8443 +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: documentdb-operator + namespace: documentdb-operator +spec: + selector: + matchLabels: + app: documentdb # must match the Service labels above + endpoints: + - port: metrics + scheme: https + tlsConfig: + insecureSkipVerify: true # use a proper CA bundle in production +``` + +!!! note + Adjust the `app` label to match your Helm release name. The operator must be started with `--metrics-bind-address=:8443` for the endpoint to be available. + +## Key Metrics + +### Gateway Application Metrics + +These metrics are pushed via OTLP from the gateway sidecar to the OpenTelemetry Collector: + +| Metric | Description | +|--------|-------------| +| `db_client_operations_total` | Total MongoDB operations by command type | +| `db_client_operation_duration_seconds_total` | Cumulative operation latency | +| `gateway_client_connections_active` | Current active client connections | +| `gateway_client_connections_total` | Cumulative connections accepted | +| `db_client_connection_active` | Active backend pool connections | +| `db_client_connection_idle` | Idle backend pool connections | +| `db_client_connection_max` | Maximum backend pool size | +| `db_client_request_size_bytes_total` | Cumulative request payload size | +| `db_client_response_size_bytes_total` | Cumulative response payload size | + +### Container Resource Metrics + +| Metric | Description | Container | +|--------|-------------|-----------| +| `container_cpu_usage_seconds_total` | Cumulative CPU time consumed | postgres, documentdb-gateway | +| `container_memory_working_set_bytes` | Current memory usage | postgres, documentdb-gateway | +| `container_spec_memory_limit_bytes` | Memory limit | postgres, documentdb-gateway | +| `container_network_receive_bytes_total` | Network bytes received | pod-level | +| `container_fs_reads_bytes_total` | Filesystem read bytes | postgres | + +### Controller-Runtime Metrics + +| Metric | Description | +|--------|-------------| +| `controller_runtime_reconcile_total` | Total reconciliations by controller and result | +| `controller_runtime_reconcile_errors_total` | Total reconciliation errors | +| `controller_runtime_reconcile_time_seconds` | Reconciliation duration histogram | +| `workqueue_depth` | Current depth of the work queue | +| `workqueue_adds_total` | Total items added to the work queue | + +### PostgreSQL Metrics + +When using the OTel `postgresql` receiver or CNPG monitoring, additional PostgreSQL-level metrics are available: + +| Metric | Description | +|--------|-------------| +| `postgresql_backends` | Number of active backends | +| `postgresql_commits_total` | Total committed transactions | +| `postgresql_rollbacks_total` | Total rolled-back transactions | +| `postgresql_replication_data_delay` | Replication data delay (seconds) | +| `postgresql_db_size_bytes` | Database size | +| `cnpg_pg_replication_lag` | Replication lag in seconds (CNPG) | +| `cnpg_pg_stat_activity_count` | Number of active connections (CNPG) | + +For the full CNPG metrics reference, see the [CloudNative-PG monitoring documentation](https://cloudnative-pg.io/documentation/current/monitoring/). + +## Telemetry Playground + +The [`documentdb-playground/telemetry/`](https://github.com/documentdb/documentdb-kubernetes-operator/tree/main/documentdb-playground/telemetry) directory contains reference implementations: + +### Local (Kind) + +The `local/` subdirectory provides a self-contained local demo on a Kind cluster with: + +- 3-node DocumentDB HA cluster (1 primary + 2 streaming replicas) +- Full observability stack: OTel Collector, Prometheus, Tempo, Loki, Grafana +- Gateway metrics, traces, and logs via OTLP push +- PostgreSQL metrics via the OTel `postgresql` receiver +- System resource metrics via the `kubeletstats` receiver +- Pre-built Grafana dashboard with Gateway, PostgreSQL, and System Resources sections +- Traffic generator for demo workload + +```bash +cd documentdb-playground/telemetry/local/scripts/ + +# Create Kind cluster with local registry +./setup-kind.sh + +# Deploy operator, DocumentDB HA, and observability stack +# (see documentdb-playground/telemetry/local/README.md for full steps) +``` + +### Cloud (Multi-tenant) + +The cloud setup supports multi-tenant namespace isolation with: + +- Separate Prometheus + Grafana per team +- OpenTelemetry Collector configurations for cAdvisor metric scraping +- Automated Grafana dashboard provisioning scripts +- AKS cluster setup with the OpenTelemetry Operator + +```bash +cd documentdb-playground/telemetry/scripts/ + +# One-time infrastructure setup +./create-cluster.sh --install-all + +# Deploy multi-tenant DocumentDB + monitoring +./deploy-multi-tenant-telemetry.sh + +# Create Grafana dashboards +./setup-grafana-dashboards.sh sales-namespace + +# Access Grafana +kubectl port-forward -n sales-namespace svc/grafana-sales 3001:3000 & +``` + +See the [telemetry design document](https://github.com/documentdb/documentdb-kubernetes-operator/blob/main/documentdb-playground/telemetry/telemetry-design.md) for the full architecture rationale including DaemonSet vs. sidecar trade-offs, OTLP receiver plans, and future application-level metrics. + +## Verification + +After deploying the monitoring stack, confirm that metrics are flowing: + +```bash +# Check that the OpenTelemetry Collector pods are running +kubectl get pods -l app=otel-collector -n observability + +# Verify Prometheus is receiving metrics (port-forward first) +kubectl port-forward svc/prometheus 9090:9090 -n observability & +curl -s 'http://localhost:9090/api/v1/query?query=up' | jq '.data.result | length' + +# Confirm gateway metrics are present +curl -s 'http://localhost:9090/api/v1/query?query=db_client_operations_total' \ + | jq '.data.result | length' + +# Confirm PostgreSQL metrics are present +curl -s 'http://localhost:9090/api/v1/query?query=postgresql_backends' \ + | jq '.data.result | length' + +# Confirm kubeletstats metrics are present +curl -s 'http://localhost:9090/api/v1/query?query=k8s_pod_cpu_usage' \ + | jq '.data.result | length' +``` + +If no metrics appear, check: + +- The collector's service account has RBAC access to the kubelet metrics API (`nodes/stats` resource) +- The `ExternalName` service bridges the DocumentDB namespace to the collector namespace +- The sidecar injector is running and injecting `OTEL_EXPORTER_OTLP_ENDPOINT` into gateway containers +- Namespace label filters in the collector config match your DocumentDB namespace + +## Next Steps + +- [Metrics Reference](metrics.md) — detailed metric descriptions and PromQL query examples +- [CloudNative-PG Monitoring](https://cloudnative-pg.io/documentation/current/monitoring/) — upstream PostgreSQL metrics diff --git a/documentdb-playground/telemetry/local/README.md b/documentdb-playground/telemetry/local/README.md new file mode 100644 index 00000000..8bdfeb49 --- /dev/null +++ b/documentdb-playground/telemetry/local/README.md @@ -0,0 +1,127 @@ +# DocumentDB Telemetry Playground (Local) + +A full observability stack for DocumentDB running on a local Kind cluster. Provides pre-configured Grafana dashboards with traces, metrics, and logs out of the box. + +## Prerequisites + +- **Docker** (running) +- **kind** — [install](https://kind.sigs.k8s.io/docs/user/quick-start/#installation) +- **kubectl** +- **DocumentDB operator images** pushed to `localhost:5001` + - `localhost:5001/documentdb-gateway:latest` + - Operator + sidecar injector installed in the cluster + +## Quick Start + +```bash +# Deploy everything (cluster + observability + DocumentDB + traffic) +./scripts/deploy.sh + +# Access Grafana (admin/admin, anonymous access enabled) +kubectl port-forward svc/grafana 3000:3000 -n observability --context kind-documentdb-telemetry + +# Access Prometheus +kubectl port-forward svc/prometheus 9090:9090 -n observability --context kind-documentdb-telemetry + +# Tear down +./scripts/teardown.sh +``` + +## Architecture + +```mermaid +graph TB + subgraph cluster["Kind Cluster (documentdb-telemetry)"] + subgraph obs["observability namespace"] + collector["OTel Collector
:4317 gRPC / :4318 HTTP / :8889 prom"] + tempo["Tempo
(traces)"] + loki["Loki
(logs)"] + prometheus["Prometheus
(metrics)"] + grafana["Grafana
:3000
Gateway + Internals dashboards"] + + collector -->|OTLP| tempo + collector -->|OTLP/HTTP| loki + collector -->|remote write| prometheus + prometheus --> grafana + tempo --> grafana + loki --> grafana + end + + subgraph docdb["documentdb-preview-ns"] + subgraph pod1["Pod: preview-1 (primary)"] + pg1["PostgreSQL :5432"] + gw1["Gateway :10260"] + end + subgraph pod2["Pod: preview-2 (replica)"] + pg2["PostgreSQL :5432"] + gw2["Gateway :10260"] + end + subgraph pod3["Pod: preview-3 (replica)"] + pg3["PostgreSQL :5432"] + gw3["Gateway :10260"] + end + traffic_rw["Traffic Gen (RW)
writes → primary"] + traffic_ro["Traffic Gen (RO)
reads → replicas"] + bridge["ExternalName Service
collector bridge"] + end + + traffic_rw --> gw1 + traffic_ro --> gw2 + traffic_ro --> gw3 + gw1 -.->|OTLP| bridge + gw2 -.->|OTLP| bridge + gw3 -.->|OTLP| bridge + bridge -.-> collector + end + + user["Browser"] --> grafana +``` + +## Directory Layout + +``` +local/ +├── scripts/ +│ ├── setup-kind.sh # Creates Kind cluster + local registry +│ ├── deploy.sh # One-command full deployment +│ └── teardown.sh # Deletes cluster and proxy containers +├── k8s/ +│ ├── observability/ # Namespace, Tempo, Loki, Prometheus, OTel Collector, Grafana +│ ├── documentdb/ # DocumentDB CR, credentials, collector bridge +│ └── traffic/ # Traffic generator services + jobs +└── dashboards/ + ├── gateway.json # Gateway-level metrics dashboard + └── internals.json # Internal metrics dashboard +``` + +## Dashboards + +| Dashboard | Description | +|-----------|-------------| +| **Gateway** | Request rates, latency (p50/p95/p99), error rates, active connections, command breakdown by type | +| **Internals** | PostgreSQL metrics, container resource usage (CPU/memory), OTel Collector pipeline stats | + +Dashboards are automatically provisioned into Grafana on startup via ConfigMap mounts. Edits made in the Grafana UI will persist until the pod restarts. + +## Restarting Traffic Generators + +Traffic generators run as Kubernetes Jobs. To restart them: + +```bash +CONTEXT="kind-documentdb-telemetry" +NS="documentdb-preview-ns" + +# Delete completed jobs +kubectl delete job traffic-generator-rw traffic-generator-ro -n $NS --context $CONTEXT --ignore-not-found + +# Re-apply +kubectl apply -f k8s/traffic/ --context $CONTEXT +``` + +## Teardown + +```bash +./scripts/teardown.sh +``` + +This deletes the Kind cluster and any proxy containers. The local Docker registry is kept for reuse. diff --git a/documentdb-playground/telemetry/local/dashboards/gateway.json b/documentdb-playground/telemetry/local/dashboards/gateway.json new file mode 100644 index 00000000..0dba9430 --- /dev/null +++ b/documentdb-playground/telemetry/local/dashboards/gateway.json @@ -0,0 +1,1051 @@ +{ + "uid": "documentdb-gateway", + "title": "DocumentDB Gateway", + "tags": [ + "documentdb", + "gateway" + ], + "timezone": "browser", + "editable": true, + "graphTooltip": 1, + "time": { + "from": "now-1h", + "to": "now" + }, + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(gateway_client_connections_total, service_instance_id)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "instance", + "options": [], + "query": { + "query": "label_values(gateway_client_connections_total, service_instance_id)" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query", + "label": "Instance" + } + ] + }, + "links": [ + { + "title": "DocumentDB Internals", + "type": "link", + "icon": "bolt", + "url": "/d/documentdb-internals/documentdb-internals", + "tooltip": "Database & Infrastructure metrics" + } + ], + "panels": [ + { + "type": "stat", + "title": "Operations/sec", + "id": 1, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "ops", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1000 + }, + { + "color": "red", + "value": 5000 + } + ] + } + } + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "colorMode": "value", + "graphMode": "area" + }, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(db_client_operations_total{service_instance_id=~\"$instance\"}[1m]))", + "legendFormat": "ops/sec" + } + ] + }, + { + "type": "stat", + "title": "Avg Latency (ms)", + "id": 2, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "ms", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 100 + }, + { + "color": "red", + "value": 500 + } + ] + } + } + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "colorMode": "value", + "graphMode": "area" + }, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(db_client_operation_duration_seconds_total{service_instance_id=~\"$instance\",db_operation_phase=\"\"}[1m])) / sum(rate(db_client_operations_total{service_instance_id=~\"$instance\"}[1m])) * 1000", + "legendFormat": "avg latency" + } + ] + }, + { + "type": "stat", + "title": "Error Rate %", + "id": 3, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 5 + } + ] + } + } + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "colorMode": "value", + "graphMode": "area" + }, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(db_client_operations_total{service_instance_id=~\"$instance\",error_type!=\"\"}[1m])) / sum(rate(db_client_operations_total{service_instance_id=~\"$instance\"}[1m])) * 100", + "legendFormat": "error rate" + } + ] + }, + { + "type": "stat", + "title": "Active Connections", + "id": 5, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10, + "spanNulls": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 100 + } + ] + } + } + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "graphMode": "area", + "colorMode": "value", + "textMode": "auto" + }, + "targets": [ + { + "refId": "A", + "expr": "gateway_client_connections_active{service_instance_id=~\"$instance\"}", + "legendFormat": "{{service_instance_id}}" + } + ] + }, + { + "type": "table", + "title": "Recent Traces", + "id": 19, + "gridPos": { + "h": 4, + "w": 24, + "x": 0, + "y": 4 + }, + "datasource": { + "type": "tempo", + "uid": "tempo" + }, + "targets": [ + { + "refId": "A", + "queryType": "traceql", + "datasource": { + "type": "tempo", + "uid": "tempo" + }, + "query": "{resource.service.name=\"documentdb_gateway\"}", + "limit": 20, + "tableType": "traces" + } + ], + "options": { + "showHeader": true, + "cellHeight": "sm", + "footer": { + "show": false + } + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + } + }, + { + "type": "row", + "title": "Traffic & Performance", + "collapsed": true, + "panels": [ + { + "type": "timeseries", + "title": "Ops/sec by Operation", + "id": 14, + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10, + "spanNulls": false + } + } + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "refId": "A", + "expr": "sum by (db_operation_name) (rate(db_client_operations_total{service_instance_id=~\"$instance\"}[1m]))", + "legendFormat": "{{db_operation_name}}" + } + ] + }, + { + "type": "timeseries", + "title": "Latency by Operation", + "id": 15, + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10, + "spanNulls": false + } + } + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "refId": "A", + "expr": "sum by (db_operation_name) (rate(db_client_operation_duration_seconds_total{service_instance_id=~\"$instance\",db_operation_phase=\"\"}[1m])) / sum by (db_operation_name) (rate(db_client_operations_total{service_instance_id=~\"$instance\"}[1m])) * 1000", + "legendFormat": "{{db_operation_name}}" + } + ] + }, + { + "type": "timeseries", + "title": "Documents/sec", + "id": 8, + "gridPos": { + "x": 0, + "y": 8, + "w": 8, + "h": 8 + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "docs/s", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10, + "spanNulls": false + } + } + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(db_client_documents_returned_total{service_instance_id=~\"$instance\"}[1m]))", + "legendFormat": "returned" + }, + { + "refId": "B", + "expr": "sum(rate(db_client_documents_inserted_total{service_instance_id=~\"$instance\"}[1m]))", + "legendFormat": "inserted" + }, + { + "refId": "C", + "expr": "sum(rate(db_client_documents_updated_total{service_instance_id=~\"$instance\"}[1m]))", + "legendFormat": "updated" + }, + { + "refId": "D", + "expr": "sum(rate(db_client_documents_deleted_total{service_instance_id=~\"$instance\"}[1m]))", + "legendFormat": "deleted" + } + ] + }, + { + "type": "timeseries", + "title": "Documents/sec by Collection", + "id": 9, + "gridPos": { + "x": 8, + "y": 8, + "w": 8, + "h": 8 + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "docs/s", + "custom": { + "lineWidth": 1, + "fillOpacity": 30, + "showPoints": "never", + "stacking": { + "mode": "normal", + "group": "A" + } + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (db_collection_name) (rate(db_client_documents_returned_total{service_instance_id=~\"$instance\"}[1m]) + rate(db_client_documents_inserted_total{service_instance_id=~\"$instance\"}[1m]))", + "legendFormat": "{{db_collection_name}}", + "refId": "A" + } + ] + }, + { + "type": "timeseries", + "title": "Request & Response Size", + "id": 16, + "gridPos": { + "x": 16, + "y": 8, + "w": 8, + "h": 8 + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "bytes", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10, + "spanNulls": false + } + } + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(db_client_request_size_bytes_total{service_instance_id=~\"$instance\"}[1m]))", + "legendFormat": "request" + }, + { + "refId": "B", + "expr": "sum(rate(db_client_response_size_bytes_total{service_instance_id=~\"$instance\"}[1m]))", + "legendFormat": "response" + } + ] + }, + { + "type": "timeseries", + "title": "Errors by Type", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (error_type) (rate(db_client_operations_total{service_instance_id=~\"$instance\", error_type!=\"\"}[5m]))", + "legendFormat": "{{error_type}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "lineWidth": 1, + "fillOpacity": 30, + "showPoints": "never", + "stacking": { + "mode": "normal", + "group": "A" + } + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "gridPos": { + "x": 0, + "y": 16, + "w": 24, + "h": 8 + } + } + ], + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + } + }, + { + "type": "row", + "title": "Connections", + "collapsed": true, + "panels": [ + { + "type": "timeseries", + "title": "Active Client Connections", + "id": 5, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 0 + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10, + "spanNulls": false + } + } + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "refId": "A", + "expr": "gateway_client_connections_active{service_instance_id=~\"$instance\"}", + "legendFormat": "{{service_instance_id}}" + } + ] + }, + { + "type": "timeseries", + "title": "Client Connection Rate", + "id": 21, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 0 + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "conn/s", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10, + "spanNulls": false + } + } + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "refId": "A", + "expr": "rate(gateway_client_connections_total{service_instance_id=~\"$instance\"}[1m])", + "legendFormat": "{{service_instance_id}}" + } + ] + }, + { + "type": "timeseries", + "title": "Pool Connections", + "id": 6, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 0 + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 30, + "spanNulls": false, + "stacking": { + "mode": "normal", + "group": "A" + }, + "lineWidth": 1 + } + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*active" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "green" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*idle" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "blue" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*waiting" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "orange" + } + } + ] + } + ] + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "refId": "A", + "expr": "sum by (service_instance_id) (db_client_connection_active{service_instance_id=~\"$instance\"})", + "legendFormat": "{{service_instance_id}} active" + }, + { + "refId": "B", + "expr": "sum by (service_instance_id) (db_client_connection_idle{service_instance_id=~\"$instance\"})", + "legendFormat": "{{service_instance_id}} idle" + }, + { + "refId": "C", + "expr": "sum by (service_instance_id) (db_client_connection_waiting{service_instance_id=~\"$instance\"})", + "legendFormat": "{{service_instance_id}} waiting" + } + ] + }, + { + "type": "timeseries", + "title": "Pool Utilization %", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (service_instance_id) (db_client_connection_active{service_instance_id=~\"$instance\"}) / sum by (service_instance_id) (db_client_connection_active{service_instance_id=~\"$instance\"} + db_client_connection_idle{service_instance_id=~\"$instance\"} + db_client_connection_waiting{service_instance_id=~\"$instance\"}) * 100", + "legendFormat": "{{service_instance_id}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "gridPos": { + "x": 0, + "y": 8, + "w": 24, + "h": 6 + } + } + ], + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + } + }, + { + "type": "row", + "title": "Resource Usage", + "collapsed": true, + "panels": [ + { + "type": "timeseries", + "title": "Gateway CPU", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "container_cpu_usage{k8s_namespace_name=\"documentdb-preview-ns\", k8s_container_name=\"documentdb-gateway\"}", + "legendFormat": "{{k8s_pod_name}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "showPoints": "never" + }, + "min": 0 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "gridPos": { + "x": 0, + "y": 0, + "w": 8, + "h": 8 + } + }, + { + "type": "timeseries", + "title": "Gateway Memory", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "container_memory_working_set_bytes{k8s_namespace_name=\"documentdb-preview-ns\", k8s_container_name=\"documentdb-gateway\"}", + "legendFormat": "{{k8s_pod_name}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "gridPos": { + "x": 8, + "y": 0, + "w": 8, + "h": 8 + } + }, + { + "type": "timeseries", + "title": "Pod Network I/O", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(k8s_pod_network_io_bytes_total{k8s_namespace_name=\"documentdb-preview-ns\", direction=\"receive\"}[1m])", + "legendFormat": "{{k8s_pod_name}} rx", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(k8s_pod_network_io_bytes_total{k8s_namespace_name=\"documentdb-preview-ns\", direction=\"transmit\"}[1m])", + "legendFormat": "{{k8s_pod_name}} tx", + "refId": "B" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps", + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "gridPos": { + "x": 16, + "y": 0, + "w": 8, + "h": 8 + } + } + ], + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + } + } + ] +} \ No newline at end of file diff --git a/documentdb-playground/telemetry/local/dashboards/internals.json b/documentdb-playground/telemetry/local/dashboards/internals.json new file mode 100644 index 00000000..d8fba6e8 --- /dev/null +++ b/documentdb-playground/telemetry/local/dashboards/internals.json @@ -0,0 +1,1166 @@ +{ + "uid": "documentdb-internals", + "title": "DocumentDB Internals", + "tags": [ + "documentdb", + "internals", + "postgresql" + ], + "timezone": "browser", + "editable": true, + "graphTooltip": 1, + "time": { + "from": "now-1h", + "to": "now" + }, + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(gateway_client_connections_total, service_instance_id)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "instance", + "options": [], + "query": { + "query": "label_values(gateway_client_connections_total, service_instance_id)" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query", + "label": "Instance" + } + ] + }, + "links": [ + { + "title": "DocumentDB Gateway", + "type": "link", + "icon": "apps", + "url": "/d/documentdb-gateway/documentdb-gateway", + "tooltip": "Gateway operations & performance" + } + ], + "panels": [ + { + "type": "row", + "title": "Database", + "collapsed": true, + "panels": [ + { + "type": "stat", + "title": "PG Backends", + "id": 10, + "gridPos": { + "x": 0, + "y": 0, + "w": 8, + "h": 4 + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 100 + } + ] + } + } + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "colorMode": "value", + "graphMode": "area" + }, + "targets": [ + { + "refId": "A", + "expr": "postgresql_backends", + "legendFormat": "{{postgresql_database_name}}" + } + ] + }, + { + "type": "stat", + "title": "PG Max Connections", + "id": 11, + "gridPos": { + "x": 8, + "y": 0, + "w": 8, + "h": 4 + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 80 + }, + { + "color": "red", + "value": 150 + } + ] + } + } + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "colorMode": "value", + "graphMode": "none" + }, + "targets": [ + { + "refId": "A", + "expr": "postgresql_connection_max", + "legendFormat": "{{postgresql_database_name}}" + } + ] + }, + { + "type": "timeseries", + "title": "PG Backends Over Time", + "id": 22, + "gridPos": { + "x": 0, + "y": 4, + "w": 12, + "h": 8 + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10, + "spanNulls": false, + "showPoints": "never" + } + } + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "refId": "A", + "expr": "postgresql_backends", + "legendFormat": "{{postgresql_database_name}}" + } + ] + }, + { + "type": "timeseries", + "title": "PG Replication Lag", + "id": 12, + "gridPos": { + "x": 12, + "y": 4, + "w": 12, + "h": 8 + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "bytes", + "noValue": "N/A (single node)", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10, + "spanNulls": false, + "showPoints": "never", + "thresholdsStyle": { + "mode": "line+area" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1048576 + }, + { + "color": "red", + "value": 10485760 + } + ] + } + } + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "postgresql_replication_data_delay_bytes", + "legendFormat": "{{replication_client}}", + "refId": "A" + } + ] + }, + { + "type": "timeseries", + "title": "PG Commits/Rollbacks", + "id": 13, + "gridPos": { + "x": 0, + "y": 12, + "w": 8, + "h": 8 + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10, + "spanNulls": false, + "showPoints": "never" + } + } + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "refId": "A", + "expr": "rate(postgresql_commits_total[1m])", + "legendFormat": "commits" + }, + { + "refId": "B", + "expr": "rate(postgresql_rollbacks_total[1m])", + "legendFormat": "rollbacks" + } + ] + }, + { + "type": "timeseries", + "title": "PG Operations/sec", + "id": 24, + "gridPos": { + "x": 8, + "y": 12, + "w": 8, + "h": 8 + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10, + "spanNulls": false, + "showPoints": "never" + } + } + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "refId": "A", + "expr": "sum by (operation) (rate(postgresql_operations_total[1m]))", + "legendFormat": "{{operation}}" + } + ] + }, + { + "type": "timeseries", + "title": "PG Database Size", + "id": 23, + "gridPos": { + "x": 16, + "y": 12, + "w": 8, + "h": 8 + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "bytes", + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "fillOpacity": 10, + "spanNulls": false, + "showPoints": "never" + } + } + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "list", + "placement": "bottom" + } + }, + "targets": [ + { + "refId": "A", + "expr": "postgresql_db_size_bytes", + "legendFormat": "{{postgresql_database_name}}" + } + ] + }, + { + "type": "bargauge", + "title": "Index Size by Table", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (postgresql_table_name) (postgresql_index_size_bytes)", + "legendFormat": "{{postgresql_table_name}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "horizontal", + "displayMode": "gradient" + }, + "gridPos": { + "x": 0, + "y": 28, + "w": 8, + "h": 8 + } + }, + { + "type": "bargauge", + "title": "Indexes per Table", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "count by (postgresql_table_name) (postgresql_index_size_bytes)", + "legendFormat": "{{postgresql_table_name}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "horizontal", + "displayMode": "gradient" + }, + "gridPos": { + "x": 8, + "y": 28, + "w": 8, + "h": 8 + } + }, + { + "type": "timeseries", + "title": "Index Scans/sec", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "topk(10, rate(postgresql_index_scans_total[5m]))", + "legendFormat": "{{postgresql_table_name}}/{{postgresql_index_name}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops", + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "gridPos": { + "x": 16, + "y": 28, + "w": 8, + "h": 8 + } + }, + { + "type": "stat", + "title": "PG Connection Utilization", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "postgresql_backends / postgresql_connection_max * 100", + "legendFormat": "{{postgresql_database_name}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 60 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "graphMode": "area", + "colorMode": "value" + }, + "gridPos": { + "x": 16, + "y": 0, + "w": 8, + "h": 4 + } + }, + { + "type": "timeseries", + "title": "WAL Age", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "postgresql_wal_age_seconds", + "legendFormat": "WAL age", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "gridPos": { + "x": 0, + "y": 20, + "w": 8, + "h": 8 + } + }, + { + "type": "timeseries", + "title": "Vacuum Count by Table", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "topk(10, increase(postgresql_table_vacuum_count_total[1h]))", + "legendFormat": "{{postgresql_table_name}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "gridPos": { + "x": 8, + "y": 20, + "w": 8, + "h": 8 + } + } + ], + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + } + }, + { + "type": "row", + "title": "Infrastructure", + "collapsed": true, + "panels": [ + { + "type": "timeseries", + "title": "Pod CPU Usage", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "k8s_pod_cpu_usage{k8s_namespace_name=\"documentdb-preview-ns\"}", + "legendFormat": "{{k8s_pod_name}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "fillOpacity": 10, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + } + } + }, + { + "type": "timeseries", + "title": "Pod Memory (Working Set)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "k8s_pod_memory_working_set_bytes{k8s_namespace_name=\"documentdb-preview-ns\"}", + "legendFormat": "{{k8s_pod_name}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "custom": { + "fillOpacity": 10, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + } + } + }, + { + "type": "timeseries", + "title": "Container CPU Usage", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "container_cpu_usage{k8s_namespace_name=\"documentdb-preview-ns\"}", + "legendFormat": "{{k8s_pod_name}}/{{k8s_container_name}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "fillOpacity": 10, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + } + } + }, + { + "type": "timeseries", + "title": "Container Memory RSS", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "container_memory_rss_bytes{k8s_namespace_name=\"documentdb-preview-ns\"}", + "legendFormat": "{{k8s_pod_name}}/{{k8s_container_name}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "custom": { + "fillOpacity": 10, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + } + } + }, + { + "type": "timeseries", + "title": "Container Memory Working Set vs Available", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "container_memory_working_set_bytes{k8s_namespace_name=\"documentdb-preview-ns\"}", + "legendFormat": "{{k8s_pod_name}}/{{k8s_container_name}} working_set", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "container_memory_available_bytes{k8s_namespace_name=\"documentdb-preview-ns\"}", + "legendFormat": "{{k8s_pod_name}}/{{k8s_container_name}} available", + "refId": "B" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "custom": { + "fillOpacity": 10, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + } + } + }, + { + "type": "timeseries", + "title": "Container Page Faults", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(container_memory_major_page_faults_ratio{k8s_namespace_name=\"documentdb-preview-ns\"}[5m])", + "legendFormat": "{{k8s_pod_name}}/{{k8s_container_name}} major", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(container_memory_page_faults_ratio{k8s_namespace_name=\"documentdb-preview-ns\"}[5m])", + "legendFormat": "{{k8s_pod_name}}/{{k8s_container_name}} all", + "refId": "B" + } + ], + "fieldConfig": { + "defaults": { + "unit": "cps", + "custom": { + "fillOpacity": 10, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + } + } + }, + { + "type": "timeseries", + "title": "Pod Network I/O", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(k8s_pod_network_io_bytes_total{k8s_namespace_name=\"documentdb-preview-ns\", direction=\"receive\"}[1m])", + "legendFormat": "{{k8s_pod_name}} rx", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(k8s_pod_network_io_bytes_total{k8s_namespace_name=\"documentdb-preview-ns\", direction=\"transmit\"}[1m])", + "legendFormat": "{{k8s_pod_name}} tx", + "refId": "B" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps", + "custom": { + "fillOpacity": 10, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + } + } + }, + { + "type": "timeseries", + "title": "Pod Filesystem Usage", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "k8s_pod_filesystem_usage_bytes{k8s_namespace_name=\"documentdb-preview-ns\"}", + "legendFormat": "{{k8s_pod_name}} used", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "k8s_pod_filesystem_available_bytes{k8s_namespace_name=\"documentdb-preview-ns\"}", + "legendFormat": "{{k8s_pod_name}} avail", + "refId": "B" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "custom": { + "fillOpacity": 10, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + } + } + } + ], + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 1 + } + } + ] +} \ No newline at end of file diff --git a/documentdb-playground/telemetry/local/k8s/documentdb/cluster.yaml b/documentdb-playground/telemetry/local/k8s/documentdb/cluster.yaml new file mode 100644 index 00000000..f0f294af --- /dev/null +++ b/documentdb-playground/telemetry/local/k8s/documentdb/cluster.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: documentdb-preview-ns +--- +apiVersion: v1 +kind: Secret +metadata: + name: documentdb-credentials + namespace: documentdb-preview-ns +type: Opaque +stringData: + username: demo_user + password: DemoPassword100 +--- +apiVersion: documentdb.io/preview +kind: DocumentDB +metadata: + name: documentdb-preview + namespace: documentdb-preview-ns +spec: + nodeCount: 1 + instancesPerNode: 3 + documentDbCredentialSecret: documentdb-credentials + gatewayImage: "localhost:5001/documentdb-gateway:latest" + resource: + storage: + pvcSize: 5Gi + sidecarInjectorPluginName: cnpg-i-sidecar-injector.documentdb.io diff --git a/documentdb-playground/telemetry/local/k8s/documentdb/collector-bridge.yaml b/documentdb-playground/telemetry/local/k8s/documentdb/collector-bridge.yaml new file mode 100644 index 00000000..80072989 --- /dev/null +++ b/documentdb-playground/telemetry/local/k8s/documentdb/collector-bridge.yaml @@ -0,0 +1,15 @@ +# ExternalName service bridges the DocumentDB namespace to the observability namespace. +# The sidecar injector sets OTEL_EXPORTER_OTLP_ENDPOINT to: +# http://documentdb-preview-collector.documentdb-preview-ns.svc.cluster.local:4317 +# This service routes that to the actual OTel Collector in the observability namespace. +apiVersion: v1 +kind: Service +metadata: + name: documentdb-preview-collector + namespace: documentdb-preview-ns +spec: + type: ExternalName + externalName: otel-collector.observability.svc.cluster.local + ports: + - name: otlp-grpc + port: 4317 diff --git a/documentdb-playground/telemetry/local/k8s/observability/grafana.yaml b/documentdb-playground/telemetry/local/k8s/observability/grafana.yaml new file mode 100644 index 00000000..73caddcb --- /dev/null +++ b/documentdb-playground/telemetry/local/k8s/observability/grafana.yaml @@ -0,0 +1,105 @@ +# ============================================================ +# Grafana - Dashboards & Visualization +# ============================================================ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grafana + namespace: observability +spec: + replicas: 1 + selector: + matchLabels: + app: grafana + template: + metadata: + labels: + app: grafana + spec: + containers: + - name: grafana + image: grafana/grafana:11.6.0 + ports: + - containerPort: 3000 + env: + - name: GF_SECURITY_ADMIN_PASSWORD + value: admin + - name: GF_AUTH_ANONYMOUS_ENABLED + value: "true" + - name: GF_AUTH_ANONYMOUS_ORG_ROLE + value: Admin + volumeMounts: + - name: datasources + mountPath: /etc/grafana/provisioning/datasources + - name: dashboard-provisioning + mountPath: /etc/grafana/provisioning/dashboards + - name: dashboards + mountPath: /var/lib/grafana/dashboards + volumes: + - name: datasources + configMap: + name: grafana-datasources + - name: dashboard-provisioning + configMap: + name: grafana-dashboard-provisioning + - name: dashboards + configMap: + name: grafana-dashboards +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-datasources + namespace: observability +data: + datasources.yaml: | + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + uid: prometheus + access: proxy + url: http://prometheus.observability.svc:9090 + isDefault: true + - name: Tempo + type: tempo + uid: tempo + access: proxy + url: http://tempo.observability.svc:3200 + - name: Loki + type: loki + uid: loki + access: proxy + url: http://loki.observability.svc:3100 +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-provisioning + namespace: observability +data: + dashboards.yaml: | + apiVersion: 1 + providers: + - name: default + orgId: 1 + folder: DocumentDB + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false +--- +apiVersion: v1 +kind: Service +metadata: + name: grafana + namespace: observability +spec: + selector: + app: grafana + type: NodePort + ports: + - port: 3000 + nodePort: 30300 diff --git a/documentdb-playground/telemetry/local/k8s/observability/loki.yaml b/documentdb-playground/telemetry/local/k8s/observability/loki.yaml new file mode 100644 index 00000000..b6eee301 --- /dev/null +++ b/documentdb-playground/telemetry/local/k8s/observability/loki.yaml @@ -0,0 +1,74 @@ +# ============================================================ +# Loki - Log Aggregation Backend +# ============================================================ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: loki + namespace: observability +spec: + replicas: 1 + selector: + matchLabels: + app: loki + template: + metadata: + labels: + app: loki + spec: + containers: + - name: loki + image: grafana/loki:3.5.0 + args: ["-config.file=/etc/loki/loki.yaml"] + ports: + - containerPort: 3100 + volumeMounts: + - name: config + mountPath: /etc/loki + volumes: + - name: config + configMap: + name: loki-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: loki-config + namespace: observability +data: + loki.yaml: | + auth_enabled: false + server: + http_listen_port: 3100 + common: + ring: + instance_addr: 127.0.0.1 + kvstore: + store: inmemory + replication_factor: 1 + path_prefix: /tmp/loki + schema_config: + configs: + - from: "2024-01-01" + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + storage_config: + filesystem: + directory: /tmp/loki/chunks + limits_config: + allow_structured_metadata: true +--- +apiVersion: v1 +kind: Service +metadata: + name: loki + namespace: observability +spec: + selector: + app: loki + ports: + - port: 3100 diff --git a/documentdb-playground/telemetry/local/k8s/observability/namespace.yaml b/documentdb-playground/telemetry/local/k8s/observability/namespace.yaml new file mode 100644 index 00000000..4f75b8c5 --- /dev/null +++ b/documentdb-playground/telemetry/local/k8s/observability/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: observability diff --git a/documentdb-playground/telemetry/local/k8s/observability/otel-collector.yaml b/documentdb-playground/telemetry/local/k8s/observability/otel-collector.yaml new file mode 100644 index 00000000..5e6919d3 --- /dev/null +++ b/documentdb-playground/telemetry/local/k8s/observability/otel-collector.yaml @@ -0,0 +1,175 @@ +# ============================================================ +# OpenTelemetry Collector - DaemonSet (hostPort 4412) +# ============================================================ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: otel-collector + namespace: observability +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: otel-collector +rules: + - apiGroups: [""] + resources: ["nodes", "nodes/proxy", "nodes/metrics", "nodes/stats", "pods", "services", "endpoints"] + verbs: ["get", "list", "watch"] + - nonResourceURLs: ["/metrics", "/metrics/cadvisor"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: otel-collector +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: otel-collector +subjects: + - kind: ServiceAccount + name: otel-collector + namespace: observability +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: otel-collector + namespace: observability +spec: + selector: + matchLabels: + app: otel-collector + template: + metadata: + labels: + app: otel-collector + spec: + serviceAccountName: otel-collector + containers: + - name: otel-collector + image: otel/opentelemetry-collector-contrib:0.143.1 + args: ["--config=/etc/otel/config.yaml"] + ports: + - containerPort: 4317 + protocol: TCP + - containerPort: 4318 + protocol: TCP + - containerPort: 8889 + env: + - name: PG_MONITOR_USER + value: postgres + - name: PG_MONITOR_PASSWORD + value: "unused" + - name: K8S_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: config + mountPath: /etc/otel + volumes: + - name: config + configMap: + name: otel-collector-config +--- +apiVersion: v1 +kind: Service +metadata: + name: otel-collector + namespace: observability +spec: + selector: + app: otel-collector + ports: + - name: otlp-grpc + port: 4317 + - name: otlp-http + port: 4318 + - name: prometheus + port: 8889 +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: otel-collector-config + namespace: observability +data: + config.yaml: | + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + postgresql: + endpoint: documentdb-preview-rw.documentdb-preview-ns.svc.cluster.local:5432 + username: postgres + password: "unused" + databases: + - postgres + collection_interval: 10s + tls: + insecure: true + + kubeletstats: + auth_type: serviceAccount + collection_interval: 15s + endpoint: "https://${env:K8S_NODE_NAME}:10250" + insecure_skip_verify: true + metric_groups: + - node + - pod + - container + extra_metadata_labels: + - container.id + - k8s.volume.type + k8s_api_config: + auth_type: serviceAccount + + processors: + batch: + timeout: 5s + send_batch_size: 512 + + resource: + attributes: + - key: deployment.environment + value: development + action: upsert + + exporters: + otlp/tempo: + endpoint: tempo.observability.svc:4417 + tls: + insecure: true + + otlphttp/loki: + endpoint: http://loki.observability.svc:3100/otlp + tls: + insecure: true + + prometheus: + endpoint: 0.0.0.0:8889 + resource_to_telemetry_conversion: + enabled: true + + debug: + verbosity: basic + + service: + pipelines: + traces: + receivers: [otlp] + processors: [batch, resource] + exporters: [otlp/tempo] + metrics: + receivers: [otlp, postgresql, kubeletstats] + processors: [batch, resource] + exporters: [prometheus] + logs: + receivers: [otlp] + processors: [batch, resource] + exporters: [otlphttp/loki] diff --git a/documentdb-playground/telemetry/local/k8s/observability/prometheus.yaml b/documentdb-playground/telemetry/local/k8s/observability/prometheus.yaml new file mode 100644 index 00000000..b6183272 --- /dev/null +++ b/documentdb-playground/telemetry/local/k8s/observability/prometheus.yaml @@ -0,0 +1,59 @@ +# ============================================================ +# Prometheus - Metrics Store +# ============================================================ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus + namespace: observability +spec: + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + spec: + containers: + - name: prometheus + image: prom/prometheus:v3.3.0 + args: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.retention.time=1d + - --web.enable-lifecycle + ports: + - containerPort: 9090 + volumeMounts: + - name: config + mountPath: /etc/prometheus + volumes: + - name: config + configMap: + name: prometheus-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: observability +data: + prometheus.yml: | + global: + scrape_interval: 15s + scrape_configs: + - job_name: otel-collector + static_configs: + - targets: ['otel-collector.observability.svc:8889'] +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: observability +spec: + selector: + app: prometheus + ports: + - port: 9090 diff --git a/documentdb-playground/telemetry/local/k8s/observability/tempo.yaml b/documentdb-playground/telemetry/local/k8s/observability/tempo.yaml new file mode 100644 index 00000000..49ecdb07 --- /dev/null +++ b/documentdb-playground/telemetry/local/k8s/observability/tempo.yaml @@ -0,0 +1,76 @@ +# ============================================================ +# Tempo - Distributed Tracing Backend +# ============================================================ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tempo + namespace: observability +spec: + replicas: 1 + selector: + matchLabels: + app: tempo + template: + metadata: + labels: + app: tempo + spec: + containers: + - name: tempo + image: grafana/tempo:2.7.2 + args: ["-config.file=/etc/tempo/tempo.yaml"] + ports: + - containerPort: 3200 # HTTP + - containerPort: 4417 # OTLP gRPC (non-standard to avoid conflict) + volumeMounts: + - name: config + mountPath: /etc/tempo + volumes: + - name: config + configMap: + name: tempo-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: tempo-config + namespace: observability +data: + tempo.yaml: | + server: + http_listen_port: 3200 + distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4417 + ingester: + trace_idle_period: 10s + max_block_bytes: 1048576 + max_block_duration: 5m + compactor: + compaction: + block_retention: 1h + storage: + trace: + backend: local + local: + path: /tmp/tempo/blocks + wal: + path: /tmp/tempo/wal +--- +apiVersion: v1 +kind: Service +metadata: + name: tempo + namespace: observability +spec: + selector: + app: tempo + ports: + - name: http + port: 3200 + - name: otlp-grpc + port: 4417 diff --git a/documentdb-playground/telemetry/local/k8s/traffic/traffic-generator.yaml b/documentdb-playground/telemetry/local/k8s/traffic/traffic-generator.yaml new file mode 100644 index 00000000..ebd1ad48 --- /dev/null +++ b/documentdb-playground/telemetry/local/k8s/traffic/traffic-generator.yaml @@ -0,0 +1,269 @@ +--- +# Service to expose gateway port 10260 on PRIMARY +apiVersion: v1 +kind: Service +metadata: + name: documentdb-preview-gateway + namespace: documentdb-preview-ns +spec: + selector: + cnpg.io/cluster: documentdb-preview + cnpg.io/instanceRole: primary + ports: + - name: gateway + port: 10260 + targetPort: 10260 + type: ClusterIP +--- +# Service to expose gateway port 10260 on REPLICAS +apiVersion: v1 +kind: Service +metadata: + name: documentdb-preview-gateway-ro + namespace: documentdb-preview-ns +spec: + selector: + cnpg.io/cluster: documentdb-preview + cnpg.io/instanceRole: replica + ports: + - name: gateway + port: 10260 + targetPort: 10260 + type: ClusterIP +--- +# Traffic generator - writes to primary, reads to replicas +apiVersion: v1 +kind: ConfigMap +metadata: + name: traffic-generator-script + namespace: documentdb-preview-ns +data: + generate-traffic.js: | + // Traffic generator for DocumentDB telemetry demo + // Writes go to primary (RW), reads go to replicas (RO) + + const DB_NAME = "telemetry_demo"; + const COLLECTION = "events"; + const BATCH_SIZE = 5; + const ITERATIONS = 2250; + const SLEEP_MS = 800; + + // This script runs on the PRIMARY connection + db = db.getSiblingDB(DB_NAME); + + print("=== DocumentDB Traffic Generator (Primary - Writes) ==="); + print(`Target: ${DB_NAME}.${COLLECTION}`); + print(`Iterations: ${ITERATIONS}, Batch: ${BATCH_SIZE}, Sleep: ${SLEEP_MS}ms`); + + const categories = ["auth", "api", "database", "network", "system"]; + const severities = ["info", "warn", "error", "critical"]; + const sources = ["web-server", "api-gateway", "worker", "scheduler", "monitor"]; + + function randomChoice(arr) { + return arr[Math.floor(Math.random() * arr.length)]; + } + + function generateEvent() { + return { + timestamp: new Date(), + category: randomChoice(categories), + severity: randomChoice(severities), + source: randomChoice(sources), + message: "Event " + Math.random().toString(36).substring(7), + duration_ms: Math.floor(Math.random() * 2000), + statusCode: randomChoice([200, 200, 200, 201, 400, 404, 500]) + }; + } + + for (let i = 0; i < ITERATIONS; i++) { + try { + // WRITES: insert documents + for (let j = 0; j < BATCH_SIZE; j++) { + db[COLLECTION].insertOne(generateEvent()); + } + + // WRITES: update + db[COLLECTION].updateMany( + { severity: "info", source: randomChoice(sources) }, + { $set: { processed: true } } + ); + + // READ on primary too (some mixed workload) + db[COLLECTION].countDocuments({ source: randomChoice(sources) }); + + // ERROR GENERATORS (~10% of iterations) + if (i % 10 === 0) { + try { + // Invalid update operator + db[COLLECTION].updateOne({ _id: 1 }, { $badOp: { x: 1 } }); + } catch (e) { /* expected */ } + + try { + // Duplicate key on unique index (if exists) + db[COLLECTION].insertOne({ _id: "deliberate-dup-" + (i % 3) }); + } catch (e) { /* expected */ } + + try { + // Query non-existent collection with invalid pipeline + db.getSiblingDB("telemetry_demo")["no_such_coll"].aggregate([ + { $merge: { into: { db: "admin", coll: "forbidden" } } } + ]).toArray(); + } catch (e) { /* expected */ } + + try { + // Invalid regex + db[COLLECTION].find({ message: { $regex: "[invalid" } }).toArray(); + } catch (e) { /* expected */ } + } + + // Periodic cleanup + if (i % 100 === 0 && i > 0) { + db[COLLECTION].deleteMany({ processed: true }); + print(`[${i}/${ITERATIONS}] Cleanup done`); + } + + if (i % 25 === 0) { + print(`[${i}/${ITERATIONS}] OK`); + } + + sleep(SLEEP_MS); + } catch (e) { + print(`[${i}/${ITERATIONS}] Error: ${e.message}`); + sleep(2000); + } + } + + print("=== Primary traffic complete ==="); + + generate-reads.js: | + // Read-only traffic for replica instances + // Runs against the gateway-ro service (load-balanced across replicas) + + const DB_NAME = "telemetry_demo"; + const COLLECTION = "events"; + const ITERATIONS = 2250; + const SLEEP_MS = 600; + + db = db.getSiblingDB(DB_NAME); + + print("=== DocumentDB Traffic Generator (Replicas - Reads) ==="); + print(`Target: ${DB_NAME}.${COLLECTION}`); + print(`Iterations: ${ITERATIONS}, Sleep: ${SLEEP_MS}ms`); + + const categories = ["auth", "api", "database", "network", "system"]; + const severities = ["info", "warn", "error", "critical"]; + const sources = ["web-server", "api-gateway", "worker", "scheduler", "monitor"]; + + function randomChoice(arr) { + return arr[Math.floor(Math.random() * arr.length)]; + } + + for (let i = 0; i < ITERATIONS; i++) { + try { + // Simple find + db[COLLECTION].find({ category: randomChoice(categories) }).limit(10).toArray(); + + // Find with sort + db[COLLECTION].find({ severity: randomChoice(severities) }).sort({ timestamp: -1 }).limit(10).toArray(); + + // Count + db[COLLECTION].countDocuments({ source: randomChoice(sources) }); + + // Aggregate pipeline + db[COLLECTION].aggregate([ + { $match: { category: randomChoice(categories) } }, + { $group: { _id: "$severity", count: { $sum: 1 }, avg_duration: { $avg: "$duration_ms" } } }, + { $sort: { count: -1 } } + ]).toArray(); + + // Distinct + db[COLLECTION].distinct("source", { severity: randomChoice(severities) }); + + if (i % 25 === 0) { + print(`[${i}/${ITERATIONS}] OK`); + } + + sleep(SLEEP_MS); + } catch (e) { + print(`[${i}/${ITERATIONS}] Error: ${e.message}`); + sleep(2000); + } + } + + print("=== Replica traffic complete ==="); +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: traffic-generator-rw + namespace: documentdb-preview-ns +spec: + backoffLimit: 3 + template: + metadata: + labels: + app: traffic-generator + spec: + restartPolicy: OnFailure + containers: + - name: traffic-gen + image: mongodb/mongodb-community-server:latest + command: + - mongosh + - "--host" + - "documentdb-preview-gateway.documentdb-preview-ns.svc.cluster.local" + - "--port" + - "10260" + - "--tls" + - "--tlsAllowInvalidCertificates" + - "-u" + - "demo_user" + - "-p" + - "DemoPassword100" + - "--file" + - "/scripts/generate-traffic.js" + volumeMounts: + - name: script + mountPath: /scripts + volumes: + - name: script + configMap: + name: traffic-generator-script +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: traffic-generator-ro + namespace: documentdb-preview-ns +spec: + backoffLimit: 3 + template: + metadata: + labels: + app: traffic-generator + spec: + restartPolicy: OnFailure + containers: + - name: traffic-gen + image: mongodb/mongodb-community-server:latest + command: + - mongosh + - "--host" + - "documentdb-preview-gateway-ro.documentdb-preview-ns.svc.cluster.local" + - "--port" + - "10260" + - "--tls" + - "--tlsAllowInvalidCertificates" + - "-u" + - "demo_user" + - "-p" + - "DemoPassword100" + - "--file" + - "/scripts/generate-reads.js" + volumeMounts: + - name: script + mountPath: /scripts + volumes: + - name: script + configMap: + name: traffic-generator-script diff --git a/documentdb-playground/telemetry/local/scripts/deploy.sh b/documentdb-playground/telemetry/local/scripts/deploy.sh new file mode 100755 index 00000000..1aaad2b4 --- /dev/null +++ b/documentdb-playground/telemetry/local/scripts/deploy.sh @@ -0,0 +1,46 @@ +#!/bin/bash +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +LOCAL_DIR="$(dirname "$SCRIPT_DIR")" +CONTEXT="kind-documentdb-telemetry" + +echo "=== DocumentDB Telemetry Playground ===" + +# Step 1: Create Kind cluster +echo "[1/5] Setting up Kind cluster..." +"$SCRIPT_DIR/setup-kind.sh" + +# Step 2: Install operators (user must have already built/pushed images) +echo "[2/5] Waiting for cluster to be ready..." +kubectl wait --for=condition=Ready nodes --all --context "$CONTEXT" --timeout=120s + +# Step 3: Deploy observability stack +echo "[3/5] Deploying observability stack..." +kubectl apply -f "$LOCAL_DIR/k8s/observability/" --context "$CONTEXT" + +# Create dashboard ConfigMap from JSON files +echo " Loading Grafana dashboards..." +kubectl create configmap grafana-dashboards \ + --namespace=observability \ + --from-file=gateway.json="$LOCAL_DIR/dashboards/gateway.json" \ + --from-file=internals.json="$LOCAL_DIR/dashboards/internals.json" \ + --context "$CONTEXT" \ + --dry-run=client -o yaml | kubectl apply -f - --context "$CONTEXT" + +kubectl wait --for=condition=Available deployment --all -n observability --context "$CONTEXT" --timeout=120s + +# Step 4: Deploy DocumentDB +echo "[4/5] Deploying DocumentDB..." +kubectl apply -f "$LOCAL_DIR/k8s/documentdb/" --context "$CONTEXT" + +# Step 5: Deploy traffic generators +echo "[5/5] Deploying traffic generators..." +# Wait for DocumentDB pods to be ready first +echo " Waiting for DocumentDB pods (this may take a few minutes)..." +kubectl wait --for=condition=Ready pod -l app=documentdb-preview -n documentdb-preview-ns --context "$CONTEXT" --timeout=300s 2>/dev/null || echo " (DocumentDB pods not ready yet - deploy traffic manually later)" +kubectl apply -f "$LOCAL_DIR/k8s/traffic/" --context "$CONTEXT" + +echo "" +echo "=== Deployment Complete ===" +echo "Grafana: kubectl port-forward svc/grafana 3000:3000 -n observability --context $CONTEXT" +echo "Prometheus: kubectl port-forward svc/prometheus 9090:9090 -n observability --context $CONTEXT" diff --git a/documentdb-playground/telemetry/local/scripts/setup-kind.sh b/documentdb-playground/telemetry/local/scripts/setup-kind.sh new file mode 100755 index 00000000..41e2de42 --- /dev/null +++ b/documentdb-playground/telemetry/local/scripts/setup-kind.sh @@ -0,0 +1,73 @@ +#!/bin/bash +set -euo pipefail + +# Configuration +CLUSTER_NAME="${CLUSTER_NAME:-documentdb-telemetry}" +REG_NAME="kind-registry" +REG_PORT="5001" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +LOCAL_DIR="$(dirname "$SCRIPT_DIR")" +K8S_VERSION="${K8S_VERSION:-v1.35.0}" + +echo "=== DocumentDB Telemetry - Kind Cluster Setup ===" + +# 1. Create registry container unless it already exists +if [ "$(docker inspect -f '{{.State.Running}}' "${REG_NAME}" 2>/dev/null || true)" != 'true' ]; then + echo "Starting local registry on port ${REG_PORT}..." + docker run -d --restart=always -p "127.0.0.1:${REG_PORT}:5000" --network bridge --name "${REG_NAME}" registry:2 +else + echo "Registry '${REG_NAME}' already running" +fi + +# 2. Create Kind cluster if it doesn't exist +if kind get clusters 2>/dev/null | grep -q "^${CLUSTER_NAME}$"; then + echo "Kind cluster '${CLUSTER_NAME}' already exists" +else + echo "Creating Kind cluster '${CLUSTER_NAME}' with 3 worker nodes..." + cat </dev/null || true + +# Clean up socat proxy containers +echo "Cleaning up proxy containers..." +for container in k8s-grafana-proxy k8s-prometheus-proxy; do + if docker inspect "$container" &>/dev/null; then + docker rm -f "$container" + echo " Removed $container" + fi +done + +echo "Done. Registry container kept for reuse." diff --git a/mkdocs.yml b/mkdocs.yml index 1bc4e919..240c18da 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -35,6 +35,9 @@ nav: - Advanced Configuration: preview/advanced-configuration/README.md - Backup and Restore: preview/backup-and-restore.md - API Reference: preview/api-reference.md + - Monitoring: + - Overview: preview/monitoring/overview.md + - Metrics Reference: preview/monitoring/metrics.md - FAQ: preview/faq.md - Tools: - Kubectl Plugin: preview/kubectl-plugin.md diff --git a/operator/cnpg-plugins/sidecar-injector/internal/lifecycle/lifecycle.go b/operator/cnpg-plugins/sidecar-injector/internal/lifecycle/lifecycle.go index 07688e59..77c094e4 100644 --- a/operator/cnpg-plugins/sidecar-injector/internal/lifecycle/lifecycle.go +++ b/operator/cnpg-plugins/sidecar-injector/internal/lifecycle/lifecycle.go @@ -135,6 +135,22 @@ func (impl Implementation) reconcileMetadata( Name: "OTEL_EXPORTER_OTLP_ENDPOINT", Value: "http://" + cluster.Name + "-collector." + cluster.Namespace + ".svc.cluster.local:4317", }, + { + Name: "OTEL_TRACING_ENABLED", + Value: "true", + }, + { + Name: "POD_NAME", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "metadata.name", + }, + }, + }, + { + Name: "OTEL_RESOURCE_ATTRIBUTES", + Value: "service.instance.id=$(POD_NAME)", + }, } // Add USERNAME and PASSWORD environment variables from secret defined in configuration From 26e2262511c98ea8cc6503a7b54f6483fd666524 Mon Sep 17 00:00:00 2001 From: urismiley Date: Mon, 16 Mar 2026 12:32:34 -0400 Subject: [PATCH 02/16] Address PR review: fix PG auth, add doc front matter, demo warnings - Fix Critical #1: OTel collector postgresql receiver now uses CNPG superuser secret (copied cross-namespace by deploy.sh) instead of hardcoded 'unused' password - Fix Major #2: Add YAML front matter (title, description, tags) to monitoring overview.md and metrics.md - Fix Major #4: Add DO NOT USE IN PRODUCTION warnings to grafana.yaml, cluster.yaml, traffic-generator.yaml, and otel-collector.yaml - Fix Minor #5: Add note that gateway metric names are versioned independently and may change between releases - Fix Minor #6: Document kubeletstats single-node coverage limitation in otel-collector.yaml - Fix Minor #9: Document hardcoded PG endpoint coupling to CR name and namespace in otel-collector.yaml Signed-off-by: urismiley --- .../preview/monitoring/metrics.md | 13 +++++++ .../preview/monitoring/overview.md | 10 ++++++ .../local/k8s/documentdb/cluster.yaml | 2 ++ .../local/k8s/observability/grafana.yaml | 4 +++ .../k8s/observability/otel-collector.yaml | 28 ++++++++++++--- .../local/k8s/traffic/traffic-generator.yaml | 3 ++ .../telemetry/local/scripts/deploy.sh | 35 +++++++++++++------ 7 files changed, 80 insertions(+), 15 deletions(-) diff --git a/docs/operator-public-documentation/preview/monitoring/metrics.md b/docs/operator-public-documentation/preview/monitoring/metrics.md index 9159bf83..0de60af1 100644 --- a/docs/operator-public-documentation/preview/monitoring/metrics.md +++ b/docs/operator-public-documentation/preview/monitoring/metrics.md @@ -1,3 +1,13 @@ +--- +title: Metrics Reference +description: Detailed reference of all metrics available when monitoring DocumentDB clusters, with PromQL examples. +tags: + - monitoring + - metrics + - prometheus + - opentelemetry +--- + # Metrics Reference This page documents the key metrics available when monitoring a DocumentDB cluster, organized by source. Each section includes the metric name, description, labels, and example PromQL queries. @@ -155,6 +165,9 @@ The DocumentDB Gateway exports application-level metrics via OTLP (OpenTelemetry Metrics are exported to an OpenTelemetry Collector, which converts them to Prometheus format via the `prometheus` exporter. +!!! note "Gateway metric names may change between versions" + The metrics below are emitted by the DocumentDB Gateway binary, which is versioned independently from the operator. Metric names, labels, and semantics may change between gateway releases. Always verify metric availability against the gateway version deployed in your cluster. + ### Operations | Metric | Type | Description | diff --git a/docs/operator-public-documentation/preview/monitoring/overview.md b/docs/operator-public-documentation/preview/monitoring/overview.md index eb99aaea..c2bb923b 100644 --- a/docs/operator-public-documentation/preview/monitoring/overview.md +++ b/docs/operator-public-documentation/preview/monitoring/overview.md @@ -1,3 +1,13 @@ +--- +title: Monitoring Overview +description: How to monitor DocumentDB clusters using OpenTelemetry, Prometheus, and Grafana. +tags: + - monitoring + - observability + - metrics + - opentelemetry +--- + # Monitoring Overview This guide describes how to monitor DocumentDB clusters running on Kubernetes using OpenTelemetry, Prometheus, and Grafana. diff --git a/documentdb-playground/telemetry/local/k8s/documentdb/cluster.yaml b/documentdb-playground/telemetry/local/k8s/documentdb/cluster.yaml index f0f294af..045fa261 100644 --- a/documentdb-playground/telemetry/local/k8s/documentdb/cluster.yaml +++ b/documentdb-playground/telemetry/local/k8s/documentdb/cluster.yaml @@ -1,3 +1,5 @@ +# ⚠️ DEMO/PLAYGROUND ONLY — credentials below are for local development. +# Do NOT use these passwords in production. apiVersion: v1 kind: Namespace metadata: diff --git a/documentdb-playground/telemetry/local/k8s/observability/grafana.yaml b/documentdb-playground/telemetry/local/k8s/observability/grafana.yaml index 73caddcb..390c3879 100644 --- a/documentdb-playground/telemetry/local/k8s/observability/grafana.yaml +++ b/documentdb-playground/telemetry/local/k8s/observability/grafana.yaml @@ -1,5 +1,9 @@ # ============================================================ # Grafana - Dashboards & Visualization +# ⚠️ DEMO/PLAYGROUND ONLY — not for production use. +# Anonymous admin access (GF_AUTH_ANONYMOUS_ORG_ROLE: Admin) and +# the default admin password are intentional for local dev +# convenience. Do NOT deploy this configuration in production. # ============================================================ apiVersion: apps/v1 kind: Deployment diff --git a/documentdb-playground/telemetry/local/k8s/observability/otel-collector.yaml b/documentdb-playground/telemetry/local/k8s/observability/otel-collector.yaml index 5e6919d3..dcc60ca2 100644 --- a/documentdb-playground/telemetry/local/k8s/observability/otel-collector.yaml +++ b/documentdb-playground/telemetry/local/k8s/observability/otel-collector.yaml @@ -1,5 +1,8 @@ # ============================================================ -# OpenTelemetry Collector - DaemonSet (hostPort 4412) +# OpenTelemetry Collector +# ⚠️ DEMO/PLAYGROUND ONLY — not for production use. +# Hardcoded passwords and single-replica deployment are +# intentional for local development convenience. # ============================================================ apiVersion: v1 kind: ServiceAccount @@ -57,10 +60,18 @@ spec: protocol: TCP - containerPort: 8889 env: + # PG credentials are copied from the CNPG superuser secret + # by deploy.sh (see "Copy PG credentials" step). - name: PG_MONITOR_USER - value: postgres + valueFrom: + secretKeyRef: + name: pg-monitor-credentials + key: username - name: PG_MONITOR_PASSWORD - value: "unused" + valueFrom: + secretKeyRef: + name: pg-monitor-credentials + key: password - name: K8S_NODE_NAME valueFrom: fieldRef: @@ -104,16 +115,23 @@ data: http: endpoint: 0.0.0.0:4318 + # NOTE: This endpoint is coupled to the DocumentDB CR name + # ("documentdb-preview") and namespace ("documentdb-preview-ns"). + # Update if your CR or namespace differs. postgresql: endpoint: documentdb-preview-rw.documentdb-preview-ns.svc.cluster.local:5432 - username: postgres - password: "unused" + username: ${env:PG_MONITOR_USER} + password: ${env:PG_MONITOR_PASSWORD} databases: - postgres collection_interval: 10s tls: insecure: true + # NOTE: kubeletstats with K8S_NODE_NAME only scrapes the node + # where this collector pod runs. In a multi-node cluster (e.g., + # the 4-node Kind cluster), metrics from other nodes are missed. + # Use a DaemonSet deployment for full-cluster kubelet coverage. kubeletstats: auth_type: serviceAccount collection_interval: 15s diff --git a/documentdb-playground/telemetry/local/k8s/traffic/traffic-generator.yaml b/documentdb-playground/telemetry/local/k8s/traffic/traffic-generator.yaml index ebd1ad48..a26ec478 100644 --- a/documentdb-playground/telemetry/local/k8s/traffic/traffic-generator.yaml +++ b/documentdb-playground/telemetry/local/k8s/traffic/traffic-generator.yaml @@ -1,4 +1,7 @@ --- +# ⚠️ DEMO/PLAYGROUND ONLY — hardcoded passwords below are for local development. +# Do NOT use these credentials in production. +# # Service to expose gateway port 10260 on PRIMARY apiVersion: v1 kind: Service diff --git a/documentdb-playground/telemetry/local/scripts/deploy.sh b/documentdb-playground/telemetry/local/scripts/deploy.sh index 1aaad2b4..2bbd7de3 100755 --- a/documentdb-playground/telemetry/local/scripts/deploy.sh +++ b/documentdb-playground/telemetry/local/scripts/deploy.sh @@ -7,15 +7,17 @@ CONTEXT="kind-documentdb-telemetry" echo "=== DocumentDB Telemetry Playground ===" # Step 1: Create Kind cluster -echo "[1/5] Setting up Kind cluster..." +echo "[1/6] Setting up Kind cluster..." "$SCRIPT_DIR/setup-kind.sh" -# Step 2: Install operators (user must have already built/pushed images) -echo "[2/5] Waiting for cluster to be ready..." +# Step 2: Wait for cluster +echo "[2/6] Waiting for cluster to be ready..." kubectl wait --for=condition=Ready nodes --all --context "$CONTEXT" --timeout=120s # Step 3: Deploy observability stack -echo "[3/5] Deploying observability stack..." +# NOTE: The OTel collector pod will remain in CreateContainerConfigError +# until PG credentials are copied in step 5. This is expected. +echo "[3/6] Deploying observability stack..." kubectl apply -f "$LOCAL_DIR/k8s/observability/" --context "$CONTEXT" # Create dashboard ConfigMap from JSON files @@ -27,15 +29,28 @@ kubectl create configmap grafana-dashboards \ --context "$CONTEXT" \ --dry-run=client -o yaml | kubectl apply -f - --context "$CONTEXT" -kubectl wait --for=condition=Available deployment --all -n observability --context "$CONTEXT" --timeout=120s - # Step 4: Deploy DocumentDB -echo "[4/5] Deploying DocumentDB..." +echo "[4/6] Deploying DocumentDB..." kubectl apply -f "$LOCAL_DIR/k8s/documentdb/" --context "$CONTEXT" -# Step 5: Deploy traffic generators -echo "[5/5] Deploying traffic generators..." -# Wait for DocumentDB pods to be ready first +# Step 5: Copy PG credentials to observability namespace +# CNPG creates a superuser secret (-superuser) that the OTel +# collector needs for the postgresql receiver to authenticate. +echo "[5/6] Waiting for CNPG superuser secret..." +until kubectl get secret documentdb-preview-superuser -n documentdb-preview-ns --context "$CONTEXT" &>/dev/null; do + sleep 5 +done +echo " Copying PG credentials to observability namespace..." +kubectl get secret documentdb-preview-superuser -n documentdb-preview-ns --context "$CONTEXT" -o json \ + | jq 'del(.metadata.namespace,.metadata.uid,.metadata.resourceVersion,.metadata.creationTimestamp,.metadata.ownerReferences,.metadata.labels,.metadata.managedFields) | .metadata.name = "pg-monitor-credentials"' \ + | kubectl apply -n observability --context "$CONTEXT" -f - + +# Wait for all observability deployments (collector should start now) +echo " Waiting for observability stack..." +kubectl wait --for=condition=Available deployment --all -n observability --context "$CONTEXT" --timeout=180s + +# Step 6: Deploy traffic generators +echo "[6/6] Deploying traffic generators..." echo " Waiting for DocumentDB pods (this may take a few minutes)..." kubectl wait --for=condition=Ready pod -l app=documentdb-preview -n documentdb-preview-ns --context "$CONTEXT" --timeout=300s 2>/dev/null || echo " (DocumentDB pods not ready yet - deploy traffic manually later)" kubectl apply -f "$LOCAL_DIR/k8s/traffic/" --context "$CONTEXT" From ec8e74128924ebefa758108582059fa940b40970 Mon Sep 17 00:00:00 2001 From: urismiley Date: Mon, 16 Mar 2026 12:38:22 -0400 Subject: [PATCH 03/16] Replace ASCII architecture diagram with Mermaid in overview.md Convert the monitoring architecture diagram from ASCII art to a Mermaid graph for better rendering in mkdocs. Enable the pymdownx.superfences mermaid custom fence in mkdocs.yml. Signed-off-by: urismiley --- .../preview/monitoring/overview.md | 66 ++++++++++--------- mkdocs.yml | 6 +- 2 files changed, 40 insertions(+), 32 deletions(-) diff --git a/docs/operator-public-documentation/preview/monitoring/overview.md b/docs/operator-public-documentation/preview/monitoring/overview.md index c2bb923b..e4205987 100644 --- a/docs/operator-public-documentation/preview/monitoring/overview.md +++ b/docs/operator-public-documentation/preview/monitoring/overview.md @@ -34,37 +34,41 @@ The gateway sidecar injector automatically configures each gateway container wit The recommended monitoring stack collects three signals — **metrics**, **traces**, and **logs** — from these containers and stores them for visualization and alerting. -``` -┌──────────────────────────────────────────────────────┐ -│ Grafana │ -│ (dashboards, alerts, trace viewer) │ -└──────────┬──────────────┬──────────────┬─────────────┘ - │ │ │ - ┌─────┴─────┐ ┌────┴────┐ ┌────┴────┐ - │ Prometheus │ │ Tempo │ │ Loki │ - │ (metrics) │ │(traces) │ │ (logs) │ - └─────┬─────┘ └────┬────┘ └────┬────┘ - │ │ │ -┌──────────┴──────────────┴──────────────┴─────────────┐ -│ OpenTelemetry Collector │ -│ Receivers: otlp, postgresql, kubeletstats │ -│ Processors: batch, resource │ -│ Exporters: prometheus, otlp/tempo, otlphttp/loki │ -└──────────┬──────────────┬────────────────────────────┘ - │ │ - ┌──────┴──────┐ ┌───┴──────────────┐ - │ OTLP push │ │ SQL scrape │ - │ (gateway) │ │ (PG receiver) │ - └──────┬──────┘ └───┬──────────────┘ - │ │ -┌──────────┴──────────────┴────────────────────────────┐ -│ DocumentDB Pods │ -│ ┌──────────────┐ ┌──────────────┐ │ -│ │ PostgreSQL │ │ Gateway │──── OTLP push │ -│ │ container │ │ container │ (metrics, │ -│ │ ◄──SQL scrape │ traces, logs) │ -│ └──────────────┘ └──────────────┘ │ -└──────────────────────────────────────────────────────┘ +```mermaid +graph TB + subgraph viz["Visualization"] + grafana["Grafana
(dashboards, alerts, trace viewer)"] + end + + subgraph backends["Storage Backends"] + prometheus["Prometheus
(metrics)"] + tempo["Tempo
(traces)"] + loki["Loki
(logs)"] + end + + subgraph collector["OpenTelemetry Collector"] + receivers["Receivers: otlp, postgresql, kubeletstats"] + processors["Processors: batch, resource"] + exporters["Exporters: prometheus, otlp/tempo, otlphttp/loki"] + end + + subgraph pods["DocumentDB Pods"] + pg["PostgreSQL container
:5432"] + gw["Gateway container
:10260"] + end + + grafana --- prometheus + grafana --- tempo + grafana --- loki + + prometheus --- exporters + tempo --- exporters + loki --- exporters + + receivers --> processors --> exporters + + gw -->|"OTLP push
(metrics, traces, logs)"| receivers + pg -->|"SQL scrape
(PG receiver)"| receivers ``` ### How gateway telemetry reaches the collector diff --git a/mkdocs.yml b/mkdocs.yml index 240c18da..fdfdea7f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -49,7 +49,11 @@ markdown_extensions: - admonition - attr_list - md_in_html - - pymdownx.superfences + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format - pymdownx.tabbed: alternate_style: true - pymdownx.highlight: From 37a5951e862e713a2296e17cfc4b71cfe981a47e Mon Sep 17 00:00:00 2001 From: urismiley Date: Mon, 16 Mar 2026 12:54:53 -0400 Subject: [PATCH 04/16] Add prerequisite note for gateway OTEL support (documentdb#443) Document that the gateway image must include OpenTelemetry instrumentation from documentdb/documentdb#443 for the telemetry playground and monitoring features to work. Signed-off-by: urismiley --- .../preview/monitoring/overview.md | 5 +++++ documentdb-playground/telemetry/local/README.md | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/docs/operator-public-documentation/preview/monitoring/overview.md b/docs/operator-public-documentation/preview/monitoring/overview.md index e4205987..8232d41b 100644 --- a/docs/operator-public-documentation/preview/monitoring/overview.md +++ b/docs/operator-public-documentation/preview/monitoring/overview.md @@ -20,6 +20,11 @@ This guide describes how to monitor DocumentDB clusters running on Kubernetes us - [`jq`](https://jqlang.github.io/jq/) for processing JSON in verification commands - (Optional) [OpenTelemetry Operator](https://opentelemetry.io/docs/kubernetes/operator/) for managed collector deployments +!!! important "Gateway OTEL support required" + Gateway telemetry requires the OpenTelemetry instrumentation added in + [documentdb/documentdb#443](https://github.com/documentdb/documentdb/pull/443). + Ensure your gateway image is built from a branch that includes this change. + ## Architecture A DocumentDB pod contains two containers: diff --git a/documentdb-playground/telemetry/local/README.md b/documentdb-playground/telemetry/local/README.md index 8bdfeb49..f168ba85 100644 --- a/documentdb-playground/telemetry/local/README.md +++ b/documentdb-playground/telemetry/local/README.md @@ -11,6 +11,12 @@ A full observability stack for DocumentDB running on a local Kind cluster. Provi - `localhost:5001/documentdb-gateway:latest` - Operator + sidecar injector installed in the cluster +!!! important "Gateway OTEL support required" + The gateway image must be built with OpenTelemetry instrumentation + ([documentdb/documentdb#443](https://github.com/documentdb/documentdb/pull/443)). + Without it, the gateway will not emit metrics, traces, or logs via OTLP + and the Grafana dashboards will show no data. + ## Quick Start ```bash From 1c682ba3bb701156154c7e53cfdd68a9a9d925dd Mon Sep 17 00:00:00 2001 From: urismiley Date: Mon, 16 Mar 2026 13:08:56 -0400 Subject: [PATCH 05/16] Address Copilot review comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix Mermaid diagram: 'remote write' → 'scrape :8889' (Prometheus scrapes the collector, not the other way around) - Derive CONTEXT from CLUSTER_NAME in deploy.sh for consistency with setup-kind.sh - Add 3-minute timeout to CNPG superuser secret wait loop - Pin mongodb-community-server image to 7.0.30-ubuntu2204 - Fix '3-node' → '3-instance' in overview.md (1 node, 3 instances) Signed-off-by: urismiley --- .../preview/monitoring/overview.md | 2 +- documentdb-playground/telemetry/local/README.md | 2 +- .../local/k8s/traffic/traffic-generator.yaml | 4 ++-- .../telemetry/local/scripts/deploy.sh | 11 ++++++++++- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/docs/operator-public-documentation/preview/monitoring/overview.md b/docs/operator-public-documentation/preview/monitoring/overview.md index 8232d41b..52d51305 100644 --- a/docs/operator-public-documentation/preview/monitoring/overview.md +++ b/docs/operator-public-documentation/preview/monitoring/overview.md @@ -220,7 +220,7 @@ The [`documentdb-playground/telemetry/`](https://github.com/documentdb/documentd The `local/` subdirectory provides a self-contained local demo on a Kind cluster with: -- 3-node DocumentDB HA cluster (1 primary + 2 streaming replicas) +- 3-instance DocumentDB HA cluster (1 primary + 2 streaming replicas) - Full observability stack: OTel Collector, Prometheus, Tempo, Loki, Grafana - Gateway metrics, traces, and logs via OTLP push - PostgreSQL metrics via the OTel `postgresql` receiver diff --git a/documentdb-playground/telemetry/local/README.md b/documentdb-playground/telemetry/local/README.md index f168ba85..74306154 100644 --- a/documentdb-playground/telemetry/local/README.md +++ b/documentdb-playground/telemetry/local/README.md @@ -47,7 +47,7 @@ graph TB collector -->|OTLP| tempo collector -->|OTLP/HTTP| loki - collector -->|remote write| prometheus + prometheus -->|scrape :8889| collector prometheus --> grafana tempo --> grafana loki --> grafana diff --git a/documentdb-playground/telemetry/local/k8s/traffic/traffic-generator.yaml b/documentdb-playground/telemetry/local/k8s/traffic/traffic-generator.yaml index a26ec478..2df53355 100644 --- a/documentdb-playground/telemetry/local/k8s/traffic/traffic-generator.yaml +++ b/documentdb-playground/telemetry/local/k8s/traffic/traffic-generator.yaml @@ -210,7 +210,7 @@ spec: restartPolicy: OnFailure containers: - name: traffic-gen - image: mongodb/mongodb-community-server:latest + image: mongodb/mongodb-community-server:7.0.30-ubuntu2204 command: - mongosh - "--host" @@ -248,7 +248,7 @@ spec: restartPolicy: OnFailure containers: - name: traffic-gen - image: mongodb/mongodb-community-server:latest + image: mongodb/mongodb-community-server:7.0.30-ubuntu2204 command: - mongosh - "--host" diff --git a/documentdb-playground/telemetry/local/scripts/deploy.sh b/documentdb-playground/telemetry/local/scripts/deploy.sh index 2bbd7de3..b6af84ec 100755 --- a/documentdb-playground/telemetry/local/scripts/deploy.sh +++ b/documentdb-playground/telemetry/local/scripts/deploy.sh @@ -2,7 +2,8 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" LOCAL_DIR="$(dirname "$SCRIPT_DIR")" -CONTEXT="kind-documentdb-telemetry" +CLUSTER_NAME="${CLUSTER_NAME:-documentdb-telemetry}" +CONTEXT="kind-${CLUSTER_NAME}" echo "=== DocumentDB Telemetry Playground ===" @@ -37,7 +38,15 @@ kubectl apply -f "$LOCAL_DIR/k8s/documentdb/" --context "$CONTEXT" # CNPG creates a superuser secret (-superuser) that the OTel # collector needs for the postgresql receiver to authenticate. echo "[5/6] Waiting for CNPG superuser secret..." +RETRIES=0 +MAX_RETRIES=36 # 36 × 5s = 3 minutes until kubectl get secret documentdb-preview-superuser -n documentdb-preview-ns --context "$CONTEXT" &>/dev/null; do + RETRIES=$((RETRIES + 1)) + if [ "$RETRIES" -ge "$MAX_RETRIES" ]; then + echo "ERROR: Timed out waiting for CNPG superuser secret (documentdb-preview-superuser)." + echo " Verify the DocumentDB operator is installed and the CR was accepted." + exit 1 + fi sleep 5 done echo " Copying PG credentials to observability namespace..." From ffb48c7c133e207d71049a1e7f576396eee3c232 Mon Sep 17 00:00:00 2001 From: urismiley Date: Wed, 25 Mar 2026 01:59:47 -0400 Subject: [PATCH 06/16] Improve telemetry playground: OTel best practices, alerting, logs panel, validation script, kubeletstats DaemonSet Signed-off-by: urismiley --- .../telemetry/local/README.md | 35 ++++- .../telemetry/local/dashboards/gateway.json | 52 ++++++- .../telemetry/local/dashboards/internals.json | 5 +- .../local/k8s/observability/grafana.yaml | 15 +- .../observability/otel-collector-node.yaml | 84 +++++++++++ .../k8s/observability/otel-collector.yaml | 56 ++++---- .../local/k8s/observability/prometheus.yaml | 113 +++++++++++++++ .../local/k8s/traffic/traffic-generator.yaml | 30 +++- .../telemetry/local/scripts/setup-kind.sh | 2 +- .../telemetry/local/scripts/teardown.sh | 9 -- .../telemetry/local/scripts/validate.sh | 130 ++++++++++++++++++ 11 files changed, 479 insertions(+), 52 deletions(-) create mode 100644 documentdb-playground/telemetry/local/k8s/observability/otel-collector-node.yaml create mode 100755 documentdb-playground/telemetry/local/scripts/validate.sh diff --git a/documentdb-playground/telemetry/local/README.md b/documentdb-playground/telemetry/local/README.md index 74306154..b11da4e0 100644 --- a/documentdb-playground/telemetry/local/README.md +++ b/documentdb-playground/telemetry/local/README.md @@ -40,6 +40,7 @@ graph TB subgraph cluster["Kind Cluster (documentdb-telemetry)"] subgraph obs["observability namespace"] collector["OTel Collector
:4317 gRPC / :4318 HTTP / :8889 prom"] + node_collector["OTel Node Collector
(DaemonSet — kubeletstats)"] tempo["Tempo
(traces)"] loki["Loki
(logs)"] prometheus["Prometheus
(metrics)"] @@ -47,6 +48,7 @@ graph TB collector -->|OTLP| tempo collector -->|OTLP/HTTP| loki + node_collector -->|OTLP| collector prometheus -->|scrape :8889| collector prometheus --> grafana tempo --> grafana @@ -90,24 +92,49 @@ local/ ├── scripts/ │ ├── setup-kind.sh # Creates Kind cluster + local registry │ ├── deploy.sh # One-command full deployment +│ ├── validate.sh # Health check — verifies data flow │ └── teardown.sh # Deletes cluster and proxy containers ├── k8s/ │ ├── observability/ # Namespace, Tempo, Loki, Prometheus, OTel Collector, Grafana +│ │ └── otel-collector-node.yaml # DaemonSet for per-node kubelet metrics │ ├── documentdb/ # DocumentDB CR, credentials, collector bridge │ └── traffic/ # Traffic generator services + jobs └── dashboards/ - ├── gateway.json # Gateway-level metrics dashboard - └── internals.json # Internal metrics dashboard + ├── gateway.json # Gateway metrics + logs dashboard + └── internals.json # PostgreSQL + infrastructure dashboard ``` ## Dashboards | Dashboard | Description | |-----------|-------------| -| **Gateway** | Request rates, latency (p50/p95/p99), error rates, active connections, command breakdown by type | +| **Gateway** | Request rates, latency (p50/p95/p99), error rates, active connections, command breakdown, gateway logs (Loki) | | **Internals** | PostgreSQL metrics, container resource usage (CPU/memory), OTel Collector pipeline stats | -Dashboards are automatically provisioned into Grafana on startup via ConfigMap mounts. Edits made in the Grafana UI will persist until the pod restarts. +Dashboards auto-refresh every 10 seconds and are automatically provisioned into Grafana on startup via ConfigMap mounts. Edits made in the Grafana UI will persist until the pod restarts. + +## Alerting Rules + +Prometheus includes sample alerting rules: + +| Alert | Condition | +|-------|-----------| +| **GatewayHighErrorRate** | Error rate > 5% for 5 minutes | +| **PostgresReplicationLagHigh** | Replication lag > 10MB for 5 minutes | +| **PostgresConnectionSaturation** | Connection usage > 80% for 5 minutes | +| **GatewayDown** | No gateway metrics for 2 minutes | + +View firing alerts at `http://localhost:9090/alerts` (after port-forwarding Prometheus). + +## Validation + +After deployment, verify everything is working: + +```bash +./scripts/validate.sh +``` + +This checks all pods are running, Prometheus has active targets, and metrics/traces/logs are flowing. ## Restarting Traffic Generators diff --git a/documentdb-playground/telemetry/local/dashboards/gateway.json b/documentdb-playground/telemetry/local/dashboards/gateway.json index 0dba9430..0b8a4595 100644 --- a/documentdb-playground/telemetry/local/dashboards/gateway.json +++ b/documentdb-playground/telemetry/local/dashboards/gateway.json @@ -1046,6 +1046,54 @@ "x": 0, "y": 11 } + }, + { + "type": "row", + "title": "Logs", + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "panels": [ + { + "type": "logs", + "title": "Gateway Logs", + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 26 + }, + "datasource": { + "type": "loki", + "uid": "loki" + }, + "options": { + "showTime": true, + "showLabels": false, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "sortOrder": "Descending", + "dedupStrategy": "none" + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "expr": "{service_name=~\"documentdb.*|gateway.*\"} |= ``", + "refId": "A" + } + ] + } + ] } - ] -} \ No newline at end of file + ], + "refresh": "10s" +} diff --git a/documentdb-playground/telemetry/local/dashboards/internals.json b/documentdb-playground/telemetry/local/dashboards/internals.json index d8fba6e8..0cd22b44 100644 --- a/documentdb-playground/telemetry/local/dashboards/internals.json +++ b/documentdb-playground/telemetry/local/dashboards/internals.json @@ -1162,5 +1162,6 @@ "y": 1 } } - ] -} \ No newline at end of file + ], + "refresh": "10s" +} diff --git a/documentdb-playground/telemetry/local/k8s/observability/grafana.yaml b/documentdb-playground/telemetry/local/k8s/observability/grafana.yaml index 390c3879..6c960661 100644 --- a/documentdb-playground/telemetry/local/k8s/observability/grafana.yaml +++ b/documentdb-playground/telemetry/local/k8s/observability/grafana.yaml @@ -70,6 +70,18 @@ data: uid: tempo access: proxy url: http://tempo.observability.svc:3200 + jsonData: + tracesToLogsV2: + datasourceUid: loki + filterByTraceID: true + filterBySpanID: false + tracesToMetrics: + datasourceUid: prometheus + queries: + - name: Request rate + query: sum(rate(db_client_operations_total{service_instance_id="$${__span.tags.service.instance.id}"}[5m])) + - name: Error rate + query: sum(rate(db_client_operations_total{service_instance_id="$${__span.tags.service.instance.id}",error_type!=""}[5m])) - name: Loki type: loki uid: loki @@ -103,7 +115,6 @@ metadata: spec: selector: app: grafana - type: NodePort + type: ClusterIP ports: - port: 3000 - nodePort: 30300 diff --git a/documentdb-playground/telemetry/local/k8s/observability/otel-collector-node.yaml b/documentdb-playground/telemetry/local/k8s/observability/otel-collector-node.yaml new file mode 100644 index 00000000..6e71f10c --- /dev/null +++ b/documentdb-playground/telemetry/local/k8s/observability/otel-collector-node.yaml @@ -0,0 +1,84 @@ +# ============================================================ +# OTel Collector DaemonSet — kubeletstats only +# ⚠️ DEMO/PLAYGROUND ONLY — not for production use. +# Runs on every node to collect kubelet metrics and forwards +# them to the central OTel Collector via OTLP. +# ============================================================ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: otel-collector-node + namespace: observability +spec: + selector: + matchLabels: + app: otel-collector-node + template: + metadata: + labels: + app: otel-collector-node + spec: + serviceAccountName: otel-collector + containers: + - name: otel-collector + image: otel/opentelemetry-collector-contrib:0.143.1 + args: ["--config=/etc/otel/config.yaml"] + env: + - name: K8S_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + memory: 128Mi + volumeMounts: + - name: config + mountPath: /etc/otel + volumes: + - name: config + configMap: + name: otel-collector-node-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: otel-collector-node-config + namespace: observability +data: + config.yaml: | + receivers: + kubeletstats: + auth_type: serviceAccount + collection_interval: 15s + endpoint: "https://${env:K8S_NODE_NAME}:10250" + insecure_skip_verify: true + metric_groups: + - node + - pod + - container + extra_metadata_labels: + - container.id + - k8s.volume.type + k8s_api_config: + auth_type: serviceAccount + + processors: + batch: + timeout: 10s + send_batch_size: 256 + + exporters: + otlp: + endpoint: otel-collector.observability.svc:4317 + tls: + insecure: true + + service: + pipelines: + metrics: + receivers: [kubeletstats] + processors: [batch] + exporters: [otlp] diff --git a/documentdb-playground/telemetry/local/k8s/observability/otel-collector.yaml b/documentdb-playground/telemetry/local/k8s/observability/otel-collector.yaml index dcc60ca2..a5561fc7 100644 --- a/documentdb-playground/telemetry/local/k8s/observability/otel-collector.yaml +++ b/documentdb-playground/telemetry/local/k8s/observability/otel-collector.yaml @@ -59,6 +59,19 @@ spec: - containerPort: 4318 protocol: TCP - containerPort: 8889 + - containerPort: 13133 + readinessProbe: + httpGet: + path: / + port: 13133 + initialDelaySeconds: 5 + periodSeconds: 10 + livenessProbe: + httpGet: + path: / + port: 13133 + initialDelaySeconds: 10 + periodSeconds: 15 env: # PG credentials are copied from the CNPG superuser secret # by deploy.sh (see "Copy PG credentials" step). @@ -72,10 +85,6 @@ spec: secretKeyRef: name: pg-monitor-credentials key: password - - name: K8S_NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName volumeMounts: - name: config mountPath: /etc/otel @@ -128,26 +137,12 @@ data: tls: insecure: true - # NOTE: kubeletstats with K8S_NODE_NAME only scrapes the node - # where this collector pod runs. In a multi-node cluster (e.g., - # the 4-node Kind cluster), metrics from other nodes are missed. - # Use a DaemonSet deployment for full-cluster kubelet coverage. - kubeletstats: - auth_type: serviceAccount - collection_interval: 15s - endpoint: "https://${env:K8S_NODE_NAME}:10250" - insecure_skip_verify: true - metric_groups: - - node - - pod - - container - extra_metadata_labels: - - container.id - - k8s.volume.type - k8s_api_config: - auth_type: serviceAccount - processors: + memory_limiter: + check_interval: 5s + limit_mib: 256 + spike_limit_mib: 64 + batch: timeout: 5s send_batch_size: 512 @@ -177,17 +172,22 @@ data: debug: verbosity: basic + extensions: + health_check: + endpoint: 0.0.0.0:13133 + service: + extensions: [health_check] pipelines: traces: receivers: [otlp] - processors: [batch, resource] + processors: [memory_limiter, batch, resource] exporters: [otlp/tempo] metrics: - receivers: [otlp, postgresql, kubeletstats] - processors: [batch, resource] + receivers: [otlp, postgresql] + processors: [memory_limiter, batch, resource] exporters: [prometheus] logs: receivers: [otlp] - processors: [batch, resource] - exporters: [otlphttp/loki] + processors: [memory_limiter, batch, resource] + exporters: [otlphttp/loki, debug] diff --git a/documentdb-playground/telemetry/local/k8s/observability/prometheus.yaml b/documentdb-playground/telemetry/local/k8s/observability/prometheus.yaml index b6183272..c9e2092f 100644 --- a/documentdb-playground/telemetry/local/k8s/observability/prometheus.yaml +++ b/documentdb-playground/telemetry/local/k8s/observability/prometheus.yaml @@ -1,6 +1,34 @@ # ============================================================ # Prometheus - Metrics Store # ============================================================ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: observability +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: + - apiGroups: [""] + resources: ["pods", "services", "endpoints"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: + - kind: ServiceAccount + name: prometheus + namespace: observability +--- apiVersion: apps/v1 kind: Deployment metadata: @@ -16,6 +44,7 @@ spec: labels: app: prometheus spec: + serviceAccountName: prometheus containers: - name: prometheus image: prom/prometheus:v3.3.0 @@ -42,10 +71,94 @@ data: prometheus.yml: | global: scrape_interval: 15s + + # Alerting rules + rule_files: + - /etc/prometheus/alerts.yml + scrape_configs: - job_name: otel-collector static_configs: - targets: ['otel-collector.observability.svc:8889'] + + # CNPG pods expose PostgreSQL metrics on port 9187. + - job_name: cnpg + kubernetes_sd_configs: + - role: pod + namespaces: + names: ['documentdb-preview-ns'] + relabel_configs: + - source_labels: [__meta_kubernetes_pod_label_cnpg_io_cluster] + regex: documentdb-preview + action: keep + - source_labels: [__meta_kubernetes_pod_ip] + target_label: __address__ + replacement: $1:9187 + + # Uncomment to scrape DocumentDB operator controller-runtime metrics. + # Requires the operator to be started with --metrics-bind-address=:8443. + # - job_name: documentdb-operator + # scheme: https + # tls_config: + # insecure_skip_verify: true + # kubernetes_sd_configs: + # - role: pod + # namespaces: + # names: ['documentdb-operator'] + # relabel_configs: + # - source_labels: [__meta_kubernetes_pod_label_app] + # regex: documentdb + # action: keep + # - source_labels: [__meta_kubernetes_pod_ip] + # target_label: __address__ + # replacement: $1:8443 + + alerts.yml: | + groups: + - name: documentdb + rules: + - alert: GatewayHighErrorRate + expr: | + ( + sum(rate(db_client_operations_total{error_type!=""}[5m])) + / sum(rate(db_client_operations_total[5m])) + ) * 100 > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "Gateway error rate above 5%" + description: "{{ $value | printf \"%.1f\" }}% of gateway operations are failing." + + - alert: PostgresReplicationLagHigh + expr: pg_stat_replication_byte_lag > 10485760 + for: 5m + labels: + severity: warning + annotations: + summary: "PostgreSQL replication lag exceeds 10MB" + description: "Replication lag is {{ $value | humanize1024 }}B on {{ $labels.instance }}." + + - alert: PostgresConnectionSaturation + expr: | + (sum by (instance) (pg_stat_activity_count) + / sum by (instance) (pg_settings_max_connections)) + * 100 > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "PostgreSQL connections above 80% capacity" + description: "{{ $value | printf \"%.0f\" }}% of max connections in use." + + - alert: GatewayDown + expr: absent(gateway_client_connections_active) == 1 + for: 2m + labels: + severity: critical + annotations: + summary: "No gateway metrics — gateway may be down" + description: "The gateway_client_connections_active metric has been absent for 2 minutes." --- apiVersion: v1 kind: Service diff --git a/documentdb-playground/telemetry/local/k8s/traffic/traffic-generator.yaml b/documentdb-playground/telemetry/local/k8s/traffic/traffic-generator.yaml index 2df53355..3ece3989 100644 --- a/documentdb-playground/telemetry/local/k8s/traffic/traffic-generator.yaml +++ b/documentdb-playground/telemetry/local/k8s/traffic/traffic-generator.yaml @@ -211,6 +211,17 @@ spec: containers: - name: traffic-gen image: mongodb/mongodb-community-server:7.0.30-ubuntu2204 + env: + - name: DOCDB_USER + valueFrom: + secretKeyRef: + name: documentdb-credentials + key: username + - name: DOCDB_PASSWORD + valueFrom: + secretKeyRef: + name: documentdb-credentials + key: password command: - mongosh - "--host" @@ -220,9 +231,9 @@ spec: - "--tls" - "--tlsAllowInvalidCertificates" - "-u" - - "demo_user" + - "$(DOCDB_USER)" - "-p" - - "DemoPassword100" + - "$(DOCDB_PASSWORD)" - "--file" - "/scripts/generate-traffic.js" volumeMounts: @@ -249,6 +260,17 @@ spec: containers: - name: traffic-gen image: mongodb/mongodb-community-server:7.0.30-ubuntu2204 + env: + - name: DOCDB_USER + valueFrom: + secretKeyRef: + name: documentdb-credentials + key: username + - name: DOCDB_PASSWORD + valueFrom: + secretKeyRef: + name: documentdb-credentials + key: password command: - mongosh - "--host" @@ -258,9 +280,9 @@ spec: - "--tls" - "--tlsAllowInvalidCertificates" - "-u" - - "demo_user" + - "$(DOCDB_USER)" - "-p" - - "DemoPassword100" + - "$(DOCDB_PASSWORD)" - "--file" - "/scripts/generate-reads.js" volumeMounts: diff --git a/documentdb-playground/telemetry/local/scripts/setup-kind.sh b/documentdb-playground/telemetry/local/scripts/setup-kind.sh index 41e2de42..e0483401 100755 --- a/documentdb-playground/telemetry/local/scripts/setup-kind.sh +++ b/documentdb-playground/telemetry/local/scripts/setup-kind.sh @@ -70,4 +70,4 @@ echo "" echo "✓ Kind cluster '${CLUSTER_NAME}' ready with 3 worker nodes" echo "✓ Local registry available at localhost:${REG_PORT}" echo "" -echo "Next: run ./scripts/deploy-operator.sh to install the DocumentDB operator" +echo "Next: run ./scripts/deploy.sh to deploy the full stack" diff --git a/documentdb-playground/telemetry/local/scripts/teardown.sh b/documentdb-playground/telemetry/local/scripts/teardown.sh index 309f438b..a236ffd8 100755 --- a/documentdb-playground/telemetry/local/scripts/teardown.sh +++ b/documentdb-playground/telemetry/local/scripts/teardown.sh @@ -8,13 +8,4 @@ echo "=== Tearing down DocumentDB Telemetry Playground ===" echo "Deleting Kind cluster '$CLUSTER_NAME'..." kind delete cluster --name "$CLUSTER_NAME" 2>/dev/null || true -# Clean up socat proxy containers -echo "Cleaning up proxy containers..." -for container in k8s-grafana-proxy k8s-prometheus-proxy; do - if docker inspect "$container" &>/dev/null; then - docker rm -f "$container" - echo " Removed $container" - fi -done - echo "Done. Registry container kept for reuse." diff --git a/documentdb-playground/telemetry/local/scripts/validate.sh b/documentdb-playground/telemetry/local/scripts/validate.sh new file mode 100755 index 00000000..45657d74 --- /dev/null +++ b/documentdb-playground/telemetry/local/scripts/validate.sh @@ -0,0 +1,130 @@ +#!/bin/bash +set -euo pipefail +CLUSTER_NAME="${CLUSTER_NAME:-documentdb-telemetry}" +CONTEXT="kind-${CLUSTER_NAME}" +PASS=0 +FAIL=0 + +green() { echo -e "\033[32m✓ $1\033[0m"; PASS=$((PASS + 1)); } +red() { echo -e "\033[31m✗ $1\033[0m"; FAIL=$((FAIL + 1)); } +warn() { echo -e "\033[33m⚠ $1\033[0m"; } + +echo "=== DocumentDB Telemetry Playground - Validation ===" +echo "" + +# 1. Check all observability pods +echo "--- Observability Stack ---" +for deploy in otel-collector prometheus grafana tempo loki; do + if kubectl get deployment "$deploy" -n observability --context "$CONTEXT" &>/dev/null; then + ready=$(kubectl get deployment "$deploy" -n observability --context "$CONTEXT" -o jsonpath='{.status.readyReplicas}' 2>/dev/null || echo "0") + if [ "${ready:-0}" -ge 1 ]; then + green "$deploy is running" + else + red "$deploy is not ready (readyReplicas=${ready:-0})" + fi + else + red "$deploy deployment not found" + fi +done + +# Check DaemonSet node collector +ds_desired=$(kubectl get daemonset otel-collector-node -n observability --context "$CONTEXT" -o jsonpath='{.status.desiredNumberScheduled}' 2>/dev/null || echo "0") +ds_ready=$(kubectl get daemonset otel-collector-node -n observability --context "$CONTEXT" -o jsonpath='{.status.numberReady}' 2>/dev/null || echo "0") +if [ "${ds_ready:-0}" -ge 1 ] && [ "$ds_ready" = "$ds_desired" ]; then + green "otel-collector-node DaemonSet is running ($ds_ready/$ds_desired nodes)" +elif [ "${ds_ready:-0}" -ge 1 ]; then + warn "otel-collector-node DaemonSet partially ready ($ds_ready/$ds_desired nodes)" +else + red "otel-collector-node DaemonSet not ready" +fi + +# 2. Check DocumentDB pods +echo "" +echo "--- DocumentDB ---" +pod_count=$(kubectl get pods -l app=documentdb-preview -n documentdb-preview-ns --context "$CONTEXT" --no-headers 2>/dev/null | grep -c "Running" || echo "0") +if [ "$pod_count" -ge 1 ]; then + green "DocumentDB pods running ($pod_count)" +else + red "No DocumentDB pods running" +fi + +# 3. Check Prometheus targets +echo "" +echo "--- Data Flow ---" +PROM_POD=$(kubectl get pod -l app=prometheus -n observability --context "$CONTEXT" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") +if [ -n "$PROM_POD" ]; then + # Check if OTel collector target is up + target_up=$(kubectl exec "$PROM_POD" -n observability --context "$CONTEXT" -- \ + wget -qO- "http://localhost:9090/api/v1/query?query=up" 2>/dev/null || echo "") + if echo "$target_up" | grep -q '"value"'; then + up_count=$(echo "$target_up" | python3 -c "import sys,json; d=json.load(sys.stdin); print(sum(1 for r in d.get('data',{}).get('result',[]) if r['value'][1]=='1'))" 2>/dev/null || echo "0") + green "Prometheus has $up_count active targets" + else + red "Cannot query Prometheus targets" + fi + + # Check for gateway metrics + gw_metrics=$(kubectl exec "$PROM_POD" -n observability --context "$CONTEXT" -- \ + wget -qO- "http://localhost:9090/api/v1/query?query=db_client_operations_total" 2>/dev/null || echo "") + if echo "$gw_metrics" | grep -q '"result":\[{'; then + green "Gateway metrics (db_client_operations_total) present" + else + warn "No gateway metrics yet (traffic generator may still be starting)" + fi + + # Check for PG metrics + pg_metrics=$(kubectl exec "$PROM_POD" -n observability --context "$CONTEXT" -- \ + wget -qO- "http://localhost:9090/api/v1/query?query=postgresql_backends" 2>/dev/null || echo "") + if echo "$pg_metrics" | grep -q '"result":\[{'; then + green "PostgreSQL metrics (postgresql_backends) present" + else + warn "No PostgreSQL metrics yet" + fi + + # Check for kubeletstats metrics + kube_metrics=$(kubectl exec "$PROM_POD" -n observability --context "$CONTEXT" -- \ + wget -qO- "http://localhost:9090/api/v1/query?query=k8s_pod_cpu_time" 2>/dev/null || echo "") + if echo "$kube_metrics" | grep -q '"result":\[{'; then + green "Kubelet metrics (k8s_pod_cpu_time) present" + else + warn "No kubelet metrics yet" + fi +else + red "Prometheus pod not found" +fi + +# 4. Check Tempo for traces +TEMPO_POD=$(kubectl get pod -l app=tempo -n observability --context "$CONTEXT" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") +if [ -n "$TEMPO_POD" ]; then + trace_check=$(kubectl exec "$TEMPO_POD" -n observability --context "$CONTEXT" -- \ + wget -qO- "http://localhost:3200/api/search?limit=1" 2>/dev/null || echo "") + if echo "$trace_check" | grep -q '"traces":\[{'; then + green "Traces present in Tempo" + else + warn "No traces in Tempo yet" + fi +else + red "Tempo pod not found" +fi + +# 5. Check Loki for logs +LOKI_POD=$(kubectl get pod -l app=loki -n observability --context "$CONTEXT" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") +if [ -n "$LOKI_POD" ]; then + loki_check=$(kubectl exec "$LOKI_POD" -n observability --context "$CONTEXT" -- \ + wget -qO- 'http://localhost:3100/loki/api/v1/label' 2>/dev/null || echo "") + if echo "$loki_check" | grep -q '"values":\['; then + green "Loki is receiving log labels" + else + warn "No log labels in Loki yet" + fi +else + red "Loki pod not found" +fi + +# Summary +echo "" +echo "=== Results: $PASS passed, $FAIL failed ===" +if [ "$FAIL" -gt 0 ]; then + echo "Some checks failed. Components may still be starting up — retry in a minute." + exit 1 +fi From dae61fede0600b6c31ee54473ed0601cd8fed3e5 Mon Sep 17 00:00:00 2001 From: urismiley Date: Wed, 25 Mar 2026 02:11:26 -0400 Subject: [PATCH 07/16] Fix monitoring docs: correct controller names, add CNPG caveat, clarify OTel naming, trim verbose examples Signed-off-by: urismiley --- .../preview/monitoring/metrics.md | 105 ++++-------------- .../preview/monitoring/overview.md | 16 +-- 2 files changed, 28 insertions(+), 93 deletions(-) diff --git a/docs/operator-public-documentation/preview/monitoring/metrics.md b/docs/operator-public-documentation/preview/monitoring/metrics.md index 0de60af1..6b52b1f4 100644 --- a/docs/operator-public-documentation/preview/monitoring/metrics.md +++ b/docs/operator-public-documentation/preview/monitoring/metrics.md @@ -26,7 +26,7 @@ These metrics are collected via the kubelet/cAdvisor interface (or the OpenTelem **Common labels:** `namespace`, `pod`, `container`, `node` -#### Example Queries +#### Example Query CPU usage rate per container over 5 minutes: @@ -37,35 +37,6 @@ rate(container_cpu_usage_seconds_total{ }[5m]) ``` -CPU utilization as a percentage of limit: - -```promql -(rate(container_cpu_usage_seconds_total{ - container="postgres", - pod=~".*documentdb.*" -}[5m]) -/ on(pod, container) -(container_spec_cpu_quota{ - container="postgres", - pod=~".*documentdb.*" -} -/ container_spec_cpu_period{ - container="postgres", - pod=~".*documentdb.*" -})) * 100 -``` - -Compare gateway vs. postgres CPU across all pods: - -```promql -sum by (container) ( - rate(container_cpu_usage_seconds_total{ - container=~"postgres|documentdb-gateway", - pod=~".*documentdb.*" - }[5m]) -) -``` - ### Memory | Metric | Type | Description | @@ -77,16 +48,7 @@ sum by (container) ( **Common labels:** `namespace`, `pod`, `container`, `node` -#### Example Queries - -Memory usage in MiB per container: - -```promql -container_memory_working_set_bytes{ - container=~"postgres|documentdb-gateway", - pod=~".*documentdb.*" -} / 1024 / 1024 -``` +#### Example Query Memory utilization as a percentage of limit: @@ -101,19 +63,6 @@ Memory utilization as a percentage of limit: }) * 100 ``` -Top 5 pods by memory usage: - -```promql -topk(5, - sum by (pod) ( - container_memory_working_set_bytes{ - container=~"postgres|documentdb-gateway", - pod=~".*documentdb.*" - } - ) -) -``` - ### Network | Metric | Type | Description | @@ -173,10 +122,12 @@ Metrics are exported to an OpenTelemetry Collector, which converts them to Prome | Metric | Type | Description | |--------|------|-------------| | `db_client_operations_total` | Counter | Total MongoDB operations processed | -| `db_client_operation_duration_seconds_total` | Counter | Cumulative operation duration | +| `db_client_operation_duration_seconds_total` | Counter | Cumulative operation duration (can be broken down by `db_operation_phase`) | **Common labels:** `db_operation_name` (e.g., `Find`, `Insert`, `Update`, `Aggregate`, `Delete`), `db_namespace`, `db_system_name`, `service_instance_id` (pod name), `error_type` (set on failed operations) +**Phase labels** (on `db_client_operation_duration_seconds_total`): `db_operation_phase` — values include `pg_query`, `cursor_iteration`, `bson_serialization`, `command_parsing`. Empty phase represents total duration. + #### Example Queries Operations per second by command type: @@ -204,12 +155,12 @@ sum(rate(db_client_operations_total{error_type!=""}[1m])) / sum(rate(db_client_operations_total[1m])) * 100 ``` -Operations per second for a specific instance: +Time spent in each operation phase per second: ```promql -sum by (db_operation_name) ( - rate(db_client_operations_total{ - service_instance_id="documentdb-preview-1" +sum by (db_operation_phase) ( + rate(db_client_operation_duration_seconds_total{ + db_operation_phase!="" }[1m]) ) ``` @@ -274,26 +225,6 @@ Average request throughput (bytes/sec): sum(rate(db_client_request_size_bytes_total[1m])) ``` -### Operation Phases - -| Metric | Type | Description | -|--------|------|-------------| -| `db_client_operation_duration_seconds_total` | Counter | Duration broken down by phase | - -**Key `db_operation_phase` values:** `pg_query`, `cursor_iteration`, `bson_serialization`, `command_parsing` - -#### Example Queries - -Time spent in each phase per second: - -```promql -sum by (db_operation_phase) ( - rate(db_client_operation_duration_seconds_total{ - db_operation_phase!="" - }[1m]) -) -``` - ## Operator Metrics (controller-runtime) The DocumentDB operator binary exposes standard controller-runtime metrics on its metrics endpoint. These track reconciliation performance and work queue health. @@ -306,7 +237,7 @@ The DocumentDB operator binary exposes standard controller-runtime metrics on it | `controller_runtime_reconcile_errors_total` | Counter | Total reconciliation errors | | `controller_runtime_reconcile_time_seconds` | Histogram | Time spent in reconciliation | -**Common labels:** `controller` (e.g., `documentdb-controller`, `backup`, `scheduledbackup`, `certificate-controller`, `persistentvolume`), `result` (`success`, `error`, `requeue`, `requeue_after`) +**Common labels:** `controller` (e.g., `documentdb-controller`, `backup-controller`, `scheduled-backup-controller`, `certificate-controller`, `pv-controller`), `result` (`success`, `error`, `requeue`, `requeue_after`) #### Example Queries @@ -355,7 +286,7 @@ sum by (controller) ( Work queue depth by controller: ```promql -workqueue_depth{name=~"documentdb-controller|backup|scheduledbackup|certificate-controller"} +workqueue_depth{name=~"documentdb-controller|backup-controller|scheduled-backup-controller|certificate-controller"} ``` Average time items spend waiting in queue: @@ -367,9 +298,12 @@ rate(workqueue_queue_duration_seconds_sum{name="documentdb-controller"}[5m]) ## CNPG / PostgreSQL Metrics -CloudNative-PG exposes PostgreSQL-level metrics from each managed pod. These are available when CNPG monitoring is enabled. For the full list, see the [CloudNative-PG monitoring docs](https://cloudnative-pg.io/documentation/current/monitoring/). +CloudNative-PG can expose PostgreSQL-level metrics from each managed pod. Additionally, the OpenTelemetry Collector's `postgresql` receiver collects metrics directly from PostgreSQL via SQL queries. + +!!! warning "CNPG monitoring must be enabled separately" + The DocumentDB operator does **not** enable CNPG's built-in Prometheus metrics endpoint by default. The `cnpg_*` metrics listed below are only available if you manually configure CNPG monitoring on the underlying Cluster resource. The `postgresql_*` metrics from the OTel `postgresql` receiver work without additional configuration. -Additionally, the OpenTelemetry Collector's `postgresql` receiver collects metrics directly from PostgreSQL via SQL queries. +For the full CNPG metrics list, see the [CloudNative-PG monitoring docs](https://cloudnative-pg.io/documentation/current/monitoring/). ### Replication @@ -465,9 +399,9 @@ cnpg_collector_up{pod=~".*documentdb.*"} == 0 ## OpenTelemetry Metric Names -When using the OpenTelemetry `kubeletstats` receiver, metric names use the OpenTelemetry naming convention instead of Prometheus-style names: +When using the OpenTelemetry `kubeletstats` receiver, metric names use the OpenTelemetry naming convention. These are **not identical** to cAdvisor/Prometheus metrics — they measure similar concepts but may differ in semantics (e.g., cumulative vs. gauge, different calculation methods): -| OpenTelemetry Name | Prometheus Equivalent | +| OpenTelemetry Name | Approximate Prometheus Equivalent | |---|---| | `k8s.container.cpu.time` | `container_cpu_usage_seconds_total` | | `k8s.container.memory.usage` | `container_memory_working_set_bytes` | @@ -475,4 +409,5 @@ When using the OpenTelemetry `kubeletstats` receiver, metric names use the OpenT | `k8s.container.memory.limit` | `container_spec_memory_limit_bytes` | | `k8s.pod.network.io` | `container_network_*_bytes_total` | -When writing queries, use the naming convention matching your collection method. The telemetry playground uses the OpenTelemetry names; a direct Prometheus scrape of cAdvisor uses Prometheus names. +!!! note + The OTel Prometheus exporter converts dots to underscores, so `k8s.container.cpu.time` becomes `k8s_container_cpu_time` in Prometheus. Use the naming convention matching your collection method. The telemetry playground uses OpenTelemetry names; a direct Prometheus scrape of cAdvisor uses Prometheus-style names. diff --git a/docs/operator-public-documentation/preview/monitoring/overview.md b/docs/operator-public-documentation/preview/monitoring/overview.md index 52d51305..4439d5a9 100644 --- a/docs/operator-public-documentation/preview/monitoring/overview.md +++ b/docs/operator-public-documentation/preview/monitoring/overview.md @@ -110,14 +110,14 @@ To enable metrics scraping, set the bind address in the operator deployment (for ### CNPG Cluster Metrics -The underlying CloudNative-PG cluster exposes PostgreSQL metrics on each pod. These are collected by the OpenTelemetry Collector's `postgresql` receiver via direct SQL queries, or by the `prometheus` receiver via Kubernetes service discovery. Key metric sources: - -| Source | Method | Metrics | -|--------|--------|---------| -| kubelet/cAdvisor | `kubeletstats` receiver | Container CPU, memory, network, filesystem | -| PostgreSQL | `postgresql` receiver (SQL) | Backends, commits, rollbacks, replication lag, DB size | -| Gateway | OTLP push | Operations, latency, connections, request/response size | -| Kubernetes API | `k8s_cluster` receiver | Pod status, restart counts, resource requests/limits | +The underlying CloudNative-PG cluster can expose PostgreSQL metrics on each pod, but **CNPG monitoring is not enabled by the operator by default**. The OpenTelemetry Collector's `postgresql` receiver collects metrics via direct SQL queries without requiring CNPG monitoring to be enabled. Key metric sources: + +| Source | Method | Metrics | Enabled by Default? | +|--------|--------|---------|---------------------| +| kubelet/cAdvisor | `kubeletstats` receiver | Container CPU, memory, network, filesystem | Yes (with OTel Collector) | +| PostgreSQL | `postgresql` receiver (SQL) | Backends, commits, rollbacks, replication lag, DB size | Yes (with OTel Collector) | +| Gateway | OTLP push | Operations, latency, connections, request/response size | Yes (automatic) | +| CNPG | Pod metrics endpoint | Replication lag, connections, DB size (CNPG-native) | No (requires manual config) | ### ServiceMonitor / PodMonitor From 8d975514241f7e18cbe6c42075462af4b1818c0a Mon Sep 17 00:00:00 2001 From: urismiley Date: Wed, 25 Mar 2026 03:03:03 -0400 Subject: [PATCH 08/16] Fix namespace race in deploy.sh and update README with operator prerequisite instructions Signed-off-by: urismiley --- .../telemetry/local/README.md | 35 ++++++++++++++++--- .../telemetry/local/scripts/deploy.sh | 1 + 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/documentdb-playground/telemetry/local/README.md b/documentdb-playground/telemetry/local/README.md index b11da4e0..9f92a155 100644 --- a/documentdb-playground/telemetry/local/README.md +++ b/documentdb-playground/telemetry/local/README.md @@ -7,9 +7,9 @@ A full observability stack for DocumentDB running on a local Kind cluster. Provi - **Docker** (running) - **kind** — [install](https://kind.sigs.k8s.io/docs/user/quick-start/#installation) - **kubectl** -- **DocumentDB operator images** pushed to `localhost:5001` - - `localhost:5001/documentdb-gateway:latest` - - Operator + sidecar injector installed in the cluster +- **Helm 3** — [install](https://helm.sh/docs/intro/install/) +- **jq** — for credential copying +- **DocumentDB operator installed** — the operator, CNPG, and cert-manager must be running in the cluster before deploying the playground. See the [Development Environment Guide](../../../docs/developer-guides/development-environment.md) for setup instructions, or use the quick setup below. !!! important "Gateway OTEL support required" The gateway image must be built with OpenTelemetry instrumentation @@ -19,17 +19,42 @@ A full observability stack for DocumentDB running on a local Kind cluster. Provi ## Quick Start +### 1. Set up the operator (if not already installed) + +```bash +cd operator/src + +# Build and deploy operator + CNPG + cert-manager onto a Kind cluster +DEPLOY=true ./scripts/development/deploy.sh +``` + +This creates a Kind cluster named `kind` with the operator running. If you want the telemetry playground on its own cluster instead, run `setup-kind.sh` first, then install the operator into that cluster. + +### 2. Deploy the telemetry playground + ```bash -# Deploy everything (cluster + observability + DocumentDB + traffic) +cd documentdb-playground/telemetry/local + +# Deploy observability stack + DocumentDB + traffic generators ./scripts/deploy.sh +``` + +### 3. Access dashboards +```bash # Access Grafana (admin/admin, anonymous access enabled) kubectl port-forward svc/grafana 3000:3000 -n observability --context kind-documentdb-telemetry # Access Prometheus kubectl port-forward svc/prometheus 9090:9090 -n observability --context kind-documentdb-telemetry -# Tear down +# Validate data is flowing +./scripts/validate.sh +``` + +### 4. Tear down + +```bash ./scripts/teardown.sh ``` diff --git a/documentdb-playground/telemetry/local/scripts/deploy.sh b/documentdb-playground/telemetry/local/scripts/deploy.sh index b6af84ec..9853f729 100755 --- a/documentdb-playground/telemetry/local/scripts/deploy.sh +++ b/documentdb-playground/telemetry/local/scripts/deploy.sh @@ -19,6 +19,7 @@ kubectl wait --for=condition=Ready nodes --all --context "$CONTEXT" --timeout=12 # NOTE: The OTel collector pod will remain in CreateContainerConfigError # until PG credentials are copied in step 5. This is expected. echo "[3/6] Deploying observability stack..." +kubectl apply -f "$LOCAL_DIR/k8s/observability/namespace.yaml" --context "$CONTEXT" kubectl apply -f "$LOCAL_DIR/k8s/observability/" --context "$CONTEXT" # Create dashboard ConfigMap from JSON files From 89745bfcb9aa2de79f097df4535e86c3a7465745 Mon Sep 17 00:00:00 2001 From: urismiley Date: Wed, 25 Mar 2026 12:53:07 -0400 Subject: [PATCH 09/16] Fix deploy.sh secret detection for CNPG 1.28 and add exposeViaService to cluster CR Signed-off-by: urismiley --- .../local/k8s/documentdb/cluster.yaml | 2 ++ .../telemetry/local/scripts/deploy.sh | 29 ++++++++++++------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/documentdb-playground/telemetry/local/k8s/documentdb/cluster.yaml b/documentdb-playground/telemetry/local/k8s/documentdb/cluster.yaml index 045fa261..0ebf8aa5 100644 --- a/documentdb-playground/telemetry/local/k8s/documentdb/cluster.yaml +++ b/documentdb-playground/telemetry/local/k8s/documentdb/cluster.yaml @@ -25,6 +25,8 @@ spec: instancesPerNode: 3 documentDbCredentialSecret: documentdb-credentials gatewayImage: "localhost:5001/documentdb-gateway:latest" + exposeViaService: + serviceType: ClusterIP resource: storage: pvcSize: 5Gi diff --git a/documentdb-playground/telemetry/local/scripts/deploy.sh b/documentdb-playground/telemetry/local/scripts/deploy.sh index 9853f729..7c7ed616 100755 --- a/documentdb-playground/telemetry/local/scripts/deploy.sh +++ b/documentdb-playground/telemetry/local/scripts/deploy.sh @@ -38,20 +38,29 @@ kubectl apply -f "$LOCAL_DIR/k8s/documentdb/" --context "$CONTEXT" # Step 5: Copy PG credentials to observability namespace # CNPG creates a superuser secret (-superuser) that the OTel # collector needs for the postgresql receiver to authenticate. -echo "[5/6] Waiting for CNPG superuser secret..." +echo "[5/6] Waiting for CNPG app secret..." RETRIES=0 MAX_RETRIES=36 # 36 × 5s = 3 minutes -until kubectl get secret documentdb-preview-superuser -n documentdb-preview-ns --context "$CONTEXT" &>/dev/null; do - RETRIES=$((RETRIES + 1)) - if [ "$RETRIES" -ge "$MAX_RETRIES" ]; then - echo "ERROR: Timed out waiting for CNPG superuser secret (documentdb-preview-superuser)." - echo " Verify the DocumentDB operator is installed and the CR was accepted." - exit 1 +SECRET_NAME="" +until [ -n "$SECRET_NAME" ]; do + # CNPG 1.28+ uses -app; older versions use -superuser + if kubectl get secret documentdb-preview-app -n documentdb-preview-ns --context "$CONTEXT" &>/dev/null; then + SECRET_NAME="documentdb-preview-app" + elif kubectl get secret documentdb-preview-superuser -n documentdb-preview-ns --context "$CONTEXT" &>/dev/null; then + SECRET_NAME="documentdb-preview-superuser" + fi + if [ -z "$SECRET_NAME" ]; then + RETRIES=$((RETRIES + 1)) + if [ "$RETRIES" -ge "$MAX_RETRIES" ]; then + echo "ERROR: Timed out waiting for CNPG secrets." + echo " Verify the DocumentDB operator is installed and the CR was accepted." + exit 1 + fi + sleep 5 fi - sleep 5 done -echo " Copying PG credentials to observability namespace..." -kubectl get secret documentdb-preview-superuser -n documentdb-preview-ns --context "$CONTEXT" -o json \ +echo " Copying PG credentials ($SECRET_NAME) to observability namespace..." +kubectl get secret "$SECRET_NAME" -n documentdb-preview-ns --context "$CONTEXT" -o json \ | jq 'del(.metadata.namespace,.metadata.uid,.metadata.resourceVersion,.metadata.creationTimestamp,.metadata.ownerReferences,.metadata.labels,.metadata.managedFields) | .metadata.name = "pg-monitor-credentials"' \ | kubectl apply -n observability --context "$CONTEXT" -f - From 6355d5021b1562d4294ca5f9d1e220c1c19d8ac1 Mon Sep 17 00:00:00 2001 From: urismiley Date: Wed, 25 Mar 2026 12:58:55 -0400 Subject: [PATCH 10/16] Fix gateway dashboard instance variable to use db_client_operations_total Signed-off-by: urismiley --- documentdb-playground/telemetry/local/dashboards/gateway.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/documentdb-playground/telemetry/local/dashboards/gateway.json b/documentdb-playground/telemetry/local/dashboards/gateway.json index 0b8a4595..313261c9 100644 --- a/documentdb-playground/telemetry/local/dashboards/gateway.json +++ b/documentdb-playground/telemetry/local/dashboards/gateway.json @@ -24,14 +24,14 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(gateway_client_connections_total, service_instance_id)", + "definition": "label_values(db_client_operations_total, service_instance_id)", "hide": 0, "includeAll": true, "multi": true, "name": "instance", "options": [], "query": { - "query": "label_values(gateway_client_connections_total, service_instance_id)" + "query": "label_values(db_client_operations_total, service_instance_id)" }, "refresh": 2, "regex": "", From dfeb8fb9778c08360c9aa09b7197ccba1a1c671a Mon Sep 17 00:00:00 2001 From: urismiley Date: Wed, 25 Mar 2026 13:12:37 -0400 Subject: [PATCH 11/16] Fix dashboards: remap to cnpg_* metrics, remove panels for unimplemented gateway metrics Signed-off-by: urismiley --- .../telemetry/local/dashboards/internals.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/documentdb-playground/telemetry/local/dashboards/internals.json b/documentdb-playground/telemetry/local/dashboards/internals.json index 0cd22b44..7e8f7a6b 100644 --- a/documentdb-playground/telemetry/local/dashboards/internals.json +++ b/documentdb-playground/telemetry/local/dashboards/internals.json @@ -106,7 +106,7 @@ "targets": [ { "refId": "A", - "expr": "postgresql_backends", + "expr": "cnpg_backends_total", "legendFormat": "{{postgresql_database_name}}" } ] @@ -203,7 +203,7 @@ "targets": [ { "refId": "A", - "expr": "postgresql_backends", + "expr": "cnpg_backends_total", "legendFormat": "{{postgresql_database_name}}" } ] @@ -271,7 +271,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "postgresql_replication_data_delay_bytes", + "expr": "cnpg_pg_replication_lag", "legendFormat": "{{replication_client}}", "refId": "A" } @@ -316,12 +316,12 @@ "targets": [ { "refId": "A", - "expr": "rate(postgresql_commits_total[1m])", + "expr": "rate(cnpg_pg_stat_database_xact_commit[1m])", "legendFormat": "commits" }, { "refId": "B", - "expr": "rate(postgresql_rollbacks_total[1m])", + "expr": "rate(cnpg_pg_stat_database_xact_rollback[1m])", "legendFormat": "rollbacks" } ] @@ -409,7 +409,7 @@ "targets": [ { "refId": "A", - "expr": "postgresql_db_size_bytes", + "expr": "cnpg_pg_database_size_bytes", "legendFormat": "{{postgresql_database_name}}" } ] @@ -577,7 +577,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "postgresql_backends / postgresql_connection_max * 100", + "expr": "cnpg_backends_total / postgresql_connection_max * 100", "legendFormat": "{{postgresql_database_name}}", "refId": "A" } From 153a76e6e1088aaeaee13e736a7356262b3c909d Mon Sep 17 00:00:00 2001 From: urismiley Date: Wed, 25 Mar 2026 14:24:01 -0400 Subject: [PATCH 12/16] Fix dashboards: remap PG ops to cnpg tup_ metrics, WAL size, split doc/sec by collection Signed-off-by: urismiley --- .../telemetry/local/dashboards/gateway.json | 23 +++++--- .../telemetry/local/dashboards/internals.json | 53 ++++++++++++------- 2 files changed, 51 insertions(+), 25 deletions(-) diff --git a/documentdb-playground/telemetry/local/dashboards/gateway.json b/documentdb-playground/telemetry/local/dashboards/gateway.json index 313261c9..eefeab97 100644 --- a/documentdb-playground/telemetry/local/dashboards/gateway.json +++ b/documentdb-playground/telemetry/local/dashboards/gateway.json @@ -505,13 +505,24 @@ }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "sum by (db_collection_name) (rate(db_client_documents_returned_total{service_instance_id=~\"$instance\"}[1m]) + rate(db_client_documents_inserted_total{service_instance_id=~\"$instance\"}[1m]))", - "legendFormat": "{{db_collection_name}}", + "expr": "sum by (db_collection_name) (rate(db_client_documents_returned_total{service_instance_id=~\"$instance\"}[1m]))", + "legendFormat": "{{db_collection_name}} returned", "refId": "A" + }, + { + "expr": "sum by (db_collection_name) (rate(db_client_documents_inserted_total{service_instance_id=~\"$instance\"}[1m]))", + "legendFormat": "{{db_collection_name}} inserted", + "refId": "B" + }, + { + "expr": "sum by (db_collection_name) (rate(db_client_documents_updated_total{service_instance_id=~\"$instance\"}[1m]))", + "legendFormat": "{{db_collection_name}} updated", + "refId": "C" + }, + { + "expr": "sum by (db_collection_name) (rate(db_client_documents_deleted_total{service_instance_id=~\"$instance\"}[1m]))", + "legendFormat": "{{db_collection_name}} deleted", + "refId": "D" } ] }, diff --git a/documentdb-playground/telemetry/local/dashboards/internals.json b/documentdb-playground/telemetry/local/dashboards/internals.json index 7e8f7a6b..8598a814 100644 --- a/documentdb-playground/telemetry/local/dashboards/internals.json +++ b/documentdb-playground/telemetry/local/dashboards/internals.json @@ -328,7 +328,7 @@ }, { "type": "timeseries", - "title": "PG Operations/sec", + "title": "PG Row Operations/sec", "id": 24, "gridPos": { "x": 8, @@ -364,9 +364,24 @@ }, "targets": [ { - "refId": "A", - "expr": "sum by (operation) (rate(postgresql_operations_total[1m]))", - "legendFormat": "{{operation}}" + "expr": "sum by (datname) (rate(cnpg_pg_stat_database_tup_inserted[1m]))", + "legendFormat": "inserted", + "refId": "A" + }, + { + "expr": "sum by (datname) (rate(cnpg_pg_stat_database_tup_updated[1m]))", + "legendFormat": "updated", + "refId": "B" + }, + { + "expr": "sum by (datname) (rate(cnpg_pg_stat_database_tup_deleted[1m]))", + "legendFormat": "deleted", + "refId": "C" + }, + { + "expr": "sum by (datname) (rate(cnpg_pg_stat_database_tup_fetched[1m]))", + "legendFormat": "fetched", + "refId": "D" } ] }, @@ -416,7 +431,7 @@ }, { "type": "bargauge", - "title": "Index Size by Table", + "title": "Index Size by Table (N/A)", "datasource": { "type": "prometheus", "uid": "prometheus" @@ -463,11 +478,12 @@ "y": 28, "w": 8, "h": 8 - } + }, + "description": "No index size metrics available from CNPG." }, { "type": "bargauge", - "title": "Indexes per Table", + "title": "Indexes per Table (N/A)", "datasource": { "type": "prometheus", "uid": "prometheus" @@ -514,11 +530,12 @@ "y": 28, "w": 8, "h": 8 - } + }, + "description": "No index count metrics available from CNPG." }, { "type": "timeseries", - "title": "Index Scans/sec", + "title": "Index Scans/sec (N/A)", "datasource": { "type": "prometheus", "uid": "prometheus" @@ -562,7 +579,8 @@ "y": 28, "w": 8, "h": 8 - } + }, + "description": "No index scan metrics available from CNPG. Enable pg_stat_user_indexes monitoring for this data." }, { "type": "stat", @@ -625,19 +643,15 @@ }, { "type": "timeseries", - "title": "WAL Age", + "title": "WAL Size", "datasource": { "type": "prometheus", "uid": "prometheus" }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "expr": "postgresql_wal_age_seconds", - "legendFormat": "WAL age", + "expr": "cnpg_collector_pg_wal", + "legendFormat": "{{pod}}", "refId": "A" } ], @@ -674,7 +688,7 @@ }, { "type": "timeseries", - "title": "Vacuum Count by Table", + "title": "Vacuum Count by Table (N/A)", "datasource": { "type": "prometheus", "uid": "prometheus" @@ -719,7 +733,8 @@ "y": 20, "w": 8, "h": 8 - } + }, + "description": "No vacuum metrics available from CNPG. Enable pg_stat_user_tables monitoring for this data." } ], "gridPos": { From 39680c383cf4a301ca33dc5fe77911641875ef4b Mon Sep 17 00:00:00 2001 From: urismiley Date: Thu, 26 Mar 2026 12:18:42 -0400 Subject: [PATCH 13/16] =?UTF-8?q?docs:=20clarify=20gateway=20OTel=20prereq?= =?UTF-8?q?uisite=20=E2=80=94=20base=20instrumentation=20vs=20full=20metri?= =?UTF-8?q?cs-expansion=20branch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: urismiley --- .../preview/monitoring/overview.md | 10 +++++++--- documentdb-playground/telemetry/local/README.md | 8 ++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/docs/operator-public-documentation/preview/monitoring/overview.md b/docs/operator-public-documentation/preview/monitoring/overview.md index 4439d5a9..204595d8 100644 --- a/docs/operator-public-documentation/preview/monitoring/overview.md +++ b/docs/operator-public-documentation/preview/monitoring/overview.md @@ -21,9 +21,13 @@ This guide describes how to monitor DocumentDB clusters running on Kubernetes us - (Optional) [OpenTelemetry Operator](https://opentelemetry.io/docs/kubernetes/operator/) for managed collector deployments !!! important "Gateway OTEL support required" - Gateway telemetry requires the OpenTelemetry instrumentation added in - [documentdb/documentdb#443](https://github.com/documentdb/documentdb/pull/443). - Ensure your gateway image is built from a branch that includes this change. + Gateway telemetry requires the OpenTelemetry instrumentation from the + [documentdb/documentdb](https://github.com/microsoft/documentdb) repository. + Base instrumentation (operations, latency, request/response size) is in + [documentdb#443](https://github.com/documentdb/documentdb/pull/443). + Full metric coverage (client connections, connection pool, document throughput) + requires the `users/urismiley/metrics-expansion` branch. + Ensure your gateway image is built from a branch that includes these changes. ## Architecture diff --git a/documentdb-playground/telemetry/local/README.md b/documentdb-playground/telemetry/local/README.md index 9f92a155..aa2d3e6b 100644 --- a/documentdb-playground/telemetry/local/README.md +++ b/documentdb-playground/telemetry/local/README.md @@ -13,8 +13,12 @@ A full observability stack for DocumentDB running on a local Kind cluster. Provi !!! important "Gateway OTEL support required" The gateway image must be built with OpenTelemetry instrumentation - ([documentdb/documentdb#443](https://github.com/documentdb/documentdb/pull/443)). - Without it, the gateway will not emit metrics, traces, or logs via OTLP + from the [documentdb/documentdb](https://github.com/microsoft/documentdb) repository. + The base instrumentation (operations, latency, request/response size) is in + [documentdb#443](https://github.com/documentdb/documentdb/pull/443). + Full metric coverage (client connections, connection pool, document throughput) + requires the `users/urismiley/metrics-expansion` branch. + Without these changes, the gateway will not emit metrics, traces, or logs via OTLP and the Grafana dashboards will show no data. ## Quick Start From f28dd955bfbebf9363b47af7ca015520146d7194 Mon Sep 17 00:00:00 2001 From: urismiley Date: Thu, 26 Mar 2026 12:26:46 -0400 Subject: [PATCH 14/16] Make deploy.sh self-contained: install operator from GHCR, default to published images, add custom image docs Signed-off-by: urismiley --- .../telemetry/local/README.md | 34 +++++------ .../local/k8s/documentdb/cluster.yaml | 6 +- .../telemetry/local/scripts/deploy.sh | 59 +++++++++++++++---- 3 files changed, 65 insertions(+), 34 deletions(-) diff --git a/documentdb-playground/telemetry/local/README.md b/documentdb-playground/telemetry/local/README.md index aa2d3e6b..677cad7e 100644 --- a/documentdb-playground/telemetry/local/README.md +++ b/documentdb-playground/telemetry/local/README.md @@ -23,29 +23,12 @@ A full observability stack for DocumentDB running on a local Kind cluster. Provi ## Quick Start -### 1. Set up the operator (if not already installed) - -```bash -cd operator/src - -# Build and deploy operator + CNPG + cert-manager onto a Kind cluster -DEPLOY=true ./scripts/development/deploy.sh -``` - -This creates a Kind cluster named `kind` with the operator running. If you want the telemetry playground on its own cluster instead, run `setup-kind.sh` first, then install the operator into that cluster. - -### 2. Deploy the telemetry playground - ```bash cd documentdb-playground/telemetry/local -# Deploy observability stack + DocumentDB + traffic generators +# Deploy everything (cluster + operator + observability + DocumentDB + traffic) ./scripts/deploy.sh -``` -### 3. Access dashboards - -```bash # Access Grafana (admin/admin, anonymous access enabled) kubectl port-forward svc/grafana 3000:3000 -n observability --context kind-documentdb-telemetry @@ -54,12 +37,23 @@ kubectl port-forward svc/prometheus 9090:9090 -n observability --context kind-do # Validate data is flowing ./scripts/validate.sh + +# Tear down +./scripts/teardown.sh ``` -### 4. Tear down +### Using a custom gateway image + +To test unreleased gateway changes (e.g. OTel instrumentation), push your custom image to the local Kind registry and uncomment `gatewayImage` in `k8s/documentdb/cluster.yaml`: ```bash -./scripts/teardown.sh +# Build/tag your gateway image and push to the local registry +docker tag my-gateway:latest localhost:5001/documentdb-gateway:latest +docker push localhost:5001/documentdb-gateway:latest + +# Uncomment gatewayImage in cluster.yaml, then deploy +sed -i 's|# gatewayImage:|gatewayImage:|' k8s/documentdb/cluster.yaml +./scripts/deploy.sh ``` ## Architecture diff --git a/documentdb-playground/telemetry/local/k8s/documentdb/cluster.yaml b/documentdb-playground/telemetry/local/k8s/documentdb/cluster.yaml index 0ebf8aa5..2791bea3 100644 --- a/documentdb-playground/telemetry/local/k8s/documentdb/cluster.yaml +++ b/documentdb-playground/telemetry/local/k8s/documentdb/cluster.yaml @@ -24,7 +24,11 @@ spec: nodeCount: 1 instancesPerNode: 3 documentDbCredentialSecret: documentdb-credentials - gatewayImage: "localhost:5001/documentdb-gateway:latest" + # To use a custom gateway image (e.g. with OTel instrumentation from a local build): + # 1. Push your image: docker tag my-gateway localhost:5001/documentdb-gateway:latest + # docker push localhost:5001/documentdb-gateway:latest + # 2. Uncomment the line below: + # gatewayImage: "localhost:5001/documentdb-gateway:latest" exposeViaService: serviceType: ClusterIP resource: diff --git a/documentdb-playground/telemetry/local/scripts/deploy.sh b/documentdb-playground/telemetry/local/scripts/deploy.sh index 7c7ed616..d906b4e1 100755 --- a/documentdb-playground/telemetry/local/scripts/deploy.sh +++ b/documentdb-playground/telemetry/local/scripts/deploy.sh @@ -4,21 +4,53 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" LOCAL_DIR="$(dirname "$SCRIPT_DIR")" CLUSTER_NAME="${CLUSTER_NAME:-documentdb-telemetry}" CONTEXT="kind-${CLUSTER_NAME}" +OPERATOR_CHART_VERSION="${OPERATOR_CHART_VERSION:-0.1.3}" echo "=== DocumentDB Telemetry Playground ===" # Step 1: Create Kind cluster -echo "[1/6] Setting up Kind cluster..." +echo "[1/7] Setting up Kind cluster..." "$SCRIPT_DIR/setup-kind.sh" # Step 2: Wait for cluster -echo "[2/6] Waiting for cluster to be ready..." +echo "[2/7] Waiting for cluster to be ready..." kubectl wait --for=condition=Ready nodes --all --context "$CONTEXT" --timeout=120s -# Step 3: Deploy observability stack +# Step 3: Install cert-manager + DocumentDB operator +echo "[3/7] Installing cert-manager and DocumentDB operator..." +if helm list -n documentdb-operator --kube-context "$CONTEXT" 2>/dev/null | grep -q documentdb-operator; then + echo " DocumentDB operator already installed, skipping." +else + # cert-manager + if kubectl get namespace cert-manager --context "$CONTEXT" &>/dev/null; then + echo " cert-manager already installed, skipping." + else + echo " Installing cert-manager..." + helm repo add jetstack https://charts.jetstack.io --force-update 2>/dev/null + helm repo update jetstack 2>/dev/null + helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --create-namespace \ + --set installCRDs=true \ + --kube-context "$CONTEXT" \ + --wait --timeout 120s + fi + + # DocumentDB operator (includes CNPG as a dependency) + echo " Installing DocumentDB operator v${OPERATOR_CHART_VERSION}..." + helm install documentdb-operator \ + oci://ghcr.io/documentdb/documentdb-operator \ + --version "${OPERATOR_CHART_VERSION}" \ + --namespace documentdb-operator \ + --create-namespace \ + --kube-context "$CONTEXT" \ + --wait --timeout 180s +fi + +# Step 4: Deploy observability stack # NOTE: The OTel collector pod will remain in CreateContainerConfigError -# until PG credentials are copied in step 5. This is expected. -echo "[3/6] Deploying observability stack..." +# until PG credentials are copied in step 6. This is expected. +echo "[4/7] Deploying observability stack..." kubectl apply -f "$LOCAL_DIR/k8s/observability/namespace.yaml" --context "$CONTEXT" kubectl apply -f "$LOCAL_DIR/k8s/observability/" --context "$CONTEXT" @@ -31,14 +63,14 @@ kubectl create configmap grafana-dashboards \ --context "$CONTEXT" \ --dry-run=client -o yaml | kubectl apply -f - --context "$CONTEXT" -# Step 4: Deploy DocumentDB -echo "[4/6] Deploying DocumentDB..." +# Step 5: Deploy DocumentDB +echo "[5/7] Deploying DocumentDB..." kubectl apply -f "$LOCAL_DIR/k8s/documentdb/" --context "$CONTEXT" -# Step 5: Copy PG credentials to observability namespace -# CNPG creates a superuser secret (-superuser) that the OTel -# collector needs for the postgresql receiver to authenticate. -echo "[5/6] Waiting for CNPG app secret..." +# Step 6: Copy PG credentials to observability namespace +# CNPG creates a secret that the OTel collector needs for the +# postgresql receiver to authenticate. +echo "[6/7] Waiting for CNPG app secret..." RETRIES=0 MAX_RETRIES=36 # 36 × 5s = 3 minutes SECRET_NAME="" @@ -68,8 +100,8 @@ kubectl get secret "$SECRET_NAME" -n documentdb-preview-ns --context "$CONTEXT" echo " Waiting for observability stack..." kubectl wait --for=condition=Available deployment --all -n observability --context "$CONTEXT" --timeout=180s -# Step 6: Deploy traffic generators -echo "[6/6] Deploying traffic generators..." +# Step 7: Deploy traffic generators +echo "[7/7] Deploying traffic generators..." echo " Waiting for DocumentDB pods (this may take a few minutes)..." kubectl wait --for=condition=Ready pod -l app=documentdb-preview -n documentdb-preview-ns --context "$CONTEXT" --timeout=300s 2>/dev/null || echo " (DocumentDB pods not ready yet - deploy traffic manually later)" kubectl apply -f "$LOCAL_DIR/k8s/traffic/" --context "$CONTEXT" @@ -78,3 +110,4 @@ echo "" echo "=== Deployment Complete ===" echo "Grafana: kubectl port-forward svc/grafana 3000:3000 -n observability --context $CONTEXT" echo "Prometheus: kubectl port-forward svc/prometheus 9090:9090 -n observability --context $CONTEXT" +echo "Validate: ./scripts/validate.sh" From eac15b232d0149b073c57d6369f055bff6002ec8 Mon Sep 17 00:00:00 2001 From: urismiley Date: Thu, 26 Mar 2026 12:28:02 -0400 Subject: [PATCH 15/16] docs: remove stale operator prerequisite, fix latency description Signed-off-by: urismiley --- documentdb-playground/telemetry/local/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/documentdb-playground/telemetry/local/README.md b/documentdb-playground/telemetry/local/README.md index 677cad7e..d4a4e6e0 100644 --- a/documentdb-playground/telemetry/local/README.md +++ b/documentdb-playground/telemetry/local/README.md @@ -9,7 +9,6 @@ A full observability stack for DocumentDB running on a local Kind cluster. Provi - **kubectl** - **Helm 3** — [install](https://helm.sh/docs/intro/install/) - **jq** — for credential copying -- **DocumentDB operator installed** — the operator, CNPG, and cert-manager must be running in the cluster before deploying the playground. See the [Development Environment Guide](../../../docs/developer-guides/development-environment.md) for setup instructions, or use the quick setup below. !!! important "Gateway OTEL support required" The gateway image must be built with OpenTelemetry instrumentation @@ -131,7 +130,7 @@ local/ | Dashboard | Description | |-----------|-------------| -| **Gateway** | Request rates, latency (p50/p95/p99), error rates, active connections, command breakdown, gateway logs (Loki) | +| **Gateway** | Request rates, average latency, error rates, active connections, command breakdown, gateway logs (Loki) | | **Internals** | PostgreSQL metrics, container resource usage (CPU/memory), OTel Collector pipeline stats | Dashboards auto-refresh every 10 seconds and are automatically provisioned into Grafana on startup via ConfigMap mounts. Edits made in the Grafana UI will persist until the pod restarts. From ff27b7ab850212c449f501f704bf1c8b1460d021 Mon Sep 17 00:00:00 2001 From: urismiley Date: Thu, 26 Mar 2026 12:45:01 -0400 Subject: [PATCH 16/16] Fix alert rules to use cnpg_* metrics, replace python3 with grep in validate.sh, fix doc metric name Signed-off-by: urismiley --- .../preview/monitoring/overview.md | 2 +- .../telemetry/local/k8s/observability/prometheus.yaml | 10 +++++----- .../telemetry/local/scripts/validate.sh | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/operator-public-documentation/preview/monitoring/overview.md b/docs/operator-public-documentation/preview/monitoring/overview.md index 204595d8..d86231bb 100644 --- a/docs/operator-public-documentation/preview/monitoring/overview.md +++ b/docs/operator-public-documentation/preview/monitoring/overview.md @@ -290,7 +290,7 @@ curl -s 'http://localhost:9090/api/v1/query?query=postgresql_backends' \ | jq '.data.result | length' # Confirm kubeletstats metrics are present -curl -s 'http://localhost:9090/api/v1/query?query=k8s_pod_cpu_usage' \ +curl -s 'http://localhost:9090/api/v1/query?query=k8s_pod_cpu_time' \ | jq '.data.result | length' ``` diff --git a/documentdb-playground/telemetry/local/k8s/observability/prometheus.yaml b/documentdb-playground/telemetry/local/k8s/observability/prometheus.yaml index c9e2092f..18790376 100644 --- a/documentdb-playground/telemetry/local/k8s/observability/prometheus.yaml +++ b/documentdb-playground/telemetry/local/k8s/observability/prometheus.yaml @@ -131,18 +131,18 @@ data: description: "{{ $value | printf \"%.1f\" }}% of gateway operations are failing." - alert: PostgresReplicationLagHigh - expr: pg_stat_replication_byte_lag > 10485760 + expr: cnpg_pg_replication_lag > 10 for: 5m labels: severity: warning annotations: - summary: "PostgreSQL replication lag exceeds 10MB" - description: "Replication lag is {{ $value | humanize1024 }}B on {{ $labels.instance }}." + summary: "PostgreSQL replication lag exceeds 10 seconds" + description: "Replication lag is {{ $value | printf \"%.1f\" }}s on {{ $labels.instance }}." - alert: PostgresConnectionSaturation expr: | - (sum by (instance) (pg_stat_activity_count) - / sum by (instance) (pg_settings_max_connections)) + (cnpg_backends_total + / postgresql_connection_max) * 100 > 80 for: 5m labels: diff --git a/documentdb-playground/telemetry/local/scripts/validate.sh b/documentdb-playground/telemetry/local/scripts/validate.sh index 45657d74..6f8d6ed2 100755 --- a/documentdb-playground/telemetry/local/scripts/validate.sh +++ b/documentdb-playground/telemetry/local/scripts/validate.sh @@ -57,7 +57,7 @@ if [ -n "$PROM_POD" ]; then target_up=$(kubectl exec "$PROM_POD" -n observability --context "$CONTEXT" -- \ wget -qO- "http://localhost:9090/api/v1/query?query=up" 2>/dev/null || echo "") if echo "$target_up" | grep -q '"value"'; then - up_count=$(echo "$target_up" | python3 -c "import sys,json; d=json.load(sys.stdin); print(sum(1 for r in d.get('data',{}).get('result',[]) if r['value'][1]=='1'))" 2>/dev/null || echo "0") + up_count=$(echo "$target_up" | grep -o '"value":\[' | wc -l) green "Prometheus has $up_count active targets" else red "Cannot query Prometheus targets"