IBM · christian-pinto · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026 · Mar 30, 2026
diff --git a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/experiment_executor.py b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/experiment_executor.py
@@ -423,6 +423,7 @@ def run_resource_and_workload_experiment(
             if experiment.identifier in [
                 "test-geospatial-deployment-v1",
                 "test-geospatial-deployment-custom-dataset-v1",
+                "test-geospatial-endpoint-custom-dataset-v1",
             ]:
                 logger.info("Using geospatial benchmark for deployment")
                 result = execute_geospatial_benchmark(
@@ -571,7 +572,7 @@ def run_workload_experiment(
             # Will raise VLLMBenchmarkError if there is a problem
             logger.info(f"Executing experiment: {experiment.identifier}")
             result: BenchmarkResult
-            if experiment.identifier == "test-geospatial-endpoint-v1":
+            if experiment.identifier in ["test-geospatial-endpoint-v1", "test-geospatial-endpoint-custom-dataset-v1"]:
                 logger.info("Using geospatial benchmark for endpoint")
                 result = execute_geospatial_benchmark(
                     base_url=benchmark_parameters.endpoint,

diff --git a/...erformance/ado_actuators/vllm_performance/experiments/performance_testing_geospatial.yaml b/...erformance/ado_actuators/vllm_performance/experiments/performance_testing_geospatial.yaml
@@ -463,3 +463,78 @@ performance_testing-geospatial-full-custom-dataset:
     - identifier: "p99_e2el_ms"
   metadata:
     description: 'VLLM performance testing across compute resource and workload configuration'
+performance_testing-geospatial-endpoint-custom-dataset:
+  identifier: test-geospatial-endpoint-custom-dataset-v1
+  actuatorIdentifier: "vllm_performance"
+  requiredProperties: # Any entity passed to this experiment must have constitutive properties with these values
+    - identifier: 'model'
+      metadata:
+        description: 'model to use for testing. Assumed to be served by all endpoints tested. Required to obtain correct tokenizer for benchmarking metrics calculation'
+      propertyDomain:
+        variableType: "OPEN_CATEGORICAL_VARIABLE_TYPE"
+        values: ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
+    - identifier: 'endpoint'
+      metadata:
+        description: 'The endpoint(s) to test'
+      propertyDomain:
+        variableType: "OPEN_CATEGORICAL_VARIABLE_TYPE"
+        values: ["http://localhost:8000"]
+    - identifier: 'request_rate'
+      metadata:
+        description: "The number of requests to send per second"
+      propertyDomain:
+        variableType: 'DISCRETE_VARIABLE_TYPE'
+        domainRange: [-1, 1000]
+        interval: 1 # -1 means send all requests at time 0
+    - identifier: 'dataset'
+      metadata:
+        description: "(benchmark) The dataset to be used for the experiment"
+      propertyDomain:
+        variableType: "OPEN_CATEGORICAL_VARIABLE_TYPE"
+        values: ["custom_dataset.jsonl"]
+  optionalProperties:
+    - identifier: 'num_prompts'
+      metadata:
+        description: "The number of prompts to send (total number of requests)"
+      propertyDomain:
+        variableType: 'DISCRETE_VARIABLE_TYPE'
+        domainRange: [1, 10001]
+        interval: 1
+    - identifier: 'burstiness'
+      metadata:
+        description: "The burstiness of the requests - 1.0 is a Poisson distribution with rate = request_rate. Others are gamma distributions with lambda = request_rate and shape = burstiness."
+      propertyDomain:
+        variableType: 'DISCRETE_VARIABLE_TYPE'
+        domainRange: [0, 10]
+        interval: 1
+    - identifier: 'max_concurrency'
+      metadata:
+        description: "The maximum number of concurrent requests to send"
+      propertyDomain:
+        variableType: 'DISCRETE_VARIABLE_TYPE'
+        domainRange: [-1, 500] # -1 means no concurrency control
+        interval: 1
+  defaultParameterization:
+    - value: 100
+      property:
+        identifier: 'num_prompts'
+    - value: -1
+      property:
+        identifier: 'max_concurrency'
+    - value: 1.0
+      property:
+        identifier: 'burstiness'
+  # measurements
+  targetProperties:
+    - identifier: "duration"
+    - identifier: "completed"
+    - identifier: "request_throughput"
+    - identifier: "mean_e2el_ms"
+    - identifier: "median_e2el_ms"
+    - identifier: "std_e2el_ms"
+    - identifier: "p25_e2el_ms"
+    - identifier: "p50_e2el_ms"
+    - identifier: "p75_e2el_ms"
+    - identifier: "p99_e2el_ms"
+  metadata:
+    description: 'Test inference performance of a geospatial model served by vLLM endpoint across inference workload configurations'
diff --git a/...llm_performance/ado_actuators/vllm_performance/vllm_performance_test/execute_benchmark.py b/...llm_performance/ado_actuators/vllm_performance/vllm_performance_test/execute_benchmark.py
@@ -135,7 +135,9 @@ def execute_benchmark(
         command.extend(["--max-concurrency", f"{max_concurrency!s}"])
     if custom_args is not None:
         for key, value in custom_args.items():
-            command.extend([key, f"{value!s}"])
+            command.append(key)
+            if value:
+                command.append(f"{value!s}")
 
     logger.debug(f"Command line: {command}")
 
@@ -273,7 +275,7 @@ def execute_geospatial_benchmark(
 
     return execute_benchmark(
         base_url=base_url,
-        backend="io-processor-plugin",
+        backend="vllm-pooling",
         model=model,
         dataset="custom",
         interpreter=interpreter,
@@ -287,7 +289,7 @@ def execute_geospatial_benchmark(
         custom_args={
             "--dataset-path": f"{dataset_path.resolve()}",
             "--endpoint": "/pooling",
-            "--skip-tokenizer-init": True,
+            "--skip-tokenizer-init": None,
         },
     )