Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions .github/workflows/e2e_tests_providers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
name: E2E Inference Provider Tests

on:
schedule:
- cron: "0 0 * * *" # Runs once a day at midnight UTC
workflow_dispatch:
push
# schedule:
# - cron: "0 0 * * *" # Runs once a day at midnight UTC
# workflow_dispatch:

jobs:
e2e_tests:
Expand Down Expand Up @@ -259,11 +260,17 @@ jobs:
exit 1
}

# Wait for watsonx library mode to finish before running server mode
# watsonx has a rate limit of 2 calls / second
- name: Wait for watsonx library mode to finish
if: matrix.environment == 'watsonx' && matrix.mode == 'server'
run: sleep 2400 # 40 minutes

# watsonx has a different convention than "<provider>/<model>"
- name: Set watsonx test overrides
if: matrix.environment == 'watsonx'
run: |
echo "E2E_DEFAULT_MODEL_OVERRIDE=watsonx/watsonx/meta-llama/llama-3-3-70b-instruct" >> $GITHUB_ENV
echo "E2E_DEFAULT_MODEL_OVERRIDE=meta-llama/llama-4-maverick-17b-128e-instruct-fp8" >> $GITHUB_ENV
echo "E2E_DEFAULT_PROVIDER_OVERRIDE=watsonx" >> $GITHUB_ENV

- name: Run e2e tests
Expand Down
9 changes: 6 additions & 3 deletions .github/workflows/e2e_tests_rhaiis.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
name: RHAIIS E2E Tests

on:
schedule:
- cron: "0 0 * * *" # Runs once a day at midnight UTC
workflow_dispatch:
push
# schedule:
# - cron: "0 0 * * *" # Runs once a day at midnight UTC
# workflow_dispatch:


jobs:
Expand All @@ -26,6 +27,8 @@ jobs:
RHAIIS_API_KEY: ${{ secrets.RHAIIS_API_KEY }}
RHAIIS_MODEL: ${{ vars.RHAIIS_MODEL }}
FAISS_VECTOR_STORE_ID: ${{ vars.FAISS_VECTOR_STORE_ID }}
E2E_DEFAULT_MODEL_OVERRIDE: ${{ vars.RHAIIS_MODEL }}
E2E_DEFAULT_PROVIDER_OVERRIDE: vllm

steps:
- uses: actions/checkout@v4
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ test-integration: ## Run integration tests tests
COVERAGE_FILE="${ARTIFACT_DIR}/.coverage.integration" uv run python -m pytest tests/integration --cov=src --cov-report term-missing --cov-report "json:${ARTIFACT_DIR}/coverage_integration.json" --junit-xml="${ARTIFACT_DIR}/junit_integration.xml" --cov-fail-under=10

test-e2e: ## Run end to end tests for the service
uv run behave --color --format pretty --tags=-skip -D dump_errors=true @tests/e2e/test_list.txt
script -q -e -c "uv run behave --color --format pretty --tags=-skip -D dump_errors=true @tests/e2e/test_list.txt"

test-e2e-local: ## Run end to end tests for the service
uv run behave --color --format pretty --tags=-skip -D dump_errors=true @tests/e2e/test_list.txt
Expand Down
1 change: 1 addition & 0 deletions docker-compose-library.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ services:
- WATSONX_BASE_URL=${WATSONX_BASE_URL:-}
- WATSONX_PROJECT_ID=${WATSONX_PROJECT_ID:-}
- WATSONX_API_KEY=${WATSONX_API_KEY:-}
- LITELLM_DROP_PARAMS=true
# Enable debug logging if needed
- LLAMA_STACK_LOGGING=${LLAMA_STACK_LOGGING:-}
# FAISS test
Expand Down
1 change: 1 addition & 0 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ services:
- WATSONX_BASE_URL=${WATSONX_BASE_URL:-}
- WATSONX_PROJECT_ID=${WATSONX_PROJECT_ID:-}
- WATSONX_API_KEY=${WATSONX_API_KEY:-}
- LITELLM_DROP_PARAMS=true
# Enable debug logging if needed
- LLAMA_STACK_LOGGING=${LLAMA_STACK_LOGGING:-}
# FAISS test
Expand Down
27 changes: 22 additions & 5 deletions examples/azure-run.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ providers:
provider_type: remote::azure
config:
api_key: ${env.AZURE_API_KEY}
api_base: https://ols-test.openai.azure.com/
base_url: https://ols-test.openai.azure.com/openai/v1
api_version: 2024-02-15-preview
- provider_id: openai
provider_type: remote::openai
Expand All @@ -50,14 +50,17 @@ providers:
provider_id: basic
provider_type: inline::basic
tool_runtime:
- config: {}
- config: {} # Enable the RAG tool
provider_id: rag-runtime
provider_type: inline::rag-runtime
- config: {} # Enable MCP (Model Context Protocol) support
provider_id: model-context-protocol
provider_type: remote::model-context-protocol
vector_io:
- config:
persistence:
namespace: vector_io::faiss
backend: kv_default
backend: kv_rag
provider_id: faiss
provider_type: inline::faiss
agents:
Expand Down Expand Up @@ -105,7 +108,10 @@ storage:
backends:
kv_default:
type: kv_sqlite
db_path: ${env.KV_STORE_PATH:=~/.llama/storage/rag/kv_store.db}
db_path: ${env.KV_STORE_PATH:=~/.llama/storage/kv_store.db}
kv_rag: # Define the storage backend type for RAG
type: kv_sqlite
db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db}
sql_default:
type: sql_sqlite
db_path: ${env.SQL_STORE_PATH:=~/.llama/storage/sql_store.db}
Expand All @@ -130,10 +136,21 @@ registered_resources:
provider_id: azure
model_type: llm
provider_model_id: gpt-4o-mini
- model_id: all-mpnet-base-v2
model_type: embedding
provider_id: sentence-transformers
provider_model_id: all-mpnet-base-v2
metadata:
embedding_dimension: 768
shields:
- shield_id: llama-guard
provider_id: llama-guard
provider_shield_id: openai/gpt-4o-mini
vector_stores:
- embedding_dimension: 768
embedding_model: sentence-transformers/all-mpnet-base-v2
provider_id: faiss
vector_store_id: ${env.FAISS_VECTOR_STORE_ID}
datasets: []
scoring_fns: []
benchmarks: []
Expand All @@ -144,6 +161,6 @@ vector_stores:
default_provider_id: faiss
default_embedding_model:
provider_id: sentence-transformers
model_id: nomic-ai/nomic-embed-text-v1.5
model_id: all-mpnet-base-v2
safety:
default_shield_id: llama-guard
29 changes: 22 additions & 7 deletions examples/vertexai-run.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,17 @@ providers:
provider_id: basic
provider_type: inline::basic
tool_runtime:
- config: {}
- config: {} # Enable the RAG tool
provider_id: rag-runtime
provider_type: inline::rag-runtime
- config: {} # Enable MCP (Model Context Protocol) support
provider_id: model-context-protocol
provider_type: remote::model-context-protocol
vector_io:
- config:
persistence:
namespace: vector_io::faiss
backend: kv_default
backend: kv_rag
provider_id: faiss
provider_type: inline::faiss
agents:
Expand Down Expand Up @@ -105,7 +108,10 @@ storage:
backends:
kv_default:
type: kv_sqlite
db_path: ${env.KV_STORE_PATH:=~/.llama/storage/rag/kv_store.db}
db_path: ${env.KV_STORE_PATH:=~/.llama/storage/kv_store.db}
kv_rag: # Define the storage backend type for RAG
type: kv_sqlite
db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db}
sql_default:
type: sql_sqlite
db_path: ${env.SQL_STORE_PATH:=~/.llama/storage/sql_store.db}
Expand All @@ -125,11 +131,22 @@ storage:
namespace: prompts
backend: kv_default
registered_resources:
models: []
models:
- model_id: all-mpnet-base-v2
model_type: embedding
provider_id: sentence-transformers
provider_model_id: all-mpnet-base-v2
metadata:
embedding_dimension: 768
shields:
- shield_id: llama-guard
provider_id: llama-guard
provider_shield_id: openai/gpt-4o-mini
vector_stores:
- embedding_dimension: 768
embedding_model: sentence-transformers/all-mpnet-base-v2
provider_id: faiss
vector_store_id: ${env.FAISS_VECTOR_STORE_ID}
datasets: []
scoring_fns: []
benchmarks: []
Expand All @@ -140,8 +157,6 @@ vector_stores:
default_provider_id: faiss
default_embedding_model:
provider_id: sentence-transformers
model_id: nomic-ai/nomic-embed-text-v1.5
model_id: all-mpnet-base-v2
safety:
default_shield_id: llama-guard
telemetry:
enabled: true
30 changes: 23 additions & 7 deletions examples/watsonx-run.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ providers:
- provider_id: watsonx
provider_type: remote::watsonx
config:
url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}
base_url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}
api_key: ${env.WATSONX_API_KEY:=key-not-set}
project_id: ${env.WATSONX_PROJECT_ID:=project-not-set}
timeout: 1200
Expand Down Expand Up @@ -61,11 +61,14 @@ providers:
- config: {} # Enable the RAG tool
provider_id: rag-runtime
provider_type: inline::rag-runtime
- config: {} # Enable MCP (Model Context Protocol) support
provider_id: model-context-protocol
provider_type: remote::model-context-protocol
vector_io:
- config: # Define the storage backend for RAG
persistence:
namespace: vector_io::faiss
backend: kv_default
backend: kv_rag
provider_id: faiss
provider_type: inline::faiss
agents:
Expand Down Expand Up @@ -111,12 +114,15 @@ server:
port: 8321
storage:
backends:
kv_default: # Define the storage backend type for RAG, in this case registry and RAG are unified i.e. information on registered resources (e.g. models, vector_stores) are saved together with the RAG chunks
kv_default:
type: kv_sqlite
db_path: ${env.KV_STORE_PATH:=~/.llama/storage/kv_store.db}
kv_rag: # Define the storage backend type for RAG
type: kv_sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/storage/rag/kv_store.db}
db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db}
sql_default:
type: sql_sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/storage/sql_store.db}
db_path: ${env.SQL_STORE_PATH:=~/.llama/storage/sql_store.db}
stores:
metadata:
namespace: registry
Expand All @@ -138,11 +144,21 @@ registered_resources:
provider_id: watsonx
model_type: llm
provider_model_id: watsonx/meta-llama/llama-3-3-70b-instruct
- model_id: all-mpnet-base-v2
model_type: embedding
provider_id: sentence-transformers
provider_model_id: all-mpnet-base-v2
metadata:
embedding_dimension: 768
shields:
- shield_id: llama-guard
provider_id: llama-guard
provider_shield_id: openai/gpt-4o-mini
vector_stores: []
vector_stores:
- embedding_dimension: 768
embedding_model: sentence-transformers/all-mpnet-base-v2
provider_id: faiss
vector_store_id: ${env.FAISS_VECTOR_STORE_ID}
datasets: []
scoring_fns: []
benchmarks: []
Expand All @@ -153,4 +169,4 @@ vector_stores:
default_provider_id: faiss
default_embedding_model: # Define the default embedding model for RAG
provider_id: sentence-transformers
model_id: nomic-ai/nomic-embed-text-v1.5
model_id: all-mpnet-base-v2
23 changes: 18 additions & 5 deletions src/utils/responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
OpenAIResponseInputMessageContent as InputMessageContent,
OpenAIResponseInputMessageContentFile as InputFilePart,
OpenAIResponseInputMessageContentText as InputTextPart,
OpenAIResponseInputTool as InputTool,
OpenAIResponseInputToolChoice as ToolChoice,
OpenAIResponseInputToolChoiceMode as ToolChoiceMode,
OpenAIResponseInputToolFileSearch as InputToolFileSearch,
OpenAIResponseInputToolMCP as InputToolMCP,
OpenAIResponseMCPApprovalRequest as MCPApprovalRequest,
Expand All @@ -28,17 +31,14 @@
OpenAIResponseOutputMessageMCPListTools as MCPListTools,
OpenAIResponseOutputMessageWebSearchToolCall as WebSearchCall,
OpenAIResponseUsage as ResponseUsage,
OpenAIResponseInputTool as InputTool,
OpenAIResponseUsageInputTokensDetails as UsageInputTokensDetails,
OpenAIResponseUsageOutputTokensDetails as UsageOutputTokensDetails,
OpenAIResponseInputToolChoiceMode as ToolChoiceMode,
OpenAIResponseInputToolChoice as ToolChoice,
)
from llama_stack_client import APIConnectionError, APIStatusError, AsyncLlamaStackClient

from client import AsyncLlamaStackClientHolder
import constants
import metrics
from client import AsyncLlamaStackClientHolder
from configuration import configuration
from constants import DEFAULT_RAG_TOOL
from log import get_logger
Expand Down Expand Up @@ -334,7 +334,6 @@ async def prepare_responses_params( # pylint: disable=too-many-arguments,too-ma

# Build x-llamastack-provider-data header from MCP tool headers
extra_headers = _build_provider_data_headers(tools)

return ResponsesApiParams(
input=input_text,
model=model,
Expand Down Expand Up @@ -1006,6 +1005,12 @@ async def check_model_configured(
for model in models:
if model.id == model_id:
return True

# Workaround to llama-stack watsonx bug
if model_id.startswith("watsonx/") and model.id == model_id.removeprefix(
"watsonx/"
):
return True
return False
except APIStatusError as e:
response = InternalServerErrorResponse.generic()
Expand Down Expand Up @@ -1080,6 +1085,14 @@ async def select_model_for_responses(

model = llm_models[0]
logger.info("Selected first LLM model: %s", model.id)

# Workaround to llama-stack bug for watsonx
# model needs to be "watsonx/<model_id>" in the response request
metadata = model.custom_metadata or {}
if metadata.get("provider_id") == "watsonx":
provider_resource_id = metadata.get("provider_resource_id")
if isinstance(provider_resource_id, str):
return provider_resource_id
return model.id


Expand Down
3 changes: 2 additions & 1 deletion test.containerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ COPY src ./src
RUN uv sync --locked --no-install-project --group llslibdev

# Add virtual environment to PATH for llama command
ENV PATH="/opt/app-root/.venv/bin:$PATH"
ENV PATH="/opt/app-root/.venv/bin:$PATH" \
PYTHONPATH="/opt/app-root/src"

# Set HOME directory so llama-stack uses /opt/app-root/src/.llama
ENV HOME="/opt/app-root/src"
Expand Down
8 changes: 6 additions & 2 deletions tests/e2e/configs/run-azure.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ providers:
provider_type: remote::azure
config:
api_key: ${env.AZURE_API_KEY}
api_base: https://ols-test.openai.azure.com/
base_url: https://ols-test.openai.azure.com/openai/v1
api_version: 2024-02-15-preview
allowed_models: ["gpt-4o-mini"]
- provider_id: openai
provider_type: remote::openai
config:
Expand All @@ -50,9 +51,12 @@ providers:
provider_id: basic
provider_type: inline::basic
tool_runtime:
- config: {}
- config: {} # Enable the RAG tool
provider_id: rag-runtime
provider_type: inline::rag-runtime
- config: {} # Enable MCP (Model Context Protocol) support
provider_id: model-context-protocol
provider_type: remote::model-context-protocol
vector_io:
- config:
persistence:
Expand Down
Loading
Loading