diff --git a/pyiceberg/catalog/rest/__init__.py b/pyiceberg/catalog/rest/__init__.py index b617cfa7da..74d5539046 100644 --- a/pyiceberg/catalog/rest/__init__.py +++ b/pyiceberg/catalog/rest/__init__.py @@ -733,6 +733,8 @@ def _split_identifier_for_json(self, identifier: str | Identifier) -> dict[str, return {"namespace": identifier_tuple[:-1], "name": identifier_tuple[-1]} def _init_sigv4(self, session: Session) -> None: + import base64 + import hashlib from urllib import parse import boto3 @@ -741,6 +743,25 @@ def _init_sigv4(self, session: Session) -> None: from requests import PreparedRequest from requests.adapters import HTTPAdapter + class _IcebergSigV4Auth(SigV4Auth): + def canonical_request(self, request: Any) -> str: + # Reuses the logic from botocore's SigV4Auth.canonical_request + # (https://github.com/boto/botocore/blob/develop/botocore/auth.py) + # but always uses self.payload(request) for the body checksum. + # Validated against botocore <= 1.42.x + # (https://github.com/boto/botocore/blob/1.42.85/botocore/auth.py#L622-L637) + cr = [request.method.upper()] + path = self._normalize_url_path(parse.urlsplit(request.url).path) + cr.append(path) + cr.append(self.canonical_query_string(request)) + headers_to_sign = self.headers_to_sign(request) + cr.append(self.canonical_headers(headers_to_sign) + "\n") + cr.append(self.signed_headers(headers_to_sign)) + # Always use hex-encoded payload hash per SigV4 spec, + # regardless of the x-amz-content-sha256 header value (which may be base64). + cr.append(self.payload(request)) + return "\n".join(cr) + class SigV4Adapter(HTTPAdapter): def __init__(self, **properties: str): self._properties = properties @@ -767,17 +788,27 @@ def add_headers(self, request: PreparedRequest, **kwargs: Any) -> None: # pylin # remove the connection header as it will be updated after signing if "connection" in request.headers: del request.headers["connection"] - # For empty bodies, explicitly set the content hash header to the SHA256 of an empty string - if not request.body: - request.headers["x-amz-content-sha256"] = EMPTY_BODY_SHA256 + + # Compute the x-amz-content-sha256 header to match Iceberg Java SDK: + # - empty body → hex (EMPTY_BODY_SHA256) + # - non-empty body → base64 + if request.body: + body_bytes = request.body.encode("utf-8") if isinstance(request.body, str) else request.body + content_sha256_header = base64.b64encode(hashlib.sha256(body_bytes).digest()).decode() + else: + content_sha256_header = EMPTY_BODY_SHA256 + + signing_headers = dict(request.headers) + signing_headers["x-amz-content-sha256"] = content_sha256_header aws_request = AWSRequest( - method=request.method, url=url, params=params, data=request.body, headers=dict(request.headers) + method=request.method, url=url, params=params, data=request.body, headers=signing_headers ) - SigV4Auth(credentials, service, region).add_auth(aws_request) - original_header = request.headers - signed_headers = aws_request.headers + _IcebergSigV4Auth(credentials, service, region).add_auth(aws_request) + + original_header = dict(request.headers) + signed_headers = dict(aws_request.headers) relocated_headers = {} # relocate headers if there is a conflict with signed headers diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index 99d1ef947b..de9a6ca7fb 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -18,6 +18,7 @@ from __future__ import annotations import base64 +import hashlib import os from collections.abc import Callable from typing import Any, cast @@ -512,9 +513,16 @@ def test_sigv4_sign_request_without_body(rest_mock: Mocker) -> None: assert isinstance(adapter, HTTPAdapter) adapter.add_headers(prepared) - assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256") + auth_header = prepared.headers["Authorization"] + assert auth_header.startswith("AWS4-HMAC-SHA256 Credential=") assert prepared.headers["Original-Authorization"] == f"Bearer {existing_token}" assert prepared.headers["x-amz-content-sha256"] == EMPTY_BODY_SHA256 + # Verify the signature format: Credential, SignedHeaders, Signature + assert "Credential=" in auth_header + assert "SignedHeaders=" in auth_header + assert "Signature=" in auth_header + # x-amz-content-sha256 should be in signed headers + assert "x-amz-content-sha256" in auth_header def test_sigv4_sign_request_with_body(rest_mock: Mocker) -> None: @@ -543,9 +551,135 @@ def test_sigv4_sign_request_with_body(rest_mock: Mocker) -> None: assert isinstance(adapter, HTTPAdapter) adapter.add_headers(prepared) - assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256") + auth_header = prepared.headers["Authorization"] + assert auth_header.startswith("AWS4-HMAC-SHA256 Credential=") + assert "SignedHeaders=" in auth_header + # Conflicting Authorization header is relocated assert prepared.headers["Original-Authorization"] == f"Bearer {existing_token}" - assert prepared.headers.get("x-amz-content-sha256") != EMPTY_BODY_SHA256 + # Non-empty body should have base64-encoded SHA256 + content_sha256 = prepared.headers["x-amz-content-sha256"] + assert prepared.body is not None + body_bytes = prepared.body.encode("utf-8") if isinstance(prepared.body, str) else prepared.body + expected_sha256 = base64.b64encode(hashlib.sha256(body_bytes).digest()).decode() + assert content_sha256 == expected_sha256 + # x-amz-content-sha256 should be in signed headers + assert "x-amz-content-sha256" in auth_header + + +def test_sigv4_content_sha256_with_bytes_body(rest_mock: Mocker) -> None: + existing_token = "existing_token" + + catalog = RestCatalog( + "rest", + **{ + "uri": TEST_URI, + "token": existing_token, + "rest.sigv4-enabled": "true", + "rest.signing-region": "us-west-2", + "client.access-key-id": "id", + "client.secret-access-key": "secret", + }, + ) + + body_content = b'{"namespace": "test_namespace"}' + prepared = catalog._session.prepare_request( + Request( + "POST", + f"{TEST_URI}v1/namespaces", + data=body_content, + ) + ) + adapter = catalog._session.adapters[catalog.uri] + assert isinstance(adapter, HTTPAdapter) + adapter.add_headers(prepared) + + assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256 Credential=") + assert "SignedHeaders=" in prepared.headers["Authorization"] + content_sha256 = prepared.headers["x-amz-content-sha256"] + expected_sha256 = base64.b64encode(hashlib.sha256(body_content).digest()).decode() + assert content_sha256 == expected_sha256 + + +def test_sigv4_conflicting_sigv4_headers(rest_mock: Mocker) -> None: + catalog = RestCatalog( + "rest", + **{ + "uri": TEST_URI, + "rest.sigv4-enabled": "true", + "rest.signing-region": "us-west-2", + "client.access-key-id": "id", + "client.secret-access-key": "secret", + }, + ) + + prepared = catalog._session.prepare_request(Request("GET", f"{TEST_URI}v1/config")) + adapter = catalog._session.adapters[catalog.uri] + assert isinstance(adapter, HTTPAdapter) + + # Inject conflicting SigV4 headers before signing + prepared.headers["x-amz-content-sha256"] = "fake" + prepared.headers["X-Amz-Date"] = "fake" + + adapter.add_headers(prepared) + + # Matching Java SDK: conflicting headers are relocated with "Original-" prefix + assert prepared.headers.get("Original-x-amz-content-sha256") == "fake" + assert prepared.headers.get("Original-X-Amz-Date") == "fake" + # SigV4 headers are set correctly after signing + assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256 Credential=") + assert prepared.headers["x-amz-content-sha256"] == EMPTY_BODY_SHA256 + assert "X-Amz-Date" in prepared.headers + + +def test_sigv4_canonical_request_uses_hex_payload(rest_mock: Mocker) -> None: + """Verify that the canonical request uses hex-encoded payload hash, not the base64 header value.""" + from unittest.mock import patch + + from botocore.auth import SigV4Auth + + catalog = RestCatalog( + "rest", + **{ + "uri": TEST_URI, + "token": "token", + "rest.sigv4-enabled": "true", + "rest.signing-region": "us-west-2", + "client.access-key-id": "id", + "client.secret-access-key": "secret", + }, + ) + + body_content = b'{"namespace": "test"}' + prepared = catalog._session.prepare_request( + Request( + "POST", + f"{TEST_URI}v1/namespaces", + data=body_content, + ) + ) + adapter = catalog._session.adapters[catalog.uri] + assert isinstance(adapter, HTTPAdapter) + + # Capture the canonical request string during signing + captured_canonical = [] + original_add_auth = SigV4Auth.add_auth + + def capturing_add_auth(self: Any, request: Any) -> None: + captured_canonical.append(self.canonical_request(request)) + original_add_auth(self, request) + + with patch.object(SigV4Auth, "add_auth", capturing_add_auth): + adapter.add_headers(prepared) + + assert len(captured_canonical) == 1 + canonical_lines = captured_canonical[0].split("\n") + # Last line of canonical request is the payload hash + payload_hash = canonical_lines[-1] + # Must be hex-encoded (64 hex chars), not base64 + assert len(payload_hash) == 64 + assert payload_hash == hashlib.sha256(body_content).hexdigest() + # Meanwhile the header is base64-encoded + assert prepared.headers["x-amz-content-sha256"] == base64.b64encode(hashlib.sha256(body_content).digest()).decode() def test_sigv4_adapter_default_retry_config(rest_mock: Mocker) -> None: