From 8c5af07dd4958783ae0d348caacbcee14fc96511 Mon Sep 17 00:00:00 2001 From: Li Jiajia Date: Wed, 4 Mar 2026 18:33:42 +0800 Subject: [PATCH 1/5] Fix SigV4 auth to use base64-encoded content SHA256 and custom canonical request --- pyiceberg/catalog/rest/__init__.py | 32 ++++++++++--- tests/catalog/test_rest.py | 72 ++++++++++++++++++++++++++++-- 2 files changed, 94 insertions(+), 10 deletions(-) diff --git a/pyiceberg/catalog/rest/__init__.py b/pyiceberg/catalog/rest/__init__.py index b617cfa7da..3fc501182e 100644 --- a/pyiceberg/catalog/rest/__init__.py +++ b/pyiceberg/catalog/rest/__init__.py @@ -733,6 +733,8 @@ def _split_identifier_for_json(self, identifier: str | Identifier) -> dict[str, return {"namespace": identifier_tuple[:-1], "name": identifier_tuple[-1]} def _init_sigv4(self, session: Session) -> None: + import base64 + import hashlib from urllib import parse import boto3 @@ -741,6 +743,12 @@ def _init_sigv4(self, session: Session) -> None: from requests import PreparedRequest from requests.adapters import HTTPAdapter + class _IcebergSigV4Auth(SigV4Auth): + def canonical_request(self, request: Any) -> str: + cr = super().canonical_request(request) + # Replace the last line (body_checksum) with hex-encoded payload hash. + return cr.rsplit("\n", 1)[0] + "\n" + self.payload(request) + class SigV4Adapter(HTTPAdapter): def __init__(self, **properties: str): self._properties = properties @@ -767,17 +775,27 @@ def add_headers(self, request: PreparedRequest, **kwargs: Any) -> None: # pylin # remove the connection header as it will be updated after signing if "connection" in request.headers: del request.headers["connection"] - # For empty bodies, explicitly set the content hash header to the SHA256 of an empty string - if not request.body: - request.headers["x-amz-content-sha256"] = EMPTY_BODY_SHA256 + + # Compute the x-amz-content-sha256 header to match Iceberg Java SDK: + # - empty body → hex (EMPTY_BODY_SHA256) + # - non-empty body → base64 + if request.body: + body_bytes = request.body.encode("utf-8") if isinstance(request.body, str) else request.body + content_sha256_header = base64.b64encode(hashlib.sha256(body_bytes).digest()).decode() + else: + content_sha256_header = EMPTY_BODY_SHA256 + + signing_headers = dict(request.headers) + signing_headers["x-amz-content-sha256"] = content_sha256_header aws_request = AWSRequest( - method=request.method, url=url, params=params, data=request.body, headers=dict(request.headers) + method=request.method, url=url, params=params, data=request.body, headers=signing_headers ) - SigV4Auth(credentials, service, region).add_auth(aws_request) - original_header = request.headers - signed_headers = aws_request.headers + _IcebergSigV4Auth(credentials, service, region).add_auth(aws_request) + + original_header = dict(request.headers) + signed_headers = dict(aws_request.headers) relocated_headers = {} # relocate headers if there is a conflict with signed headers diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index 99d1ef947b..de58cdbaa7 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -512,9 +512,10 @@ def test_sigv4_sign_request_without_body(rest_mock: Mocker) -> None: assert isinstance(adapter, HTTPAdapter) adapter.add_headers(prepared) - assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256") + assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256 Credential=") assert prepared.headers["Original-Authorization"] == f"Bearer {existing_token}" assert prepared.headers["x-amz-content-sha256"] == EMPTY_BODY_SHA256 + assert "SignedHeaders=" in prepared.headers["Authorization"] def test_sigv4_sign_request_with_body(rest_mock: Mocker) -> None: @@ -543,9 +544,74 @@ def test_sigv4_sign_request_with_body(rest_mock: Mocker) -> None: assert isinstance(adapter, HTTPAdapter) adapter.add_headers(prepared) - assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256") + assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256 Credential=") + assert "SignedHeaders=" in prepared.headers["Authorization"] + # Conflicting Authorization header is relocated assert prepared.headers["Original-Authorization"] == f"Bearer {existing_token}" - assert prepared.headers.get("x-amz-content-sha256") != EMPTY_BODY_SHA256 + assert prepared.headers["x-amz-content-sha256"] == "nhKdVGKGU3IMGjYlod9xKUVc7/H5K6zTWj60yJOM80k=" + + +def test_sigv4_content_sha256_with_bytes_body(rest_mock: Mocker) -> None: + existing_token = "existing_token" + + catalog = RestCatalog( + "rest", + **{ + "uri": TEST_URI, + "token": existing_token, + "rest.sigv4-enabled": "true", + "rest.signing-region": "us-west-2", + "client.access-key-id": "id", + "client.secret-access-key": "secret", + }, + ) + + body_content = b'{"namespace": "test_namespace"}' + prepared = catalog._session.prepare_request( + Request( + "POST", + f"{TEST_URI}v1/namespaces", + data=body_content, + ) + ) + adapter = catalog._session.adapters[catalog.uri] + assert isinstance(adapter, HTTPAdapter) + adapter.add_headers(prepared) + + assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256 Credential=") + assert "SignedHeaders=" in prepared.headers["Authorization"] + assert prepared.headers["x-amz-content-sha256"] == "sD20bEQP+WnwKPT7jxn7PIACGciAeWjQPlzFCK5Fifo=" + + +def test_sigv4_conflicting_sigv4_headers(rest_mock: Mocker) -> None: + catalog = RestCatalog( + "rest", + **{ + "uri": TEST_URI, + "rest.sigv4-enabled": "true", + "rest.signing-region": "us-west-2", + "client.access-key-id": "id", + "client.secret-access-key": "secret", + }, + ) + + prepared = catalog._session.prepare_request(Request("GET", f"{TEST_URI}v1/config")) + adapter = catalog._session.adapters[catalog.uri] + assert isinstance(adapter, HTTPAdapter) + + # Inject conflicting SigV4 headers before signing + prepared.headers["x-amz-content-sha256"] = "fake" + prepared.headers["X-Amz-Date"] = "fake" + + adapter.add_headers(prepared) + + # Matching Java SDK: conflicting headers are relocated with "Original-" prefix + assert prepared.headers.get("Original-x-amz-content-sha256") == "fake" + assert prepared.headers.get("Original-X-Amz-Date") == "fake" + # SigV4 headers are set correctly after signing + assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256 Credential=") + assert prepared.headers["x-amz-content-sha256"] == EMPTY_BODY_SHA256 + assert "X-Amz-Date" in prepared.headers def test_sigv4_adapter_default_retry_config(rest_mock: Mocker) -> None: From 1b680854cbb8241e21b6b9c557240e2c8a1b6188 Mon Sep 17 00:00:00 2001 From: Li Jiajia Date: Sat, 21 Mar 2026 16:43:35 +0800 Subject: [PATCH 2/5] Refactor _IcebergSigV4Auth to reuse canonical_request logic instead of rsplit --- pyiceberg/catalog/rest/__init__.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/pyiceberg/catalog/rest/__init__.py b/pyiceberg/catalog/rest/__init__.py index 3fc501182e..ee2885fe09 100644 --- a/pyiceberg/catalog/rest/__init__.py +++ b/pyiceberg/catalog/rest/__init__.py @@ -745,9 +745,20 @@ def _init_sigv4(self, session: Session) -> None: class _IcebergSigV4Auth(SigV4Auth): def canonical_request(self, request: Any) -> str: - cr = super().canonical_request(request) - # Replace the last line (body_checksum) with hex-encoded payload hash. - return cr.rsplit("\n", 1)[0] + "\n" + self.payload(request) + # Reuses the logic from botocore's SigV4Auth.canonical_request + # (https://github.com/boto/botocore/blob/develop/botocore/auth.py) + # but always uses self.payload(request) for the body checksum. + cr = [request.method.upper()] + path = self._normalize_url_path(parse.urlsplit(request.url).path) + cr.append(path) + cr.append(self.canonical_query_string(request)) + headers_to_sign = self.headers_to_sign(request) + cr.append(self.canonical_headers(headers_to_sign) + "\n") + cr.append(self.signed_headers(headers_to_sign)) + # Always use hex-encoded payload hash per SigV4 spec, + # regardless of the x-amz-content-sha256 header value (which may be base64). + cr.append(self.payload(request)) + return "\n".join(cr) class SigV4Adapter(HTTPAdapter): def __init__(self, **properties: str): From b48917eb80b95096e2a9844d29f3367d106e3a94 Mon Sep 17 00:00:00 2001 From: Li Jiajia Date: Sat, 21 Mar 2026 17:19:09 +0800 Subject: [PATCH 3/5] update test --- tests/catalog/test_rest.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index de58cdbaa7..e1aa081923 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -18,6 +18,7 @@ from __future__ import annotations import base64 +import hashlib import os from collections.abc import Callable from typing import Any, cast @@ -512,10 +513,16 @@ def test_sigv4_sign_request_without_body(rest_mock: Mocker) -> None: assert isinstance(adapter, HTTPAdapter) adapter.add_headers(prepared) - assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256 Credential=") + auth_header = prepared.headers["Authorization"] + assert auth_header.startswith("AWS4-HMAC-SHA256 Credential=") assert prepared.headers["Original-Authorization"] == f"Bearer {existing_token}" assert prepared.headers["x-amz-content-sha256"] == EMPTY_BODY_SHA256 - assert "SignedHeaders=" in prepared.headers["Authorization"] + # Verify the signature format: Credential, SignedHeaders, Signature + assert "Credential=" in auth_header + assert "SignedHeaders=" in auth_header + assert "Signature=" in auth_header + # x-amz-content-sha256 should be in signed headers + assert "x-amz-content-sha256" in auth_header def test_sigv4_sign_request_with_body(rest_mock: Mocker) -> None: @@ -544,11 +551,19 @@ def test_sigv4_sign_request_with_body(rest_mock: Mocker) -> None: assert isinstance(adapter, HTTPAdapter) adapter.add_headers(prepared) - assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256 Credential=") - assert "SignedHeaders=" in prepared.headers["Authorization"] + auth_header = prepared.headers["Authorization"] + assert auth_header.startswith("AWS4-HMAC-SHA256 Credential=") + assert "SignedHeaders=" in auth_header # Conflicting Authorization header is relocated assert prepared.headers["Original-Authorization"] == f"Bearer {existing_token}" - assert prepared.headers["x-amz-content-sha256"] == "nhKdVGKGU3IMGjYlod9xKUVc7/H5K6zTWj60yJOM80k=" + # Non-empty body should have base64-encoded SHA256 + content_sha256 = prepared.headers["x-amz-content-sha256"] + assert content_sha256 == "nhKdVGKGU3IMGjYlod9xKUVc7/H5K6zTWj60yJOM80k=" + # Verify it's valid base64 and matches the body + decoded = base64.b64decode(content_sha256) + assert len(decoded) == 32 # SHA256 produces 32 bytes + # x-amz-content-sha256 should be in signed headers + assert "x-amz-content-sha256" in auth_header def test_sigv4_content_sha256_with_bytes_body(rest_mock: Mocker) -> None: @@ -580,7 +595,12 @@ def test_sigv4_content_sha256_with_bytes_body(rest_mock: Mocker) -> None: assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256 Credential=") assert "SignedHeaders=" in prepared.headers["Authorization"] - assert prepared.headers["x-amz-content-sha256"] == "sD20bEQP+WnwKPT7jxn7PIACGciAeWjQPlzFCK5Fifo=" + content_sha256 = prepared.headers["x-amz-content-sha256"] + assert content_sha256 == "sD20bEQP+WnwKPT7jxn7PIACGciAeWjQPlzFCK5Fifo=" + # Verify it's valid base64 and matches the body + decoded = base64.b64decode(content_sha256) + assert len(decoded) == 32 # SHA256 produces 32 bytes + assert decoded == hashlib.sha256(body_content).digest() def test_sigv4_conflicting_sigv4_headers(rest_mock: Mocker) -> None: From ab0c6467c95e83f8a98bc99eb57749422af871e5 Mon Sep 17 00:00:00 2001 From: Li Jiajia Date: Wed, 8 Apr 2026 10:54:23 +0800 Subject: [PATCH 4/5] Improve SigV4 tests and add botocore version reference --- pyiceberg/catalog/rest/__init__.py | 2 + tests/catalog/test_rest.py | 65 +++++++++++++++++++++++++----- 2 files changed, 58 insertions(+), 9 deletions(-) diff --git a/pyiceberg/catalog/rest/__init__.py b/pyiceberg/catalog/rest/__init__.py index ee2885fe09..74d5539046 100644 --- a/pyiceberg/catalog/rest/__init__.py +++ b/pyiceberg/catalog/rest/__init__.py @@ -748,6 +748,8 @@ def canonical_request(self, request: Any) -> str: # Reuses the logic from botocore's SigV4Auth.canonical_request # (https://github.com/boto/botocore/blob/develop/botocore/auth.py) # but always uses self.payload(request) for the body checksum. + # Validated against botocore <= 1.42.x + # (https://github.com/boto/botocore/blob/1.42.85/botocore/auth.py#L622-L637) cr = [request.method.upper()] path = self._normalize_url_path(parse.urlsplit(request.url).path) cr.append(path) diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index e1aa081923..5e11673c37 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -558,10 +558,9 @@ def test_sigv4_sign_request_with_body(rest_mock: Mocker) -> None: assert prepared.headers["Original-Authorization"] == f"Bearer {existing_token}" # Non-empty body should have base64-encoded SHA256 content_sha256 = prepared.headers["x-amz-content-sha256"] - assert content_sha256 == "nhKdVGKGU3IMGjYlod9xKUVc7/H5K6zTWj60yJOM80k=" - # Verify it's valid base64 and matches the body - decoded = base64.b64decode(content_sha256) - assert len(decoded) == 32 # SHA256 produces 32 bytes + body_bytes = prepared.body.encode("utf-8") if isinstance(prepared.body, str) else prepared.body + expected_sha256 = base64.b64encode(hashlib.sha256(body_bytes).digest()).decode() + assert content_sha256 == expected_sha256 # x-amz-content-sha256 should be in signed headers assert "x-amz-content-sha256" in auth_header @@ -596,11 +595,8 @@ def test_sigv4_content_sha256_with_bytes_body(rest_mock: Mocker) -> None: assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256 Credential=") assert "SignedHeaders=" in prepared.headers["Authorization"] content_sha256 = prepared.headers["x-amz-content-sha256"] - assert content_sha256 == "sD20bEQP+WnwKPT7jxn7PIACGciAeWjQPlzFCK5Fifo=" - # Verify it's valid base64 and matches the body - decoded = base64.b64decode(content_sha256) - assert len(decoded) == 32 # SHA256 produces 32 bytes - assert decoded == hashlib.sha256(body_content).digest() + expected_sha256 = base64.b64encode(hashlib.sha256(body_content).digest()).decode() + assert content_sha256 == expected_sha256 def test_sigv4_conflicting_sigv4_headers(rest_mock: Mocker) -> None: @@ -634,6 +630,57 @@ def test_sigv4_conflicting_sigv4_headers(rest_mock: Mocker) -> None: assert "X-Amz-Date" in prepared.headers +def test_sigv4_canonical_request_uses_hex_payload(rest_mock: Mocker) -> None: + """Verify that the canonical request uses hex-encoded payload hash, not the base64 header value.""" + from unittest.mock import patch + + from botocore.auth import SigV4Auth + + catalog = RestCatalog( + "rest", + **{ + "uri": TEST_URI, + "token": "token", + "rest.sigv4-enabled": "true", + "rest.signing-region": "us-west-2", + "client.access-key-id": "id", + "client.secret-access-key": "secret", + }, + ) + + body_content = b'{"namespace": "test"}' + prepared = catalog._session.prepare_request( + Request( + "POST", + f"{TEST_URI}v1/namespaces", + data=body_content, + ) + ) + adapter = catalog._session.adapters[catalog.uri] + assert isinstance(adapter, HTTPAdapter) + + # Capture the canonical request string during signing + captured_canonical = [] + original_add_auth = SigV4Auth.add_auth + + def capturing_add_auth(self: Any, request: Any) -> None: + captured_canonical.append(self.canonical_request(request)) + original_add_auth(self, request) + + with patch.object(SigV4Auth, "add_auth", capturing_add_auth): + adapter.add_headers(prepared) + + assert len(captured_canonical) == 1 + canonical_lines = captured_canonical[0].split("\n") + # Last line of canonical request is the payload hash + payload_hash = canonical_lines[-1] + # Must be hex-encoded (64 hex chars), not base64 + assert len(payload_hash) == 64 + assert payload_hash == hashlib.sha256(body_content).hexdigest() + # Meanwhile the header is base64-encoded + assert prepared.headers["x-amz-content-sha256"] == base64.b64encode(hashlib.sha256(body_content).digest()).decode() + + def test_sigv4_adapter_default_retry_config(rest_mock: Mocker) -> None: catalog = RestCatalog( "rest", From d6af40add74e21d86c56c0503c59479f91cdc94d Mon Sep 17 00:00:00 2001 From: Li Jiajia Date: Wed, 8 Apr 2026 15:35:35 +0800 Subject: [PATCH 5/5] Fix mypy error: assert prepared.body is not None before hashing --- tests/catalog/test_rest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index 5e11673c37..de9a6ca7fb 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -558,6 +558,7 @@ def test_sigv4_sign_request_with_body(rest_mock: Mocker) -> None: assert prepared.headers["Original-Authorization"] == f"Bearer {existing_token}" # Non-empty body should have base64-encoded SHA256 content_sha256 = prepared.headers["x-amz-content-sha256"] + assert prepared.body is not None body_bytes = prepared.body.encode("utf-8") if isinstance(prepared.body, str) else prepared.body expected_sha256 = base64.b64encode(hashlib.sha256(body_bytes).digest()).decode() assert content_sha256 == expected_sha256