From 71a6526459ad07bb1ba7bd20db0229038cca95df Mon Sep 17 00:00:00 2001 From: Soham Kukreti Date: Fri, 6 Mar 2026 13:10:35 +0530 Subject: [PATCH] fix(docker): narrow from_serializable_dict to ignore plain data dicts with "type" key MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The typed-object entry condition (`"type" in data`) was too broad: it also matched plain business dicts that happen to carry a "type" key, such as JsonCssExtractionStrategy field specs ({"type": "text"}) and LLMExtractionStrategy JSON Schema fragments ({"type": "string"}). These were never config objects, but the deserializer tried to treat them as such, hit the ALLOWED_DESERIALIZE_TYPES allowlist, and raised a ValueError — causing /crawl to return HTTP 500 for perfectly valid extraction-strategy payloads. Fix: narrow the entry condition to require "params" (or "type":"dict" + "value"), matching only the shapes that to_serializable_dict() actually produces. Dicts with "type" but no "params"/"value" fall through to the raw-dict path and are passed as plain data. The RCE protection from commit 0104db6 is fully preserved: any real class-instantiation attack still requires "type" + "params", still enters the typed path, and is still blocked by the allowlist. Fixes #1797 --- crawl4ai/async_configs.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index e7946f0a6..da3df8769 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -242,8 +242,18 @@ def from_serializable_dict(data: Any) -> Any: if isinstance(data, (str, int, float, bool)): return data - # Handle typed data - if isinstance(data, dict) and "type" in data: + # Handle typed data. + # Only enter the typed-object path for dicts that match the shapes produced + # by to_serializable_dict(): {"type": "", "params": {...}} or + # {"type": "dict", "value": {...}}. Plain business dicts that happen to + # carry a "type" key (e.g. JSON-Schema fragments, JsonCss field specs like + # {"type": "text", "name": "..."}) have neither "params" nor "value" and + # must fall through to the raw-dict path below so they are passed as data. + if ( + isinstance(data, dict) + and "type" in data + and ("params" in data or (data["type"] == "dict" and "value" in data)) + ): # Handle plain dictionaries if data["type"] == "dict" and "value" in data: return {k: from_serializable_dict(v) for k, v in data["value"].items()}