From 45a33f88cc644812054e0d2893cc8f8ff58a6419 Mon Sep 17 00:00:00 2001 From: Ye Yu Date: Mon, 30 Mar 2026 09:24:38 -0700 Subject: [PATCH 1/3] Fix parquet loading crash from datasets version mismatch When local parquet files contain HF datasets metadata written by a different version of the `datasets` library, `load_dataset("parquet")` can raise a TypeError during feature deserialization. Fall back to reading via PyArrow directly in that case. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Ye Yu --- .../specdec_bench/specdec_bench/datasets/speed.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/examples/specdec_bench/specdec_bench/datasets/speed.py b/examples/specdec_bench/specdec_bench/datasets/speed.py index e3429126d9..49157e9037 100644 --- a/examples/specdec_bench/specdec_bench/datasets/speed.py +++ b/examples/specdec_bench/specdec_bench/datasets/speed.py @@ -716,7 +716,18 @@ def _load_dataset(self, config_name_or_dataset_path: config_type | str) -> "Data } else: data_files = {"test": [str(config_name_or_dataset_path_path)]} - dataset = load_dataset("parquet", data_files=data_files, split="test") + try: + dataset = load_dataset("parquet", data_files=data_files, split="test") + except TypeError: + # Fallback: parquet metadata may be incompatible with the installed + # ``datasets`` version. Read via PyArrow and convert directly. + import pyarrow + import pyarrow.parquet as pq + from datasets import Dataset as HFDataset + + tables = [pq.read_table(f) for f in data_files["test"]] + table = pyarrow.concat_tables(tables) if len(tables) > 1 else tables[0] + dataset = HFDataset(table) if self.num_samples is not None: dataset = dataset.select(range(self.num_samples)) return dataset From 111d312c530b36b5e4238db576de560f82690c13 Mon Sep 17 00:00:00 2001 From: Ye Yu Date: Tue, 31 Mar 2026 09:54:37 -0700 Subject: [PATCH 2/3] Strip HF metadata from arrow table in parquet fallback The PyArrow fallback still failed because HFDataset(table) parses the huggingface metadata embedded in the arrow schema, hitting the same TypeError. Strip that metadata before constructing the Dataset. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Ye Yu --- examples/specdec_bench/specdec_bench/datasets/speed.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/specdec_bench/specdec_bench/datasets/speed.py b/examples/specdec_bench/specdec_bench/datasets/speed.py index 49157e9037..a22de37d11 100644 --- a/examples/specdec_bench/specdec_bench/datasets/speed.py +++ b/examples/specdec_bench/specdec_bench/datasets/speed.py @@ -727,6 +727,15 @@ def _load_dataset(self, config_name_or_dataset_path: config_type | str) -> "Data tables = [pq.read_table(f) for f in data_files["test"]] table = pyarrow.concat_tables(tables) if len(tables) > 1 else tables[0] + # Strip HF metadata from the schema to avoid Feature parsing errors + schema = table.schema + if schema.metadata and b"huggingface" in schema.metadata: + new_meta = { + k: v + for k, v in schema.metadata.items() + if k != b"huggingface" + } + table = table.replace_schema_metadata(new_meta or None) dataset = HFDataset(table) if self.num_samples is not None: dataset = dataset.select(range(self.num_samples)) From 94fadbd58f47e23778bfdd30c4df39c68492a441 Mon Sep 17 00:00:00 2001 From: Ye Yu Date: Tue, 31 Mar 2026 09:55:28 -0700 Subject: [PATCH 3/3] Relax datasets version pin to avoid conflict with TRT-LLM The tensorrt_llm 1.3.0rc5 container pins datasets==3.1.0. The previous pin (>=4.4.0) caused concurrent pip installs across ranks to race and corrupt the datasets package, breaking tensorrt_llm imports entirely. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Ye Yu --- examples/specdec_bench/requirements_speed.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/specdec_bench/requirements_speed.txt b/examples/specdec_bench/requirements_speed.txt index 5b0117e3a7..549a5d73e8 100644 --- a/examples/specdec_bench/requirements_speed.txt +++ b/examples/specdec_bench/requirements_speed.txt @@ -1,4 +1,4 @@ -datasets>=4.4.0,<5.0.0 +datasets>=3.1.0 rich>=14.2.0 seaborn>=0.13.2 tiktoken>=0.12.0