From 2702f0f2cb63e567d26329648889727a048939c1 Mon Sep 17 00:00:00 2001 From: Samuel Shi Date: Tue, 31 Mar 2026 20:43:37 -0400 Subject: [PATCH 1/3] forbid case sensitive columns during df validation --- datacompy/spark.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/datacompy/spark.py b/datacompy/spark.py index 9df8852f..2d1a604f 100644 --- a/datacompy/spark.py +++ b/datacompy/spark.py @@ -249,6 +249,20 @@ def _validate_dataframe( self._df1 = dataframe.toDF(*[str(c).lower() for c in dataframe.columns]) if index == "df2": self._df2 = dataframe.toDF(*[str(c).lower() for c in dataframe.columns]) + else: + # Don't allow case sensitive columns + lower_cols = [c.lower() for c in dataframe.columns] + if len(set(lower_cols)) < len(lower_cols): + dupes = { + c for c in dataframe.columns if lower_cols.count(c.lower()) > 1 + } + raise ValueError( + f"{index} has columns that differ only by case: {dupes}. " + "Spark strongly discourages use of case sensitive column names." + " Rename columns to be unique regardless of case. " + "See: https://spark.apache.org/docs/latest/api/python/tutorial/" + "pandas_on_spark/best_practices.html#do-not-use-duplicated-column-names" + ) # Check if join_columns are present in the dataframe dataframe = getattr(self, index) # refresh From ae967b8ea8680a3c492ece2b5d0bd833e46ece3b Mon Sep 17 00:00:00 2001 From: Samuel Shi Date: Tue, 31 Mar 2026 21:32:17 -0400 Subject: [PATCH 2/3] add unit test --- tests/test_spark.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/test_spark.py b/tests/test_spark.py index b0ae5bad..16cce4ad 100644 --- a/tests/test_spark.py +++ b/tests/test_spark.py @@ -2433,3 +2433,29 @@ def test_columns_with_mismatches_multiple_join_columns(spark_session): assert "id1" not in result assert "id2" not in result assert sorted(result) == ["value1", "value2"] + + +def test_forbid_case_sensitvive_columns(spark_session): + """Test error case for case sensitive columns in dataframes.""" + df1 = spark_session.createDataFrame( + [{"a": 1, "b": 2, "B": 1}, {"a": 3, "b": 1, "B": 0}] + ) + df2 = spark_session.createDataFrame( + [{"a": 1, "b": 2, "B": 2}, {"a": 2, "b": 0, "B": 0}] + ) + + with pytest.raises( + ValueError, + match=r"df1 has columns that differ only by case: {'b', 'B'}. " + r"Spark strongly discourages use of case sensitive column names. " + r"Rename columns to be unique regardless of case. " + r"See: https://spark.apache.org/docs/latest/api/python/tutorial/" + r"pandas_on_spark/best_practices.html#do-not-use-duplicated-column-names", + ): + SparkSQLCompare( + spark_session, + df1, + df2, + join_columns=["a"], + cast_column_names_lower=False, + ) From 077383a13f9d99572b0fab5664262a29461b4a65 Mon Sep 17 00:00:00 2001 From: Samuel Shi Date: Tue, 31 Mar 2026 21:56:20 -0400 Subject: [PATCH 3/3] add regex to match diff set orders --- datacompy/spark.py | 4 ++-- tests/test_spark.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/datacompy/spark.py b/datacompy/spark.py index 2d1a604f..7769d170 100644 --- a/datacompy/spark.py +++ b/datacompy/spark.py @@ -258,8 +258,8 @@ def _validate_dataframe( } raise ValueError( f"{index} has columns that differ only by case: {dupes}. " - "Spark strongly discourages use of case sensitive column names." - " Rename columns to be unique regardless of case. " + "Spark strongly discourages use of case sensitive column names. " + "Rename columns to be unique regardless of case. " "See: https://spark.apache.org/docs/latest/api/python/tutorial/" "pandas_on_spark/best_practices.html#do-not-use-duplicated-column-names" ) diff --git a/tests/test_spark.py b/tests/test_spark.py index 16cce4ad..fa1973fc 100644 --- a/tests/test_spark.py +++ b/tests/test_spark.py @@ -2446,11 +2446,11 @@ def test_forbid_case_sensitvive_columns(spark_session): with pytest.raises( ValueError, - match=r"df1 has columns that differ only by case: {'b', 'B'}. " - r"Spark strongly discourages use of case sensitive column names. " - r"Rename columns to be unique regardless of case. " - r"See: https://spark.apache.org/docs/latest/api/python/tutorial/" - r"pandas_on_spark/best_practices.html#do-not-use-duplicated-column-names", + match=r"df1 has columns that differ only by case: \{(?:'b', 'B'|'B', 'b')\}. " + "Spark strongly discourages use of case sensitive column names. " + "Rename columns to be unique regardless of case. " + "See: https://spark.apache.org/docs/latest/api/python/tutorial/" + "pandas_on_spark/best_practices.html#do-not-use-duplicated-column-names", ): SparkSQLCompare( spark_session,