From 2702f0f2cb63e567d26329648889727a048939c1 Mon Sep 17 00:00:00 2001
From: Samuel Shi <samuel.shi2004@gmail.com>
Date: Tue, 31 Mar 2026 20:43:37 -0400
Subject: [PATCH 1/3] forbid case sensitive columns during df validation

---
 datacompy/spark.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/datacompy/spark.py b/datacompy/spark.py
index 9df8852f..2d1a604f 100644
--- a/datacompy/spark.py
+++ b/datacompy/spark.py
@@ -249,6 +249,20 @@ def _validate_dataframe(
                 self._df1 = dataframe.toDF(*[str(c).lower() for c in dataframe.columns])
             if index == "df2":
                 self._df2 = dataframe.toDF(*[str(c).lower() for c in dataframe.columns])
+        else:
+            # Don't allow case sensitive columns
+            lower_cols = [c.lower() for c in dataframe.columns]
+            if len(set(lower_cols)) < len(lower_cols):
+                dupes = {
+                    c for c in dataframe.columns if lower_cols.count(c.lower()) > 1
+                }
+                raise ValueError(
+                    f"{index} has columns that differ only by case: {dupes}. "
+                    "Spark strongly discourages use of case sensitive column names."
+                    " Rename columns to be unique regardless of case. "
+                    "See: https://spark.apache.org/docs/latest/api/python/tutorial/"
+                    "pandas_on_spark/best_practices.html#do-not-use-duplicated-column-names"
+                )
 
         # Check if join_columns are present in the dataframe
         dataframe = getattr(self, index)  # refresh

From ae967b8ea8680a3c492ece2b5d0bd833e46ece3b Mon Sep 17 00:00:00 2001
From: Samuel Shi <samuel.shi2004@gmail.com>
Date: Tue, 31 Mar 2026 21:32:17 -0400
Subject: [PATCH 2/3] add unit test

---
 tests/test_spark.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tests/test_spark.py b/tests/test_spark.py
index b0ae5bad..16cce4ad 100644
--- a/tests/test_spark.py
+++ b/tests/test_spark.py
@@ -2433,3 +2433,29 @@ def test_columns_with_mismatches_multiple_join_columns(spark_session):
     assert "id1" not in result
     assert "id2" not in result
     assert sorted(result) == ["value1", "value2"]
+
+
+def test_forbid_case_sensitvive_columns(spark_session):
+    """Test error case for case sensitive columns in dataframes."""
+    df1 = spark_session.createDataFrame(
+        [{"a": 1, "b": 2, "B": 1}, {"a": 3, "b": 1, "B": 0}]
+    )
+    df2 = spark_session.createDataFrame(
+        [{"a": 1, "b": 2, "B": 2}, {"a": 2, "b": 0, "B": 0}]
+    )
+
+    with pytest.raises(
+        ValueError,
+        match=r"df1 has columns that differ only by case: {'b', 'B'}. "
+        r"Spark strongly discourages use of case sensitive column names. "
+        r"Rename columns to be unique regardless of case. "
+        r"See: https://spark.apache.org/docs/latest/api/python/tutorial/"
+        r"pandas_on_spark/best_practices.html#do-not-use-duplicated-column-names",
+    ):
+        SparkSQLCompare(
+            spark_session,
+            df1,
+            df2,
+            join_columns=["a"],
+            cast_column_names_lower=False,
+        )

From 077383a13f9d99572b0fab5664262a29461b4a65 Mon Sep 17 00:00:00 2001
From: Samuel Shi <samuel.shi2004@gmail.com>
Date: Tue, 31 Mar 2026 21:56:20 -0400
Subject: [PATCH 3/3] add regex to match diff set orders

---
 datacompy/spark.py  |  4 ++--
 tests/test_spark.py | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/datacompy/spark.py b/datacompy/spark.py
index 2d1a604f..7769d170 100644
--- a/datacompy/spark.py
+++ b/datacompy/spark.py
@@ -258,8 +258,8 @@ def _validate_dataframe(
                 }
                 raise ValueError(
                     f"{index} has columns that differ only by case: {dupes}. "
-                    "Spark strongly discourages use of case sensitive column names."
-                    " Rename columns to be unique regardless of case. "
+                    "Spark strongly discourages use of case sensitive column names. "
+                    "Rename columns to be unique regardless of case. "
                     "See: https://spark.apache.org/docs/latest/api/python/tutorial/"
                     "pandas_on_spark/best_practices.html#do-not-use-duplicated-column-names"
                 )
diff --git a/tests/test_spark.py b/tests/test_spark.py
index 16cce4ad..fa1973fc 100644
--- a/tests/test_spark.py
+++ b/tests/test_spark.py
@@ -2446,11 +2446,11 @@ def test_forbid_case_sensitvive_columns(spark_session):
 
     with pytest.raises(
         ValueError,
-        match=r"df1 has columns that differ only by case: {'b', 'B'}. "
-        r"Spark strongly discourages use of case sensitive column names. "
-        r"Rename columns to be unique regardless of case. "
-        r"See: https://spark.apache.org/docs/latest/api/python/tutorial/"
-        r"pandas_on_spark/best_practices.html#do-not-use-duplicated-column-names",
+        match=r"df1 has columns that differ only by case: \{(?:'b', 'B'|'B', 'b')\}. "
+        "Spark strongly discourages use of case sensitive column names. "
+        "Rename columns to be unique regardless of case. "
+        "See: https://spark.apache.org/docs/latest/api/python/tutorial/"
+        "pandas_on_spark/best_practices.html#do-not-use-duplicated-column-names",
     ):
         SparkSQLCompare(
             spark_session,