From 42c47cb950689deec61fc31227d5940906b163f7 Mon Sep 17 00:00:00 2001
From: hao <alveinwang@163.com>
Date: Tue, 10 Mar 2026 01:25:03 +0800
Subject: [PATCH] Fix KeyError: 'low_quality' in AlanDataset (#2)

- Add filter_low_quality parameter (default True for backward compat)
- Check if 'low_quality' column exists before filtering
- Add clear error message for missing required columns
- Print warning when low_quality column is expected but not found
---
 near/datasets/refine_dataset.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/near/datasets/refine_dataset.py b/near/datasets/refine_dataset.py
index ea77c47..89be5fd 100644
--- a/near/datasets/refine_dataset.py
+++ b/near/datasets/refine_dataset.py
@@ -8,7 +8,7 @@
 
 class AlanDataset(Dataset):
 
-    def __init__(self, root='../../data/Alan', appearance_path='appearance', shape_path='shape', resolution=128, n_samples=None):
+    def __init__(self, root='../../data/Alan', appearance_path='appearance', shape_path='shape', resolution=128, n_samples=None, filter_low_quality=True):
 
         self.root = root
         self.resolution = resolution
@@ -16,7 +16,21 @@ def __init__(self, root='../../data/Alan', appearance_path='appearance', shape_p
         self.shape_dir = os.path.join(root, shape_path)
 
         df = pd.read_csv(os.path.join(self.root, 'info.csv'))
-        info = df[df['low_quality'].isnull()]
+
+        # Handle optional 'low_quality' column
+        if filter_low_quality and 'low_quality' in df.columns:
+            info = df[df['low_quality'].isnull()]
+        else:
+            if filter_low_quality and 'low_quality' not in df.columns:
+                print("Warning: 'low_quality' column not found in info.csv. Using all samples.")
+            info = df
+
+        # Check for required columns
+        required_cols = ['ROI_id', 'ROI_anomaly']
+        missing_cols = [col for col in required_cols if col not in info.columns]
+        if missing_cols:
+            raise KeyError(f"Required columns {missing_cols} not found in info.csv. Available columns: {list(df.columns)}")
+
         self.info = info[['ROI_id', 'ROI_anomaly']]
         self.info.reset_index(drop=True, inplace=True)