m2b3 · GauravSRC · Mar 15, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,35 @@
+# Virtual environment — never commit this
+venv/
+env/
+.env/
+.venv/
+
+# Python cache
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+.Python
+
+# Pytest cache
+.pytest_cache/
+.cache/
+
+# Distribution / packaging
+dist/
+build/
+*.egg-info/
+
+# Jupyter notebooks checkpoints
+.ipynb_checkpoints/
+
+# OS files
+.DS_Store
+Thumbs.db
+
+# VSCode settings (optional — remove if you want to share these)
+.vscode/
+
+# Environment variable files
+.env
+*.env
diff --git a/astats/__init__.py b/astats/__init__.py
diff --git a/astats/profiler/__init__.py b/astats/profiler/__init__.py
diff --git a/astats/profiler/data_profiler.py b/astats/profiler/data_profiler.py
@@ -0,0 +1,183 @@
+"""
+AStats DataProfiler-
+Auto-discovers dataset structure and statistical properties.
+Outputs a structured JSON profile with agent_hints for the AStats agent.
+"""
+# imports
+import json
+import sys
+from typing import Any
+
+import numpy as np
+import pandas as pd
+from scipy import stats
+
+# functions
+def _normality_test(series: pd.Series) -> dict:
+    """Shapiro-Wilk for n<=5000, D'Agostino-Pearson for larger samples."""
+    n = len(series)
+    if n < 3:
+        return {"test": "none", "reason": "insufficient_data"}
+    if n <= 5000:
+        stat, p = stats.shapiro(series)
+        test_name = "shapiro-wilk"
+    else:
+        stat, p = stats.normaltest(series)
+        test_name = "dagostino-pearson"
+    return {
+        "test": test_name,
+        "statistic": round(float(stat), 4),
+        "p_value": round(float(p), 4),
+        "is_normal": bool(p > 0.05),
+        "note": "p>0.05 suggests normality (fail to reject H0)",
+    }
+
+def _variance_homogeneity(df: pd.DataFrame, numeric_col: str, group_col: str) -> dict:
+    """Levene's test for equal variances across groups."""
+    groups = [
+        grp[numeric_col].dropna().values
+        for _, grp in df.groupby(group_col)
+        if grp[numeric_col].dropna().shape[0] >= 2
+    ]
+    if len(groups) < 2:
+        return {"test": "none", "reason": "need_at_least_2_groups"}
+    stat, p = stats.levene(*groups)
+    return {
+        "test": "levene",
+        "statistic": round(float(stat), 4),
+        "p_value": round(float(p), 4),
+        "equal_variance": bool(p > 0.05),
+        "note": "p>0.05 suggests equal variances",
+    }
+
+def _outlier_summary(series: pd.Series) -> dict:
+    """IQR-based outlier detection."""
+    q1, q3 = series.quantile(0.25), series.quantile(0.75)
+    iqr = q3 - q1
+    lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
+    outliers = series[(series < lower) | (series > upper)]
+    return {
+        "method": "IQR",
+        "lower_fence": round(float(lower), 4),
+        "upper_fence": round(float(upper), 4),
+        "outlier_count": int(len(outliers)),
+        "outlier_pct": round(float(len(outliers) / len(series) * 100), 2),
+    }
+
+def profile_dataset(df: pd.DataFrame, group_col: str | None = None) -> dict[str, Any]:
+    """
+    Full statistical profile of a DataFrame.
+
+    Parameters
+    --
+    df : pd.DataFrame
+        Input dataset.
+    group_col : str, optional
+        If provided, variance homogeneity is tested between groups
+        for all numeric columns.
+
+    Returns
+    --
+    dict
+        Structured JSON-serialisable profile with agent_hints.
+    """
+    profile: dict[str, Any] = {
+        "shape": {"rows": int(df.shape[0]), "columns": int(df.shape[1])},
+        "columns": {},
+    }
+
+    for col in df.columns:
+        series = df[col].dropna()
+        col_info: dict[str, Any] = {
+            "dtype": str(df[col].dtype),
+            "missing": int(df[col].isna().sum()),
+            "missing_pct": round(float(df[col].isna().mean()) * 100, 2),
+            "unique": int(df[col].nunique()),
+        }
+
+        if pd.api.types.is_numeric_dtype(df[col]):
+            col_info["role"] = "continuous"
+            col_info["descriptive"] = {
+                "mean":     round(float(series.mean()), 4),
+                "median":   round(float(series.median()), 4),
+                "std":      round(float(series.std()), 4),
+                "min":      round(float(series.min()), 4),
+                "max":      round(float(series.max()), 4),
+                "skewness": round(float(stats.skew(series)), 4),
+                "kurtosis": round(float(stats.kurtosis(series)), 4),
+            }
+            col_info["normality"] = _normality_test(series)
+            col_info["outliers"]  = _outlier_summary(series)
+
+            if group_col and group_col in df.columns:
+                col_info["variance_homogeneity"] = _variance_homogeneity(
+                    df, col, group_col
+                )
+
+        elif isinstance(df[col].dtype, pd.CategoricalDtype) or df[col].dtype == object or pd.api.types.is_string_dtype(df[col]):
+            col_info["role"] = "categorical"
+            col_info["top_values"] = df[col].value_counts().head(5).to_dict()
+
+        elif pd.api.types.is_datetime64_any_dtype(df[col]):
+            col_info["role"] = "datetime"
+            col_info["range"] = {"min": str(series.min()), "max": str(series.max())}
+
+        else:
+            col_info["role"] = "unknown"
+
+        profile["columns"][col] = col_info
+
+    # agent hints-
+    normal_cols = [
+        c for c, v in profile["columns"].items()
+        if v.get("normality", {}).get("is_normal", False)
+    ]
+    non_normal_cols = [
+        c for c, v in profile["columns"].items()
+        if "normality" in v and not v["normality"]["is_normal"]
+    ]
+    high_outlier_cols = [
+        c for c, v in profile["columns"].items()
+        if v.get("outliers", {}).get("outlier_pct", 0) > 5
+    ]
+    equal_variance_cols = [
+        c for c, v in profile["columns"].items()
+        if v.get("variance_homogeneity", {}).get("equal_variance", None) is True
+    ]
+    unequal_variance_cols = [
+        c for c, v in profile["columns"].items()
+        if v.get("variance_homogeneity", {}).get("equal_variance", None) is False
+    ]
+
+    profile["agent_hints"] = {
+        "normal_columns":          normal_cols,
+        "non_normal_columns":      non_normal_cols,
+        "high_outlier_columns":    high_outlier_cols,
+        "equal_variance_columns":  equal_variance_cols,
+        "unequal_variance_columns": unequal_variance_cols,
+        "test_routing": {
+            "parametric_candidates":     normal_cols,
+            "nonparametric_candidates":  non_normal_cols,
+            "welch_candidates":          unequal_variance_cols,
+        },
+        "summary": (
+            f"{len(normal_cols)} normal column(s), "
+            f"{len(non_normal_cols)} non-normal column(s), "
+            f"{len(high_outlier_cols)} column(s) with >5% outliers."
+        ),
+    }
+
+    return profile
+
+def profile_csv(filepath: str, group_col: str | None = None) -> dict[str, Any]:
+    """Convenience wrapper: load a CSV and profile it."""
+    df = pd.read_csv(filepath)
+    return profile_dataset(df, group_col=group_col)
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python data_profiler.py <csv_path> [group_col]")
+        sys.exit(1)
+    group = sys.argv[2] if len(sys.argv) > 2 else None
+    result = profile_csv(sys.argv[1], group_col=group)
+    print(json.dumps(result, indent=2))
diff --git a/examples/data/generate_sample.py b/examples/data/generate_sample.py
@@ -0,0 +1,34 @@
+"""
+Generates simulated datasets with KNOWN statistical properties.
+Used as ground-truth inputs for evaluating the AStats agent.
+"""
+import numpy as np
+import pandas as pd
+
+np.random.seed(42)
+N = 300
+
+df = pd.DataFrame({
+    # Normal: should pass normality test → parametric tests valid
+    "score_normal":    np.random.normal(loc=50, scale=10, size=N),
+    # Skewed: should fail normality → non-parametric tests needed
+    "score_skewed":    np.random.exponential(scale=5, size=N),
+    # Group column for between-group comparisons
+    "group":           np.random.choice(["control", "treatment"], size=N),
+    # Categorical variable
+    "category":        np.random.choice(["A", "B", "C"], size=N),
+    # Age with some missing values
+    "age":             np.random.randint(18, 70, size=N).astype(float),
+})
+
+# Inject missing values
+df.loc[np.random.choice(df.index, 15), "age"] = np.nan
+
+df.to_csv("sample_dataset.csv", index=False)
+
+print("sample_dataset.csv created.")
+print("\nKnown ground-truth properties:")
+print("  score_normal  → NORMAL    → use t-test / ANOVA")
+print("  score_skewed  → NON-NORMAL → use Mann-Whitney / Kruskal-Wallis")
+print("  group         → categorical (2 levels) → suitable grouping variable")
+print("  age           → continuous with 15 missing values")