Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Virtual environment — never commit this
venv/
env/
.env/
.venv/

# Python cache
__pycache__/
*.py[cod]
*.pyo
*.pyd
.Python

# Pytest cache
.pytest_cache/
.cache/

# Distribution / packaging
dist/
build/
*.egg-info/

# Jupyter notebooks checkpoints
.ipynb_checkpoints/

# OS files
.DS_Store
Thumbs.db

# VSCode settings (optional — remove if you want to share these)
.vscode/

# Environment variable files
.env
*.env
Empty file added astats/__init__.py
Empty file.
Empty file added astats/profiler/__init__.py
Empty file.
183 changes: 183 additions & 0 deletions astats/profiler/data_profiler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
"""
AStats DataProfiler-
Auto-discovers dataset structure and statistical properties.
Outputs a structured JSON profile with agent_hints for the AStats agent.
"""
# imports
import json
import sys
from typing import Any

import numpy as np
import pandas as pd
from scipy import stats

# functions
def _normality_test(series: pd.Series) -> dict:
"""Shapiro-Wilk for n<=5000, D'Agostino-Pearson for larger samples."""
n = len(series)
if n < 3:
return {"test": "none", "reason": "insufficient_data"}
if n <= 5000:
stat, p = stats.shapiro(series)
test_name = "shapiro-wilk"
else:
stat, p = stats.normaltest(series)
test_name = "dagostino-pearson"
return {
"test": test_name,
"statistic": round(float(stat), 4),
"p_value": round(float(p), 4),
"is_normal": bool(p > 0.05),
"note": "p>0.05 suggests normality (fail to reject H0)",
}

def _variance_homogeneity(df: pd.DataFrame, numeric_col: str, group_col: str) -> dict:
"""Levene's test for equal variances across groups."""
groups = [
grp[numeric_col].dropna().values
for _, grp in df.groupby(group_col)
if grp[numeric_col].dropna().shape[0] >= 2
]
if len(groups) < 2:
return {"test": "none", "reason": "need_at_least_2_groups"}
stat, p = stats.levene(*groups)
return {
"test": "levene",
"statistic": round(float(stat), 4),
"p_value": round(float(p), 4),
"equal_variance": bool(p > 0.05),
"note": "p>0.05 suggests equal variances",
}

def _outlier_summary(series: pd.Series) -> dict:
"""IQR-based outlier detection."""
q1, q3 = series.quantile(0.25), series.quantile(0.75)
iqr = q3 - q1
lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
outliers = series[(series < lower) | (series > upper)]
return {
"method": "IQR",
"lower_fence": round(float(lower), 4),
"upper_fence": round(float(upper), 4),
"outlier_count": int(len(outliers)),
"outlier_pct": round(float(len(outliers) / len(series) * 100), 2),
}

def profile_dataset(df: pd.DataFrame, group_col: str | None = None) -> dict[str, Any]:
"""
Full statistical profile of a DataFrame.

Parameters
--
df : pd.DataFrame
Input dataset.
group_col : str, optional
If provided, variance homogeneity is tested between groups
for all numeric columns.

Returns
--
dict
Structured JSON-serialisable profile with agent_hints.
"""
profile: dict[str, Any] = {
"shape": {"rows": int(df.shape[0]), "columns": int(df.shape[1])},
"columns": {},
}

for col in df.columns:
series = df[col].dropna()
col_info: dict[str, Any] = {
"dtype": str(df[col].dtype),
"missing": int(df[col].isna().sum()),
"missing_pct": round(float(df[col].isna().mean()) * 100, 2),
"unique": int(df[col].nunique()),
}

if pd.api.types.is_numeric_dtype(df[col]):
col_info["role"] = "continuous"
col_info["descriptive"] = {
"mean": round(float(series.mean()), 4),
"median": round(float(series.median()), 4),
"std": round(float(series.std()), 4),
"min": round(float(series.min()), 4),
"max": round(float(series.max()), 4),
"skewness": round(float(stats.skew(series)), 4),
"kurtosis": round(float(stats.kurtosis(series)), 4),
}
col_info["normality"] = _normality_test(series)
col_info["outliers"] = _outlier_summary(series)

if group_col and group_col in df.columns:
col_info["variance_homogeneity"] = _variance_homogeneity(
df, col, group_col
)

elif isinstance(df[col].dtype, pd.CategoricalDtype) or df[col].dtype == object or pd.api.types.is_string_dtype(df[col]):
col_info["role"] = "categorical"
col_info["top_values"] = df[col].value_counts().head(5).to_dict()

elif pd.api.types.is_datetime64_any_dtype(df[col]):
col_info["role"] = "datetime"
col_info["range"] = {"min": str(series.min()), "max": str(series.max())}

else:
col_info["role"] = "unknown"

profile["columns"][col] = col_info

# agent hints-
normal_cols = [
c for c, v in profile["columns"].items()
if v.get("normality", {}).get("is_normal", False)
]
non_normal_cols = [
c for c, v in profile["columns"].items()
if "normality" in v and not v["normality"]["is_normal"]
]
high_outlier_cols = [
c for c, v in profile["columns"].items()
if v.get("outliers", {}).get("outlier_pct", 0) > 5
]
equal_variance_cols = [
c for c, v in profile["columns"].items()
if v.get("variance_homogeneity", {}).get("equal_variance", None) is True
]
unequal_variance_cols = [
c for c, v in profile["columns"].items()
if v.get("variance_homogeneity", {}).get("equal_variance", None) is False
]

profile["agent_hints"] = {
"normal_columns": normal_cols,
"non_normal_columns": non_normal_cols,
"high_outlier_columns": high_outlier_cols,
"equal_variance_columns": equal_variance_cols,
"unequal_variance_columns": unequal_variance_cols,
"test_routing": {
"parametric_candidates": normal_cols,
"nonparametric_candidates": non_normal_cols,
"welch_candidates": unequal_variance_cols,
},
"summary": (
f"{len(normal_cols)} normal column(s), "
f"{len(non_normal_cols)} non-normal column(s), "
f"{len(high_outlier_cols)} column(s) with >5% outliers."
),
}

return profile

def profile_csv(filepath: str, group_col: str | None = None) -> dict[str, Any]:
"""Convenience wrapper: load a CSV and profile it."""
df = pd.read_csv(filepath)
return profile_dataset(df, group_col=group_col)

if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python data_profiler.py <csv_path> [group_col]")
sys.exit(1)
group = sys.argv[2] if len(sys.argv) > 2 else None
result = profile_csv(sys.argv[1], group_col=group)
print(json.dumps(result, indent=2))
34 changes: 34 additions & 0 deletions examples/data/generate_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""
Generates simulated datasets with KNOWN statistical properties.
Used as ground-truth inputs for evaluating the AStats agent.
"""
import numpy as np
import pandas as pd

np.random.seed(42)
N = 300

df = pd.DataFrame({
# Normal: should pass normality test → parametric tests valid
"score_normal": np.random.normal(loc=50, scale=10, size=N),
# Skewed: should fail normality → non-parametric tests needed
"score_skewed": np.random.exponential(scale=5, size=N),
# Group column for between-group comparisons
"group": np.random.choice(["control", "treatment"], size=N),
# Categorical variable
"category": np.random.choice(["A", "B", "C"], size=N),
# Age with some missing values
"age": np.random.randint(18, 70, size=N).astype(float),
})

# Inject missing values
df.loc[np.random.choice(df.index, 15), "age"] = np.nan

df.to_csv("sample_dataset.csv", index=False)

print("sample_dataset.csv created.")
print("\nKnown ground-truth properties:")
print(" score_normal → NORMAL → use t-test / ANOVA")
print(" score_skewed → NON-NORMAL → use Mann-Whitney / Kruskal-Wallis")
print(" group → categorical (2 levels) → suitable grouping variable")
print(" age → continuous with 15 missing values")
Loading