In [1]:
# dataset test
In [2]:
!pip install jupyter matplotlib seaborn pandas scikit-learn polars numpy
Requirement already satisfied: jupyter in ./.venv/lib/python3.14/site-packages (1.1.1) Requirement already satisfied: matplotlib in ./.venv/lib/python3.14/site-packages (3.10.8) Requirement already satisfied: seaborn in ./.venv/lib/python3.14/site-packages (0.13.2) Requirement already satisfied: pandas in ./.venv/lib/python3.14/site-packages (3.0.1) Requirement already satisfied: scikit-learn in ./.venv/lib/python3.14/site-packages (1.8.0) Collecting polars Downloading polars-1.38.1-py3-none-any.whl.metadata (10 kB) Requirement already satisfied: numpy in ./.venv/lib/python3.14/site-packages (2.4.2) Requirement already satisfied: notebook in ./.venv/lib/python3.14/site-packages (from jupyter) (7.5.3) Requirement already satisfied: jupyter-console in ./.venv/lib/python3.14/site-packages (from jupyter) (6.6.3) Requirement already satisfied: nbconvert in ./.venv/lib/python3.14/site-packages (from jupyter) (7.17.0) Requirement already satisfied: ipykernel in ./.venv/lib/python3.14/site-packages (from jupyter) (7.2.0) Requirement already satisfied: ipywidgets in ./.venv/lib/python3.14/site-packages (from jupyter) (8.1.8) Requirement already satisfied: jupyterlab in ./.venv/lib/python3.14/site-packages (from jupyter) (4.5.4) Requirement already satisfied: contourpy>=1.0.1 in ./.venv/lib/python3.14/site-packages (from matplotlib) (1.3.3) Requirement already satisfied: cycler>=0.10 in ./.venv/lib/python3.14/site-packages (from matplotlib) (0.12.1) Requirement already satisfied: fonttools>=4.22.0 in ./.venv/lib/python3.14/site-packages (from matplotlib) (4.61.1) Requirement already satisfied: kiwisolver>=1.3.1 in ./.venv/lib/python3.14/site-packages (from matplotlib) (1.4.9) Requirement already satisfied: packaging>=20.0 in ./.venv/lib/python3.14/site-packages (from matplotlib) (26.0) Requirement already satisfied: pillow>=8 in ./.venv/lib/python3.14/site-packages (from matplotlib) (12.1.1) Requirement already satisfied: pyparsing>=3 in ./.venv/lib/python3.14/site-packages (from matplotlib) (3.3.2) Requirement already satisfied: python-dateutil>=2.7 in ./.venv/lib/python3.14/site-packages (from matplotlib) (2.9.0.post0) Requirement already satisfied: scipy>=1.10.0 in ./.venv/lib/python3.14/site-packages (from scikit-learn) (1.17.0) Requirement already satisfied: joblib>=1.3.0 in ./.venv/lib/python3.14/site-packages (from scikit-learn) (1.5.3) Requirement already satisfied: threadpoolctl>=3.2.0 in ./.venv/lib/python3.14/site-packages (from scikit-learn) (3.6.0) Collecting polars-runtime-32==1.38.1 (from polars) Downloading polars_runtime_32-1.38.1-cp310-abi3-macosx_11_0_arm64.whl.metadata (1.5 kB) Requirement already satisfied: six>=1.5 in ./.venv/lib/python3.14/site-packages (from python-dateutil>=2.7->matplotlib) (1.17.0) Requirement already satisfied: appnope>=0.1.2 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (0.1.4) Requirement already satisfied: comm>=0.1.1 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (0.2.3) Requirement already satisfied: debugpy>=1.6.5 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (1.8.20) Requirement already satisfied: ipython>=7.23.1 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (9.10.0) Requirement already satisfied: jupyter-client>=8.8.0 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (8.8.0) Requirement already satisfied: jupyter-core!=6.0.*,>=5.1 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (5.9.1) Requirement already satisfied: matplotlib-inline>=0.1 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (0.2.1) Requirement already satisfied: nest-asyncio>=1.4 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (1.6.0) Requirement already satisfied: psutil>=5.7 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (7.2.2) Requirement already satisfied: pyzmq>=25 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (27.1.0) Requirement already satisfied: tornado>=6.4.1 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (6.5.4) Requirement already satisfied: traitlets>=5.4.0 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (5.14.3) Requirement already satisfied: decorator>=4.3.2 in ./.venv/lib/python3.14/site-packages (from ipython>=7.23.1->ipykernel->jupyter) (5.2.1) Requirement already satisfied: ipython-pygments-lexers>=1.0.0 in ./.venv/lib/python3.14/site-packages (from ipython>=7.23.1->ipykernel->jupyter) (1.1.1) Requirement already satisfied: jedi>=0.18.1 in ./.venv/lib/python3.14/site-packages (from ipython>=7.23.1->ipykernel->jupyter) (0.19.2) Requirement already satisfied: pexpect>4.3 in ./.venv/lib/python3.14/site-packages (from ipython>=7.23.1->ipykernel->jupyter) (4.9.0) Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in ./.venv/lib/python3.14/site-packages (from ipython>=7.23.1->ipykernel->jupyter) (3.0.52) Requirement already satisfied: pygments>=2.11.0 in ./.venv/lib/python3.14/site-packages (from ipython>=7.23.1->ipykernel->jupyter) (2.19.2) Requirement already satisfied: stack_data>=0.6.0 in ./.venv/lib/python3.14/site-packages (from ipython>=7.23.1->ipykernel->jupyter) (0.6.3) Requirement already satisfied: wcwidth in ./.venv/lib/python3.14/site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=7.23.1->ipykernel->jupyter) (0.6.0) Requirement already satisfied: parso<0.9.0,>=0.8.4 in ./.venv/lib/python3.14/site-packages (from jedi>=0.18.1->ipython>=7.23.1->ipykernel->jupyter) (0.8.6) Requirement already satisfied: platformdirs>=2.5 in ./.venv/lib/python3.14/site-packages (from jupyter-core!=6.0.*,>=5.1->ipykernel->jupyter) (4.6.0) Requirement already satisfied: ptyprocess>=0.5 in ./.venv/lib/python3.14/site-packages (from pexpect>4.3->ipython>=7.23.1->ipykernel->jupyter) (0.7.0) Requirement already satisfied: executing>=1.2.0 in ./.venv/lib/python3.14/site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel->jupyter) (2.2.1) Requirement already satisfied: asttokens>=2.1.0 in ./.venv/lib/python3.14/site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel->jupyter) (3.0.1) Requirement already satisfied: pure-eval in ./.venv/lib/python3.14/site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel->jupyter) (0.2.3) Requirement already satisfied: widgetsnbextension~=4.0.14 in ./.venv/lib/python3.14/site-packages (from ipywidgets->jupyter) (4.0.15) Requirement already satisfied: jupyterlab_widgets~=3.0.15 in ./.venv/lib/python3.14/site-packages (from ipywidgets->jupyter) (3.0.16) Requirement already satisfied: async-lru>=1.0.0 in ./.venv/lib/python3.14/site-packages (from jupyterlab->jupyter) (2.1.0) Requirement already satisfied: httpx<1,>=0.25.0 in ./.venv/lib/python3.14/site-packages (from jupyterlab->jupyter) (0.28.1) Requirement already satisfied: jinja2>=3.0.3 in ./.venv/lib/python3.14/site-packages (from jupyterlab->jupyter) (3.1.6) Requirement already satisfied: jupyter-lsp>=2.0.0 in ./.venv/lib/python3.14/site-packages (from jupyterlab->jupyter) (2.3.0) Requirement already satisfied: jupyter-server<3,>=2.4.0 in ./.venv/lib/python3.14/site-packages (from jupyterlab->jupyter) (2.17.0) Requirement already satisfied: jupyterlab-server<3,>=2.28.0 in ./.venv/lib/python3.14/site-packages (from jupyterlab->jupyter) (2.28.0) Requirement already satisfied: notebook-shim>=0.2 in ./.venv/lib/python3.14/site-packages (from jupyterlab->jupyter) (0.2.4) Requirement already satisfied: setuptools>=41.1.0 in ./.venv/lib/python3.14/site-packages (from jupyterlab->jupyter) (82.0.0) Requirement already satisfied: anyio in ./.venv/lib/python3.14/site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (4.12.1) Requirement already satisfied: certifi in ./.venv/lib/python3.14/site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (2026.1.4) Requirement already satisfied: httpcore==1.* in ./.venv/lib/python3.14/site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (1.0.9) Requirement already satisfied: idna in ./.venv/lib/python3.14/site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (3.11) Requirement already satisfied: h11>=0.16 in ./.venv/lib/python3.14/site-packages (from httpcore==1.*->httpx<1,>=0.25.0->jupyterlab->jupyter) (0.16.0) Requirement already satisfied: argon2-cffi>=21.1 in ./.venv/lib/python3.14/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.1.0) Requirement already satisfied: jupyter-events>=0.11.0 in ./.venv/lib/python3.14/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.12.0) Requirement already satisfied: jupyter-server-terminals>=0.4.4 in ./.venv/lib/python3.14/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.5.4) Requirement already satisfied: nbformat>=5.3.0 in ./.venv/lib/python3.14/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (5.10.4) Requirement already satisfied: prometheus-client>=0.9 in ./.venv/lib/python3.14/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.24.1) Requirement already satisfied: send2trash>=1.8.2 in ./.venv/lib/python3.14/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.1.0) Requirement already satisfied: terminado>=0.8.3 in ./.venv/lib/python3.14/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.18.1) Requirement already satisfied: websocket-client>=1.7 in ./.venv/lib/python3.14/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.9.0) Requirement already satisfied: babel>=2.10 in ./.venv/lib/python3.14/site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2.18.0) Requirement already satisfied: json5>=0.9.0 in ./.venv/lib/python3.14/site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (0.13.0) Requirement already satisfied: jsonschema>=4.18.0 in ./.venv/lib/python3.14/site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (4.26.0) Requirement already satisfied: requests>=2.31 in ./.venv/lib/python3.14/site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2.32.5) Requirement already satisfied: argon2-cffi-bindings in ./.venv/lib/python3.14/site-packages (from argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.1.0) Requirement already satisfied: MarkupSafe>=2.0 in ./.venv/lib/python3.14/site-packages (from jinja2>=3.0.3->jupyterlab->jupyter) (3.0.3) Requirement already satisfied: attrs>=22.2.0 in ./.venv/lib/python3.14/site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (25.4.0) Requirement already satisfied: jsonschema-specifications>=2023.03.6 in ./.venv/lib/python3.14/site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2025.9.1) Requirement already satisfied: referencing>=0.28.4 in ./.venv/lib/python3.14/site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (0.37.0) Requirement already satisfied: rpds-py>=0.25.0 in ./.venv/lib/python3.14/site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (0.30.0) Requirement already satisfied: python-json-logger>=2.0.4 in ./.venv/lib/python3.14/site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (4.0.0) Requirement already satisfied: pyyaml>=5.3 in ./.venv/lib/python3.14/site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (6.0.3) Requirement already satisfied: rfc3339-validator in ./.venv/lib/python3.14/site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.1.4) Requirement already satisfied: rfc3986-validator>=0.1.1 in ./.venv/lib/python3.14/site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.1.1) Requirement already satisfied: fqdn in ./.venv/lib/python3.14/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.5.1) Requirement already satisfied: isoduration in ./.venv/lib/python3.14/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (20.11.0) Requirement already satisfied: jsonpointer>1.13 in ./.venv/lib/python3.14/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (3.0.0) Requirement already satisfied: rfc3987-syntax>=1.1.0 in ./.venv/lib/python3.14/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.1.0) Requirement already satisfied: uri-template in ./.venv/lib/python3.14/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.3.0) Requirement already satisfied: webcolors>=24.6.0 in ./.venv/lib/python3.14/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.10.0) Requirement already satisfied: beautifulsoup4 in ./.venv/lib/python3.14/site-packages (from nbconvert->jupyter) (4.14.3) Requirement already satisfied: bleach!=5.0.0 in ./.venv/lib/python3.14/site-packages (from bleach[css]!=5.0.0->nbconvert->jupyter) (6.3.0) Requirement already satisfied: defusedxml in ./.venv/lib/python3.14/site-packages (from nbconvert->jupyter) (0.7.1) Requirement already satisfied: jupyterlab-pygments in ./.venv/lib/python3.14/site-packages (from nbconvert->jupyter) (0.3.0) Requirement already satisfied: mistune<4,>=2.0.3 in ./.venv/lib/python3.14/site-packages (from nbconvert->jupyter) (3.2.0) Requirement already satisfied: nbclient>=0.5.0 in ./.venv/lib/python3.14/site-packages (from nbconvert->jupyter) (0.10.4) Requirement already satisfied: pandocfilters>=1.4.1 in ./.venv/lib/python3.14/site-packages (from nbconvert->jupyter) (1.5.1) Requirement already satisfied: webencodings in ./.venv/lib/python3.14/site-packages (from bleach!=5.0.0->bleach[css]!=5.0.0->nbconvert->jupyter) (0.5.1) Requirement already satisfied: tinycss2<1.5,>=1.1.0 in ./.venv/lib/python3.14/site-packages (from bleach[css]!=5.0.0->nbconvert->jupyter) (1.4.0) Requirement already satisfied: fastjsonschema>=2.15 in ./.venv/lib/python3.14/site-packages (from nbformat>=5.3.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.21.2) Requirement already satisfied: charset_normalizer<4,>=2 in ./.venv/lib/python3.14/site-packages (from requests>=2.31->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (3.4.4) Requirement already satisfied: urllib3<3,>=1.21.1 in ./.venv/lib/python3.14/site-packages (from requests>=2.31->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2.6.3) Requirement already satisfied: lark>=1.2.2 in ./.venv/lib/python3.14/site-packages (from rfc3987-syntax>=1.1.0->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.3.1) Requirement already satisfied: cffi>=2.0.0b1 in ./.venv/lib/python3.14/site-packages (from argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.0.0) Requirement already satisfied: pycparser in ./.venv/lib/python3.14/site-packages (from cffi>=2.0.0b1->argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (3.0) Requirement already satisfied: soupsieve>=1.6.1 in ./.venv/lib/python3.14/site-packages (from beautifulsoup4->nbconvert->jupyter) (2.8.3) Requirement already satisfied: typing-extensions>=4.0.0 in ./.venv/lib/python3.14/site-packages (from beautifulsoup4->nbconvert->jupyter) (4.15.0) Requirement already satisfied: arrow>=0.15.0 in ./.venv/lib/python3.14/site-packages (from isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.4.0) Requirement already satisfied: tzdata in ./.venv/lib/python3.14/site-packages (from arrow>=0.15.0->isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2025.3) Downloading polars-1.38.1-py3-none-any.whl (810 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 810.4/810.4 kB 22.7 MB/s 0:00:00 Downloading polars_runtime_32-1.38.1-cp310-abi3-macosx_11_0_arm64.whl (40.2 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40.2/40.2 MB 29.7 MB/s 0:00:01 28.6 MB/s eta 0:00:01 Installing collected packages: polars-runtime-32, polars ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2/2 [polars]━━━━ 1/2 [polars] Successfully installed polars-1.38.1 polars-runtime-32-1.38.1 [notice] A new release of pip is available: 25.3 -> 26.0.1 [notice] To update, run: pip install --upgrade pip
In [4]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, accuracy_score, r2_score, mean_absolute_error
sns.set_theme(style="whitegrid")
%matplotlib inline
DATASETS_DIR = "./sample_datasets"
def load_dataset(name):
"""Load a dataset and all its metadata."""
base = f"{DATASETS_DIR}/{name}"
result = {
"full": pd.read_csv(f"{base}/dataset.csv"),
"train": pd.read_csv(f"{base}/train.csv"),
"test": pd.read_csv(f"{base}/test.csv"),
}
with open(f"{base}/config.json") as f:
result["config"] = json.load(f)
with open(f"{base}/quality_report.json") as f:
result["report"] = json.load(f)
with open(f"{base}/baseline_evaluation.json") as f:
result["baseline"] = json.load(f)
return result
print("Available datasets:")
for name in sorted(os.listdir(DATASETS_DIR)):
path = os.path.join(DATASETS_DIR, name)
if os.path.isdir(path) and os.path.exists(os.path.join(path, "dataset.csv")):
print(f" - {name}")
Available datasets: - correlated_regression - healthcare_readmission - housing_price - imperfect_binary
In [5]:
ds = load_dataset("healthcare_readmission")
config = ds["config"]
report = ds["report"]
baseline = ds["baseline"]
full = ds["full"]
train = ds["train"]
test = ds["test"]
target = config["target"]["name"]
print(f"Dataset: {config['name']}")
print(f"Task type: {config['task_type']}")
print(f"Shape: {full.shape}")
print(f"Target column: {target}")
print(f"Train/test split: {len(train)} / {len(test)}")
print()
full.describe()
Dataset: Hospital Readmission Task type: binary_classification Shape: (2000, 7) Target column: readmitted Train/test split: 1600 / 400
Out[5]:
| age | num_procedures | length_of_stay | num_medications | |
|---|---|---|---|---|
| count | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 |
| mean | 68.607597 | 3.743760 | 6.889521 | 9.595137 |
| std | 13.466581 | 1.939329 | 3.864062 | 4.700236 |
| min | 18.000000 | 0.000000 | 1.000000 | 0.000000 |
| 25% | 59.564149 | 2.262493 | 2.701904 | 6.142470 |
| 50% | 69.351417 | 4.145051 | 7.987439 | 10.159562 |
| 75% | 78.362426 | 5.211250 | 9.838393 | 12.894204 |
| max | 100.000000 | 9.465672 | 17.377155 | 22.569377 |
In [6]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
full[target].value_counts().plot.bar(ax=axes[0], color=["#4A90D9", "#E8734A"])
axes[0].set_title("Class Distribution")
axes[0].set_ylabel("Count")
if "class_balance" in report:
ratios = report["class_balance"]["label_ratios"]
pd.Series(ratios).plot.pie(
ax=axes[1],
autopct="%1.1f%%",
colors=["#4A90D9", "#E8734A", "#50C878", "#9B59B6"],
)
axes[1].set_ylabel("")
imbalance = report["class_balance"]["imbalance_ratio"]
axes[1].set_title(f"Class Ratios (imbalance ratio: {imbalance:.2f})")
plt.tight_layout()
plt.show()
In [7]:
numeric_features = [f["name"] for f in config["features"] if f["feature_type"] == "numeric"]
n = len(numeric_features)
fig, axes = plt.subplots(1, n, figsize=(5 * n, 4))
if n == 1:
axes = [axes]
for ax, feat in zip(axes, numeric_features):
for label in sorted(full[target].unique()):
subset = full[full[target] == label][feat].dropna()
ax.hist(subset, bins=30, alpha=0.6, label=str(label))
ax.set_title(feat)
ax.set_xlabel("Value")
ax.legend()
plt.suptitle("Feature Distributions by Class", y=1.02, fontsize=14)
plt.tight_layout()
plt.show()
In [8]:
correlations = report["feature_target_correlations"]
if correlations:
corr_s = pd.Series(correlations).sort_values()
colors = ["#E8734A" if v < 0 else "#4A90D9" for v in corr_s]
fig, ax = plt.subplots(figsize=(8, max(3, len(corr_s) * 0.6)))
corr_s.plot.barh(ax=ax, color=colors)
ax.set_title("Feature-Target Correlations (Pearson)")
ax.set_xlabel("Correlation Coefficient")
ax.axvline(0, color="gray", linestyle="--", linewidth=0.8)
plt.tight_layout()
plt.show()
else:
print("No feature-target correlations available.")
In [9]:
strength = report["predictive_strength_verification"]
df_s = pd.DataFrame(strength).dropna(subset=["measured_strength"])
if len(df_s) > 0:
x = np.arange(len(df_s))
width = 0.35
fig, ax = plt.subplots(figsize=(10, 5))
ax.bar(x - width / 2, df_s["configured_strength"], width, label="Configured", color="#4A90D9")
ax.bar(x + width / 2, df_s["measured_strength"].abs(), width, label="Measured (abs)", color="#E8734A")
ax.set_xticks(x)
ax.set_xticklabels(df_s["feature"], rotation=45, ha="right")
ax.set_ylabel("Strength")
ax.set_title("Predictive Strength: Configured vs Measured")
ax.legend()
plt.tight_layout()
plt.show()
In [10]:
print("Baseline Model Results")
print("=" * 60)
for model in baseline["models"]:
print(f"\n {model['name']}")
print(f" {model['description']}")
for k, v in model["metrics"].items():
if k == "per_class":
continue
if isinstance(v, float):
print(f" {k}: {v:.4f}")
else:
print(f" {k}: {v}")
Baseline Model Results
============================================================
Majority Class
Always predicts 'no' (most common in training set)
accuracy: 0.7000
macro_precision: 0.3500
macro_recall: 0.5000
macro_f1: 0.4118
Threshold Classifier
Single-feature threshold on 'length_of_stay' (strength: 70%)
accuracy: 0.9575
macro_precision: 0.9399
macro_recall: 0.9649
macro_f1: 0.9522
In [11]:
# Encode categoricals, fill missing values
X_train = pd.get_dummies(train.drop(columns=[target]), drop_first=True)
X_test = pd.get_dummies(test.drop(columns=[target]), drop_first=True)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
y_train = train[target]
y_test = test[target]
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_train.median())
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Random Forest Classification Report:\n")
print(classification_report(y_test, y_pred))
# Comparison table
rows = []
for model in baseline["models"]:
rows.append({"Model": model["name"], "Accuracy": model["metrics"].get("accuracy")})
rows.append({"Model": "Random Forest", "Accuracy": accuracy_score(y_test, y_pred)})
comparison = pd.DataFrame(rows).set_index("Model")
print("\nAccuracy Comparison:")
print(comparison.to_string(float_format="%.4f"))
Random Forest Classification Report:
precision recall f1-score support
no 1.00 0.99 1.00 280
yes 0.98 1.00 0.99 120
accuracy 0.99 400
macro avg 0.99 1.00 0.99 400
weighted avg 1.00 0.99 1.00 400
Accuracy Comparison:
Accuracy
Model
Majority Class 0.7000
Threshold Classifier 0.9575
Random Forest 0.9950
In [12]:
importances = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values()
fig, ax = plt.subplots(figsize=(8, max(3, len(importances) * 0.4)))
importances.plot.barh(ax=ax, color="#4A90D9")
ax.set_title("Random Forest Feature Importances")
ax.set_xlabel("Importance")
plt.tight_layout()
plt.show()
In [13]:
ds_r = load_dataset("housing_price")
config_r = ds_r["config"]
train_r = ds_r["train"]
test_r = ds_r["test"]
baseline_r = ds_r["baseline"]
target_r = config_r["target"]["name"]
# Prepare
X_train_r = pd.get_dummies(train_r.drop(columns=[target_r]), drop_first=True)
X_test_r = pd.get_dummies(test_r.drop(columns=[target_r]), drop_first=True)
X_test_r = X_test_r.reindex(columns=X_train_r.columns, fill_value=0)
X_train_r = X_train_r.fillna(X_train_r.median())
X_test_r = X_test_r.fillna(X_train_r.median())
# Train
rf_r = RandomForestRegressor(n_estimators=100, random_state=42)
rf_r.fit(X_train_r, train_r[target_r])
y_pred_r = rf_r.predict(X_test_r)
r2 = r2_score(test_r[target_r], y_pred_r)
mae = mean_absolute_error(test_r[target_r], y_pred_r)
# Actual vs predicted scatter
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(test_r[target_r], y_pred_r, alpha=0.4, s=12, color="#4A90D9")
lims = [
min(test_r[target_r].min(), y_pred_r.min()),
max(test_r[target_r].max(), y_pred_r.max()),
]
ax.plot(lims, lims, "r--", linewidth=1)
ax.set_xlabel("Actual")
ax.set_ylabel("Predicted")
ax.set_title(f"Housing Price: Actual vs Predicted (R2={r2:.3f}, MAE={mae:,.0f})")
plt.tight_layout()
plt.show()
# Comparison
rows = []
for model in baseline_r["models"]:
rows.append({
"Model": model["name"],
"R2": model["metrics"].get("r2"),
"MAE": model["metrics"].get("mae"),
})
rows.append({"Model": "Random Forest", "R2": r2, "MAE": mae})
print(pd.DataFrame(rows).set_index("Model").to_string(float_format="%.4f"))
R2 MAE Model Mean Predictor -0.0103 130442.3719 Linear Regression 0.6596 73272.0933 Random Forest 0.6614 72774.3558
In [14]:
ds_c = load_dataset("correlated_regression")
df_c = ds_c["full"]
config_c = ds_c["config"]
numeric_cols = df_c.select_dtypes(include=[np.number]).columns.tolist()
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(
df_c[numeric_cols].corr(),
annot=True,
fmt=".2f",
cmap="RdBu_r",
center=0,
vmin=-1,
vmax=1,
ax=ax,
)
ax.set_title("Feature Correlation Matrix")
plt.tight_layout()
plt.show()
# Show configured correlations
print("Configured feature correlations:")
for fc in config_c.get("feature_correlations", []):
actual = df_c[fc["feature_a"]].corr(df_c[fc["feature_b"]])
print(f" {fc['feature_a']} <-> {fc['feature_b']}: configured={fc['correlation']:.2f}, actual={actual:.2f}")
Configured feature correlations: marketing_spend <-> customer_count: configured=0.70, actual=0.81 team_size <-> customer_count: configured=0.50, actual=0.60
In [15]:
ds_i = load_dataset("imperfect_binary")
df_i = ds_i["full"]
config_i = ds_i["config"]
print("Data Imperfections Analysis")
print("=" * 50)
print(f" Configured rows: {config_i['row_count']}")
print(f" Actual rows: {len(df_i)}")
print(f" Duplicate rate: {config_i['duplicate_rate']}")
print(f" Label noise: {config_i['label_noise_rate']}")
print(f" Outlier rate: {config_i['outlier_rate']}")
# Missing values per column
print("\nMissing Values:")
for col in df_i.columns:
null_count = df_i[col].isna().sum()
if null_count > 0:
rate = null_count / len(df_i) * 100
# Find configured rate
feat_cfg = next((f for f in config_i["features"] if f["name"] == col), None)
configured = feat_cfg["missing_rate"] * 100 if feat_cfg else "N/A"
print(f" {col}: {null_count} nulls ({rate:.1f}%), configured: {configured}%")
# Duplicate rows
dups = df_i.duplicated().sum()
print(f"\nDuplicate rows: {dups} ({dups / len(df_i) * 100:.1f}%)")
# Outlier detection (values beyond 3 std from mean)
print("\nPotential outliers (beyond 3 std):")
numeric_features_i = [f["name"] for f in config_i["features"] if f["feature_type"] == "numeric"]
for feat in numeric_features_i:
col = df_i[feat].dropna()
mean, std = col.mean(), col.std()
outliers = ((col < mean - 3 * std) | (col > mean + 3 * std)).sum()
print(f" {feat}: {outliers} ({outliers / len(col) * 100:.1f}%)")
Data Imperfections Analysis ================================================== Configured rows: 1500 Actual rows: 1545 Duplicate rate: 0.03 Label noise: 0.05 Outlier rate: 0.02 Missing Values: score: 76 nulls (4.9%), configured: 5.0% age: 31 nulls (2.0%), configured: 2.0% category: 46 nulls (3.0%), configured: 3.0% Duplicate rows: 45 (2.9%) Potential outliers (beyond 3 std): score: 29 (2.0%) age: 31 (2.0%)
In [16]:
col_stats = ds["report"]["column_stats"]
rows = []
for col_name, stats in col_stats.items():
row = {"column": col_name}
row.update(stats)
rows.append(row)
stats_df = pd.DataFrame(rows).set_index("column")
stats_df
Out[16]:
| null_count | unique_count | mean | std | min | max | median | true_count | true_ratio | |
|---|---|---|---|---|---|---|---|---|---|
| column | |||||||||
| age | 0 | 1986 | 68.607597 | 13.463214 | 18.0 | 100.000000 | 69.351417 | NaN | NaN |
| num_procedures | 0 | 1905 | 3.743760 | 1.938844 | 0.0 | 9.465672 | 4.145051 | NaN | NaN |
| length_of_stay | 0 | 1670 | 6.889521 | 3.863096 | 1.0 | 17.377155 | 7.987439 | NaN | NaN |
| num_medications | 0 | 1939 | 9.595137 | 4.699061 | 0.0 | 22.569377 | 10.159562 | NaN | NaN |
| diagnosis_category | 0 | 5 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| has_diabetes | 0 | 2 | NaN | NaN | NaN | NaN | NaN | 323.0 | 0.1615 |
| readmitted | 0 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
In [17]:
split_stats = ds["report"].get("split_stats")
if split_stats:
print(f"Train size: {split_stats['train_size']}")
print(f"Test size: {split_stats['test_size']}")
print(f"Train ratio: {split_stats['train_ratio']:.3f}")
if "train_class_ratios" in split_stats and "test_class_ratios" in split_stats:
split_df = pd.DataFrame({
"Train": split_stats["train_class_ratios"],
"Test": split_stats["test_class_ratios"],
})
print("\nClass ratios per split:")
print(split_df.to_string(float_format="%.4f"))
split_df.plot.bar(figsize=(8, 4), color=["#4A90D9", "#E8734A"])
plt.title("Class Distribution: Train vs Test")
plt.ylabel("Ratio")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()
else:
print("No split stats (split was not enabled).")
Train size: 1600
Test size: 400
Train ratio: 0.800
Class ratios per split:
Train Test
no 0.7000 0.7000
yes 0.3000 0.3000