In [1]:
# dataset test
In [2]:
!pip install jupyter matplotlib seaborn pandas scikit-learn polars numpy
Requirement already satisfied: jupyter in ./.venv/lib/python3.14/site-packages (1.1.1)
Requirement already satisfied: matplotlib in ./.venv/lib/python3.14/site-packages (3.10.8)
Requirement already satisfied: seaborn in ./.venv/lib/python3.14/site-packages (0.13.2)
Requirement already satisfied: pandas in ./.venv/lib/python3.14/site-packages (3.0.1)
Requirement already satisfied: scikit-learn in ./.venv/lib/python3.14/site-packages (1.8.0)
Collecting polars
  Downloading polars-1.38.1-py3-none-any.whl.metadata (10 kB)
Requirement already satisfied: numpy in ./.venv/lib/python3.14/site-packages (2.4.2)
Requirement already satisfied: notebook in ./.venv/lib/python3.14/site-packages (from jupyter) (7.5.3)
Requirement already satisfied: jupyter-console in ./.venv/lib/python3.14/site-packages (from jupyter) (6.6.3)
Requirement already satisfied: nbconvert in ./.venv/lib/python3.14/site-packages (from jupyter) (7.17.0)
Requirement already satisfied: ipykernel in ./.venv/lib/python3.14/site-packages (from jupyter) (7.2.0)
Requirement already satisfied: ipywidgets in ./.venv/lib/python3.14/site-packages (from jupyter) (8.1.8)
Requirement already satisfied: jupyterlab in ./.venv/lib/python3.14/site-packages (from jupyter) (4.5.4)
Requirement already satisfied: contourpy>=1.0.1 in ./.venv/lib/python3.14/site-packages (from matplotlib) (1.3.3)
Requirement already satisfied: cycler>=0.10 in ./.venv/lib/python3.14/site-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in ./.venv/lib/python3.14/site-packages (from matplotlib) (4.61.1)
Requirement already satisfied: kiwisolver>=1.3.1 in ./.venv/lib/python3.14/site-packages (from matplotlib) (1.4.9)
Requirement already satisfied: packaging>=20.0 in ./.venv/lib/python3.14/site-packages (from matplotlib) (26.0)
Requirement already satisfied: pillow>=8 in ./.venv/lib/python3.14/site-packages (from matplotlib) (12.1.1)
Requirement already satisfied: pyparsing>=3 in ./.venv/lib/python3.14/site-packages (from matplotlib) (3.3.2)
Requirement already satisfied: python-dateutil>=2.7 in ./.venv/lib/python3.14/site-packages (from matplotlib) (2.9.0.post0)
Requirement already satisfied: scipy>=1.10.0 in ./.venv/lib/python3.14/site-packages (from scikit-learn) (1.17.0)
Requirement already satisfied: joblib>=1.3.0 in ./.venv/lib/python3.14/site-packages (from scikit-learn) (1.5.3)
Requirement already satisfied: threadpoolctl>=3.2.0 in ./.venv/lib/python3.14/site-packages (from scikit-learn) (3.6.0)
Collecting polars-runtime-32==1.38.1 (from polars)
  Downloading polars_runtime_32-1.38.1-cp310-abi3-macosx_11_0_arm64.whl.metadata (1.5 kB)
Requirement already satisfied: six>=1.5 in ./.venv/lib/python3.14/site-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)
Requirement already satisfied: appnope>=0.1.2 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (0.1.4)
Requirement already satisfied: comm>=0.1.1 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (0.2.3)
Requirement already satisfied: debugpy>=1.6.5 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (1.8.20)
Requirement already satisfied: ipython>=7.23.1 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (9.10.0)
Requirement already satisfied: jupyter-client>=8.8.0 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (8.8.0)
Requirement already satisfied: jupyter-core!=6.0.*,>=5.1 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (5.9.1)
Requirement already satisfied: matplotlib-inline>=0.1 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (0.2.1)
Requirement already satisfied: nest-asyncio>=1.4 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (1.6.0)
Requirement already satisfied: psutil>=5.7 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (7.2.2)
Requirement already satisfied: pyzmq>=25 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (27.1.0)
Requirement already satisfied: tornado>=6.4.1 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (6.5.4)
Requirement already satisfied: traitlets>=5.4.0 in ./.venv/lib/python3.14/site-packages (from ipykernel->jupyter) (5.14.3)
Requirement already satisfied: decorator>=4.3.2 in ./.venv/lib/python3.14/site-packages (from ipython>=7.23.1->ipykernel->jupyter) (5.2.1)
Requirement already satisfied: ipython-pygments-lexers>=1.0.0 in ./.venv/lib/python3.14/site-packages (from ipython>=7.23.1->ipykernel->jupyter) (1.1.1)
Requirement already satisfied: jedi>=0.18.1 in ./.venv/lib/python3.14/site-packages (from ipython>=7.23.1->ipykernel->jupyter) (0.19.2)
Requirement already satisfied: pexpect>4.3 in ./.venv/lib/python3.14/site-packages (from ipython>=7.23.1->ipykernel->jupyter) (4.9.0)
Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in ./.venv/lib/python3.14/site-packages (from ipython>=7.23.1->ipykernel->jupyter) (3.0.52)
Requirement already satisfied: pygments>=2.11.0 in ./.venv/lib/python3.14/site-packages (from ipython>=7.23.1->ipykernel->jupyter) (2.19.2)
Requirement already satisfied: stack_data>=0.6.0 in ./.venv/lib/python3.14/site-packages (from ipython>=7.23.1->ipykernel->jupyter) (0.6.3)
Requirement already satisfied: wcwidth in ./.venv/lib/python3.14/site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=7.23.1->ipykernel->jupyter) (0.6.0)
Requirement already satisfied: parso<0.9.0,>=0.8.4 in ./.venv/lib/python3.14/site-packages (from jedi>=0.18.1->ipython>=7.23.1->ipykernel->jupyter) (0.8.6)
Requirement already satisfied: platformdirs>=2.5 in ./.venv/lib/python3.14/site-packages (from jupyter-core!=6.0.*,>=5.1->ipykernel->jupyter) (4.6.0)
Requirement already satisfied: ptyprocess>=0.5 in ./.venv/lib/python3.14/site-packages (from pexpect>4.3->ipython>=7.23.1->ipykernel->jupyter) (0.7.0)
Requirement already satisfied: executing>=1.2.0 in ./.venv/lib/python3.14/site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel->jupyter) (2.2.1)
Requirement already satisfied: asttokens>=2.1.0 in ./.venv/lib/python3.14/site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel->jupyter) (3.0.1)
Requirement already satisfied: pure-eval in ./.venv/lib/python3.14/site-packages (from stack_data>=0.6.0->ipython>=7.23.1->ipykernel->jupyter) (0.2.3)
Requirement already satisfied: widgetsnbextension~=4.0.14 in ./.venv/lib/python3.14/site-packages (from ipywidgets->jupyter) (4.0.15)
Requirement already satisfied: jupyterlab_widgets~=3.0.15 in ./.venv/lib/python3.14/site-packages (from ipywidgets->jupyter) (3.0.16)
Requirement already satisfied: async-lru>=1.0.0 in ./.venv/lib/python3.14/site-packages (from jupyterlab->jupyter) (2.1.0)
Requirement already satisfied: httpx<1,>=0.25.0 in ./.venv/lib/python3.14/site-packages (from jupyterlab->jupyter) (0.28.1)
Requirement already satisfied: jinja2>=3.0.3 in ./.venv/lib/python3.14/site-packages (from jupyterlab->jupyter) (3.1.6)
Requirement already satisfied: jupyter-lsp>=2.0.0 in ./.venv/lib/python3.14/site-packages (from jupyterlab->jupyter) (2.3.0)
Requirement already satisfied: jupyter-server<3,>=2.4.0 in ./.venv/lib/python3.14/site-packages (from jupyterlab->jupyter) (2.17.0)
Requirement already satisfied: jupyterlab-server<3,>=2.28.0 in ./.venv/lib/python3.14/site-packages (from jupyterlab->jupyter) (2.28.0)
Requirement already satisfied: notebook-shim>=0.2 in ./.venv/lib/python3.14/site-packages (from jupyterlab->jupyter) (0.2.4)
Requirement already satisfied: setuptools>=41.1.0 in ./.venv/lib/python3.14/site-packages (from jupyterlab->jupyter) (82.0.0)
Requirement already satisfied: anyio in ./.venv/lib/python3.14/site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (4.12.1)
Requirement already satisfied: certifi in ./.venv/lib/python3.14/site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (2026.1.4)
Requirement already satisfied: httpcore==1.* in ./.venv/lib/python3.14/site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (1.0.9)
Requirement already satisfied: idna in ./.venv/lib/python3.14/site-packages (from httpx<1,>=0.25.0->jupyterlab->jupyter) (3.11)
Requirement already satisfied: h11>=0.16 in ./.venv/lib/python3.14/site-packages (from httpcore==1.*->httpx<1,>=0.25.0->jupyterlab->jupyter) (0.16.0)
Requirement already satisfied: argon2-cffi>=21.1 in ./.venv/lib/python3.14/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.1.0)
Requirement already satisfied: jupyter-events>=0.11.0 in ./.venv/lib/python3.14/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.12.0)
Requirement already satisfied: jupyter-server-terminals>=0.4.4 in ./.venv/lib/python3.14/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.5.4)
Requirement already satisfied: nbformat>=5.3.0 in ./.venv/lib/python3.14/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (5.10.4)
Requirement already satisfied: prometheus-client>=0.9 in ./.venv/lib/python3.14/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.24.1)
Requirement already satisfied: send2trash>=1.8.2 in ./.venv/lib/python3.14/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.1.0)
Requirement already satisfied: terminado>=0.8.3 in ./.venv/lib/python3.14/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.18.1)
Requirement already satisfied: websocket-client>=1.7 in ./.venv/lib/python3.14/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.9.0)
Requirement already satisfied: babel>=2.10 in ./.venv/lib/python3.14/site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2.18.0)
Requirement already satisfied: json5>=0.9.0 in ./.venv/lib/python3.14/site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (0.13.0)
Requirement already satisfied: jsonschema>=4.18.0 in ./.venv/lib/python3.14/site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (4.26.0)
Requirement already satisfied: requests>=2.31 in ./.venv/lib/python3.14/site-packages (from jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2.32.5)
Requirement already satisfied: argon2-cffi-bindings in ./.venv/lib/python3.14/site-packages (from argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.1.0)
Requirement already satisfied: MarkupSafe>=2.0 in ./.venv/lib/python3.14/site-packages (from jinja2>=3.0.3->jupyterlab->jupyter) (3.0.3)
Requirement already satisfied: attrs>=22.2.0 in ./.venv/lib/python3.14/site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (25.4.0)
Requirement already satisfied: jsonschema-specifications>=2023.03.6 in ./.venv/lib/python3.14/site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2025.9.1)
Requirement already satisfied: referencing>=0.28.4 in ./.venv/lib/python3.14/site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (0.37.0)
Requirement already satisfied: rpds-py>=0.25.0 in ./.venv/lib/python3.14/site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (0.30.0)
Requirement already satisfied: python-json-logger>=2.0.4 in ./.venv/lib/python3.14/site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (4.0.0)
Requirement already satisfied: pyyaml>=5.3 in ./.venv/lib/python3.14/site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (6.0.3)
Requirement already satisfied: rfc3339-validator in ./.venv/lib/python3.14/site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.1.4)
Requirement already satisfied: rfc3986-validator>=0.1.1 in ./.venv/lib/python3.14/site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (0.1.1)
Requirement already satisfied: fqdn in ./.venv/lib/python3.14/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.5.1)
Requirement already satisfied: isoduration in ./.venv/lib/python3.14/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (20.11.0)
Requirement already satisfied: jsonpointer>1.13 in ./.venv/lib/python3.14/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (3.0.0)
Requirement already satisfied: rfc3987-syntax>=1.1.0 in ./.venv/lib/python3.14/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.1.0)
Requirement already satisfied: uri-template in ./.venv/lib/python3.14/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.3.0)
Requirement already satisfied: webcolors>=24.6.0 in ./.venv/lib/python3.14/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (25.10.0)
Requirement already satisfied: beautifulsoup4 in ./.venv/lib/python3.14/site-packages (from nbconvert->jupyter) (4.14.3)
Requirement already satisfied: bleach!=5.0.0 in ./.venv/lib/python3.14/site-packages (from bleach[css]!=5.0.0->nbconvert->jupyter) (6.3.0)
Requirement already satisfied: defusedxml in ./.venv/lib/python3.14/site-packages (from nbconvert->jupyter) (0.7.1)
Requirement already satisfied: jupyterlab-pygments in ./.venv/lib/python3.14/site-packages (from nbconvert->jupyter) (0.3.0)
Requirement already satisfied: mistune<4,>=2.0.3 in ./.venv/lib/python3.14/site-packages (from nbconvert->jupyter) (3.2.0)
Requirement already satisfied: nbclient>=0.5.0 in ./.venv/lib/python3.14/site-packages (from nbconvert->jupyter) (0.10.4)
Requirement already satisfied: pandocfilters>=1.4.1 in ./.venv/lib/python3.14/site-packages (from nbconvert->jupyter) (1.5.1)
Requirement already satisfied: webencodings in ./.venv/lib/python3.14/site-packages (from bleach!=5.0.0->bleach[css]!=5.0.0->nbconvert->jupyter) (0.5.1)
Requirement already satisfied: tinycss2<1.5,>=1.1.0 in ./.venv/lib/python3.14/site-packages (from bleach[css]!=5.0.0->nbconvert->jupyter) (1.4.0)
Requirement already satisfied: fastjsonschema>=2.15 in ./.venv/lib/python3.14/site-packages (from nbformat>=5.3.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.21.2)
Requirement already satisfied: charset_normalizer<4,>=2 in ./.venv/lib/python3.14/site-packages (from requests>=2.31->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (3.4.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in ./.venv/lib/python3.14/site-packages (from requests>=2.31->jupyterlab-server<3,>=2.28.0->jupyterlab->jupyter) (2.6.3)
Requirement already satisfied: lark>=1.2.2 in ./.venv/lib/python3.14/site-packages (from rfc3987-syntax>=1.1.0->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.3.1)
Requirement already satisfied: cffi>=2.0.0b1 in ./.venv/lib/python3.14/site-packages (from argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2.0.0)
Requirement already satisfied: pycparser in ./.venv/lib/python3.14/site-packages (from cffi>=2.0.0b1->argon2-cffi-bindings->argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (3.0)
Requirement already satisfied: soupsieve>=1.6.1 in ./.venv/lib/python3.14/site-packages (from beautifulsoup4->nbconvert->jupyter) (2.8.3)
Requirement already satisfied: typing-extensions>=4.0.0 in ./.venv/lib/python3.14/site-packages (from beautifulsoup4->nbconvert->jupyter) (4.15.0)
Requirement already satisfied: arrow>=0.15.0 in ./.venv/lib/python3.14/site-packages (from isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (1.4.0)
Requirement already satisfied: tzdata in ./.venv/lib/python3.14/site-packages (from arrow>=0.15.0->isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter) (2025.3)
Downloading polars-1.38.1-py3-none-any.whl (810 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 810.4/810.4 kB 22.7 MB/s  0:00:00
Downloading polars_runtime_32-1.38.1-cp310-abi3-macosx_11_0_arm64.whl (40.2 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40.2/40.2 MB 29.7 MB/s  0:00:01 28.6 MB/s eta 0:00:01
Installing collected packages: polars-runtime-32, polars
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2/2 [polars]━━━━ 1/2 [polars]
Successfully installed polars-1.38.1 polars-runtime-32-1.38.1

[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: pip install --upgrade pip
In [4]:
import os
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, accuracy_score, r2_score, mean_absolute_error

sns.set_theme(style="whitegrid")
%matplotlib inline

DATASETS_DIR = "./sample_datasets"

def load_dataset(name):
    """Load a dataset and all its metadata."""
    base = f"{DATASETS_DIR}/{name}"
    result = {
        "full": pd.read_csv(f"{base}/dataset.csv"),
        "train": pd.read_csv(f"{base}/train.csv"),
        "test": pd.read_csv(f"{base}/test.csv"),
    }
    with open(f"{base}/config.json") as f:
        result["config"] = json.load(f)
    with open(f"{base}/quality_report.json") as f:
        result["report"] = json.load(f)
    with open(f"{base}/baseline_evaluation.json") as f:
        result["baseline"] = json.load(f)
    return result

print("Available datasets:")
for name in sorted(os.listdir(DATASETS_DIR)):
    path = os.path.join(DATASETS_DIR, name)
    if os.path.isdir(path) and os.path.exists(os.path.join(path, "dataset.csv")):
        print(f"  - {name}")
Available datasets:
  - correlated_regression
  - healthcare_readmission
  - housing_price
  - imperfect_binary
In [5]:
ds = load_dataset("healthcare_readmission")
config = ds["config"]
report = ds["report"]
baseline = ds["baseline"]
full = ds["full"]
train = ds["train"]
test = ds["test"]

target = config["target"]["name"]

print(f"Dataset: {config['name']}")
print(f"Task type: {config['task_type']}")
print(f"Shape: {full.shape}")
print(f"Target column: {target}")
print(f"Train/test split: {len(train)} / {len(test)}")
print()
full.describe()
Dataset: Hospital Readmission
Task type: binary_classification
Shape: (2000, 7)
Target column: readmitted
Train/test split: 1600 / 400

Out[5]:
age num_procedures length_of_stay num_medications
count 2000.000000 2000.000000 2000.000000 2000.000000
mean 68.607597 3.743760 6.889521 9.595137
std 13.466581 1.939329 3.864062 4.700236
min 18.000000 0.000000 1.000000 0.000000
25% 59.564149 2.262493 2.701904 6.142470
50% 69.351417 4.145051 7.987439 10.159562
75% 78.362426 5.211250 9.838393 12.894204
max 100.000000 9.465672 17.377155 22.569377
In [6]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

full[target].value_counts().plot.bar(ax=axes[0], color=["#4A90D9", "#E8734A"])
axes[0].set_title("Class Distribution")
axes[0].set_ylabel("Count")

if "class_balance" in report:
    ratios = report["class_balance"]["label_ratios"]
    pd.Series(ratios).plot.pie(
        ax=axes[1],
        autopct="%1.1f%%",
        colors=["#4A90D9", "#E8734A", "#50C878", "#9B59B6"],
    )
    axes[1].set_ylabel("")
    imbalance = report["class_balance"]["imbalance_ratio"]
    axes[1].set_title(f"Class Ratios (imbalance ratio: {imbalance:.2f})")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [7]:
numeric_features = [f["name"] for f in config["features"] if f["feature_type"] == "numeric"]
n = len(numeric_features)

fig, axes = plt.subplots(1, n, figsize=(5 * n, 4))
if n == 1:
    axes = [axes]

for ax, feat in zip(axes, numeric_features):
    for label in sorted(full[target].unique()):
        subset = full[full[target] == label][feat].dropna()
        ax.hist(subset, bins=30, alpha=0.6, label=str(label))
    ax.set_title(feat)
    ax.set_xlabel("Value")
    ax.legend()

plt.suptitle("Feature Distributions by Class", y=1.02, fontsize=14)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [8]:
correlations = report["feature_target_correlations"]
if correlations:
    corr_s = pd.Series(correlations).sort_values()
    colors = ["#E8734A" if v < 0 else "#4A90D9" for v in corr_s]

    fig, ax = plt.subplots(figsize=(8, max(3, len(corr_s) * 0.6)))
    corr_s.plot.barh(ax=ax, color=colors)
    ax.set_title("Feature-Target Correlations (Pearson)")
    ax.set_xlabel("Correlation Coefficient")
    ax.axvline(0, color="gray", linestyle="--", linewidth=0.8)
    plt.tight_layout()
    plt.show()
else:
    print("No feature-target correlations available.")
No description has been provided for this image
In [9]:
strength = report["predictive_strength_verification"]
df_s = pd.DataFrame(strength).dropna(subset=["measured_strength"])

if len(df_s) > 0:
    x = np.arange(len(df_s))
    width = 0.35

    fig, ax = plt.subplots(figsize=(10, 5))
    ax.bar(x - width / 2, df_s["configured_strength"], width, label="Configured", color="#4A90D9")
    ax.bar(x + width / 2, df_s["measured_strength"].abs(), width, label="Measured (abs)", color="#E8734A")
    ax.set_xticks(x)
    ax.set_xticklabels(df_s["feature"], rotation=45, ha="right")
    ax.set_ylabel("Strength")
    ax.set_title("Predictive Strength: Configured vs Measured")
    ax.legend()
    plt.tight_layout()
    plt.show()
No description has been provided for this image
In [10]:
print("Baseline Model Results")
print("=" * 60)
for model in baseline["models"]:
    print(f"\n  {model['name']}")
    print(f"  {model['description']}")
    for k, v in model["metrics"].items():
        if k == "per_class":
            continue
        if isinstance(v, float):
            print(f"    {k}: {v:.4f}")
        else:
            print(f"    {k}: {v}")
Baseline Model Results
============================================================

  Majority Class
  Always predicts 'no' (most common in training set)
    accuracy: 0.7000
    macro_precision: 0.3500
    macro_recall: 0.5000
    macro_f1: 0.4118

  Threshold Classifier
  Single-feature threshold on 'length_of_stay' (strength: 70%)
    accuracy: 0.9575
    macro_precision: 0.9399
    macro_recall: 0.9649
    macro_f1: 0.9522
In [11]:
# Encode categoricals, fill missing values
X_train = pd.get_dummies(train.drop(columns=[target]), drop_first=True)
X_test = pd.get_dummies(test.drop(columns=[target]), drop_first=True)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

y_train = train[target]
y_test = test[target]

X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_train.median())

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("Random Forest Classification Report:\n")
print(classification_report(y_test, y_pred))

# Comparison table
rows = []
for model in baseline["models"]:
    rows.append({"Model": model["name"], "Accuracy": model["metrics"].get("accuracy")})
rows.append({"Model": "Random Forest", "Accuracy": accuracy_score(y_test, y_pred)})
comparison = pd.DataFrame(rows).set_index("Model")
print("\nAccuracy Comparison:")
print(comparison.to_string(float_format="%.4f"))
Random Forest Classification Report:

              precision    recall  f1-score   support

          no       1.00      0.99      1.00       280
         yes       0.98      1.00      0.99       120

    accuracy                           0.99       400
   macro avg       0.99      1.00      0.99       400
weighted avg       1.00      0.99      1.00       400


Accuracy Comparison:
                      Accuracy
Model                         
Majority Class          0.7000
Threshold Classifier    0.9575
Random Forest           0.9950
In [12]:
importances = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values()

fig, ax = plt.subplots(figsize=(8, max(3, len(importances) * 0.4)))
importances.plot.barh(ax=ax, color="#4A90D9")
ax.set_title("Random Forest Feature Importances")
ax.set_xlabel("Importance")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [13]:
ds_r = load_dataset("housing_price")
config_r = ds_r["config"]
train_r = ds_r["train"]
test_r = ds_r["test"]
baseline_r = ds_r["baseline"]
target_r = config_r["target"]["name"]

# Prepare
X_train_r = pd.get_dummies(train_r.drop(columns=[target_r]), drop_first=True)
X_test_r = pd.get_dummies(test_r.drop(columns=[target_r]), drop_first=True)
X_test_r = X_test_r.reindex(columns=X_train_r.columns, fill_value=0)

X_train_r = X_train_r.fillna(X_train_r.median())
X_test_r = X_test_r.fillna(X_train_r.median())

# Train
rf_r = RandomForestRegressor(n_estimators=100, random_state=42)
rf_r.fit(X_train_r, train_r[target_r])
y_pred_r = rf_r.predict(X_test_r)

r2 = r2_score(test_r[target_r], y_pred_r)
mae = mean_absolute_error(test_r[target_r], y_pred_r)

# Actual vs predicted scatter
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(test_r[target_r], y_pred_r, alpha=0.4, s=12, color="#4A90D9")
lims = [
    min(test_r[target_r].min(), y_pred_r.min()),
    max(test_r[target_r].max(), y_pred_r.max()),
]
ax.plot(lims, lims, "r--", linewidth=1)
ax.set_xlabel("Actual")
ax.set_ylabel("Predicted")
ax.set_title(f"Housing Price: Actual vs Predicted (R2={r2:.3f}, MAE={mae:,.0f})")
plt.tight_layout()
plt.show()

# Comparison
rows = []
for model in baseline_r["models"]:
    rows.append({
        "Model": model["name"],
        "R2": model["metrics"].get("r2"),
        "MAE": model["metrics"].get("mae"),
    })
rows.append({"Model": "Random Forest", "R2": r2, "MAE": mae})
print(pd.DataFrame(rows).set_index("Model").to_string(float_format="%.4f"))
No description has been provided for this image
                       R2         MAE
Model                                
Mean Predictor    -0.0103 130442.3719
Linear Regression  0.6596  73272.0933
Random Forest      0.6614  72774.3558
In [14]:
ds_c = load_dataset("correlated_regression")
df_c = ds_c["full"]
config_c = ds_c["config"]

numeric_cols = df_c.select_dtypes(include=[np.number]).columns.tolist()

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(
    df_c[numeric_cols].corr(),
    annot=True,
    fmt=".2f",
    cmap="RdBu_r",
    center=0,
    vmin=-1,
    vmax=1,
    ax=ax,
)
ax.set_title("Feature Correlation Matrix")
plt.tight_layout()
plt.show()

# Show configured correlations
print("Configured feature correlations:")
for fc in config_c.get("feature_correlations", []):
    actual = df_c[fc["feature_a"]].corr(df_c[fc["feature_b"]])
    print(f"  {fc['feature_a']} <-> {fc['feature_b']}: configured={fc['correlation']:.2f}, actual={actual:.2f}")
No description has been provided for this image
Configured feature correlations:
  marketing_spend <-> customer_count: configured=0.70, actual=0.81
  team_size <-> customer_count: configured=0.50, actual=0.60
In [15]:
ds_i = load_dataset("imperfect_binary")
df_i = ds_i["full"]
config_i = ds_i["config"]

print("Data Imperfections Analysis")
print("=" * 50)
print(f"  Configured rows: {config_i['row_count']}")
print(f"  Actual rows:     {len(df_i)}")
print(f"  Duplicate rate:  {config_i['duplicate_rate']}")
print(f"  Label noise:     {config_i['label_noise_rate']}")
print(f"  Outlier rate:    {config_i['outlier_rate']}")

# Missing values per column
print("\nMissing Values:")
for col in df_i.columns:
    null_count = df_i[col].isna().sum()
    if null_count > 0:
        rate = null_count / len(df_i) * 100
        # Find configured rate
        feat_cfg = next((f for f in config_i["features"] if f["name"] == col), None)
        configured = feat_cfg["missing_rate"] * 100 if feat_cfg else "N/A"
        print(f"  {col}: {null_count} nulls ({rate:.1f}%), configured: {configured}%")

# Duplicate rows
dups = df_i.duplicated().sum()
print(f"\nDuplicate rows: {dups} ({dups / len(df_i) * 100:.1f}%)")

# Outlier detection (values beyond 3 std from mean)
print("\nPotential outliers (beyond 3 std):")
numeric_features_i = [f["name"] for f in config_i["features"] if f["feature_type"] == "numeric"]
for feat in numeric_features_i:
    col = df_i[feat].dropna()
    mean, std = col.mean(), col.std()
    outliers = ((col < mean - 3 * std) | (col > mean + 3 * std)).sum()
    print(f"  {feat}: {outliers} ({outliers / len(col) * 100:.1f}%)")
Data Imperfections Analysis
==================================================
  Configured rows: 1500
  Actual rows:     1545
  Duplicate rate:  0.03
  Label noise:     0.05
  Outlier rate:    0.02

Missing Values:
  score: 76 nulls (4.9%), configured: 5.0%
  age: 31 nulls (2.0%), configured: 2.0%
  category: 46 nulls (3.0%), configured: 3.0%

Duplicate rows: 45 (2.9%)

Potential outliers (beyond 3 std):
  score: 29 (2.0%)
  age: 31 (2.0%)
In [16]:
col_stats = ds["report"]["column_stats"]

rows = []
for col_name, stats in col_stats.items():
    row = {"column": col_name}
    row.update(stats)
    rows.append(row)

stats_df = pd.DataFrame(rows).set_index("column")
stats_df
Out[16]:
null_count unique_count mean std min max median true_count true_ratio
column
age 0 1986 68.607597 13.463214 18.0 100.000000 69.351417 NaN NaN
num_procedures 0 1905 3.743760 1.938844 0.0 9.465672 4.145051 NaN NaN
length_of_stay 0 1670 6.889521 3.863096 1.0 17.377155 7.987439 NaN NaN
num_medications 0 1939 9.595137 4.699061 0.0 22.569377 10.159562 NaN NaN
diagnosis_category 0 5 NaN NaN NaN NaN NaN NaN NaN
has_diabetes 0 2 NaN NaN NaN NaN NaN 323.0 0.1615
readmitted 0 2 NaN NaN NaN NaN NaN NaN NaN
In [17]:
split_stats = ds["report"].get("split_stats")
if split_stats:
    print(f"Train size: {split_stats['train_size']}")
    print(f"Test size:  {split_stats['test_size']}")
    print(f"Train ratio: {split_stats['train_ratio']:.3f}")

    if "train_class_ratios" in split_stats and "test_class_ratios" in split_stats:
        split_df = pd.DataFrame({
            "Train": split_stats["train_class_ratios"],
            "Test": split_stats["test_class_ratios"],
        })
        print("\nClass ratios per split:")
        print(split_df.to_string(float_format="%.4f"))

        split_df.plot.bar(figsize=(8, 4), color=["#4A90D9", "#E8734A"])
        plt.title("Class Distribution: Train vs Test")
        plt.ylabel("Ratio")
        plt.xticks(rotation=0)
        plt.tight_layout()
        plt.show()
else:
    print("No split stats (split was not enabled).")
Train size: 1600
Test size:  400
Train ratio: 0.800

Class ratios per split:
     Train   Test
no  0.7000 0.7000
yes 0.3000 0.3000
No description has been provided for this image