Interpretable California Housing Price Modeling¶

This portfolio case study compares two interpretable modeling strategies for predicting median home values across California census districts: a sparse linear model and a small decision tree. The goal is not only to predict prices, but also to explain which socioeconomic and geographic variables drive the predictions.

# Core setup
from __future__ import annotations

import json
import os
import random
import textwrap
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import Markdown, display

from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor, export_text, plot_tree

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 140)

Reproducible Project Configuration¶

All modeling choices that control the experiment are centralized here. This makes the notebook easier to rerun and easier to adapt for a website write-up.

PROJECT_NAME = "california_housing_interpretable_models"
RANDOM_STATE = 42
TEST_SIZE = 0.20
INNER_CV_SPLITS = 5
OUTER_CV_SPLITS = 5
N_JOBS = 1
N_ALPHA_GRID = 100

TARGET_NAME = "MedHouseVal"
TARGET_UNIT = "$100,000s"
LOCAL_DATA_PATH = Path("california_housing.csv")
OUTPUT_DIR = Path("portfolio_outputs") / PROJECT_NAME
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

plt.rcParams.update({
    "figure.figsize": (8, 5),
    "axes.titlesize": 13,
    "axes.labelsize": 11,
    "xtick.labelsize": 10,
    "ytick.labelsize": 10,
})

inner_cv = KFold(n_splits=INNER_CV_SPLITS, shuffle=True, random_state=RANDOM_STATE)
outer_cv = KFold(n_splits=OUTER_CV_SPLITS, shuffle=True, random_state=RANDOM_STATE)

config = pd.DataFrame({
    "setting": ["project", "target", "target unit", "test size", "inner CV", "outer CV", "random state", "parallel jobs", "Lasso alpha grid", "output folder"],
    "value": [PROJECT_NAME, TARGET_NAME, TARGET_UNIT, TEST_SIZE, INNER_CV_SPLITS, OUTER_CV_SPLITS, RANDOM_STATE, N_JOBS, N_ALPHA_GRID, str(OUTPUT_DIR)],
})

display(config)

# Reusable helpers

def rmse(y_true, y_pred) -> float:
    '''Root mean squared error in target units.'''
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))


rmse_scorer = make_scorer(rmse, greater_is_better=False)


def target_to_usd(value: float) -> str:
    '''Convert MedHouseVal units into approximate dollar values.'''
    return f"${value * 100_000:,.0f}"


def save_current_figure(filename: str) -> Path:
    '''Save the active Matplotlib figure into the project output folder.'''
    path = OUTPUT_DIR / filename
    plt.tight_layout()
    plt.savefig(path, dpi=180, bbox_inches="tight")
    return path


def show_table(df: pd.DataFrame, title: str | None = None, formats: dict | None = None):
    '''Display compact, readable tables for notebook and exported HTML views.'''
    if title:
        display(Markdown(f"**{title}**"))
    if formats:
        display(df.style.format(formats))
    else:
        display(df)


def save_table(df: pd.DataFrame, filename: str) -> Path:
    path = OUTPUT_DIR / filename
    df.to_csv(path, index=False)
    return path

Problem Framing and Dataset¶

The applied task is to estimate median home value for California census districts using a small set of demographic, housing, and geographic predictors. For a portfolio project, the key question is broader than model accuracy: which features consistently explain housing value predictions, and how stable are those explanations?

Interpretability matters because housing models can affect policy analysis, affordability research, investment planning, and model governance. A transparent model helps reveal whether predictions are driven by expected economic signals such as income, by geographic proxies such as latitude and longitude, or by potentially unstable artifacts.

def load_california_data(local_path: Path = LOCAL_DATA_PATH) -> tuple[pd.DataFrame, list[str]]:
    '''Load the California Housing dataset and return a modeling DataFrame plus feature names.

    The preferred source is scikit-learn's built-in dataset. If a local CSV named
    `california_housing.csv` is available with the same sklearn-style columns, the
    notebook can use that file instead.
    '''
    expected_features = [
        "MedInc", "HouseAge", "AveRooms", "AveBedrms",
        "Population", "AveOccup", "Latitude", "Longitude",
    ]

    if local_path.exists():
        local_df = pd.read_csv(local_path)
        missing = [col for col in expected_features + [TARGET_NAME] if col not in local_df.columns]
        if missing:
            raise ValueError(
                f"Local file {local_path} is missing expected columns: {missing}. "
                "Use sklearn's California Housing format or remove the local file to fetch the dataset."
            )
        return local_df[expected_features + [TARGET_NAME]].copy(), expected_features

    try:
        housing = fetch_california_housing(as_frame=True)
    except Exception as exc:
        raise RuntimeError(
            "Could not load the California Housing dataset. Run this notebook in an environment "
            "with internet access once so scikit-learn can download and cache the data, or place "
            "a sklearn-style `california_housing.csv` beside the notebook."
        ) from exc

    data = housing.data.copy()
    data[TARGET_NAME] = housing.target
    return data, list(housing.feature_names)


housing_df, feature_names = load_california_data()
X = housing_df[feature_names].copy()
y = housing_df[TARGET_NAME].copy()

feature_descriptions = pd.DataFrame({
    "feature": feature_names,
    "description": [
        "Median income in the census district",
        "Median house age",
        "Average number of rooms per household",
        "Average number of bedrooms per household",
        "Population in the census district",
        "Average number of household members",
        "Latitude of the census district",
        "Longitude of the census district",
    ],
})

dataset_summary = pd.DataFrame({
    "metric": ["rows", "features", "target", "target units", "missing values", "duplicate rows"],
    "value": [len(housing_df), len(feature_names), TARGET_NAME, TARGET_UNIT, int(housing_df.isna().sum().sum()), int(housing_df.duplicated().sum())],
})

show_table(dataset_summary, "Dataset overview")
show_table(feature_descriptions, "Feature dictionary")
show_table(housing_df.head(), "Sample records", {col: "{:.3f}" for col in housing_df.select_dtypes("number").columns})

Data Profile¶

Before fitting models, I summarize the feature distributions and their simple correlations with median home value. These summaries provide context for interpreting model behavior later, but they are not used as final evidence by themselves because correlations do not capture nonlinear interactions.

feature_profile = X.describe().T.reset_index().rename(columns={"index": "feature"})
feature_profile = feature_profile[["feature", "mean", "std", "min", "25%", "50%", "75%", "max"]]

correlations = (
    housing_df.corr(numeric_only=True)[TARGET_NAME]
    .drop(TARGET_NAME)
    .sort_values(key=lambda s: s.abs(), ascending=False)
    .reset_index()
    .rename(columns={"index": "feature", TARGET_NAME: "correlation_with_target"})
)

show_table(
    feature_profile,
    "Feature distribution summary",
    {"mean": "{:.3f}", "std": "{:.3f}", "min": "{:.3f}", "25%": "{:.3f}", "50%": "{:.3f}", "75%": "{:.3f}", "max": "{:.3f}"},
)
show_table(correlations, "Feature correlations with median home value", {"correlation_with_target": "{:.3f}"})

save_table(feature_profile, "feature_profile.csv")
save_table(correlations, "target_correlations.csv")

PosixPath('portfolio_outputs/california_housing_interpretable_models/target_correlations.csv')

fig, ax = plt.subplots(figsize=(8, 4.5))
ax.hist(y, bins=40)
ax.set_title("Distribution of Median Home Values")
ax.set_xlabel(f"{TARGET_NAME} ({TARGET_UNIT})")
ax.set_ylabel("Number of census districts")
target_distribution_path = save_current_figure("target_distribution.png")
plt.show()

Modeling Strategy¶

I compare two interpretable approaches that explain predictions in different ways:

Lasso regression: a standardized sparse linear model. Coefficients show the direction and approximate strength of each feature while the 1-standard-error rule encourages a simpler model.
Decision tree regression: a constrained tree that produces human-readable threshold rules. The tree can model interactions, but it must be kept small enough to interpret.

Both approaches are evaluated on a held-out test set. Cross-validation on the training set is used to check whether performance is stable rather than tied to one lucky split.

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
)

split_summary = pd.DataFrame({
    "split": ["training", "test"],
    "rows": [len(X_train), len(X_test)],
    "share": [len(X_train) / len(X), len(X_test) / len(X)],
})

show_table(split_summary, "Train/test split", {"share": "{:.1%}"})

Sparse Linear Model: Lasso Regression¶

The Lasso model is useful as a transparent global benchmark. Features are standardized before fitting, so coefficient magnitudes are directly comparable: a larger absolute coefficient means that a one-standard-deviation change in that feature has a larger modeled association with predicted home value.

# Accuracy-oriented LassoCV model
lasso_cv_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LassoCV(cv=inner_cv, n_alphas=N_ALPHA_GRID, random_state=RANDOM_STATE, max_iter=10_000)),
])

lasso_cv_pipe.fit(X_train, y_train)
lasso_cv_pred = lasso_cv_pipe.predict(X_test)
lasso_cv_model = lasso_cv_pipe.named_steps["model"]

lasso_cv_metrics = {
    "model": "LassoCV",
    "role": "accuracy-oriented sparse linear model",
    "test_rmse": rmse(y_test, lasso_cv_pred),
    "test_mae": mean_absolute_error(y_test, lasso_cv_pred),
    "test_r2": r2_score(y_test, lasso_cv_pred),
    "selected_alpha": float(lasso_cv_model.alpha_),
}

lasso_cv_rmse = -cross_val_score(
    lasso_cv_pipe,
    X_train,
    y_train,
    cv=outer_cv,
    scoring=rmse_scorer,
    n_jobs=N_JOBS,
)

# 1-standard-error rule for a simpler coefficient profile
mse_path = lasso_cv_model.mse_path_
mean_mse = mse_path.mean(axis=1)
se_mse = mse_path.std(axis=1, ddof=1) / np.sqrt(mse_path.shape[1])
best_idx = int(np.argmin(mean_mse))
threshold = mean_mse[best_idx] + se_mse[best_idx]
alpha_1se = float(lasso_cv_model.alphas_[mean_mse <= threshold].max())

lasso_1se_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Lasso(alpha=alpha_1se, max_iter=10_000)),
])
lasso_1se_pipe.fit(X_train, y_train)
lasso_1se_pred = lasso_1se_pipe.predict(X_test)

lasso_1se_rmse_cv = -cross_val_score(
    lasso_1se_pipe,
    X_train,
    y_train,
    cv=outer_cv,
    scoring=rmse_scorer,
    n_jobs=N_JOBS,
)

lasso_metrics = pd.DataFrame([
    {
        **lasso_cv_metrics,
        "cv_rmse_mean": float(lasso_cv_rmse.mean()),
        "cv_rmse_std": float(lasso_cv_rmse.std()),
        "rmse_usd": target_to_usd(lasso_cv_metrics["test_rmse"]),
    },
    {
        "model": "Lasso 1-SE",
        "role": "more interpretable sparse linear model",
        "test_rmse": rmse(y_test, lasso_1se_pred),
        "test_mae": mean_absolute_error(y_test, lasso_1se_pred),
        "test_r2": r2_score(y_test, lasso_1se_pred),
        "selected_alpha": alpha_1se,
        "cv_rmse_mean": float(lasso_1se_rmse_cv.mean()),
        "cv_rmse_std": float(lasso_1se_rmse_cv.std()),
        "rmse_usd": target_to_usd(rmse(y_test, lasso_1se_pred)),
    },
])

lasso_coef_series = pd.Series(lasso_1se_pipe.named_steps["model"].coef_, index=feature_names)
lasso_coefficients = (
    pd.DataFrame({
        "feature": feature_names,
        "standardized_coefficient": lasso_coef_series.values,
        "absolute_coefficient": np.abs(lasso_coef_series.values),
        "kept_by_lasso_1se": lasso_coef_series.values != 0,
    })
    .sort_values("absolute_coefficient", ascending=False)
    .reset_index(drop=True)
)

show_table(
    lasso_metrics,
    "Lasso model performance",
    {"test_rmse": "{:.4f}", "test_mae": "{:.4f}", "test_r2": "{:.3f}", "selected_alpha": "{:.6f}", "cv_rmse_mean": "{:.4f}", "cv_rmse_std": "{:.4f}"},
)
show_table(
    lasso_coefficients,
    "Lasso 1-SE standardized coefficients",
    {"standardized_coefficient": "{:+.3f}", "absolute_coefficient": "{:.3f}"},
)

save_table(lasso_metrics, "lasso_metrics.csv")
save_table(lasso_coefficients, "lasso_coefficients.csv")

PosixPath('portfolio_outputs/california_housing_interpretable_models/lasso_coefficients.csv')

Rule-Based Model: Small Decision Tree¶

The decision tree is constrained to remain readable. Instead of optimizing for the deepest possible tree, the grid search considers only shallow trees and minimum leaf sizes large enough to avoid overly specific rules.

tree_param_grid = {
    "max_depth": [2, 3, 4, 5],
    "min_samples_leaf": [25, 50, 100],
}

tree_search = GridSearchCV(
    DecisionTreeRegressor(random_state=RANDOM_STATE),
    param_grid=tree_param_grid,
    scoring=rmse_scorer,
    cv=inner_cv,
    n_jobs=N_JOBS,
    refit=True,
)

tree_cv_rmse = -cross_val_score(
    tree_search,
    X_train,
    y_train,
    cv=outer_cv,
    scoring=rmse_scorer,
    n_jobs=N_JOBS,
)

tree_search.fit(X_train, y_train)
tree_model = tree_search.best_estimator_
tree_pred = tree_model.predict(X_test)

tree_metrics = pd.DataFrame([
    {
        "model": "Decision Tree",
        "role": "interpretable nonlinear rule model",
        "test_rmse": rmse(y_test, tree_pred),
        "test_mae": mean_absolute_error(y_test, tree_pred),
        "test_r2": r2_score(y_test, tree_pred),
        "cv_rmse_mean": float(tree_cv_rmse.mean()),
        "cv_rmse_std": float(tree_cv_rmse.std()),
        "rmse_usd": target_to_usd(rmse(y_test, tree_pred)),
        "max_depth": tree_model.get_depth(),
        "n_leaves": tree_model.get_n_leaves(),
        "best_params": json.dumps(tree_search.best_params_),
    }
])

tree_importances = (
    pd.Series(tree_model.feature_importances_, index=feature_names, name="importance")
    .sort_values(ascending=False)
    .reset_index()
    .rename(columns={"index": "feature"})
)

tree_rules_full = export_text(tree_model, feature_names=feature_names)
tree_rules_preview = export_text(tree_model, feature_names=feature_names, max_depth=3)
(OUTPUT_DIR / "decision_tree_rules.txt").write_text(tree_rules_full)

show_table(
    tree_metrics,
    "Decision tree performance and size",
    {"test_rmse": "{:.4f}", "test_mae": "{:.4f}", "test_r2": "{:.3f}", "cv_rmse_mean": "{:.4f}", "cv_rmse_std": "{:.4f}"},
)
show_table(tree_importances, "Decision tree feature importances", {"importance": "{:.3f}"})

display(Markdown("**Preview of the top decision rules**"))
display(Markdown("```text\n" + tree_rules_preview + "\n```"))

save_table(tree_metrics, "decision_tree_metrics.csv")
save_table(tree_importances, "decision_tree_importances.csv")

|--- MedInc <= 5.09
|   |--- MedInc <= 3.07
|   |   |--- AveRooms <= 4.31
|   |   |   |--- MedInc <= 2.21
|   |   |   |   |--- truncated branch of depth 2
|   |   |   |--- MedInc >  2.21
|   |   |   |   |--- truncated branch of depth 2
|   |   |--- AveRooms >  4.31
|   |   |   |--- MedInc <= 2.42
|   |   |   |   |--- truncated branch of depth 2
|   |   |   |--- MedInc >  2.42
|   |   |   |   |--- truncated branch of depth 2
|   |--- MedInc >  3.07
|   |   |--- AveOccup <= 2.42
|   |   |   |--- Latitude <= 37.93
|   |   |   |   |--- truncated branch of depth 2
|   |   |   |--- Latitude >  37.93
|   |   |   |   |--- truncated branch of depth 2
|   |   |--- AveOccup >  2.42
|   |   |   |--- MedInc <= 4.12
|   |   |   |   |--- truncated branch of depth 2
|   |   |   |--- MedInc >  4.12
|   |   |   |   |--- truncated branch of depth 2
|--- MedInc >  5.09
|   |--- MedInc <= 6.89
|   |   |--- AveOccup <= 2.67
|   |   |   |--- HouseAge <= 21.50
|   |   |   |   |--- truncated branch of depth 2
|   |   |   |--- HouseAge >  21.50
|   |   |   |   |--- truncated branch of depth 2
|   |   |--- AveOccup >  2.67
|   |   |   |--- MedInc <= 5.74
|   |   |   |   |--- truncated branch of depth 2
|   |   |   |--- MedInc >  5.74
|   |   |   |   |--- truncated branch of depth 2
|   |--- MedInc >  6.89
|   |   |--- MedInc <= 7.82
|   |   |   |--- HouseAge <= 26.50
|   |   |   |   |--- truncated branch of depth 2
|   |   |   |--- HouseAge >  26.50
|   |   |   |   |--- truncated branch of depth 2
|   |   |--- MedInc >  7.82
|   |   |   |--- MedInc <= 8.87
|   |   |   |   |--- truncated branch of depth 2
|   |   |   |--- MedInc >  8.87
|   |   |   |   |--- truncated branch of depth 2

PosixPath('portfolio_outputs/california_housing_interpretable_models/decision_tree_importances.csv')

fig, ax = plt.subplots(figsize=(18, 7))
plot_tree(
    tree_model,
    feature_names=feature_names,
    filled=True,
    rounded=True,
    fontsize=9,
    max_depth=2,
    ax=ax,
)
ax.set_title("Top Levels of the Selected Decision Tree")
tree_preview_path = save_current_figure("decision_tree_top_levels.png")
plt.show()

Model Comparison¶

The comparison focuses on two dimensions: predictive accuracy and explanation style. The Lasso model gives a compact global coefficient profile, while the decision tree explains predictions through threshold rules and feature importances.

model_comparison = pd.concat([
    lasso_metrics[["model", "role", "test_rmse", "test_mae", "test_r2", "cv_rmse_mean", "cv_rmse_std", "rmse_usd"]],
    tree_metrics[["model", "role", "test_rmse", "test_mae", "test_r2", "cv_rmse_mean", "cv_rmse_std", "rmse_usd"]],
], ignore_index=True)

model_comparison = model_comparison.sort_values("test_rmse").reset_index(drop=True)

show_table(
    model_comparison,
    "Model performance comparison",
    {"test_rmse": "{:.4f}", "test_mae": "{:.4f}", "test_r2": "{:.3f}", "cv_rmse_mean": "{:.4f}", "cv_rmse_std": "{:.4f}"},
)

save_table(model_comparison, "model_comparison.csv")

PosixPath('portfolio_outputs/california_housing_interpretable_models/model_comparison.csv')

feature_comparison = pd.DataFrame({
    "feature": feature_names,
    "lasso_abs_coefficient": lasso_coef_series.abs().reindex(feature_names).values,
    "tree_importance": pd.Series(tree_model.feature_importances_, index=feature_names).reindex(feature_names).values,
})
feature_comparison["lasso_abs_coefficient_normalized"] = feature_comparison["lasso_abs_coefficient"] / feature_comparison["lasso_abs_coefficient"].max()
feature_comparison["tree_importance_normalized"] = feature_comparison["tree_importance"] / feature_comparison["tree_importance"].max()
feature_comparison = feature_comparison.sort_values("lasso_abs_coefficient", ascending=False).reset_index(drop=True)

x = np.arange(len(feature_comparison))
width = 0.38

fig, ax = plt.subplots(figsize=(10, 5))
ax.bar(x - width / 2, feature_comparison["lasso_abs_coefficient_normalized"], width, label="Lasso 1-SE |coefficient|")
ax.bar(x + width / 2, feature_comparison["tree_importance_normalized"], width, label="Decision tree importance")
ax.set_xticks(x)
ax.set_xticklabels(feature_comparison["feature"], rotation=35, ha="right")
ax.set_ylabel("Normalized importance")
ax.set_title("Global Feature Importance: Lasso vs Decision Tree")
ax.legend()
importance_comparison_path = save_current_figure("feature_importance_comparison.png")
plt.show()

save_table(feature_comparison, "feature_importance_comparison.csv")

PosixPath('portfolio_outputs/california_housing_interpretable_models/feature_importance_comparison.csv')

plot_df = model_comparison.copy()
x = np.arange(len(plot_df))
width = 0.38

fig, ax = plt.subplots(figsize=(8, 4.5))
ax.bar(x - width / 2, plot_df["test_rmse"], width, label="Held-out test RMSE")
ax.bar(x + width / 2, plot_df["cv_rmse_mean"], width, yerr=plot_df["cv_rmse_std"], capsize=5, label="Training CV RMSE")
ax.set_xticks(x)
ax.set_xticklabels(plot_df["model"], rotation=20, ha="right")
ax.set_ylabel(f"RMSE ({TARGET_UNIT})")
ax.set_title("Out-of-Sample Error Comparison")
ax.legend()
error_comparison_path = save_current_figure("model_error_comparison.png")
plt.show()

Robustness Check: Tree Hyperparameter Sensitivity¶

Decision tree explanations can be sensitive to depth and leaf-size choices. To test whether the main interpretability conclusion is stable, I sweep a wider range of tree settings and track which features appear as the most important and among the top three most important features.

from collections import Counter

sensitivity_depths = [2, 3, 4, 5, 6, 7, 8]
sensitivity_leaf_sizes = [20, 50, 100, 200]

sensitivity_rows = []
for max_depth in sensitivity_depths:
    for min_samples_leaf in sensitivity_leaf_sizes:
        candidate_tree = DecisionTreeRegressor(
            max_depth=max_depth,
            min_samples_leaf=min_samples_leaf,
            random_state=RANDOM_STATE,
        )
        cv_scores = -cross_val_score(
            candidate_tree,
            X_train,
            y_train,
            cv=outer_cv,
            scoring=rmse_scorer,
            n_jobs=N_JOBS,
        )
        candidate_tree.fit(X_train, y_train)
        importances = pd.Series(candidate_tree.feature_importances_, index=feature_names).sort_values(ascending=False)
        top3 = tuple(importances.head(3).index)

        sensitivity_rows.append({
            "max_depth": max_depth,
            "min_samples_leaf": min_samples_leaf,
            "cv_rmse_mean": float(cv_scores.mean()),
            "cv_rmse_std": float(cv_scores.std()),
            "top_feature": importances.index[0],
            "top_feature_importance": float(importances.iloc[0]),
            "top3_features": top3,
        })

sensitivity_results = pd.DataFrame(sensitivity_rows).sort_values("cv_rmse_mean").reset_index(drop=True)

top_feature_counts = (
    sensitivity_results["top_feature"]
    .value_counts()
    .rename_axis("feature")
    .reset_index(name="top_feature_count")
)
top_feature_counts["top_feature_rate"] = top_feature_counts["top_feature_count"] / len(sensitivity_results)

top3_counts = Counter(feature for features in sensitivity_results["top3_features"] for feature in features)
top3_stability = (
    pd.DataFrame({"feature": list(top3_counts.keys()), "top3_count": list(top3_counts.values())})
    .sort_values("top3_count", ascending=False)
    .reset_index(drop=True)
)
top3_stability["top3_rate"] = top3_stability["top3_count"] / len(sensitivity_results)

show_table(
    sensitivity_results.head(10),
    "Best tree settings in sensitivity sweep",
    {"cv_rmse_mean": "{:.4f}", "cv_rmse_std": "{:.4f}", "top_feature_importance": "{:.3f}"},
)
show_table(top_feature_counts, "Most important feature stability", {"top_feature_rate": "{:.1%}"})
show_table(top3_stability, "Top-three feature stability", {"top3_rate": "{:.1%}"})

save_table(sensitivity_results, "tree_sensitivity_results.csv")
save_table(top_feature_counts, "tree_top_feature_stability.csv")
save_table(top3_stability, "tree_top3_stability.csv")

PosixPath('portfolio_outputs/california_housing_interpretable_models/tree_top3_stability.csv')

heatmap_data = sensitivity_results.pivot(index="max_depth", columns="min_samples_leaf", values="cv_rmse_mean").sort_index()

fig, ax = plt.subplots(figsize=(8, 5))
im = ax.imshow(heatmap_data.values, aspect="auto")
ax.set_xticks(np.arange(len(heatmap_data.columns)))
ax.set_xticklabels(heatmap_data.columns)
ax.set_yticks(np.arange(len(heatmap_data.index)))
ax.set_yticklabels(heatmap_data.index)
ax.set_xlabel("Minimum samples per leaf")
ax.set_ylabel("Maximum tree depth")
ax.set_title("Tree Sensitivity: Cross-Validated RMSE")

for i in range(heatmap_data.shape[0]):
    for j in range(heatmap_data.shape[1]):
        ax.text(j, i, f"{heatmap_data.values[i, j]:.3f}", ha="center", va="center", fontsize=9)

fig.colorbar(im, ax=ax, label=f"CV RMSE ({TARGET_UNIT})")
sensitivity_heatmap_path = save_current_figure("tree_sensitivity_heatmap.png")
plt.show()

Case Study Summary¶

The final summary is generated from the computed results so the notebook remains accurate if it is rerun with updated package versions or a different data split.

best_model_row = model_comparison.sort_values("test_rmse").iloc[0]
strongest_lasso = lasso_coefficients.iloc[0]
strongest_tree = tree_importances.iloc[0]
most_stable_tree_feature = top_feature_counts.iloc[0]

summary_text = f'''
### Main findings

- The lowest held-out test RMSE in this run is from **{best_model_row['model']}**, with RMSE = **{best_model_row['test_rmse']:.3f}** ({target_to_usd(best_model_row['test_rmse'])} in approximate dollar terms).
- The strongest Lasso 1-SE coefficient is **{strongest_lasso['feature']}** with standardized coefficient **{strongest_lasso['standardized_coefficient']:+.3f}**.
- The most important decision-tree feature is **{strongest_tree['feature']}** with importance **{strongest_tree['importance']:.3f}**.
- In the tree sensitivity sweep, **{most_stable_tree_feature['feature']}** is the top feature in **{most_stable_tree_feature['top_feature_count']} of {len(sensitivity_results)}** tested settings.

### Interpretation

Across the interpretable models, median income is the most stable signal for predicting median home value. The sparse linear model also assigns large weight to latitude and longitude, suggesting that geography captures broad regional price gradients. The decision tree uses income as the first major split and then refines predictions with variables such as occupancy, age, and location depending on the branch.

### Limitations

This project uses a historical census dataset, so the results should be interpreted as a modeling and interpretability exercise rather than a current housing-market analysis. The features are aggregated at the district level, and location variables can act as proxies for unobserved economic, regulatory, or neighborhood factors.
'''

display(Markdown(summary_text))
(OUTPUT_DIR / "case_study_summary.md").write_text(summary_text.strip())

1206

	MedInc	HouseAge	AveRooms	AveBedrms	Population	AveOccup	Latitude	Longitude	MedHouseVal
0	8.325	41.000	6.984	1.024	322.000	2.556	37.880	-122.230	4.526
1	8.301	21.000	6.238	0.972	2401.000	2.110	37.860	-122.220	3.585
2	7.257	52.000	8.288	1.073	496.000	2.802	37.850	-122.240	3.521
3	5.643	52.000	5.817	1.073	558.000	2.548	37.850	-122.250	3.413
4	3.846	52.000	6.282	1.081	565.000	2.181	37.850	-122.250	3.422

	feature	mean	std	min	25%	50%	75%	max
0	MedInc	3.871	1.900	0.500	2.563	3.535	4.743	15.000
1	HouseAge	28.639	12.586	1.000	18.000	29.000	37.000	52.000
2	AveRooms	5.429	2.474	0.846	4.441	5.229	6.052	141.909
3	AveBedrms	1.097	0.474	0.333	1.006	1.049	1.100	34.067
4	Population	1425.477	1132.462	3.000	787.000	1166.000	1725.000	35682.000
5	AveOccup	3.071	10.386	0.692	2.430	2.818	3.282	1243.333
6	Latitude	35.632	2.136	32.540	33.930	34.260	37.710	41.950
7	Longitude	-119.570	2.004	-124.350	-121.800	-118.490	-118.010	-114.310

	model	role	test_rmse	test_mae	test_r2	selected_alpha	cv_rmse_mean	cv_rmse_std	rmse_usd
0	LassoCV	accuracy-oriented sparse linear model	0.7448	0.5332	0.577	0.000799	0.7220	0.0145	$74,482
1	Lasso 1-SE	more interpretable sparse linear model	0.7408	0.5373	0.581	0.013014	0.7265	0.0131	$74,084

	model	role	test_rmse	test_mae	test_r2	cv_rmse_mean	cv_rmse_std	rmse_usd
0	Decision Tree	interpretable nonlinear rule model	0.7248	0.5225	0.599	0.7211	0.0098	$72,483
1	Lasso 1-SE	more interpretable sparse linear model	0.7408	0.5373	0.581	0.7265	0.0131	$74,084
2	LassoCV	accuracy-oriented sparse linear model	0.7448	0.5332	0.577	0.7220	0.0145	$74,482

	max_depth	min_samples_leaf	cv_rmse_mean	cv_rmse_std	top_feature	top_feature_importance	top3_features
0	8	20	0.6441	0.0155	MedInc	0.675	('MedInc', 'AveOccup', 'Latitude')
1	8	50	0.6528	0.0102	MedInc	0.688	('MedInc', 'AveOccup', 'Latitude')
2	8	100	0.6669	0.0125	MedInc	0.706	('MedInc', 'AveOccup', 'Latitude')
3	7	20	0.6688	0.0129	MedInc	0.701	('MedInc', 'AveOccup', 'Latitude')
4	7	50	0.6697	0.0103	MedInc	0.708	('MedInc', 'AveOccup', 'HouseAge')
5	7	100	0.6777	0.0131	MedInc	0.722	('MedInc', 'AveOccup', 'HouseAge')
6	6	50	0.6893	0.0112	MedInc	0.736	('MedInc', 'AveOccup', 'HouseAge')
7	6	20	0.6898	0.0135	MedInc	0.733	('MedInc', 'AveOccup', 'HouseAge')
8	6	100	0.6922	0.0110	MedInc	0.740	('MedInc', 'AveOccup', 'HouseAge')
9	8	200	0.6950	0.0095	MedInc	0.736	('MedInc', 'AveOccup', 'HouseAge')