{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "md-01",
   "metadata": {},
   "source": [
    "# Evidence Readiness for a Credit Risk Pipeline\n",
    "\n",
    "This notebook is a local, end-to-end Python demo. It creates a synthetic credit-risk dataset, trains a small model, logs run evidence, generates SHAP explanations, writes a model-card HTML artifact, and saves an Evidently drift report.\n",
    "\n",
    "Run it from the folder where you downloaded the notebook. Generated files are written to `data/` and `output/`.\n",
    "\n",
    "Required Python packages: `numpy`, `pandas`, `matplotlib`, `scikit-learn`, `mlflow`, `shap`, `evidently`, and `pyarrow`."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "md-02",
   "metadata": {},
   "source": [
    "## 0 · Setup\n",
    "\n",
    "Create local output folders and load the Python libraries used throughout the notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "code-03",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "from dataclasses import dataclass\n",
    "from html import escape\n",
    "from pathlib import Path\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "ROOT = Path.cwd()\n",
    "DATA = ROOT / \"data\"\n",
    "OUT = ROOT / \"output\"\n",
    "MLFLOW_DB = OUT / \"mlflow.db\"\n",
    "MLFLOW_ARTIFACTS = OUT / \"mlflow_artifacts\"\n",
    "\n",
    "for folder in (DATA, OUT, MLFLOW_ARTIFACTS):\n",
    "    folder.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "ARTIFACTS = [\n",
    "    DATA / \"credit_reference.parquet\",\n",
    "    DATA / \"credit_production.parquet\",\n",
    "    OUT / \"lineage.json\",\n",
    "    OUT / \"shap_summary.png\",\n",
    "    OUT / \"shap_waterfall.png\",\n",
    "    OUT / \"model_card.html\",\n",
    "    OUT / \"credit_drift_report.html\",\n",
    "    OUT / \"drift_report.png\",\n",
    "    MLFLOW_DB,\n",
    "    MLFLOW_ARTIFACTS,\n",
    "]\n",
    "\n",
    "def list_artifacts():\n",
    "    existing = [path for path in ARTIFACTS if path.exists()]\n",
    "    if not existing:\n",
    "        print(\"No artifacts yet. Run the notebook from top to bottom first.\")\n",
    "        return []\n",
    "    print(\"Generated artifacts:\")\n",
    "    for path in existing:\n",
    "        print(f\"- {path.resolve()}\")\n",
    "    return existing\n",
    "\n",
    "print(f\"Writing data to: {DATA.resolve()}\")\n",
    "print(f\"Writing outputs to: {OUT.resolve()}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "md-04",
   "metadata": {},
   "source": [
    "## 1 · Synthetic Credit-Risk Data\n",
    "\n",
    "Create two snapshots: a reference distribution representing training data and a later production distribution with intentional drift. The drift is visible in age, income, employment length, and especially credit score."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "code-05",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_credit_data(n: int, drift: bool = False, seed: int = 42):\n",
    "    rng = np.random.default_rng(seed)\n",
    "    age = rng.normal(42 if not drift else 38, 12, n).clip(18, 80).round()\n",
    "    income_eur = rng.lognormal(mean=10.6, sigma=0.5, size=n).clip(8000, 200000)\n",
    "    if drift:\n",
    "        income_eur *= 0.92\n",
    "    employment_years = rng.gamma(shape=2.0, scale=4.0, size=n).clip(0, 40)\n",
    "    if drift:\n",
    "        employment_years *= 0.85\n",
    "    loan_amount_eur = rng.lognormal(mean=9.5, sigma=0.6, size=n).clip(1000, 100000)\n",
    "    loan_term_months = rng.choice([12, 24, 36, 48, 60, 72], n,\n",
    "                                   p=[0.10, 0.20, 0.30, 0.20, 0.15, 0.05])\n",
    "    existing_credits = rng.poisson(lam=1.5, size=n).clip(0, 10)\n",
    "    credit_score = rng.normal(680 if not drift else 645, 80, n).clip(300, 850).round()\n",
    "    monthly_payment = loan_amount_eur / loan_term_months * 1.05\n",
    "    dti = (monthly_payment * 12) / income_eur\n",
    "    purpose = rng.choice(\n",
    "        [\"car\", \"education\", \"home_improvement\", \"debt_consolidation\", \"business\", \"other\"],\n",
    "        n, p=[0.25, 0.10, 0.20, 0.25, 0.10, 0.10],\n",
    "    )\n",
    "    housing = rng.choice([\"own\", \"rent\", \"free\"], n, p=[0.55, 0.40, 0.05])\n",
    "    df = pd.DataFrame({\n",
    "        \"age\": age.astype(int),\n",
    "        \"income_eur\": income_eur.round(2),\n",
    "        \"employment_years\": employment_years.round(1),\n",
    "        \"loan_amount_eur\": loan_amount_eur.round(2),\n",
    "        \"loan_term_months\": loan_term_months,\n",
    "        \"existing_credits\": existing_credits,\n",
    "        \"credit_score\": credit_score.astype(int),\n",
    "        \"dti_ratio\": dti.round(3),\n",
    "        \"purpose\": purpose,\n",
    "        \"housing\": housing,\n",
    "    })\n",
    "    risk_logit = (\n",
    "        -2.5\n",
    "        + (-0.015 * (df[\"credit_score\"] - 680))\n",
    "        + (2.0 * df[\"dti_ratio\"])\n",
    "        + (-0.05 * df[\"employment_years\"])\n",
    "        + (-0.000005 * df[\"income_eur\"])\n",
    "        + (0.3 * (df[\"existing_credits\"] > 3).astype(int))\n",
    "    )\n",
    "    prob_default = 1 / (1 + np.exp(-risk_logit))\n",
    "    df[\"default\"] = (rng.uniform(size=n) < prob_default).astype(int)\n",
    "    return df\n",
    "\n",
    "\n",
    "reference_df = generate_credit_data(n=2000, drift=False, seed=42)\n",
    "production_df = generate_credit_data(n=1500, drift=True, seed=99)\n",
    "\n",
    "reference_df.to_parquet(DATA / \"credit_reference.parquet\", index=False)\n",
    "production_df.to_parquet(DATA / \"credit_production.parquet\", index=False)\n",
    "\n",
    "print(f\"Reference:  {reference_df.shape} - default rate {reference_df['default'].mean():.1%}\")\n",
    "print(f\"Production: {production_df.shape} - default rate {production_df['default'].mean():.1%}\")\n",
    "reference_df.head()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "md-06",
   "metadata": {},
   "source": [
    "---\n",
    "\n",
    "## 2 · Data Lineage Evidence\n",
    "\n",
    "A high-risk ML system should be able to reconstruct which data trained a model. In a real pipeline this evidence would live in DVC, a data catalog, warehouse lineage, or a feature platform. In this notebook, the reproducible evidence artifacts are the generated parquet snapshots in `data/`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "code-06-lineage",
   "metadata": {},
   "outputs": [],
   "source": [
    "lineage = {\n",
    "    \"data\": {\n",
    "        \"training_snapshot\": str((DATA / \"credit_reference.parquet\").resolve()),\n",
    "        \"production_snapshot\": str((DATA / \"credit_production.parquet\").resolve()),\n",
    "        \"training_rows\": int(len(reference_df)),\n",
    "        \"production_rows\": int(len(production_df)),\n",
    "        \"feature_columns\": [column for column in reference_df.columns if column != \"default\"],\n",
    "        \"label\": \"default\",\n",
    "    },\n",
    "    \"governance\": {\n",
    "        \"source\": \"synthetic_credit_risk_demo\",\n",
    "        \"owner\": \"risk_analytics\",\n",
    "        \"versioning_note\": \"In production, point this to DVC, feature-store, or warehouse snapshot IDs.\",\n",
    "    },\n",
    "}\n",
    "\n",
    "lineage_path = OUT / \"lineage.json\"\n",
    "with lineage_path.open(\"w\", encoding=\"utf-8\") as f:\n",
    "    json.dump(lineage, f, indent=2)\n",
    "\n",
    "print(f\"Wrote lineage evidence: {lineage_path.resolve()}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "md-07",
   "metadata": {},
   "source": [
    "---\n",
    "\n",
    "## 3 · Model Decision Evidence\n",
    "\n",
    "Train a small classifier and log the run to MLflow. The run records parameters, AUC, the model artifact, SHAP explanations, and model-card-style metadata."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "code-08",
   "metadata": {},
   "outputs": [],
   "source": [
    "import mlflow\n",
    "import shap\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.metrics import roc_auc_score\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "mlflow.set_tracking_uri(f\"sqlite:///{MLFLOW_DB}\")\n",
    "\n",
    "experiment_name = \"credit_risk_pydata_demo\"\n",
    "client = mlflow.tracking.MlflowClient()\n",
    "if client.get_experiment_by_name(experiment_name) is None:\n",
    "    client.create_experiment(\n",
    "        experiment_name,\n",
    "        artifact_location=MLFLOW_ARTIFACTS.as_uri(),\n",
    "    )\n",
    "mlflow.set_experiment(experiment_name)\n",
    "\n",
    "df_enc = pd.get_dummies(reference_df, columns=[\"purpose\", \"housing\"], drop_first=True)\n",
    "X = df_enc.drop(columns=[\"default\"])\n",
    "y = df_enc[\"default\"]\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X, y, test_size=0.25, random_state=42, stratify=y\n",
    ")\n",
    "\n",
    "with mlflow.start_run(run_name=\"credit_risk_v1\") as run:\n",
    "    model = GradientBoostingClassifier(max_depth=3, n_estimators=200, random_state=42)\n",
    "    model.fit(X_train, y_train)\n",
    "\n",
    "    # Parameters, metrics, model artifact, explanations, and model-card facts are logged together.\n",
    "    mlflow.log_params(model.get_params())\n",
    "    auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])\n",
    "    mlflow.log_metric(\"auc\", auc)\n",
    "    mlflow.sklearn.log_model(model, name=\"model\")\n",
    "    mlflow.log_artifact(str(OUT / \"lineage.json\"))\n",
    "\n",
    "    # Explanations are saved as reusable artifacts.\n",
    "    explainer = shap.TreeExplainer(model)\n",
    "    shap_values = explainer.shap_values(X_test)\n",
    "    shap.summary_plot(shap_values, X_test, show=False)\n",
    "    plt.tight_layout()\n",
    "    mlflow.log_figure(plt.gcf(), \"shap_summary.png\")\n",
    "    plt.savefig(OUT / \"shap_summary.png\", dpi=120, bbox_inches=\"tight\")\n",
    "    plt.close()\n",
    "\n",
    "    # Lightweight model-card fields as run tags.\n",
    "    mlflow.set_tags({\n",
    "        \"intended_use\": \"credit_risk_scoring\",\n",
    "        \"risk_tier\": \"high_risk_annex_iii\",\n",
    "        \"known_limitations\": \"trained on EU applicants 2022-2025\",\n",
    "        \"human_oversight\": \"loan_officer_review_above_threshold_0.4\",\n",
    "        \"decision_threshold\": \"0.4\",\n",
    "        \"shap_configuration\": \"tree_path_dependent\",\n",
    "    })\n",
    "\n",
    "    run_id = run.info.run_id\n",
    "    print(f\"Run ID: {run_id}\")\n",
    "    print(f\"AUC: {auc:.3f}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "md-09",
   "metadata": {},
   "source": [
    "### Single-Decision Explanation\n",
    "\n",
    "The summary plot explains global feature effects. The waterfall plot below shows one concrete prediction, which is the kind of artifact that helps connect a model score to a review policy."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "code-10",
   "metadata": {},
   "outputs": [],
   "source": [
    "explainer = shap.TreeExplainer(model)\n",
    "shap_explanation = explainer(X_test.iloc[:1])\n",
    "\n",
    "shap.plots.waterfall(shap_explanation[0], show=False)\n",
    "plt.tight_layout()\n",
    "plt.savefig(OUT / \"shap_waterfall.png\", dpi=120, bbox_inches=\"tight\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "md-11",
   "metadata": {},
   "source": [
    "The local MLflow run now links the trained model, metrics, explanations, lineage file, and intended-use metadata in `output/mlflow.db` and `output/mlflow_artifacts/`."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "md-11-model-card",
   "metadata": {},
   "source": [
    "### Generated Model Card (Example)\n",
    "\n",
    "The slide's model-card step is part of the notebook now: pull metadata from the finished MLflow run, render a small HTML card, and log it back as a run artifact."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "code-11-model-card",
   "metadata": {},
   "outputs": [],
   "source": [
    "@dataclass\n",
    "class ModelCard:\n",
    "    intended_use: str\n",
    "    risk_tier: str\n",
    "    human_oversight: str\n",
    "    training_data: dict\n",
    "    metrics: dict\n",
    "    limitations: str\n",
    "    shap_configuration: str\n",
    "    decision_threshold: str\n",
    "\n",
    "    def to_html(self, path: Path):\n",
    "        def render_rows(items):\n",
    "            rows = []\n",
    "            for key, value in items:\n",
    "                if isinstance(value, list):\n",
    "                    value = \", \".join(str(item) for item in value)\n",
    "                rows.append(\n",
    "                    f\"<tr><th>{escape(str(key))}</th><td>{escape(str(value))}</td></tr>\"\n",
    "                )\n",
    "            return \"\\n\".join(rows)\n",
    "\n",
    "        metric_items = [\n",
    "            (name, f\"{value:.3f}\" if isinstance(value, (int, float)) else value)\n",
    "            for name, value in self.metrics.items()\n",
    "        ]\n",
    "        metric_rows = render_rows(metric_items) or \"<tr><td colspan=\\\"2\\\">No metrics recorded.</td></tr>\"\n",
    "        data_rows = render_rows(self.training_data.items())\n",
    "\n",
    "        html = f\"\"\"<!doctype html>\n",
    "<html lang=\"en\">\n",
    "<head>\n",
    "  <meta charset=\"utf-8\">\n",
    "  <title>Credit Risk Model Card</title>\n",
    "  <style>\n",
    "    body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; margin: 0; color: #172033; background: #f7f9fc; }}\n",
    "    main {{ max-width: 920px; margin: 0 auto; padding: 40px 28px 56px; }}\n",
    "    h1 {{ margin: 0 0 8px; font-size: 32px; }}\n",
    "    h2 {{ margin-top: 30px; border-bottom: 1px solid #d8dee9; padding-bottom: 8px; }}\n",
    "    .summary {{ color: #4b5563; margin-bottom: 26px; }}\n",
    "    .pill {{ display: inline-block; background: #dbeafe; color: #1d4ed8; padding: 5px 10px; border-radius: 999px; font-weight: 700; font-size: 13px; }}\n",
    "    table {{ width: 100%; border-collapse: collapse; background: white; }}\n",
    "    th, td {{ text-align: left; vertical-align: top; padding: 11px 13px; border: 1px solid #d8dee9; }}\n",
    "    th {{ width: 230px; background: #edf2f7; }}\n",
    "  </style>\n",
    "</head>\n",
    "<body>\n",
    "  <main>\n",
    "    <h1>Credit Risk Model Card</h1>\n",
    "    <p class=\"summary\">Generated from MLflow run metadata, metrics, and logged lineage artifacts.</p>\n",
    "    <p><span class=\"pill\">{escape(self.risk_tier)}</span></p>\n",
    "\n",
    "    <h2>Purpose and Oversight</h2>\n",
    "    <table>\n",
    "      {render_rows([\n",
    "          ('intended_use', self.intended_use),\n",
    "          ('human_oversight', self.human_oversight),\n",
    "          ('decision_threshold', self.decision_threshold),\n",
    "          ('known_limitations', self.limitations),\n",
    "          ('shap_configuration', self.shap_configuration),\n",
    "      ])}\n",
    "    </table>\n",
    "\n",
    "    <h2>Training Data</h2>\n",
    "    <table>{data_rows}</table>\n",
    "\n",
    "    <h2>Metrics</h2>\n",
    "    <table>{metric_rows}</table>\n",
    "  </main>\n",
    "</body>\n",
    "</html>\n",
    "\"\"\"\n",
    "        path.write_text(html, encoding=\"utf-8\")\n",
    "\n",
    "\n",
    "client = mlflow.tracking.MlflowClient()\n",
    "run_record = client.get_run(run_id)\n",
    "tags = run_record.data.tags\n",
    "\n",
    "downloaded_lineage = Path(client.download_artifacts(run_id, \"lineage.json\"))\n",
    "with downloaded_lineage.open(\"r\", encoding=\"utf-8\") as f:\n",
    "    lineage_from_run = json.load(f)\n",
    "\n",
    "card = ModelCard(\n",
    "    intended_use=tags[\"intended_use\"],\n",
    "    risk_tier=tags[\"risk_tier\"],\n",
    "    human_oversight=tags[\"human_oversight\"],\n",
    "    training_data=lineage_from_run[\"data\"],\n",
    "    metrics=dict(run_record.data.metrics),\n",
    "    limitations=tags[\"known_limitations\"],\n",
    "    shap_configuration=tags.get(\"shap_configuration\", \"not_recorded\"),\n",
    "    decision_threshold=tags.get(\"decision_threshold\", \"not_recorded\"),\n",
    ")\n",
    "\n",
    "model_card_path = OUT / \"model_card.html\"\n",
    "card.to_html(model_card_path)\n",
    "client.log_artifact(run_id, str(model_card_path))\n",
    "\n",
    "print(f\"Generated model card: {model_card_path.resolve()}\")\n",
    "print(\"Logged model_card.html back to the MLflow run.\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "md-12",
   "metadata": {},
   "source": [
    "## 4 · Drift Monitoring Evidence\n",
    "\n",
    "Compare the training reference distribution to the later production snapshot. Evidently creates a full HTML report and a machine-readable summary of which features drifted."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "code-13",
   "metadata": {},
   "outputs": [],
   "source": [
    "from evidently import Report\n",
    "from evidently.presets import DataDriftPreset\n",
    "\n",
    "# reference = training distribution; current = last 30 days of production\n",
    "ref = reference_df.drop(columns=[\"default\"])\n",
    "cur = production_df.drop(columns=[\"default\"])\n",
    "\n",
    "report = Report(metrics=[DataDriftPreset()])\n",
    "snapshot = report.run(reference_data=ref, current_data=cur)\n",
    "\n",
    "# Save the full interactive HTML report.\n",
    "snapshot.save_html(str(OUT / \"credit_drift_report.html\"))\n",
    "\n",
    "# Inspect: which features drifted?\n",
    "result = snapshot.dict()\n",
    "drifted = result[\"metrics\"][0][\"value\"]\n",
    "print(f\"Drift summary: {drifted['count']:.0f} of {len(ref.columns)} features drifted \"\n",
    "      f\"({drifted['share']:.0%})\")\n",
    "print()\n",
    "print(\"Per-feature Wasserstein distance:\")\n",
    "for m in result[\"metrics\"][1:]:\n",
    "    name = m[\"metric_name\"]\n",
    "    if \"ValueDrift(column=\" in name:\n",
    "        col = name.split(\"column=\")[1].split(\",\")[0]\n",
    "        val = m[\"value\"]\n",
    "        flag = \"  DRIFT\" if val > 0.10 else \"       \"\n",
    "        print(f\"  {flag}  {col:25s}  {val:.3f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "md-14",
   "metadata": {},
   "source": [
    "### Compact Drift Summary\n",
    "\n",
    "The HTML report is useful for inspection. This PNG gives a quick review view: the credit score distribution shift on the left and feature-level drift distances on the right."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "code-15",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.rcParams.update({\n",
    "    \"font.family\": \"DejaVu Sans\",\n",
    "    \"font.size\": 12,\n",
    "    \"axes.spines.top\": False,\n",
    "    \"axes.spines.right\": False,\n",
    "    \"figure.facecolor\": \"white\",\n",
    "    \"axes.facecolor\": \"white\",\n",
    "})\n",
    "\n",
    "# Per-feature drift values from the Evidently result computed above.\n",
    "drift_by_feature = {}\n",
    "for metric in result[\"metrics\"][1:]:\n",
    "    name = metric[\"metric_name\"]\n",
    "    if \"ValueDrift(column=\" in name:\n",
    "        column = name.split(\"column=\")[1].split(\",\")[0]\n",
    "        drift_by_feature[column] = metric[\"value\"]\n",
    "\n",
    "threshold = 0.10\n",
    "cs_drift = drift_by_feature[\"credit_score\"]\n",
    "ref_scores = reference_df[\"credit_score\"]\n",
    "prod_scores = production_df[\"credit_score\"]\n",
    "ref_mean = ref_scores.mean()\n",
    "prod_mean = prod_scores.mean()\n",
    "\n",
    "fig = plt.figure(figsize=(14, 5.15))\n",
    "grid = fig.add_gridspec(\n",
    "    2,\n",
    "    2,\n",
    "    height_ratios=[0.36, 1],\n",
    "    width_ratios=[1.02, 1],\n",
    "    hspace=0.12,\n",
    "    wspace=0.36,\n",
    ")\n",
    "left_header = fig.add_subplot(grid[0, 0])\n",
    "left_header.axis(\"off\")\n",
    "ax_hist = fig.add_subplot(grid[1, 0])\n",
    "ax_drift = fig.add_subplot(grid[:, 1])\n",
    "\n",
    "# Left header: all explanatory text lives outside the histogram.\n",
    "left_header.text(\n",
    "    0.0,\n",
    "    0.96,\n",
    "    \"credit_score: drift detected\",\n",
    "    ha=\"left\",\n",
    "    va=\"top\",\n",
    "    fontsize=14,\n",
    "    fontweight=\"bold\",\n",
    ")\n",
    "left_header.text(0.02, 0.44, \"■\", color=\"#3b82f6\", fontsize=14, va=\"center\")\n",
    "left_header.text(0.08, 0.44, f\"Training (n={len(reference_df):,})\", fontsize=10.5, va=\"center\")\n",
    "left_header.text(0.02, 0.13, \"■\", color=\"#ef4444\", fontsize=14, va=\"center\")\n",
    "left_header.text(0.08, 0.13, f\"Production last 30d (n={len(production_df):,})\", fontsize=10.5, va=\"center\")\n",
    "left_header.text(\n",
    "    0.98,\n",
    "    0.43,\n",
    "    f\"Mean shift: {ref_mean:.0f} -> {prod_mean:.0f}\\nWasserstein: {cs_drift:.2f} (threshold {threshold:.2f})\",\n",
    "    ha=\"right\",\n",
    "    va=\"center\",\n",
    "    fontsize=9.5,\n",
    "    bbox=dict(\n",
    "        boxstyle=\"round,pad=0.42\",\n",
    "        facecolor=\"#fff7f7\",\n",
    "        edgecolor=\"#ef4444\",\n",
    "        linewidth=1.1,\n",
    "    ),\n",
    ")\n",
    "\n",
    "# Credit score distribution shift.\n",
    "bins = np.linspace(300, 850, 34)\n",
    "ax_hist.hist(\n",
    "    ref_scores,\n",
    "    bins=bins,\n",
    "    alpha=0.58,\n",
    "    color=\"#3b82f6\",\n",
    "    edgecolor=\"white\",\n",
    ")\n",
    "ax_hist.hist(\n",
    "    prod_scores,\n",
    "    bins=bins,\n",
    "    alpha=0.58,\n",
    "    color=\"#ef4444\",\n",
    "    edgecolor=\"white\",\n",
    ")\n",
    "ax_hist.axvline(ref_mean, color=\"#1e40af\", linestyle=\"--\", linewidth=2, alpha=0.7)\n",
    "ax_hist.axvline(prod_mean, color=\"#991b1b\", linestyle=\"--\", linewidth=2, alpha=0.7)\n",
    "ax_hist.set_xlabel(\"credit_score\")\n",
    "ax_hist.set_ylabel(\"Frequency\")\n",
    "\n",
    "# Drift summary across all features.\n",
    "drift_pairs = sorted(drift_by_feature.items(), key=lambda item: item[1])\n",
    "features = [feature for feature, _ in drift_pairs]\n",
    "values = [value for _, value in drift_pairs]\n",
    "colors = [\"#ef4444\" if value > threshold else \"#94a3b8\" for value in values]\n",
    "\n",
    "bars = ax_drift.barh(features, values, color=colors, edgecolor=\"white\")\n",
    "ax_drift.axvline(threshold, color=\"#1e293b\", linestyle=\"--\", linewidth=1.4, alpha=0.7)\n",
    "ax_drift.text(threshold + 0.006, 0.35, \"threshold (0.10)\", fontsize=9, color=\"#334155\")\n",
    "ax_drift.set_xlabel(\"Wasserstein distance (normed)\")\n",
    "ax_drift.set_title(\n",
    "    f\"Drift summary: {sum(value > threshold for value in values)} of {len(values)} features drifted\",\n",
    "    loc=\"left\",\n",
    "    fontweight=\"bold\",\n",
    "    pad=16,\n",
    ")\n",
    "\n",
    "for bar, value in zip(bars, values):\n",
    "    ax_drift.text(\n",
    "        value + 0.006,\n",
    "        bar.get_y() + bar.get_height() / 2,\n",
    "        f\"{value:.2f}\",\n",
    "        va=\"center\",\n",
    "        fontsize=9.5,\n",
    "        color=\"#ef4444\" if value > threshold else \"#64748b\",\n",
    "    )\n",
    "\n",
    "fig.suptitle(\n",
    "    \"Production drift report: Credit Risk Model v1, last 30 days\",\n",
    "    fontsize=15,\n",
    "    fontweight=\"bold\",\n",
    "    y=0.985,\n",
    ")\n",
    "fig.subplots_adjust(top=0.82, bottom=0.15, left=0.07, right=0.98)\n",
    "\n",
    "fig.savefig(OUT / \"drift_report.png\", dpi=140, bbox_inches=\"tight\")\n",
    "\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "md-16",
   "metadata": {},
   "source": [
    "---\n",
    "\n",
    "## 5 · Generated Artifacts\n",
    "\n",
    "After running all cells, inspect the files below. MLflow metadata is stored in `output/mlflow.db`; model artifacts are stored in `output/mlflow_artifacts/`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "code-17",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run this after the notebook has completed to see the generated files.\n",
    "list_artifacts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e9038fb7-90a6-4d1b-af10-fab4830c684c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (pydata2026)",
   "language": "python",
   "name": "pydata2026"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
