Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ uv.lock
*data-uclh*
data-public/*
!data-public/.gitkeep
*.claude/worktrees/*
notebooks/eval-output/*


trained-models/*
Expand Down
206 changes: 141 additions & 65 deletions src/patientflow/train/classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
Initialize a model with given hyperparameters
create_column_transformer
Create a column transformer for a dataframe with dynamic column handling
infer_feature_kind
Classify each column for preprocessing (ordinal, numeric, timedelta, etc.)
calculate_class_balance
Calculate class balance ratios for target labels
get_feature_metadata
Expand All @@ -32,6 +34,7 @@
Train admission prediction models for multiple prediction times
"""

from enum import Enum, auto
from typing import Dict, List, Any, Tuple, Optional, Union, TypedDict, Type
import numpy as np
import numpy.typing as npt
Expand All @@ -43,7 +46,12 @@
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, log_loss, average_precision_score
from sklearn.model_selection import TimeSeriesSplit, ParameterGrid
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.preprocessing import (
FunctionTransformer,
OneHotEncoder,
OrdinalEncoder,
StandardScaler,
)
from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn import __version__ as sk_version
Expand All @@ -60,21 +68,111 @@
)


def _is_string_like_column(series: Series) -> bool:
"""Check whether a Series holds string-like values that need encoding.
class FeatureKind(Enum):
"""How a column is preprocessed in ``create_column_transformer``."""

ORDINAL = auto()
BOOLEAN = auto()
TIMEDELTA = auto()
DATETIME = auto()
NUMERIC_BINARY = auto()
NUMERIC_SCALED = auto()
CATEGORICAL = auto()


def _timedelta_to_float_seconds(X: npt.NDArray[Any]) -> npt.NDArray[np.float64]:
arr = np.asarray(X)
if not np.issubdtype(arr.dtype, np.timedelta64):
return arr.astype(np.float64, copy=False)
flat = arr.ravel()
sec = pd.Series(flat).dt.total_seconds().to_numpy(dtype=np.float64)
return sec.reshape(arr.shape[0], -1)


def infer_feature_kind(
series: Series,
col: str,
ordinal_mappings: Dict[str, List[Any]],
) -> FeatureKind:
"""Assign a single preprocessing role to a column.

Uses an exclusion-based approach so that any current or future
string-like dtype (``object``, ``StringDtype``, ``ArrowDtype("string")``,
``CategoricalDtype`` with string categories, etc.) is detected without
needing to enumerate each one explicitly.
This is the single source of truth for both ``create_column_transformer``
and ``FeatureColumnTransformer`` defaults. Checks are ordered from explicit
contracts (ordinal mapping, concrete dtypes) to broad fallbacks.
"""
if col in ordinal_mappings:
return FeatureKind.ORDINAL
if series.dtype == "bool":
return FeatureKind.BOOLEAN
if pd.api.types.is_timedelta64_dtype(series):
return FeatureKind.TIMEDELTA
if pd.api.types.is_datetime64_any_dtype(series):
return FeatureKind.DATETIME

if isinstance(series.dtype, pd.CategoricalDtype):
series = series.cat.categories.to_series()
return not (
pd.api.types.is_numeric_dtype(series)
or pd.api.types.is_bool_dtype(series)
or pd.api.types.is_datetime64_any_dtype(series)
)
cats = series.cat.categories.to_series()
if pd.api.types.is_numeric_dtype(cats):
nu = series.nunique(dropna=False)
return FeatureKind.NUMERIC_BINARY if nu == 2 else FeatureKind.NUMERIC_SCALED
return FeatureKind.CATEGORICAL

if pd.api.types.is_numeric_dtype(series):
nu = series.nunique(dropna=False)
return FeatureKind.NUMERIC_BINARY if nu == 2 else FeatureKind.NUMERIC_SCALED

return FeatureKind.CATEGORICAL


def _default_for_feature_kind(series: Series, kind: FeatureKind) -> Any:
if kind == FeatureKind.BOOLEAN:
return False
if kind in (FeatureKind.NUMERIC_BINARY, FeatureKind.NUMERIC_SCALED):
return 0.0
if kind == FeatureKind.DATETIME:
return pd.NaT
if kind == FeatureKind.TIMEDELTA:
return pd.Timedelta(0)
if kind in (FeatureKind.CATEGORICAL, FeatureKind.ORDINAL):
mode = series.mode(dropna=True)
return mode.iloc[0] if not mode.empty else "Unknown"
return pd.NA


def _make_transformer_for_kind(
col: str,
kind: FeatureKind,
ordinal_mappings: Dict[str, List[Any]],
) -> Union[OrdinalEncoder, OneHotEncoder, StandardScaler, Pipeline, str]:
if kind == FeatureKind.ORDINAL:
return OrdinalEncoder(
categories=[ordinal_mappings[col]],
handle_unknown="use_encoded_value",
unknown_value=np.nan,
)
if kind == FeatureKind.BOOLEAN:
return "passthrough"
if kind == FeatureKind.TIMEDELTA:
return Pipeline(
[
(
"to_seconds",
FunctionTransformer(
_timedelta_to_float_seconds,
feature_names_out="one-to-one",
),
),
("scale", StandardScaler()),
]
)
if kind == FeatureKind.DATETIME:
return StandardScaler()
if kind == FeatureKind.NUMERIC_BINARY:
return "passthrough"
if kind == FeatureKind.NUMERIC_SCALED:
return StandardScaler()
if kind == FeatureKind.CATEGORICAL:
return OneHotEncoder(handle_unknown="ignore")
raise ValueError(f"Unhandled FeatureKind: {kind!r}")


class FeatureColumnTransformer(BaseEstimator, TransformerMixin):
Expand All @@ -87,29 +185,36 @@ class FeatureColumnTransformer(BaseEstimator, TransformerMixin):
- Drops any extra columns that weren't in training.
- Returns only the columns that match the training feature set, in the original order.

Defaults are derived from the training data rather than from hard-coded
name-based rules:
- Boolean columns -> False
- Numeric columns -> 0.0
- Datetime columns -> pd.NaT
- Object/categorical columns -> mode (most frequent non-null value), or "Unknown"
if no such value exists
- Other dtypes -> pd.NA
Defaults are derived from the training data using the same ``FeatureKind``
classification as ``create_column_transformer`` (via ``infer_feature_kind``):
- Boolean -> False
- Numeric -> 0.0
- Datetime -> pd.NaT
- Timedelta -> pd.Timedelta(0)
- Categorical / ordinal -> mode, or "Unknown" if empty
- Otherwise -> pd.NA
"""

def __init__(
self, explicit_defaults: Optional[Dict[str, Any]] = None, verbose: bool = False
self,
explicit_defaults: Optional[Dict[str, Any]] = None,
ordinal_mappings: Optional[Dict[str, List[Any]]] = None,
verbose: bool = False,
):
"""
Parameters
----------
explicit_defaults : Dict[str, Any], optional
Optional mapping of column name -> default value. These values take
precedence over heuristics learned from the training data.
ordinal_mappings : Dict[str, List[Any]], optional
Same mapping passed to ``create_column_transformer`` so ordinal
columns get consistent kind and defaults.
verbose : bool, default=False
If True, prints which columns were added at transform time.
"""
self.explicit_defaults = explicit_defaults or {}
self.ordinal_mappings: Dict[str, List[Any]] = ordinal_mappings or {}
self.verbose = verbose

def fit(
Expand All @@ -127,21 +232,8 @@ def fit(
continue

series = X[col]
default: Any

if series.dtype == "bool":
default = False
elif pd.api.types.is_numeric_dtype(series):
default = 0.0
elif pd.api.types.is_datetime64_any_dtype(series):
default = pd.NaT
elif _is_string_like_column(series):
mode = series.mode(dropna=True)
default = mode.iloc[0] if not mode.empty else "Unknown"
else:
default = pd.NA

self.column_defaults_[col] = default
kind = infer_feature_kind(series, col, self.ordinal_mappings)
self.column_defaults_[col] = _default_for_feature_kind(series, kind)

return self

Expand Down Expand Up @@ -308,7 +400,7 @@ def initialise_model(
def create_column_transformer(
df: DataFrame, ordinal_mappings: Optional[Dict[str, List[Any]]] = None
) -> ColumnTransformer:
"""Create a column transformer for a dataframe with dynamic column handling.
"""Create a column transformer using :func:`infer_feature_kind` per column.

Parameters
----------
Expand All @@ -322,36 +414,20 @@ def create_column_transformer(
ColumnTransformer
Configured column transformer
"""
transformers: List[
Tuple[str, Union[OrdinalEncoder, OneHotEncoder, StandardScaler], List[str]]
] = []

if ordinal_mappings is None:
ordinal_mappings = {}

transformers: List[
Tuple[
str,
Union[OrdinalEncoder, OneHotEncoder, StandardScaler, Pipeline, str],
List[str],
]
] = []
for col in df.columns:
if col in ordinal_mappings:
transformers.append(
(
col,
OrdinalEncoder(
categories=[ordinal_mappings[col]],
handle_unknown="use_encoded_value",
unknown_value=np.nan,
),
[col],
)
)
# Keep boolean columns as single 0/1 features rather than one-hot encoding
elif df[col].dtype == "bool":
transformers.append((col, "passthrough", [col]))
elif _is_string_like_column(df[col]):
transformers.append((col, OneHotEncoder(handle_unknown="ignore"), [col]))
# Keep non-boolean binary (0/1) columns as single features
elif df[col].nunique() == 2:
transformers.append((col, "passthrough", [col]))
else:
transformers.append((col, StandardScaler(), [col]))
kind = infer_feature_kind(df[col], col, ordinal_mappings)
trans = _make_transformer_for_kind(col, kind, ordinal_mappings)
transformers.append((col, trans, [col]))

return ColumnTransformer(transformers)

Expand Down Expand Up @@ -722,7 +798,7 @@ def train_classifier(
model = initialise_model(model_class, params)

column_transformer = create_column_transformer(X_train, ordinal_mappings)
feature_columns = FeatureColumnTransformer()
feature_columns = FeatureColumnTransformer(ordinal_mappings=ordinal_mappings)
pipeline = Pipeline(
[
("feature_columns", feature_columns),
Expand Down
73 changes: 72 additions & 1 deletion tests/test_classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,16 @@
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from patientflow.train.classifiers import train_classifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from patientflow.train.classifiers import (
FeatureColumnTransformer,
FeatureKind,
create_column_transformer,
infer_feature_kind,
train_classifier,
)
from patientflow.model_artifacts import TrainedClassifier


Expand Down Expand Up @@ -278,6 +287,68 @@ def test_feature_importance(self):
self.assertTrue(features_info["has_importance_values"])
self.assertEqual(len(features_info["names"]), len(features_info["importances"]))

def test_timedelta_column_uses_standard_scaler_not_one_hot(self):
"""timedelta64 features must be scaled (via seconds), not one-hot encoded."""
df = pd.DataFrame(
{
"elapsed_los": pd.to_timedelta(np.arange(10), unit="h"),
"sex": pd.Series(["M", "F"] * 5, dtype="object"),
}
)
ct = create_column_transformer(df)
by_col = {cols[0]: trans for _, trans, cols in ct.transformers}
self.assertIsInstance(by_col["elapsed_los"], Pipeline)
self.assertIsInstance(
by_col["elapsed_los"].named_steps["scale"], StandardScaler
)
self.assertIsInstance(by_col["sex"], OneHotEncoder)

def test_timedelta_column_transform_matches_float_seconds(self):
"""After fit on timedelta, transform accepts the same durations as float seconds."""
df_td = pd.DataFrame({"elapsed_los": pd.to_timedelta([1, 2, 3], unit="h")})
df_sec = pd.DataFrame({"elapsed_los": [3600.0, 7200.0, 10800.0]})
ct: ColumnTransformer = create_column_transformer(df_td)
ct.fit(df_td)
out_td = ct.transform(df_td)
out_sec = ct.transform(df_sec)
np.testing.assert_allclose(out_td, out_sec, rtol=1e-10, atol=1e-10)

def test_timedelta_binary_column_still_scaled_seconds(self):
"""Durations are always seconds + scaler (no binary passthrough)."""
df = pd.DataFrame(
{
"elapsed_los": pd.to_timedelta([1, 2] * 5, unit="h"),
}
)
ct = create_column_transformer(df)
_, trans, cols = ct.transformers[0]
self.assertEqual(cols, ["elapsed_los"])
self.assertIsInstance(trans, Pipeline)
self.assertIsInstance(trans.named_steps["scale"], StandardScaler)

def test_infer_feature_kind_categorical_with_numeric_categories(self):
"""Pandas categorical with numeric categories follows numeric routing."""
s = pd.Series(pd.Categorical.from_codes([0, 1, 0], categories=[1, 2]))
self.assertEqual(infer_feature_kind(s, "x", {}), FeatureKind.NUMERIC_BINARY)

def test_infer_feature_kind_two_string_object_is_categorical(self):
s = pd.Series(["a", "b"] * 3, dtype="object")
self.assertEqual(infer_feature_kind(s, "x", {}), FeatureKind.CATEGORICAL)

def test_feature_column_transformer_timedelta_default(self):
"""Missing timedelta columns are filled with pd.Timedelta(0)."""
fit_df = pd.DataFrame({"elapsed_los": pd.to_timedelta([1, 2, 3], unit="h")})
fct = FeatureColumnTransformer()
fct.fit(fit_df)
self.assertEqual(fct.column_defaults_["elapsed_los"], pd.Timedelta(0))

out = fct.transform(pd.DataFrame({"other": [1, 2]}))
self.assertIn("elapsed_los", out.columns)
self.assertTrue(
(out["elapsed_los"] == pd.Timedelta(0)).all(),
msg="expected zero timedelta fill for missing column",
)


if __name__ == "__main__":
unittest.main()
Loading