Merge pull request #171 from UCL-CORU/inference-column-handling

zmek · web-flow · commit 5aacf8400891 · 2026-05-04T18:43:16.000+02:00
Align classifier inference with modern vs legacy pipelines
diff --git a/src/patientflow/predict/emergency_demand.py b/src/patientflow/predict/emergency_demand.py
@@ -167,6 +167,33 @@ def get_feature_names_before_encoding(column_transformer):
     return df
 
 
+def dataframe_for_classifier_predict_proba(pipeline, df: pd.DataFrame) -> pd.DataFrame:
+    """Return a copy of ``df`` for ``pipeline.predict_proba``.
+
+    Pipelines with a ``feature_columns`` step (``FeatureColumnTransformer``) are
+    fitted with ``elapsed_los`` as ``timedelta64`` and convert to seconds inside
+    the column transformer. Legacy pipelines expect numeric seconds for
+    ``elapsed_los`` (training matched that layout).
+
+    Parameters
+    ----------
+    pipeline : sklearn.pipeline.Pipeline
+        Trained classifier pipeline (calibrated or not).
+    df : pandas.DataFrame
+        Snapshot rows; ``elapsed_los`` must be timedelta when required by callers.
+
+    Returns
+    -------
+    pandas.DataFrame
+        Copy suitable to pass to ``predict_proba``.
+    """
+    if "feature_columns" in pipeline.named_steps:
+        return df.copy()
+    out = df.copy()
+    out["elapsed_los"] = out["elapsed_los"].dt.total_seconds()
+    return out
+
+
 def find_probability_threshold_index(sequence: List[float], threshold: float) -> int:
     """Find index k such that P(X >= k) >= threshold.
 
@@ -486,13 +513,9 @@ def create_predictions(
         # Legacy path: use external helper for older model artifacts
         prediction_snapshots = add_missing_columns(pipeline, prediction_snapshots)
 
-    # Before we get predictions, we need to create a temp copy with the elapsed_los column in seconds.
-    # In the training data, elapsed_los is stored as seconds, so this conversion ensures
-    # the model sees the same representation at inference time.
-    prediction_snapshots_temp = prediction_snapshots.copy()
-    prediction_snapshots_temp["elapsed_los"] = prediction_snapshots_temp[
-        "elapsed_los"
-    ].dt.total_seconds()
+    prediction_snapshots_temp = dataframe_for_classifier_predict_proba(
+        pipeline, prediction_snapshots
+    )
 
     # Get predictions of admissions for ED patients
     prob_admission_after_ed = model_input_to_pred_proba(
diff --git a/src/patientflow/predict/service.py b/src/patientflow/predict/service.py
@@ -21,6 +21,7 @@
 
 from patientflow.predict.emergency_demand import (
     add_missing_columns,
+    dataframe_for_classifier_predict_proba,
     get_specialty_probs,
     warn_specialty_mismatch,
 )
@@ -542,20 +543,28 @@ def _prepare_base_probabilities(
     else:
         inpatient_pipeline = None
 
-    # Ensure model expects columns exist
-    if ed_pipeline is not None and ed_snapshots is not None:
+    # Legacy only: add missing columns before the ColumnTransformer step.
+    if (
+        ed_pipeline is not None
+        and ed_snapshots is not None
+        and "feature_columns" not in ed_pipeline.named_steps
+    ):
         ed_snapshots = add_missing_columns(ed_pipeline, ed_snapshots.copy())
-    if inpatient_pipeline is not None and inpatient_snapshots is not None:
+    if (
+        inpatient_pipeline is not None
+        and inpatient_snapshots is not None
+        and "feature_columns" not in inpatient_pipeline.named_steps
+    ):
         inpatient_snapshots = add_missing_columns(
             inpatient_pipeline, inpatient_snapshots.copy()
         )
 
-    # Convert elapsed_los to seconds for the ED classifier pipeline
-    if ed_snapshots is not None:
+    if ed_snapshots is not None and ed_pipeline is not None:
+        ed_snapshots_temp = dataframe_for_classifier_predict_proba(
+            ed_pipeline, ed_snapshots
+        )
+    elif ed_snapshots is not None:
         ed_snapshots_temp = ed_snapshots.copy()
-        ed_snapshots_temp["elapsed_los"] = ed_snapshots_temp[
-            "elapsed_los"
-        ].dt.total_seconds()
     else:
         ed_snapshots_temp = None
 
@@ -569,14 +578,18 @@ def _prepare_base_probabilities(
     else:
         prob_admission_after_ed = pd.Series(dtype=float)
 
-    # Convert elapsed_los to seconds for the inpatient classifier pipeline
-    if inpatient_snapshots is not None:
+    if inpatient_snapshots is not None and inpatient_pipeline is not None:
+        inpatient_snapshots_temp = dataframe_for_classifier_predict_proba(
+            inpatient_pipeline, inpatient_snapshots
+        )
+        elective_snapshots = inpatient_snapshots_temp[
+            inpatient_snapshots_temp["admission_type"] == "elective"
+        ]
+        emergency_snapshots = inpatient_snapshots_temp[
+            inpatient_snapshots_temp["admission_type"] == "emergency"
+        ]
+    elif inpatient_snapshots is not None:
         inpatient_snapshots_temp = inpatient_snapshots.copy()
-        inpatient_snapshots_temp["elapsed_los"] = inpatient_snapshots_temp[
-            "elapsed_los"
-        ].dt.total_seconds()
-
-        # Split inpatient snapshots into elective and emergency
         elective_snapshots = inpatient_snapshots_temp[
             inpatient_snapshots_temp["admission_type"] == "elective"
         ]
diff --git a/tests/test_classifiers.py b/tests/test_classifiers.py
@@ -4,7 +4,9 @@
 from sklearn.pipeline import Pipeline
 from sklearn.compose import ColumnTransformer
 from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from xgboost import XGBClassifier
 
+from patientflow.predict.emergency_demand import dataframe_for_classifier_predict_proba
 from patientflow.train.classifiers import (
     FeatureColumnTransformer,
     FeatureKind,
@@ -349,6 +351,87 @@ def test_feature_column_transformer_timedelta_default(self):
             msg="expected zero timedelta fill for missing column",
         )
 
+    def test_dataframe_for_classifier_predict_proba_modern_keeps_timedelta(self):
+        """Modern pipelines receive timedelta ``elapsed_los`` through predict_proba."""
+        df = pd.DataFrame(
+            {
+                "elapsed_los": pd.to_timedelta([1, 2, 3, 4], unit="h"),
+                "sex": ["M", "F", "M", "F"],
+                "y": [0, 1, 0, 1],
+            }
+        )
+        X = df[["elapsed_los", "sex"]]
+        y = df["y"]
+        fct = FeatureColumnTransformer()
+        ct = create_column_transformer(X)
+        pipe = Pipeline(
+            [
+                ("feature_columns", fct),
+                ("feature_transformer", ct),
+                (
+                    "classifier",
+                    XGBClassifier(n_estimators=5, max_depth=2, eval_metric="logloss"),
+                ),
+            ]
+        )
+        pipe.fit(X, y)
+
+        snap = pd.DataFrame(
+            {
+                "elapsed_los": pd.to_timedelta([30, 45], unit="m"),
+                "sex": ["F", "M"],
+            }
+        )
+        prepared = dataframe_for_classifier_predict_proba(pipe, snap)
+        self.assertTrue(pd.api.types.is_timedelta64_dtype(prepared["elapsed_los"]))
+        probs = pipe.predict_proba(prepared)
+        self.assertEqual(probs.shape[0], 2)
+
+    def test_dataframe_for_classifier_predict_proba_legacy_seconds(self):
+        """Legacy pipelines (no ``feature_columns``) get numeric seconds."""
+        df = pd.DataFrame(
+            {
+                "elapsed_los": [3600, 7200],
+                "sex": ["M", "F"],
+                "y": [0, 1],
+            }
+        )
+        X = df[["elapsed_los", "sex"]]
+        y = df["y"]
+        ct = ColumnTransformer(
+            [
+                ("oh", OneHotEncoder(handle_unknown="ignore"), ["sex"]),
+                ("num", "passthrough", ["elapsed_los"]),
+            ]
+        )
+        pipe = Pipeline(
+            [
+                ("feature_transformer", ct),
+                (
+                    "classifier",
+                    XGBClassifier(n_estimators=5, max_depth=2, eval_metric="logloss"),
+                ),
+            ]
+        )
+        pipe.fit(X, y)
+
+        snap = pd.DataFrame(
+            {
+                "elapsed_los": pd.to_timedelta([30, 60], unit="m"),
+                "sex": ["F", "M"],
+            }
+        )
+        prepared = dataframe_for_classifier_predict_proba(pipe, snap)
+        self.assertTrue(
+            pd.api.types.is_float_dtype(prepared["elapsed_los"]),
+            msg="legacy path should convert elapsed_los to float seconds",
+        )
+        np.testing.assert_allclose(
+            prepared["elapsed_los"].values, [1800.0, 3600.0], rtol=0, atol=1e-9
+        )
+        probs = pipe.predict_proba(prepared)
+        self.assertEqual(probs.shape[0], 2)
+
 
 if __name__ == "__main__":
     unittest.main()