Skip to content

Commit 5aacf84

Browse files
authored
Merge pull request #171 from UCL-CORU/inference-column-handling
Align classifier inference with modern vs legacy pipelines
2 parents 59b1380 + 30cf245 commit 5aacf84

3 files changed

Lines changed: 141 additions & 22 deletions

File tree

src/patientflow/predict/emergency_demand.py

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,33 @@ def get_feature_names_before_encoding(column_transformer):
167167
return df
168168

169169

170+
def dataframe_for_classifier_predict_proba(pipeline, df: pd.DataFrame) -> pd.DataFrame:
171+
"""Return a copy of ``df`` for ``pipeline.predict_proba``.
172+
173+
Pipelines with a ``feature_columns`` step (``FeatureColumnTransformer``) are
174+
fitted with ``elapsed_los`` as ``timedelta64`` and convert to seconds inside
175+
the column transformer. Legacy pipelines expect numeric seconds for
176+
``elapsed_los`` (training matched that layout).
177+
178+
Parameters
179+
----------
180+
pipeline : sklearn.pipeline.Pipeline
181+
Trained classifier pipeline (calibrated or not).
182+
df : pandas.DataFrame
183+
Snapshot rows; ``elapsed_los`` must be timedelta when required by callers.
184+
185+
Returns
186+
-------
187+
pandas.DataFrame
188+
Copy suitable to pass to ``predict_proba``.
189+
"""
190+
if "feature_columns" in pipeline.named_steps:
191+
return df.copy()
192+
out = df.copy()
193+
out["elapsed_los"] = out["elapsed_los"].dt.total_seconds()
194+
return out
195+
196+
170197
def find_probability_threshold_index(sequence: List[float], threshold: float) -> int:
171198
"""Find index k such that P(X >= k) >= threshold.
172199
@@ -486,13 +513,9 @@ def create_predictions(
486513
# Legacy path: use external helper for older model artifacts
487514
prediction_snapshots = add_missing_columns(pipeline, prediction_snapshots)
488515

489-
# Before we get predictions, we need to create a temp copy with the elapsed_los column in seconds.
490-
# In the training data, elapsed_los is stored as seconds, so this conversion ensures
491-
# the model sees the same representation at inference time.
492-
prediction_snapshots_temp = prediction_snapshots.copy()
493-
prediction_snapshots_temp["elapsed_los"] = prediction_snapshots_temp[
494-
"elapsed_los"
495-
].dt.total_seconds()
516+
prediction_snapshots_temp = dataframe_for_classifier_predict_proba(
517+
pipeline, prediction_snapshots
518+
)
496519

497520
# Get predictions of admissions for ED patients
498521
prob_admission_after_ed = model_input_to_pred_proba(

src/patientflow/predict/service.py

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
from patientflow.predict.emergency_demand import (
2323
add_missing_columns,
24+
dataframe_for_classifier_predict_proba,
2425
get_specialty_probs,
2526
warn_specialty_mismatch,
2627
)
@@ -542,20 +543,28 @@ def _prepare_base_probabilities(
542543
else:
543544
inpatient_pipeline = None
544545

545-
# Ensure model expects columns exist
546-
if ed_pipeline is not None and ed_snapshots is not None:
546+
# Legacy only: add missing columns before the ColumnTransformer step.
547+
if (
548+
ed_pipeline is not None
549+
and ed_snapshots is not None
550+
and "feature_columns" not in ed_pipeline.named_steps
551+
):
547552
ed_snapshots = add_missing_columns(ed_pipeline, ed_snapshots.copy())
548-
if inpatient_pipeline is not None and inpatient_snapshots is not None:
553+
if (
554+
inpatient_pipeline is not None
555+
and inpatient_snapshots is not None
556+
and "feature_columns" not in inpatient_pipeline.named_steps
557+
):
549558
inpatient_snapshots = add_missing_columns(
550559
inpatient_pipeline, inpatient_snapshots.copy()
551560
)
552561

553-
# Convert elapsed_los to seconds for the ED classifier pipeline
554-
if ed_snapshots is not None:
562+
if ed_snapshots is not None and ed_pipeline is not None:
563+
ed_snapshots_temp = dataframe_for_classifier_predict_proba(
564+
ed_pipeline, ed_snapshots
565+
)
566+
elif ed_snapshots is not None:
555567
ed_snapshots_temp = ed_snapshots.copy()
556-
ed_snapshots_temp["elapsed_los"] = ed_snapshots_temp[
557-
"elapsed_los"
558-
].dt.total_seconds()
559568
else:
560569
ed_snapshots_temp = None
561570

@@ -569,14 +578,18 @@ def _prepare_base_probabilities(
569578
else:
570579
prob_admission_after_ed = pd.Series(dtype=float)
571580

572-
# Convert elapsed_los to seconds for the inpatient classifier pipeline
573-
if inpatient_snapshots is not None:
581+
if inpatient_snapshots is not None and inpatient_pipeline is not None:
582+
inpatient_snapshots_temp = dataframe_for_classifier_predict_proba(
583+
inpatient_pipeline, inpatient_snapshots
584+
)
585+
elective_snapshots = inpatient_snapshots_temp[
586+
inpatient_snapshots_temp["admission_type"] == "elective"
587+
]
588+
emergency_snapshots = inpatient_snapshots_temp[
589+
inpatient_snapshots_temp["admission_type"] == "emergency"
590+
]
591+
elif inpatient_snapshots is not None:
574592
inpatient_snapshots_temp = inpatient_snapshots.copy()
575-
inpatient_snapshots_temp["elapsed_los"] = inpatient_snapshots_temp[
576-
"elapsed_los"
577-
].dt.total_seconds()
578-
579-
# Split inpatient snapshots into elective and emergency
580593
elective_snapshots = inpatient_snapshots_temp[
581594
inpatient_snapshots_temp["admission_type"] == "elective"
582595
]

tests/test_classifiers.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
from sklearn.pipeline import Pipeline
55
from sklearn.compose import ColumnTransformer
66
from sklearn.preprocessing import OneHotEncoder, StandardScaler
7+
from xgboost import XGBClassifier
78

9+
from patientflow.predict.emergency_demand import dataframe_for_classifier_predict_proba
810
from patientflow.train.classifiers import (
911
FeatureColumnTransformer,
1012
FeatureKind,
@@ -349,6 +351,87 @@ def test_feature_column_transformer_timedelta_default(self):
349351
msg="expected zero timedelta fill for missing column",
350352
)
351353

354+
def test_dataframe_for_classifier_predict_proba_modern_keeps_timedelta(self):
355+
"""Modern pipelines receive timedelta ``elapsed_los`` through predict_proba."""
356+
df = pd.DataFrame(
357+
{
358+
"elapsed_los": pd.to_timedelta([1, 2, 3, 4], unit="h"),
359+
"sex": ["M", "F", "M", "F"],
360+
"y": [0, 1, 0, 1],
361+
}
362+
)
363+
X = df[["elapsed_los", "sex"]]
364+
y = df["y"]
365+
fct = FeatureColumnTransformer()
366+
ct = create_column_transformer(X)
367+
pipe = Pipeline(
368+
[
369+
("feature_columns", fct),
370+
("feature_transformer", ct),
371+
(
372+
"classifier",
373+
XGBClassifier(n_estimators=5, max_depth=2, eval_metric="logloss"),
374+
),
375+
]
376+
)
377+
pipe.fit(X, y)
378+
379+
snap = pd.DataFrame(
380+
{
381+
"elapsed_los": pd.to_timedelta([30, 45], unit="m"),
382+
"sex": ["F", "M"],
383+
}
384+
)
385+
prepared = dataframe_for_classifier_predict_proba(pipe, snap)
386+
self.assertTrue(pd.api.types.is_timedelta64_dtype(prepared["elapsed_los"]))
387+
probs = pipe.predict_proba(prepared)
388+
self.assertEqual(probs.shape[0], 2)
389+
390+
def test_dataframe_for_classifier_predict_proba_legacy_seconds(self):
391+
"""Legacy pipelines (no ``feature_columns``) get numeric seconds."""
392+
df = pd.DataFrame(
393+
{
394+
"elapsed_los": [3600, 7200],
395+
"sex": ["M", "F"],
396+
"y": [0, 1],
397+
}
398+
)
399+
X = df[["elapsed_los", "sex"]]
400+
y = df["y"]
401+
ct = ColumnTransformer(
402+
[
403+
("oh", OneHotEncoder(handle_unknown="ignore"), ["sex"]),
404+
("num", "passthrough", ["elapsed_los"]),
405+
]
406+
)
407+
pipe = Pipeline(
408+
[
409+
("feature_transformer", ct),
410+
(
411+
"classifier",
412+
XGBClassifier(n_estimators=5, max_depth=2, eval_metric="logloss"),
413+
),
414+
]
415+
)
416+
pipe.fit(X, y)
417+
418+
snap = pd.DataFrame(
419+
{
420+
"elapsed_los": pd.to_timedelta([30, 60], unit="m"),
421+
"sex": ["F", "M"],
422+
}
423+
)
424+
prepared = dataframe_for_classifier_predict_proba(pipe, snap)
425+
self.assertTrue(
426+
pd.api.types.is_float_dtype(prepared["elapsed_los"]),
427+
msg="legacy path should convert elapsed_los to float seconds",
428+
)
429+
np.testing.assert_allclose(
430+
prepared["elapsed_los"].values, [1800.0, 3600.0], rtol=0, atol=1e-9
431+
)
432+
probs = pipe.predict_proba(prepared)
433+
self.assertEqual(probs.shape[0], 2)
434+
352435

353436
if __name__ == "__main__":
354437
unittest.main()

0 commit comments

Comments
 (0)