|
4 | 4 | from sklearn.pipeline import Pipeline |
5 | 5 | from sklearn.compose import ColumnTransformer |
6 | 6 | from sklearn.preprocessing import OneHotEncoder, StandardScaler |
| 7 | +from xgboost import XGBClassifier |
7 | 8 |
|
| 9 | +from patientflow.predict.emergency_demand import dataframe_for_classifier_predict_proba |
8 | 10 | from patientflow.train.classifiers import ( |
9 | 11 | FeatureColumnTransformer, |
10 | 12 | FeatureKind, |
@@ -349,6 +351,87 @@ def test_feature_column_transformer_timedelta_default(self): |
349 | 351 | msg="expected zero timedelta fill for missing column", |
350 | 352 | ) |
351 | 353 |
|
| 354 | + def test_dataframe_for_classifier_predict_proba_modern_keeps_timedelta(self): |
| 355 | + """Modern pipelines receive timedelta ``elapsed_los`` through predict_proba.""" |
| 356 | + df = pd.DataFrame( |
| 357 | + { |
| 358 | + "elapsed_los": pd.to_timedelta([1, 2, 3, 4], unit="h"), |
| 359 | + "sex": ["M", "F", "M", "F"], |
| 360 | + "y": [0, 1, 0, 1], |
| 361 | + } |
| 362 | + ) |
| 363 | + X = df[["elapsed_los", "sex"]] |
| 364 | + y = df["y"] |
| 365 | + fct = FeatureColumnTransformer() |
| 366 | + ct = create_column_transformer(X) |
| 367 | + pipe = Pipeline( |
| 368 | + [ |
| 369 | + ("feature_columns", fct), |
| 370 | + ("feature_transformer", ct), |
| 371 | + ( |
| 372 | + "classifier", |
| 373 | + XGBClassifier(n_estimators=5, max_depth=2, eval_metric="logloss"), |
| 374 | + ), |
| 375 | + ] |
| 376 | + ) |
| 377 | + pipe.fit(X, y) |
| 378 | + |
| 379 | + snap = pd.DataFrame( |
| 380 | + { |
| 381 | + "elapsed_los": pd.to_timedelta([30, 45], unit="m"), |
| 382 | + "sex": ["F", "M"], |
| 383 | + } |
| 384 | + ) |
| 385 | + prepared = dataframe_for_classifier_predict_proba(pipe, snap) |
| 386 | + self.assertTrue(pd.api.types.is_timedelta64_dtype(prepared["elapsed_los"])) |
| 387 | + probs = pipe.predict_proba(prepared) |
| 388 | + self.assertEqual(probs.shape[0], 2) |
| 389 | + |
| 390 | + def test_dataframe_for_classifier_predict_proba_legacy_seconds(self): |
| 391 | + """Legacy pipelines (no ``feature_columns``) get numeric seconds.""" |
| 392 | + df = pd.DataFrame( |
| 393 | + { |
| 394 | + "elapsed_los": [3600, 7200], |
| 395 | + "sex": ["M", "F"], |
| 396 | + "y": [0, 1], |
| 397 | + } |
| 398 | + ) |
| 399 | + X = df[["elapsed_los", "sex"]] |
| 400 | + y = df["y"] |
| 401 | + ct = ColumnTransformer( |
| 402 | + [ |
| 403 | + ("oh", OneHotEncoder(handle_unknown="ignore"), ["sex"]), |
| 404 | + ("num", "passthrough", ["elapsed_los"]), |
| 405 | + ] |
| 406 | + ) |
| 407 | + pipe = Pipeline( |
| 408 | + [ |
| 409 | + ("feature_transformer", ct), |
| 410 | + ( |
| 411 | + "classifier", |
| 412 | + XGBClassifier(n_estimators=5, max_depth=2, eval_metric="logloss"), |
| 413 | + ), |
| 414 | + ] |
| 415 | + ) |
| 416 | + pipe.fit(X, y) |
| 417 | + |
| 418 | + snap = pd.DataFrame( |
| 419 | + { |
| 420 | + "elapsed_los": pd.to_timedelta([30, 60], unit="m"), |
| 421 | + "sex": ["F", "M"], |
| 422 | + } |
| 423 | + ) |
| 424 | + prepared = dataframe_for_classifier_predict_proba(pipe, snap) |
| 425 | + self.assertTrue( |
| 426 | + pd.api.types.is_float_dtype(prepared["elapsed_los"]), |
| 427 | + msg="legacy path should convert elapsed_los to float seconds", |
| 428 | + ) |
| 429 | + np.testing.assert_allclose( |
| 430 | + prepared["elapsed_los"].values, [1800.0, 3600.0], rtol=0, atol=1e-9 |
| 431 | + ) |
| 432 | + probs = pipe.predict_proba(prepared) |
| 433 | + self.assertEqual(probs.shape[0], 2) |
| 434 | + |
352 | 435 |
|
353 | 436 | if __name__ == "__main__": |
354 | 437 | unittest.main() |
0 commit comments