Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/notebooks/2c_Evaluate_patient_snapshot_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,7 @@ from patientflow.viz.shap import plot_shap
plot_shap(
trained_models,
test_visits,
exclude_from_training_data)
exclude_from_training_data=exclude_from_training_data)


```
Expand Down
60 changes: 34 additions & 26 deletions notebooks/2c_Evaluate_patient_snapshot_models.ipynb

Large diffs are not rendered by default.

64 changes: 31 additions & 33 deletions notebooks/3a_Prepare_group_snapshots.ipynb

Large diffs are not rendered by default.

21 changes: 10 additions & 11 deletions notebooks/3b_Evaluate_group_snapshots.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -58,7 +58,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -102,7 +102,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -145,7 +145,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -219,7 +219,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -277,7 +277,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -312,7 +312,6 @@
" df=test_visits, \n",
" prediction_time=_prediction_time, \n",
" single_snapshot_per_visit=False,\n",
" exclude_columns=exclude_from_training_data, \n",
" visit_col='visit_number'\n",
" )\n",
"\n",
Expand All @@ -338,7 +337,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -380,7 +379,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -435,7 +434,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -479,7 +478,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"outputs": [
{
Expand Down
23 changes: 19 additions & 4 deletions src/patientflow/predict/emergency_demand.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,18 @@
)
from patientflow.model_artifacts import TrainedClassifier

warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
# SettingWithCopyWarning was removed in pandas 3.0 (CoW is now default)
if hasattr(pd.errors, "SettingWithCopyWarning"):
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)


def add_missing_columns(pipeline, df):
"""Add missing columns required by the prediction pipeline from the training data.

This is a legacy function for older model artifacts that don't include
FeatureColumnTransformer in their pipeline. For newer models, the pipeline
handles column selection automatically via the transformer step.

Parameters
----------
pipeline : sklearn.pipeline.Pipeline
Expand All @@ -78,6 +84,9 @@ def add_missing_columns(pipeline, df):
- latest_ : pd.NA
- arrival_method : "None"
- others : pd.NA

For newer models with FeatureColumnTransformer in the pipeline, this function
is not needed as the transformer handles column selection automatically.
"""
# check input data for missing columns
column_transformer = pipeline.named_steps["feature_transformer"]
Expand Down Expand Up @@ -435,9 +444,15 @@ def create_predictions(
pipeline = classifier.pipeline

# Add missing columns expected by the model
prediction_snapshots = add_missing_columns(pipeline, prediction_snapshots)

# Before we get predictions, we need to create a temp copy with the elapsed_los column in seconds
# For new models with FeatureColumnTransformer, the pipeline handles column selection automatically.
# For legacy models without the transformer, use the external helper function.
if "feature_columns" not in pipeline.named_steps:
# Legacy path: use external helper for older model artifacts
prediction_snapshots = add_missing_columns(pipeline, prediction_snapshots)

# Before we get predictions, we need to create a temp copy with the elapsed_los column in seconds.
# In the training data, elapsed_los is stored as seconds, so this conversion ensures
# the model sees the same representation at inference time.
prediction_snapshots_temp = prediction_snapshots.copy()
prediction_snapshots_temp["elapsed_los"] = prediction_snapshots_temp[
"elapsed_los"
Expand Down
19 changes: 19 additions & 0 deletions src/patientflow/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,25 @@ def prepare_patient_snapshots(
# Filter by the time of day while keeping the original index
df_tod = df[df["prediction_time"] == prediction_time].copy()

# Provide a helpful error message if no snapshots match the requested
# prediction_time. Downstream model training (e.g. time-series CV) fails
# with a less informative error when given an empty dataset.
if df_tod.empty:
available_times = sorted(df["prediction_time"].unique())
arg_type = type(prediction_time).__name__
col_dtype = df["prediction_time"].dtype
raise ValueError(
"No patient snapshots found for prediction_time "
f"{prediction_time}. "
"A common cause is a type/format mismatch between the inputs: "
f"(type of `prediction_time` argument: {arg_type}; "
f"dtype of `df['prediction_time']` column: {col_dtype}). "
"Check that the value you passed matches one of the "
f"available `prediction_time` values in the dataset: {available_times}. "
"If the types and formats match, another possibility is that there are "
"no visits with a snapshot at this time of day."
)

if single_snapshot_per_visit:
# Select one row for each visit
df_single = select_one_snapshot_per_visit(df_tod, visit_col)
Expand Down
Loading
Loading