Skip to content
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
* Python examples added for CloudSQL enrichment handler on [Beam website](https://beam.apache.org/documentation/transforms/python/elementwise/enrichment-cloudsql/) (Python) ([#35473](https://github.com/apache/beam/issues/36095)).
* Support for batch mode execution in WriteToPubSub transform added (Python) ([#35990](https://github.com/apache/beam/issues/35990)).
* Added official support for Python 3.13 ([#34869](https://github.com/apache/beam/issues/34869)).
* Added Triton Inference Server ModelHandler for ML inference (Python) ([#36369](https://github.com/apache/beam/issues/36369)).

## Breaking Changes

Expand Down
222 changes: 222 additions & 0 deletions sdks/python/apache_beam/ml/inference/TritonModelHandler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""Apache Beam ModelHandler implementation for Triton Inference Server."""

import json
import logging
from typing import Any, Callable, Dict, Iterable, Optional, Sequence

from apache_beam.ml.inference.base import ModelHandler, PredictionResult

try:
import tritonserver
from tritonserver import Model, Server
except ImportError:
tritonserver = None

LOGGER = logging.getLogger(__name__)


class TritonModelWrapper:
"""Wrapper to manage Triton Server lifecycle with the model."""
def __init__(self, server: 'Server', model: 'Model'):
self.server = server
self.model = model
self._cleaned_up = False

def cleanup(self):
"""Explicitly cleanup server resources.

This method should be called when the model is no longer needed.
It's safe to call multiple times.
"""
if self._cleaned_up:
return

try:
if self.server:
self.server.stop()
self._cleaned_up = True
except Exception as e:
LOGGER.warning("Error stopping Triton server: %s", e)
raise
Comment on lines +55 to +57
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

High: Instead of raising the exception, consider logging the error at the error level and then re-raising. This provides more detailed information for debugging purposes.

Consider adding exception chaining to preserve the original traceback.

Suggested change
except Exception as e:
LOGGER.warning("Error stopping Triton server: %s", e)
raise
except Exception as e:
LOGGER.error("Error stopping Triton server: %s", e, exc_info=True) # Log the error with exc_info for traceback
raise RuntimeError("Error stopping Triton server") from e # Re-raise with exception chaining


def __del__(self):
"""Cleanup server when model is garbage collected.

Note: __del__ is not guaranteed to be called. Prefer using cleanup()
explicitly when possible.
"""
if not self._cleaned_up:
try:
if self.server:
self.server.stop()
Comment on lines +67 to +68
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The __del__ method should set self._cleaned_up = True after stopping the server. This ensures that if cleanup() is called after __del__ has been invoked (e.g., during a complex shutdown sequence), it remains idempotent and does not attempt to stop an already-stopped server, which could lead to errors.

Suggested change
if self.server:
self.server.stop()
if self.server:
self.server.stop()
self._cleaned_up = True

self._cleaned_up = True
except Exception as e:
LOGGER.warning("Error stopping Triton server in __del__: %s", e)


class TritonModelHandler(ModelHandler[Any, PredictionResult,
TritonModelWrapper]):
"""Beam ModelHandler for Triton Inference Server.

This handler supports loading models from a Triton model repository and
running inference using the Triton Python API.

Example usage::

pcoll | RunInference(
TritonModelHandler(
model_repository="/workspace/models",
model_name="my_model",
input_tensor_name="input",
output_tensor_name="output"
)
)

Args:
model_repository: Path to the Triton model repository directory.
model_name: Name of the model to load from the repository.
input_tensor_name: Name of the input tensor (default: "INPUT").
output_tensor_name: Name of the output tensor (default: "OUTPUT").
parse_output_fn: Optional custom function to parse model outputs.
Should take (outputs_dict, output_tensor_name) and return parsed result.
"""
def __init__(
self,
model_repository: str,
model_name: str,
input_tensor_name: str = "INPUT",
output_tensor_name: str = "OUTPUT",
parse_output_fn: Optional[Callable] = None,
):
if tritonserver is None:
raise ImportError(
"tritonserver is not installed. "
"Install it with: pip install tritonserver")

self._model_repository = model_repository
self._model_name = model_name
self._input_tensor_name = input_tensor_name
self._output_tensor_name = output_tensor_name
self._parse_output_fn = parse_output_fn

def load_model(self) -> TritonModelWrapper:
"""Loads and initializes a Triton model for processing.

Returns:
TritonModelWrapper containing the server and model instances.

Raises:
RuntimeError: If server fails to start or model fails to load.
"""
try:
server = tritonserver.Server(model_repository=self._model_repository)
server.start()
except Exception as e:
raise RuntimeError(
f"Failed to start Triton server with repository "
f"'{self._model_repository}': {e}") from e
Comment on lines +131 to +134
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

High: Consider logging the exception with exc_info=True to include the traceback in the logs, which can be helpful for debugging.

Suggested change
except Exception as e:
raise RuntimeError(
f"Failed to start Triton server with repository "
f"'{self._model_repository}': {e}") from e
except Exception as e:
raise RuntimeError(
f"Failed to start Triton server with repository "
f"'{self._model_repository}': {e}") from e
LOGGER.error("Failed to start Triton server", exc_info=True)


try:
model = server.model(self._model_name)
if model is None:
raise RuntimeError(
f"Model '{self._model_name}' not found in repository")
except Exception as e:
server.stop()
raise RuntimeError(
f"Failed to load model '{self._model_name}': {e}") from e
Comment on lines +141 to +144
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

High: Consider logging the exception with exc_info=True to include the traceback in the logs, which can be helpful for debugging.

Suggested change
except Exception as e:
server.stop()
raise RuntimeError(
f"Failed to load model '{self._model_name}': {e}") from e
except Exception as e:
server.stop()
LOGGER.error("Failed to load model", exc_info=True)
raise RuntimeError(
f"Failed to load model '{self._model_name}': {e}") from e


return TritonModelWrapper(server, model)

def run_inference(
self,
batch: Sequence[Any],
model: TritonModelWrapper,
inference_args: Optional[Dict[str, Any]] = None
) -> Iterable[PredictionResult]:
"""Runs inferences on a batch of inputs.

Args:
batch: A sequence of examples (can be strings, arrays, etc.).
model: TritonModelWrapper returned by load_model().
inference_args: Optional dict with 'input_tensor_name' and/or
'output_tensor_name' to override defaults for this batch.

Returns:
An Iterable of PredictionResult objects.

Raises:
RuntimeError: If inference fails.
"""
# Allow per-batch tensor name overrides
input_name = self._input_tensor_name
output_name = self._output_tensor_name
if inference_args:
input_name = inference_args.get('input_tensor_name', input_name)
output_name = inference_args.get('output_tensor_name', output_name)

try:
responses = model.model.infer(inputs={input_name: batch})
except Exception as e:
raise RuntimeError(
f"Triton inference failed for model '{self._model_name}': {e}") from e
Comment on lines +177 to +179
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

High: Consider logging the exception with exc_info=True to include the traceback in the logs, which can be helpful for debugging.

Suggested change
except Exception as e:
raise RuntimeError(
f"Triton inference failed for model '{self._model_name}': {e}") from e
except Exception as e:
LOGGER.error("Triton inference failed", exc_info=True)
raise RuntimeError(
f"Triton inference failed for model '{self._model_name}': {e}") from e


# Parse outputs
predictions = []
try:
for response in responses:
if output_name not in response.outputs:
raise RuntimeError(
f"Output tensor '{output_name}' not found in response. "
f"Available outputs: {list(response.outputs.keys())}")

output_tensor = response.outputs[output_name]

# Use custom parser if provided
if self._parse_output_fn:
parsed = self._parse_output_fn(response.outputs, output_name)
else:
# Default parsing: try string array, fallback to raw
try:
parsed = [
json.loads(val)
for val in output_tensor.to_string_array().tolist()
]
except (json.JSONDecodeError, TypeError, AttributeError):
# If JSON parsing fails, return raw output
parsed = output_tensor.to_bytes_array().tolist()

predictions.extend(parsed if isinstance(parsed, list) else [parsed])

except Exception as e:
raise RuntimeError(f"Failed to parse model outputs: {e}") from e
Comment on lines +208 to +209
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

High: Consider logging the exception with exc_info=True to include the traceback in the logs, which can be helpful for debugging.

Suggested change
except Exception as e:
raise RuntimeError(f"Failed to parse model outputs: {e}") from e
except Exception as e:
LOGGER.error("Failed to parse model outputs", exc_info=True)
raise RuntimeError(f"Failed to parse model outputs: {e}") from e


if len(predictions) != len(batch):
LOGGER.warning(
"Prediction count (%d) doesn't match "
"batch size (%d). Truncating or padding.",
len(predictions),
len(batch))
Comment on lines +211 to +216
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Silently truncating data when the number of predictions does not match the batch size can lead to data loss and hard-to-debug issues. It's safer to raise an exception in this case. The log message is also slightly misleading as it mentions "padding", which is not implemented.

    if len(predictions) != len(batch):
      raise RuntimeError(
          f"Prediction count ({len(predictions)}) doesn't match "
          f"batch size ({len(batch)}).")

Comment on lines +211 to +216
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Medium: It might be beneficial to include the actual prediction and batch content in the warning message for debugging purposes. Consider truncating the content if it's too large.

Suggested change
if len(predictions) != len(batch):
LOGGER.warning(
"Prediction count (%d) doesn't match "
"batch size (%d). Truncating or padding.",
len(predictions),
len(batch))
LOGGER.warning(
"Prediction count (%d) doesn't match "
"batch size (%d). Truncating or padding. "
"Predictions: %s, Batch: %s",
len(predictions),
len(batch),
str(predictions[:100]), # Truncate for large content
str(batch[:100])) # Truncate for large content


return [PredictionResult(x, y) for x, y in zip(batch, predictions)]

def get_metrics_namespace(self) -> str:
"""Returns namespace for metrics."""
return "BeamML_Triton"
Loading
Loading