apache · SaiShashank12 · Oct 3, 2025 · Oct 5, 2025 · Oct 11, 2025 · Oct 11, 2025
diff --git a/CHANGES.md b/CHANGES.md
@@ -78,6 +78,7 @@
 * Python examples added for CloudSQL enrichment handler on [Beam website](https://beam.apache.org/documentation/transforms/python/elementwise/enrichment-cloudsql/) (Python) ([#35473](https://github.com/apache/beam/issues/36095)).
 * Support for batch mode execution in WriteToPubSub transform added (Python) ([#35990](https://github.com/apache/beam/issues/35990)).
 * Added official support for Python 3.13 ([#34869](https://github.com/apache/beam/issues/34869)).
+* Added Triton Inference Server ModelHandler for ML inference (Python) ([#36369](https://github.com/apache/beam/issues/36369)).
 
 ## Breaking Changes
 

diff --git a/sdks/python/apache_beam/ml/inference/TritonModelHandler.py b/sdks/python/apache_beam/ml/inference/TritonModelHandler.py
@@ -0,0 +1,222 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Apache Beam ModelHandler implementation for Triton Inference Server."""
+
+import json
+import logging
+from typing import Any, Callable, Dict, Iterable, Optional, Sequence
+
+from apache_beam.ml.inference.base import ModelHandler, PredictionResult
+
+try:
+  import tritonserver
+  from tritonserver import Model, Server
+except ImportError:
+  tritonserver = None
+
+LOGGER = logging.getLogger(__name__)
+
+
+class TritonModelWrapper:
+  """Wrapper to manage Triton Server lifecycle with the model."""
+  def __init__(self, server: 'Server', model: 'Model'):
+    self.server = server
+    self.model = model
+    self._cleaned_up = False
+
+  def cleanup(self):
+    """Explicitly cleanup server resources.
+
+    This method should be called when the model is no longer needed.
+    It's safe to call multiple times.
+    """
+    if self._cleaned_up:
+      return
+
+    try:
+      if self.server:
+        self.server.stop()
+        self._cleaned_up = True
+    except Exception as e:
+      LOGGER.warning("Error stopping Triton server: %s", e)
+      raise
-    except Exception as e:
-      LOGGER.warning("Error stopping Triton server: %s", e)
-      raise
+    except Exception as e:
+      LOGGER.error("Error stopping Triton server: %s", e, exc_info=True) # Log the error with exc_info for traceback
+      raise RuntimeError("Error stopping Triton server") from e # Re-raise with exception chaining
-    except Exception as e:
-      LOGGER.warning("Error stopping Triton server: %s", e)
-      raise
+    except Exception as e:
+      LOGGER.error("Error stopping Triton server: %s", e, exc_info=True) # Log the error with exc_info for traceback
+      raise RuntimeError("Error stopping Triton server") from e # Re-raise with exception chaining
+
+  def __del__(self):
+    """Cleanup server when model is garbage collected.
+
+    Note: __del__ is not guaranteed to be called. Prefer using cleanup()
+    explicitly when possible.
+    """
+    if not self._cleaned_up:
+      try:
+        if self.server:
+          self.server.stop()
-        if self.server:
-          self.server.stop()
+        if self.server:
+          self.server.stop()
+          self._cleaned_up = True
-        if self.server:
-          self.server.stop()
+        if self.server:
+          self.server.stop()
+          self._cleaned_up = True
+          self._cleaned_up = True
+      except Exception as e:
+        LOGGER.warning("Error stopping Triton server in __del__: %s", e)
+
+
+class TritonModelHandler(ModelHandler[Any, PredictionResult,
+                                      TritonModelWrapper]):
+  """Beam ModelHandler for Triton Inference Server.
+
+  This handler supports loading models from a Triton model repository and
+  running inference using the Triton Python API.
+
+  Example usage::
+
+    pcoll | RunInference(
+      TritonModelHandler(
+        model_repository="/workspace/models",
+        model_name="my_model",
+        input_tensor_name="input",
+        output_tensor_name="output"
+      )
+    )
+
+  Args:
+    model_repository: Path to the Triton model repository directory.
+    model_name: Name of the model to load from the repository.
+    input_tensor_name: Name of the input tensor (default: "INPUT").
+    output_tensor_name: Name of the output tensor (default: "OUTPUT").
+    parse_output_fn: Optional custom function to parse model outputs.
+      Should take (outputs_dict, output_tensor_name) and return parsed result.
+  """
+  def __init__(
+      self,
+      model_repository: str,
+      model_name: str,
+      input_tensor_name: str = "INPUT",
+      output_tensor_name: str = "OUTPUT",
+      parse_output_fn: Optional[Callable] = None,
+  ):
+    if tritonserver is None:
+      raise ImportError(
+          "tritonserver is not installed. "
+          "Install it with: pip install tritonserver")
+
+    self._model_repository = model_repository
+    self._model_name = model_name
+    self._input_tensor_name = input_tensor_name
+    self._output_tensor_name = output_tensor_name
+    self._parse_output_fn = parse_output_fn
+
+  def load_model(self) -> TritonModelWrapper:
+    """Loads and initializes a Triton model for processing.
+
+    Returns:
+      TritonModelWrapper containing the server and model instances.
+
+    Raises:
+      RuntimeError: If server fails to start or model fails to load.
+    """
+    try:
+      server = tritonserver.Server(model_repository=self._model_repository)
+      server.start()
+    except Exception as e:
+      raise RuntimeError(
+          f"Failed to start Triton server with repository "
+          f"'{self._model_repository}': {e}") from e
-    except Exception as e:
-      raise RuntimeError(
-          f"Failed to start Triton server with repository "
-          f"'{self._model_repository}': {e}") from e
+    except Exception as e:
+      raise RuntimeError(
+          f"Failed to start Triton server with repository "
+          f"'{self._model_repository}': {e}") from e
+      LOGGER.error("Failed to start Triton server", exc_info=True)
-    except Exception as e:
-      raise RuntimeError(
-          f"Failed to start Triton server with repository "
-          f"'{self._model_repository}': {e}") from e
+    except Exception as e:
+      raise RuntimeError(
+          f"Failed to start Triton server with repository "
+          f"'{self._model_repository}': {e}") from e
+      LOGGER.error("Failed to start Triton server", exc_info=True)
+
+    try:
+      model = server.model(self._model_name)
+      if model is None:
+        raise RuntimeError(
+            f"Model '{self._model_name}' not found in repository")
+    except Exception as e:
+      server.stop()
+      raise RuntimeError(
+          f"Failed to load model '{self._model_name}': {e}") from e
-    except Exception as e:
-      server.stop()
-      raise RuntimeError(
-          f"Failed to load model '{self._model_name}': {e}") from e
+    except Exception as e:
+      server.stop()
+      LOGGER.error("Failed to load model", exc_info=True)
+      raise RuntimeError(
+          f"Failed to load model '{self._model_name}': {e}") from e
-    except Exception as e:
-      server.stop()
-      raise RuntimeError(
-          f"Failed to load model '{self._model_name}': {e}") from e
+    except Exception as e:
+      server.stop()
+      LOGGER.error("Failed to load model", exc_info=True)
+      raise RuntimeError(
+          f"Failed to load model '{self._model_name}': {e}") from e
+
+    return TritonModelWrapper(server, model)
+
+  def run_inference(
+      self,
+      batch: Sequence[Any],
+      model: TritonModelWrapper,
+      inference_args: Optional[Dict[str, Any]] = None
+  ) -> Iterable[PredictionResult]:
+    """Runs inferences on a batch of inputs.
+
+    Args:
+      batch: A sequence of examples (can be strings, arrays, etc.).
+      model: TritonModelWrapper returned by load_model().
+      inference_args: Optional dict with 'input_tensor_name' and/or
+        'output_tensor_name' to override defaults for this batch.
+
+    Returns:
+      An Iterable of PredictionResult objects.
+
+    Raises:
+      RuntimeError: If inference fails.
+    """
+    # Allow per-batch tensor name overrides
+    input_name = self._input_tensor_name
+    output_name = self._output_tensor_name
+    if inference_args:
+      input_name = inference_args.get('input_tensor_name', input_name)
+      output_name = inference_args.get('output_tensor_name', output_name)
+
+    try:
+      responses = model.model.infer(inputs={input_name: batch})
+    except Exception as e:
+      raise RuntimeError(
+          f"Triton inference failed for model '{self._model_name}': {e}") from e
-    except Exception as e:
-      raise RuntimeError(
-          f"Triton inference failed for model '{self._model_name}': {e}") from e
+    except Exception as e:
+      LOGGER.error("Triton inference failed", exc_info=True)
+      raise RuntimeError(
+          f"Triton inference failed for model '{self._model_name}': {e}") from e
-    except Exception as e:
-      raise RuntimeError(
-          f"Triton inference failed for model '{self._model_name}': {e}") from e
+    except Exception as e:
+      LOGGER.error("Triton inference failed", exc_info=True)
+      raise RuntimeError(
+          f"Triton inference failed for model '{self._model_name}': {e}") from e
+
+    # Parse outputs
+    predictions = []
+    try:
+      for response in responses:
+        if output_name not in response.outputs:
+          raise RuntimeError(
+              f"Output tensor '{output_name}' not found in response. "
+              f"Available outputs: {list(response.outputs.keys())}")
+
+        output_tensor = response.outputs[output_name]
+
+        # Use custom parser if provided
+        if self._parse_output_fn:
+          parsed = self._parse_output_fn(response.outputs, output_name)
+        else:
+          # Default parsing: try string array, fallback to raw
+          try:
+            parsed = [
+                json.loads(val)
+                for val in output_tensor.to_string_array().tolist()
+            ]
+          except (json.JSONDecodeError, TypeError, AttributeError):
+            # If JSON parsing fails, return raw output
+            parsed = output_tensor.to_bytes_array().tolist()
+
+        predictions.extend(parsed if isinstance(parsed, list) else [parsed])
+
+    except Exception as e:
+      raise RuntimeError(f"Failed to parse model outputs: {e}") from e
-    except Exception as e:
-      raise RuntimeError(f"Failed to parse model outputs: {e}") from e
+    except Exception as e:
+      LOGGER.error("Failed to parse model outputs", exc_info=True)
+      raise RuntimeError(f"Failed to parse model outputs: {e}") from e
-    except Exception as e:
-      raise RuntimeError(f"Failed to parse model outputs: {e}") from e
+    except Exception as e:
+      LOGGER.error("Failed to parse model outputs", exc_info=True)
+      raise RuntimeError(f"Failed to parse model outputs: {e}") from e
+
+    if len(predictions) != len(batch):
+      LOGGER.warning(
+          "Prediction count (%d) doesn't match "
+          "batch size (%d). Truncating or padding.",
+          len(predictions),
+          len(batch))
-    if len(predictions) != len(batch):
-      LOGGER.warning(
-          "Prediction count (%d) doesn't match "
-          "batch size (%d). Truncating or padding.",
-          len(predictions),
-          len(batch))
+      LOGGER.warning(
+          "Prediction count (%d) doesn't match "
+          "batch size (%d). Truncating or padding. "
+          "Predictions: %s, Batch: %s",
+          len(predictions),
+          len(batch),
+          str(predictions[:100]), # Truncate for large content
+          str(batch[:100])) # Truncate for large content
-    if len(predictions) != len(batch):
-      LOGGER.warning(
-          "Prediction count (%d) doesn't match "
-          "batch size (%d). Truncating or padding.",
-          len(predictions),
-          len(batch))
+      LOGGER.warning(
+          "Prediction count (%d) doesn't match "
+          "batch size (%d). Truncating or padding. "
+          "Predictions: %s, Batch: %s",
+          len(predictions),
+          len(batch),
+          str(predictions[:100]), # Truncate for large content
+          str(batch[:100])) # Truncate for large content
+
+    return [PredictionResult(x, y) for x, y in zip(batch, predictions)]
+
+  def get_metrics_namespace(self) -> str:
+    """Returns namespace for metrics."""
+    return "BeamML_Triton"