Skip to content

Commit 4f56119

Browse files
committed
Align failed spans with OTel error.type
1 parent 539d104 commit 4f56119

10 files changed

Lines changed: 105 additions & 9 deletions

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ execution, state management, and lifecycle observation.
6161
- OpenTelemetry span attributes now use explicit identity names such as `junjo.executable_runtime_id`, `junjo.executable_structural_id`, and `junjo.enclosing_graph_structural_id` instead of the old generic `junjo.id` and `junjo.parent_id` keys.
6262
- OpenTelemetry and hook payloads now use `executable_definition_id` and `parent_executable_definition_id` instead of the older generic `definition_id` naming on those surfaces.
6363
- Workflow telemetry now records `junjo.workflow.execution_graph_snapshot` to make it explicit that the graph payload is an execution-scoped compiled snapshot containing both runtime and structural identities.
64+
- Failed workflow, subflow, node, concurrent, and hook-error spans now set the standard OpenTelemetry `error.type` attribute in addition to Junjo-specific error metadata.
6465
- `on_state_changed` hook payloads and state-change telemetry context now identify the active executable that performed the mutation, rather than mixing workflow metadata with node or subflow runtime identities.
6566
- Lifecycle observation examples and docs now show hook registration as a separate concern from workflow definition.
6667
- `_NestableWorkflow` remains documented in the generated API reference, but is no longer exported from the top-level `junjo` package for direct consumption.

docs/junjo_ai_studio.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,15 @@ Junjo-Specific Telemetry Attributes
326326

327327
Junjo automatically adds these attributes to OpenTelemetry spans:
328328

329+
When an executable span fails, Junjo also emits the standard OpenTelemetry
330+
error fields alongside the Junjo-specific attributes below:
331+
332+
- ``error.type``: Exception class name for the failed operation
333+
- span status ``Error``
334+
- the standard ``exception`` span event with exception details
335+
336+
Ordinary cancellations stay classified as cancellations rather than errors.
337+
329338
Workflow Spans
330339
--------------
331340

docs/opentelemetry.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,18 @@ Junjo's Custom Span Attributes
165165

166166
Junjo adds workflow-specific attributes to all spans. These work with any OTLP exporter:
167167

168+
Failed workflow, subflow, node, and concurrent-execution spans also follow the
169+
standard OpenTelemetry error contract in addition to the Junjo-specific fields
170+
below:
171+
172+
- ``error.type`` is set to the exception class name on failed spans.
173+
- span status is set to ``Error``.
174+
- the standard ``exception`` span event is recorded via OpenTelemetry's
175+
exception recording support.
176+
177+
Cancelled spans do not set ``error.type`` and are not marked with ``Error``
178+
status unless they actually fail.
179+
168180
Workflow/Subflow Span Attributes
169181
---------------------------------
170182

src/junjo/_lifecycle.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from . import hooks as hook_events
1313
from .telemetry.otel_schema import JUNJO_OTEL_MODULE_NAME, JunjoOtelSpanTypes
14+
from .telemetry.span_lifecycle import mark_span_failed
1415

1516
if TYPE_CHECKING:
1617
from .hooks import Hooks
@@ -673,5 +674,5 @@ def _record_hook_error(
673674
"junjo.parent_executable_structural_id",
674675
event.parent_executable_structural_id,
675676
)
676-
span.set_status(trace.StatusCode.ERROR, str(exc))
677+
mark_span_failed(span, exc)
677678
span.record_exception(exc)

src/junjo/node.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@
88
from ._lifecycle import ActiveExecutableIdentity, active_executable_identity, get_active_executable_identity
99
from .store import StoreT
1010
from .telemetry.otel_schema import JUNJO_OTEL_MODULE_NAME, JunjoOtelSpanTypes
11-
from .telemetry.span_lifecycle import get_span_identifiers, mark_span_cancelled
11+
from .telemetry.span_lifecycle import (
12+
get_span_identifiers,
13+
mark_span_cancelled,
14+
mark_span_failed,
15+
)
1216
from .util import generate_safe_id
1317

1418

@@ -259,7 +263,7 @@ async def execute(self, store: StoreT, parent_id: str) -> None:
259263

260264
except Exception as exc:
261265
print("Error executing node service", exc)
262-
span.set_status(trace.StatusCode.ERROR, str(exc))
266+
mark_span_failed(span, exc)
263267
span.record_exception(exc)
264268
failure = exc
265269
if lifecycle_context is not None:

src/junjo/run_concurrent.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@
99
from .node import Node
1010
from .store import BaseStore
1111
from .telemetry.otel_schema import JUNJO_OTEL_MODULE_NAME, JunjoOtelSpanTypes
12-
from .telemetry.span_lifecycle import get_span_identifiers, mark_span_cancelled
12+
from .telemetry.span_lifecycle import (
13+
get_span_identifiers,
14+
mark_span_cancelled,
15+
mark_span_failed,
16+
)
1317
from .util import generate_safe_id
1418

1519
if TYPE_CHECKING:
@@ -286,7 +290,7 @@ async def execute(self, store: BaseStore, parent_id: str) -> None: # noqa: C901
286290

287291
except Exception as exc:
288292
print(f"Error executing node service: {exc}")
289-
span.set_status(trace.StatusCode.ERROR, str(exc))
293+
mark_span_failed(span, exc)
290294
span.record_exception(exc)
291295
failure = exc
292296
if lifecycle_context is not None:

src/junjo/telemetry/span_lifecycle.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,13 @@
66
from opentelemetry.trace import Span
77

88

9+
def mark_span_failed(span: Span, exc: Exception) -> None:
10+
"""Annotate a span as failed using standard OpenTelemetry error fields."""
11+
12+
span.set_attribute("error.type", get_error_type(exc))
13+
span.set_status(trace.StatusCode.ERROR, str(exc))
14+
15+
916
def mark_span_cancelled(span: Span, exc: asyncio.CancelledError) -> None:
1017
"""Annotate a span as cancelled without treating cancellation as an error."""
1118

@@ -14,6 +21,12 @@ def mark_span_cancelled(span: Span, exc: asyncio.CancelledError) -> None:
1421
span.set_attribute("junjo.cancelled_reason", reason)
1522

1623

24+
def get_error_type(exc: BaseException) -> str:
25+
"""Return the stable OpenTelemetry ``error.type`` value for an exception."""
26+
27+
return type(exc).__name__
28+
29+
1730
def get_span_identifiers(span: Span) -> tuple[str, str]:
1831
context = span.get_span_context()
1932
return (

src/junjo/workflow.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,11 @@
2121
from .run_concurrent import RunConcurrent
2222
from .store import BaseStore, ParentStateT, ParentStoreT, StateT, StoreT
2323
from .telemetry.otel_schema import JUNJO_OTEL_MODULE_NAME, JunjoOtelSpanTypes
24-
from .telemetry.span_lifecycle import get_span_identifiers, mark_span_cancelled
24+
from .telemetry.span_lifecycle import (
25+
get_span_identifiers,
26+
mark_span_cancelled,
27+
mark_span_failed,
28+
)
2529
from .util import generate_safe_id
2630

2731
if TYPE_CHECKING:
@@ -370,7 +374,7 @@ async def execute( # noqa: C901
370374

371375
except Exception as exc:
372376
print(f"Error executing workflow: {exc}")
373-
span.set_status(trace.StatusCode.ERROR, str(exc))
377+
mark_span_failed(span, exc)
374378
span.record_exception(exc)
375379
failure = exc
376380

tests/test_hooks.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
1010
InMemorySpanExporter,
1111
)
12+
from opentelemetry.trace import StatusCode
1213

1314
from junjo import BaseState, BaseStore, Graph, Hooks, Node, RunConcurrent, Subflow, Workflow
1415

@@ -33,6 +34,11 @@ async def service(self, store: HookStore) -> None:
3334
return None
3435

3536

37+
class FailingNode(Node[HookStore]):
38+
async def service(self, store: HookStore) -> None:
39+
raise RuntimeError("boom")
40+
41+
3642
class ExampleSubflow(Subflow[HookState, HookStore, HookState, HookStore]):
3743
async def pre_run_actions(self, parent_store: HookStore, subflow_store: HookStore) -> None:
3844
parent_state = await parent_store.get_state()
@@ -482,8 +488,14 @@ def failing_hook(event) -> None:
482488

483489
hook_error_span = hook_error_spans[0]
484490
assert hook_error_span.attributes["junjo.hook.event"] == "workflow_started"
491+
assert hook_error_span.attributes["error.type"] == "RuntimeError"
485492
assert hook_error_span.attributes["junjo.hook.error.type"] == "RuntimeError"
486493
assert hook_error_span.attributes["junjo.hook.error.message"] == "bad hook"
494+
hook_error_exception = next(
495+
event for event in hook_error_span.events if event.name == "exception"
496+
)
497+
assert hook_error_exception.attributes["exception.type"] == "RuntimeError"
498+
assert hook_error_exception.attributes["exception.message"] == "bad hook"
487499
assert hook_error_span.attributes["junjo.run_id"] == result.run_id
488500
assert (
489501
hook_error_span.attributes["junjo.executable_definition_id"]
@@ -495,6 +507,34 @@ def failing_hook(event) -> None:
495507
hook_error_span.attributes["junjo.executable_structural_id"]
496508
== hook_error_span.attributes["junjo.enclosing_graph_structural_id"]
497509
)
510+
assert hook_error_span.status.status_code is StatusCode.ERROR
511+
512+
513+
@pytest.mark.asyncio
514+
async def test_failed_workflow_and_node_spans_emit_standard_error_type(
515+
span_exporter: InMemorySpanExporter,
516+
) -> None:
517+
failing_node = FailingNode()
518+
519+
workflow = Workflow[HookState, HookStore](
520+
name="Failing Workflow",
521+
graph_factory=lambda: Graph(source=failing_node, sinks=[failing_node], edges=[]),
522+
store_factory=lambda: HookStore(initial_state=HookState()),
523+
)
524+
525+
with pytest.raises(RuntimeError, match="boom"):
526+
await workflow.execute()
527+
528+
spans = {span.name: span for span in span_exporter.get_finished_spans()}
529+
workflow_span = spans["Failing Workflow"]
530+
node_span = spans["FailingNode"]
531+
532+
for span in (workflow_span, node_span):
533+
assert span.status.status_code is StatusCode.ERROR
534+
assert span.attributes["error.type"] == "RuntimeError"
535+
exception_event = next(event for event in span.events if event.name == "exception")
536+
assert exception_event.attributes["exception.type"] == "RuntimeError"
537+
assert exception_event.attributes["exception.message"] == "boom"
498538

499539

500540
def test_old_hook_manager_module_is_removed() -> None:

tests/test_run_concurrent_cancellation_telemetry.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,11 +79,19 @@ def create_run_concurrent_graph() -> Graph:
7979

8080
spans = {span.name: span for span in span_exporter.get_finished_spans()}
8181

82+
workflow_span = spans["Telemetry Workflow"]
83+
run_concurrent_span = spans["Concurrent Execution"]
8284
failing_span = spans["FailingNode"]
8385
sibling_span = spans["WaitingSiblingNode"]
8486

85-
assert failing_span.status.status_code is StatusCode.ERROR
86-
assert any(event.name == "exception" for event in failing_span.events)
87+
for span in (workflow_span, run_concurrent_span, failing_span):
88+
assert span.status.status_code is StatusCode.ERROR
89+
assert span.attributes["error.type"] == "RuntimeError"
90+
exception_event = next(event for event in span.events if event.name == "exception")
91+
assert exception_event.attributes["exception.type"] == "RuntimeError"
92+
assert exception_event.attributes["exception.message"] == "boom"
8793

8894
assert sibling_span.attributes["junjo.cancelled"] is True
8995
assert sibling_span.attributes["junjo.cancelled_reason"] == "sibling_failed"
96+
assert "error.type" not in sibling_span.attributes
97+
assert sibling_span.status.status_code is not StatusCode.ERROR

0 commit comments

Comments
 (0)