-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcross_task_validation.json
More file actions
61 lines (61 loc) · 2 KB
/
cross_task_validation.json
File metadata and controls
61 lines (61 loc) · 2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
{
"bundle_kind": "public_cross_task_validation_v1",
"generated_at": "2026-03-21T17:38:07.356450+00:00",
"task": "IFEval",
"metric": "strict_prompt_accuracy",
"rows": [
{
"surface_id": "frozen_authoritative_parent",
"label": "Frozen authoritative parent",
"official_ifeval": 0.780037,
"delta_vs_parent_ifeval": 0.0,
"official_logiqa": 0.303738
},
{
"surface_id": "current_no_trunk_ablation",
"label": "Current no-trunk ablation",
"official_ifeval": 0.780037,
"delta_vs_parent_ifeval": 0.0,
"official_logiqa": 0.303738
},
{
"surface_id": "three_family_promoted_surface",
"label": "Earlier three-family promoted surface",
"official_ifeval": 0.780037,
"delta_vs_parent_ifeval": 0.0,
"official_logiqa": 0.308411
},
{
"surface_id": "single_family_official_bc_support_b1",
"label": "Single-family official BC surface",
"official_ifeval": 0.780037,
"delta_vs_parent_ifeval": 0.0,
"official_logiqa": 0.34891
},
{
"surface_id": "single_family_official_dbb_bc_support_b1",
"label": "Single-family official DBB-BC surface",
"official_ifeval": 0.780037,
"delta_vs_parent_ifeval": 0.0,
"official_logiqa": 0.353583
},
{
"surface_id": "current_scientific_surface",
"label": "Current accepted host surface",
"official_ifeval": 0.780037,
"delta_vs_parent_ifeval": 0.0,
"official_logiqa": 0.392523
}
],
"summary": {
"parent_ifeval": 0.780037,
"current_ifeval": 0.780037,
"max_abs_delta_vs_parent_ifeval": 0.0,
"all_published_surfaces_match_parent_ifeval": true
},
"notes": [
"This bundle is a second-task non-regression check on the public authoritative IFEval path.",
"It shows that the published same-parent progression keeps the parent IFEval score while LogiQA changes.",
"It does not establish positive transfer or broad multi-task capability gains beyond the published boundary."
]
}