-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathmemory_evaluation.py
More file actions
113 lines (109 loc) · 3.28 KB
/
memory_evaluation.py
File metadata and controls
113 lines (109 loc) · 3.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import argparse
from membase import (
DATASET_MAPPING,
EvaluationRunner,
EvaluationRunnerConfig,
)
from membase.utils import import_function_from_path
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="A script to evaluate the answers of the search results."
)
parser.add_argument(
"--search-results-path",
type=str,
required=True,
help="Path to the search results."
)
parser.add_argument(
"--qa-model",
type=str,
default="gpt-4.1-mini",
help="Model name or path for question answering."
)
parser.add_argument(
"--judge-model",
type=str,
default="gpt-4.1-mini",
help="Model name or path for judgment (exact match)."
)
parser.add_argument(
"--qa-batch-size",
type=int,
default=4,
help="Batch size for question-answering."
)
parser.add_argument(
"--judge-batch-size",
type=int,
default=4,
help="Batch size for judgment."
)
parser.add_argument(
"--api-config-path",
type=str,
default=None,
help="Path to the API config file."
)
parser.add_argument(
"--context-builder",
type=str,
default=None,
help=(
"Import path for a custom context builder function that converts a list of "
"memory entries into a context string. "
"It accepts 'module.submodule.function' or 'path/to/file.py:function'."
),
)
parser.add_argument(
"--prompt-template",
type=str,
default=None,
help=(
"Import path for a custom prompt template factory that returns a "
"template with $question and $context placeholders. "
"It accepts 'module.submodule.function' or 'path/to/file.py:function'."
),
)
parser.add_argument(
"--add-question-timestamp",
action="store_true",
help="Append the question timestamp to the prompt.",
)
parser.add_argument(
"--dataset-type",
choices=list(DATASET_MAPPING.keys()),
default=list(DATASET_MAPPING.keys())[0],
type=str,
help="The type of the dataset used to evaluate the memory layer."
)
parser.add_argument(
"--metrics",
type=str,
nargs="+",
default=None,
help="Metric names to compute.",
)
args = parser.parse_args()
context_builder = (
import_function_from_path(args.context_builder)
if args.context_builder is not None else None
)
prompt_template = (
import_function_from_path(args.prompt_template)
if args.prompt_template is not None else None
)
runner_config = EvaluationRunnerConfig(
search_results_path=args.search_results_path,
dataset_type=args.dataset_type,
qa_model=args.qa_model,
judge_model=args.judge_model,
qa_batch_size=args.qa_batch_size,
judge_batch_size=args.judge_batch_size,
api_config_path=args.api_config_path,
context_builder=context_builder,
prompt_template=prompt_template,
add_question_timestamp=args.add_question_timestamp,
metrics=args.metrics,
)
EvaluationRunner(runner_config).run()