caption quality evaluation#1980
Conversation
Greptile SummaryThis PR adds a two-script evaluation suite (
Confidence Score: 3/5The two new scripts have bugs that would crash on script restarts or silently produce wrong evaluation results; fixes needed before use in a production eval pipeline. All symlink creation sites in build_benchmark_dataset.py use os.path.exists instead of os.path.lexists, causing FileExistsError crashes whenever a broken symlink is left behind by a partial run. In caption_clipscore.py, loading cached summaries with --load-summaries silently substitutes an empty string for any missing (uid, label) entry, then feeds that to the text encoder, yielding a meaningless cosine-similarity score with no warning. These two issues affect both core workflows. Both build_benchmark_dataset.py (symlink guards) and caption_clipscore.py (empty-summary fallback in the cached-load path) need attention before this tooling is relied upon for regression tracking. Important Files Changed
Sequence DiagramsequenceDiagram
participant User
participant build_benchmark_dataset.py
participant video_split_clip_example.py
participant caption_clipscore.py
participant CosmosEmbed1
participant vLLM
User->>build_benchmark_dataset.py: --video-dir, --output-dir, --num-clusters
build_benchmark_dataset.py->>build_benchmark_dataset.py: Sample 3000 videos
build_benchmark_dataset.py->>video_split_clip_example.py: subprocess split + embed
video_split_clip_example.py-->>build_benchmark_dataset.py: clips + embeddings + metas
build_benchmark_dataset.py->>build_benchmark_dataset.py: "K-means K=200"
build_benchmark_dataset.py->>build_benchmark_dataset.py: Select 1 clip per cluster
User->>video_split_clip_example.py: per-model caption run
video_split_clip_example.py-->>User: captions metas
User->>caption_clipscore.py: --embedding-dir, --caption-dirs, --uid-list
caption_clipscore.py->>caption_clipscore.py: Intersect UIDs
alt --load-summaries
caption_clipscore.py->>caption_clipscore.py: Load cached summaries
else --summarizer-model
caption_clipscore.py->>vLLM: Batch summarize captions
vLLM-->>caption_clipscore.py: Summaries
end
caption_clipscore.py->>CosmosEmbed1: get_text_embedding per clip
CosmosEmbed1-->>caption_clipscore.py: text embeddings
caption_clipscore.py->>caption_clipscore.py: cosine_sim per clip
caption_clipscore.py-->>User: results.csv + mean scores
Reviews (1): Last reviewed commit: "eval" | Re-trigger Greptile |
| # Symlink embedding | ||
| src = f"{emb_dir}/{uid}.pickle" | ||
| dst = f"{output_dir}/ce1_embd/{uid}.pickle" | ||
| if os.path.exists(src) and not os.path.exists(dst): | ||
| os.symlink(os.path.abspath(src), dst) | ||
|
|
||
| # Symlink clip | ||
| src = f"{clip_dir}/{uid}.mp4" | ||
| dst = f"{output_dir}/clips/{uid}.mp4" | ||
| if os.path.exists(src) and not os.path.exists(dst): | ||
| os.symlink(os.path.abspath(src), dst) | ||
|
|
||
| # Symlink source video | ||
| src_video = meta.get("source_video", "") | ||
| if src_video and os.path.exists(src_video): | ||
| vid_name = os.path.basename(src_video) | ||
| dst = f"{output_dir}/input/{vid_name}" | ||
| if vid_name not in input_videos_linked and not os.path.exists(dst): | ||
| os.symlink(os.path.abspath(src_video), dst) | ||
| input_videos_linked.add(vid_name) |
There was a problem hiding this comment.
os.path.exists() returns False for broken symlinks, so if a prior run left a broken symlink at dst (e.g. the pipeline output was later cleaned up), the guard passes and os.symlink() immediately raises FileExistsError: [Errno 17] File exists. This affects all three symlink creation blocks (lines 211, 217, 225) and would crash a restart of the script. Replace os.path.exists(dst) with os.path.lexists(dst), which returns True for both valid and broken symlinks.
| # Symlink embedding | |
| src = f"{emb_dir}/{uid}.pickle" | |
| dst = f"{output_dir}/ce1_embd/{uid}.pickle" | |
| if os.path.exists(src) and not os.path.exists(dst): | |
| os.symlink(os.path.abspath(src), dst) | |
| # Symlink clip | |
| src = f"{clip_dir}/{uid}.mp4" | |
| dst = f"{output_dir}/clips/{uid}.mp4" | |
| if os.path.exists(src) and not os.path.exists(dst): | |
| os.symlink(os.path.abspath(src), dst) | |
| # Symlink source video | |
| src_video = meta.get("source_video", "") | |
| if src_video and os.path.exists(src_video): | |
| vid_name = os.path.basename(src_video) | |
| dst = f"{output_dir}/input/{vid_name}" | |
| if vid_name not in input_videos_linked and not os.path.exists(dst): | |
| os.symlink(os.path.abspath(src_video), dst) | |
| input_videos_linked.add(vid_name) | |
| # Symlink embedding | |
| src = f"{emb_dir}/{uid}.pickle" | |
| dst = f"{output_dir}/ce1_embd/{uid}.pickle" | |
| if os.path.exists(src) and not os.path.lexists(dst): | |
| os.symlink(os.path.abspath(src), dst) | |
| # Symlink clip | |
| src = f"{clip_dir}/{uid}.mp4" | |
| dst = f"{output_dir}/clips/{uid}.mp4" | |
| if os.path.exists(src) and not os.path.lexists(dst): | |
| os.symlink(os.path.abspath(src), dst) | |
| # Symlink source video | |
| src_video = meta.get("source_video", "") | |
| if src_video and os.path.exists(src_video): | |
| vid_name = os.path.basename(src_video) | |
| dst = f"{output_dir}/input/{vid_name}" | |
| if vid_name not in input_videos_linked and not os.path.lexists(dst): | |
| os.symlink(os.path.abspath(src_video), dst) | |
| input_videos_linked.add(vid_name) |
| if not os.path.exists(dst): | ||
| os.symlink(os.path.abspath(src), dst) |
There was a problem hiding this comment.
The same broken-symlink race is present in
main(): if dst is a broken symlink from a prior run, os.path.exists(dst) returns False and os.symlink() throws FileExistsError. Use os.path.lexists(dst) here too.
| if not os.path.exists(dst): | |
| os.symlink(os.path.abspath(src), dst) | |
| if not os.path.lexists(dst): | |
| os.symlink(os.path.abspath(src), dst) |
| print(f"\nLoading cached summaries from: {load_summaries}") | ||
| with open(load_summaries) as f: | ||
| summary_cache = json.load(f) | ||
| summaries = [summary_cache.get(uid, {}).get(label, "") for uid, label, _ in tasks] |
There was a problem hiding this comment.
When
--load-summaries is used and a (uid, label) pair is absent from the cache (e.g. when scoring a new model against an old cache that only contains other models), the summary silently becomes "". That empty string is later passed to model.get_text_embedding(""), which produces an arbitrary or near-zero embedding, and the resulting cosine-similarity score is silently wrong. A warning when the summary is missing will surface this immediately rather than producing a quietly incorrect result.
| summaries = [summary_cache.get(uid, {}).get(label, "") for uid, label, _ in tasks] | |
| summaries = [] | |
| missing = 0 | |
| for uid, label, _ in tasks: | |
| s = summary_cache.get(uid, {}).get(label, "") | |
| if not s: | |
| missing += 1 | |
| summaries.append(s) | |
| if missing: | |
| print(f" Warning: {missing} (uid, label) pairs have no cached summary and will produce invalid scores.") |
| for i, (uid, label, _caption) in enumerate(tqdm(tasks, unit="cap")): | ||
| with open(f"{embedding_dir}/{uid}.pickle", "rb") as f: | ||
| arr = pickle.load(f) # noqa: S301 | ||
| vid_emb = torch.from_numpy(arr).squeeze(0) | ||
| text_emb = model.get_text_embedding(summaries[i]).squeeze(0) | ||
| clip_scores.setdefault(uid, {})[label] = _cosine_sim(vid_emb, text_emb) |
There was a problem hiding this comment.
Redundant embedding loads per model label
The loop opens and unpickles {uid}.pickle once per task, meaning the same file is read n_models times for every clip (e.g. 3x for 3 models). With 200 clips and 3 models that is 600 reads instead of 200. Consider caching the video embedding per uid inside the loop to avoid the repeated I/O.
Description
doc link
Usage
# Add snippet demonstrating usageChecklist