Skip to content

Commit e8a1556

Browse files
committed
chore: partial diarization example
1 parent 9d0614a commit e8a1556

39 files changed

Lines changed: 5617 additions & 0 deletions
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.10
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
using Flowthru.Core.Data;
2+
3+
namespace DiarizationExample.Data;
4+
5+
/// <summary>
6+
/// Data catalog for the Diarization pipeline. Inputs are batch audio files
7+
/// (one <see cref="Directory{T}"/> of <c>byte[]</c>); intermediate and output
8+
/// items are flat row schemas keyed by <c>clip_id</c> (= the source file path).
9+
/// </summary>
10+
public partial class Catalog : CatalogAbstract
11+
{
12+
private readonly string _basePath;
13+
14+
public Catalog(string basePath)
15+
{
16+
_basePath = basePath;
17+
InitializeCatalogProperties();
18+
}
19+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
using Flowthru.Core.Data;
2+
3+
namespace DiarizationExample.Data;
4+
5+
public partial class Catalog
6+
{
7+
/// <summary>
8+
/// Batch of raw audio files dropped into <c>_01_Raw/Datasets/</c>. Each entry
9+
/// in the directory is one independent recording; the key is the full file
10+
/// path (used downstream as <c>clip_id</c> on every row). Glob covers the
11+
/// common formats — anything ffmpeg can decode is fine since the
12+
/// <c>NormalizeAudio</c> step transcodes to 16kHz mono PCM before either
13+
/// Whisper or pyannote sees it.
14+
/// </summary>
15+
public IItem<Directory<byte[]>> AudioClips =>
16+
CreateItem(() =>
17+
ItemFactory.Enumerable.BinaryDirectory(
18+
label: "AudioClips",
19+
directoryPath: $"{_basePath}/_01_Raw/Datasets",
20+
filePattern: "*.{wav,mp3,m4a,flac,ogg}"
21+
)
22+
);
23+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
using Flowthru.Core.Data;
2+
3+
namespace DiarizationExample.Data;
4+
5+
public partial class Catalog
6+
{
7+
/// <summary>
8+
/// Audio normalized to 16kHz mono PCM (WAV bytes). Both Whisper and pyannote
9+
/// expect this format; doing the transcode once here means downstream steps
10+
/// can run in parallel without each redoing the same work.
11+
/// </summary>
12+
public IItem<Directory<byte[]>> NormalizedAudio =>
13+
CreateItem(() =>
14+
ItemFactory.Enumerable.BinaryDirectory(
15+
label: "NormalizedAudio",
16+
directoryPath: $"{_basePath}/_02_Intermediate/normalized",
17+
filePattern: "*.wav"
18+
)
19+
);
20+
}

examples/archived/DiarizationExample/Data/_02_Intermediate/normalized/.keep

Whitespace-only changes.
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
using DiarizationExample.Data._03_Primary.Schemas;
2+
using Flowthru.Core.Data;
3+
4+
namespace DiarizationExample.Data;
5+
6+
public partial class Catalog
7+
{
8+
/// <summary>
9+
/// Whisper transcript segments — one row per (clip_id, start, end) span.
10+
/// Stored as Parquet so re-runs can skip transcription if the upstream
11+
/// audio hasn't changed.
12+
/// </summary>
13+
public IItem<IEnumerable<TranscriptSegmentSchema>> Transcripts =>
14+
CreateItem(() =>
15+
ItemFactory.Enumerable.Parquet<TranscriptSegmentSchema>(
16+
label: "Transcripts",
17+
filePath: $"{_basePath}/_03_Primary/transcripts.parquet"
18+
)
19+
);
20+
21+
/// <summary>
22+
/// pyannote diarization turns — one row per (clip_id, start, end, speaker).
23+
/// Speaker indices are local to each clip (speaker_0, speaker_1, ...);
24+
/// cross-clip speaker identity is out of scope for this example.
25+
/// </summary>
26+
public IItem<IEnumerable<DiarizationSegmentSchema>> DiarizationTurns =>
27+
CreateItem(() =>
28+
ItemFactory.Enumerable.Parquet<DiarizationSegmentSchema>(
29+
label: "DiarizationTurns",
30+
filePath: $"{_basePath}/_03_Primary/diarization.parquet"
31+
)
32+
);
33+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
using Flowthru.Core.Abstractions;
2+
3+
namespace DiarizationExample.Data._03_Primary.Schemas;
4+
5+
/// <summary>
6+
/// One pyannote speaker turn for one input clip. <c>SpeakerId</c> is local
7+
/// to the clip (e.g., <c>speaker_0</c>, <c>speaker_1</c>); cross-clip speaker
8+
/// identity is out of scope for this example.
9+
/// </summary>
10+
[FlowthruSchema]
11+
public partial record DiarizationSegmentSchema
12+
{
13+
[SerializedLabel("clip_id")]
14+
public string ClipId { get; init; } = null!;
15+
16+
[SerializedLabel("start")]
17+
public double Start { get; init; }
18+
19+
[SerializedLabel("end")]
20+
public double End { get; init; }
21+
22+
[SerializedLabel("speaker_id")]
23+
public string SpeakerId { get; init; } = null!;
24+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
using Flowthru.Core.Abstractions;
2+
3+
namespace DiarizationExample.Data._03_Primary.Schemas;
4+
5+
/// <summary>
6+
/// One Whisper transcript segment for one input clip. <c>ClipId</c> is the
7+
/// source audio's file path (the <c>Directory&lt;byte[]&gt;</c> key); a single
8+
/// clip produces many rows.
9+
/// </summary>
10+
[FlowthruSchema]
11+
public partial record TranscriptSegmentSchema
12+
{
13+
[SerializedLabel("clip_id")]
14+
public string ClipId { get; init; } = null!;
15+
16+
[SerializedLabel("start")]
17+
public double Start { get; init; }
18+
19+
[SerializedLabel("end")]
20+
public double End { get; init; }
21+
22+
[SerializedLabel("text")]
23+
public string Text { get; init; } = null!;
24+
}

examples/archived/DiarizationExample/Data/_04_Feature/Catalog.Feature.cs

Lines changed: 20 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
using Flowthru.Core.Abstractions;
2+
3+
namespace DiarizationExample.Data._04_Feature.Schemas;
4+
5+
/// <summary>
6+
/// A transcript segment with its dominant speaker attached. Produced by the
7+
/// alignment step — for each transcript span, the speaker is whichever
8+
/// diarization turn covers the largest fraction of the segment.
9+
/// </summary>
10+
[FlowthruSchema]
11+
public partial record AttributedSegmentSchema
12+
{
13+
[SerializedLabel("clip_id")]
14+
public string ClipId { get; init; } = null!;
15+
16+
[SerializedLabel("start")]
17+
public double Start { get; init; }
18+
19+
[SerializedLabel("end")]
20+
public double End { get; init; }
21+
22+
[SerializedLabel("speaker_id")]
23+
public string SpeakerId { get; init; } = null!;
24+
25+
[SerializedLabel("text")]
26+
public string Text { get; init; } = null!;
27+
}

0 commit comments

Comments
 (0)