diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 8dd31d0..43f9f84 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -332,6 +332,7 @@ bool output_txt(struct whisper_context * ctx, const char * fname, const whisper_ fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname); const int n_segments = whisper_full_n_segments(ctx); + bool speaker_turned = false; for (int i = 0; i < n_segments; ++i) { const char * text = whisper_full_get_segment_text(ctx, i); std::string speaker = ""; @@ -343,6 +344,13 @@ bool output_txt(struct whisper_context * ctx, const char * fname, const whisper_ speaker = estimate_diarization_speaker(pcmf32s, t0, t1); } + if (params.tinydiarize) { + if (speaker_turned) { + speaker.insert(0, "Speaker Change: "); + } + speaker_turned = whisper_full_get_segment_speaker_turn_next(ctx, i); + } + fout << speaker << text << "\n"; } @@ -361,6 +369,7 @@ bool output_vtt(struct whisper_context * ctx, const char * fname, const whisper_ fout << "WEBVTT\n\n"; const int n_segments = whisper_full_n_segments(ctx); + bool speaker_turned = false; for (int i = 0; i < n_segments; ++i) { const char * text = whisper_full_get_segment_text(ctx, i); const int64_t t0 = whisper_full_get_segment_t0(ctx, i); @@ -374,6 +383,13 @@ bool output_vtt(struct whisper_context * ctx, const char * fname, const whisper_ speaker.append(">"); } + if (params.tinydiarize) { + if (speaker_turned) { + speaker.insert(0, "Speaker Change: "); + } + speaker_turned = whisper_full_get_segment_speaker_turn_next(ctx, i); + } + fout << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n"; fout << speaker << text << "\n\n"; }