Akash Mahajan ggerganov commited on
Commit
eedbf0c
·
unverified ·
1 Parent(s): 982cdf8

whisper : support speaker segmentation (local diarization) of mono audio via tinydiarize (#1058)

Browse files

* add HuggingFace mirror to download ggml model

* support tdrz via simple hack overriding solm tokens

* fix incorrect translate/transcribe token_ids that are not static const

* add apollo 13 sample for tdrz demo

* render [SPEAKER TURN] consistently in all terminal output using vocab.id_to_token

* extend whisper_segment with speaker_turn_next field and save in json output

* fix failing go build

* slipped in some python syntax whoops

* whisper : finalize tinydiarize support (add flag + fixes)

* whisper : tdrz support for word-level timestamps (respect max_len)

* java : try to fix tests after adding tdrz_enable flag

* main : remove TODO leftover

* java : fix params order list after adding "tdrz_enable"

* whisper : fix solm and add nosp token

* main : print tinydiarize help

---------

Co-authored-by: Georgi Gerganov <[email protected]>

Makefile CHANGED
@@ -308,12 +308,16 @@ samples:
308
  @wget --quiet --show-progress -O samples/gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
309
  @wget --quiet --show-progress -O samples/hp0.ogg https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg
310
  @wget --quiet --show-progress -O samples/mm1.wav https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav
 
311
  @echo "Converting to 16-bit WAV ..."
312
  @ffmpeg -loglevel -0 -y -i samples/gb0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb0.wav
313
  @ffmpeg -loglevel -0 -y -i samples/gb1.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb1.wav
314
  @ffmpeg -loglevel -0 -y -i samples/hp0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/hp0.wav
 
315
  @ffmpeg -loglevel -0 -y -i samples/mm1.wav -ar 16000 -ac 1 -c:a pcm_s16le samples/mm0.wav
316
  @rm samples/mm1.wav
 
 
317
 
318
  #
319
  # Models
 
308
  @wget --quiet --show-progress -O samples/gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
309
  @wget --quiet --show-progress -O samples/hp0.ogg https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg
310
  @wget --quiet --show-progress -O samples/mm1.wav https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav
311
+ @wget --quiet --show-progress -O samples/a13.mp3 https://upload.wikimedia.org/wikipedia/commons/transcoded/6/6f/Apollo13-wehaveaproblem.ogg/Apollo13-wehaveaproblem.ogg.mp3
312
  @echo "Converting to 16-bit WAV ..."
313
  @ffmpeg -loglevel -0 -y -i samples/gb0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb0.wav
314
  @ffmpeg -loglevel -0 -y -i samples/gb1.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb1.wav
315
  @ffmpeg -loglevel -0 -y -i samples/hp0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/hp0.wav
316
+ @rm samples/*.ogg
317
  @ffmpeg -loglevel -0 -y -i samples/mm1.wav -ar 16000 -ac 1 -c:a pcm_s16le samples/mm0.wav
318
  @rm samples/mm1.wav
319
+ @ffmpeg -loglevel -0 -y -i samples/a13.mp3 -ar 16000 -ac 1 -c:a pcm_s16le -ss 00:00:00 -to 00:00:30 samples/a13.wav
320
+ @rm samples/a13.mp3
321
 
322
  #
323
  # Models
bindings/go/whisper.go CHANGED
@@ -270,13 +270,13 @@ func (ctx *Context) Whisper_token_lang(lang_id int) Token {
270
  }
271
 
272
  // Task tokens
273
- func Whisper_token_translate() Token {
274
- return Token(C.whisper_token_translate())
275
  }
276
 
277
  // Task tokens
278
- func Whisper_token_transcribe() Token {
279
- return Token(C.whisper_token_transcribe())
280
  }
281
 
282
  // Performance information
 
270
  }
271
 
272
  // Task tokens
273
+ func (ctx *Context) Whisper_token_translate() Token {
274
+ return Token(C.whisper_token_translate((*C.struct_whisper_context)(ctx)))
275
  }
276
 
277
  // Task tokens
278
+ func (ctx *Context) Whisper_token_transcribe() Token {
279
+ return Token(C.whisper_token_transcribe((*C.struct_whisper_context)(ctx)))
280
  }
281
 
282
  // Performance information
bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java CHANGED
@@ -224,8 +224,8 @@ public interface WhisperCppJnaLibrary extends Library {
224
  int whisper_token_lang(Pointer ctx, int lang_id);
225
 
226
  // Task tokens
227
- int whisper_token_translate();
228
- int whisper_token_transcribe();
229
 
230
  // Performance information from the default state.
231
  void whisper_print_timings(Pointer ctx);
 
224
  int whisper_token_lang(Pointer ctx, int lang_id);
225
 
226
  // Task tokens
227
+ int whisper_token_translate (Pointer ctx);
228
+ int whisper_token_transcribe(Pointer ctx);
229
 
230
  // Performance information from the default state.
231
  void whisper_print_timings(Pointer ctx);
bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java CHANGED
@@ -137,6 +137,14 @@ public class WhisperFullParams extends Structure {
137
  /** Overwrite the audio context size (0 = use default). */
138
  public int audio_ctx;
139
 
 
 
 
 
 
 
 
 
140
  /** Tokens to provide to the whisper decoder as an initial prompt.
141
  * These are prepended to any existing text context from a previous call. */
142
  public String initial_prompt;
@@ -302,7 +310,7 @@ public class WhisperFullParams extends Structure {
302
  "no_context", "single_segment",
303
  "print_special", "print_progress", "print_realtime", "print_timestamps", "token_timestamps",
304
  "thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "speed_up", "audio_ctx",
305
- "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
306
  "suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty",
307
  "temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
308
  "new_segment_callback", "new_segment_callback_user_data",
 
137
  /** Overwrite the audio context size (0 = use default). */
138
  public int audio_ctx;
139
 
140
+ /** Enable tinydiarize (default = false) */
141
+ public CBool tdrz_enable;
142
+
143
+ /** Enable tinydiarize (default = false) */
144
+ public void tdrzEnable(boolean enable) {
145
+ tdrz_enable = enable ? CBool.TRUE : CBool.FALSE;
146
+ }
147
+
148
  /** Tokens to provide to the whisper decoder as an initial prompt.
149
  * These are prepended to any existing text context from a previous call. */
150
  public String initial_prompt;
 
310
  "no_context", "single_segment",
311
  "print_special", "print_progress", "print_realtime", "print_timestamps", "token_timestamps",
312
  "thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "speed_up", "audio_ctx",
313
+ "tdrz_enable", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
314
  "suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty",
315
  "temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
316
  "new_segment_callback", "new_segment_callback_user_data",
examples/main/main.cpp CHANGED
@@ -68,28 +68,32 @@ struct whisper_params {
68
  float entropy_thold = 2.40f;
69
  float logprob_thold = -1.00f;
70
 
71
- bool speed_up = false;
72
- bool translate = false;
73
- bool detect_language= false;
74
- bool diarize = false;
75
- bool split_on_word = false;
76
- bool no_fallback = false;
77
- bool output_txt = false;
78
- bool output_vtt = false;
79
- bool output_srt = false;
80
- bool output_wts = false;
81
- bool output_csv = false;
82
- bool output_jsn = false;
83
- bool output_lrc = false;
84
- bool print_special = false;
85
- bool print_colors = false;
86
- bool print_progress = false;
87
- bool no_timestamps = false;
88
-
89
- std::string language = "en";
 
90
  std::string prompt;
91
  std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
92
- std::string model = "models/ggml-base.en.bin";
 
 
 
93
 
94
  std::vector<std::string> fname_inp = {};
95
  std::vector<std::string> fname_out = {};
@@ -115,41 +119,42 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
115
  whisper_print_usage(argc, argv, params);
116
  exit(0);
117
  }
118
- else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
119
- else if (arg == "-p" || arg == "--processors") { params.n_processors = std::stoi(argv[++i]); }
120
- else if (arg == "-ot" || arg == "--offset-t") { params.offset_t_ms = std::stoi(argv[++i]); }
121
- else if (arg == "-on" || arg == "--offset-n") { params.offset_n = std::stoi(argv[++i]); }
122
- else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(argv[++i]); }
123
- else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(argv[++i]); }
124
- else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
125
- else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
126
- else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
127
- else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
128
- else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
129
- else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
130
- else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
131
- else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
132
- else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
133
- else if (arg == "-sow" || arg == "--split-on-word") { params.split_on_word = true; }
134
- else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
135
- else if (arg == "-otxt" || arg == "--output-txt") { params.output_txt = true; }
136
- else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; }
137
- else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
138
- else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
139
- else if (arg == "-olrc" || arg == "--output-lrc") { params.output_lrc = true; }
140
- else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
141
- else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
142
- else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; }
143
- else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
144
- else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
145
- else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
146
- else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
147
- else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; }
148
- else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
149
- else if (arg == "-dl" || arg == "--detect-language"){ params.detect_language= true; }
150
- else if ( arg == "--prompt") { params.prompt = argv[++i]; }
151
- else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
152
- else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
 
153
  else {
154
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
155
  whisper_print_usage(argc, argv, params);
@@ -182,6 +187,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
182
  fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
183
  fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
184
  fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
 
185
  fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
186
  fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
187
  fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
@@ -297,6 +303,12 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
297
  printf("%s%s", speaker.c_str(), text);
298
  }
299
 
 
 
 
 
 
 
300
  // with timestamps or speakers: each segment on new line
301
  if (!params.no_timestamps || params.diarize) {
302
  printf("\n");
@@ -564,6 +576,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
564
  const int n_segments = whisper_full_n_segments(ctx);
565
  for (int i = 0; i < n_segments; ++i) {
566
  const char * text = whisper_full_get_segment_text(ctx, i);
 
567
  const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
568
  const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
569
 
@@ -576,11 +589,15 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
576
  value_i("from", t0 * 10, false);
577
  value_i("to", t1 * 10, true);
578
  end_obj(false);
579
- value_s("text", text, !params.diarize);
580
 
581
  if (params.diarize && pcmf32s.size() == 2) {
582
  value_s("speaker", estimate_diarization_speaker(pcmf32s, t0, t1, true).c_str(), true);
583
  }
 
 
 
 
584
  end_obj(i == (n_segments - 1));
585
  }
586
 
@@ -777,6 +794,12 @@ int main(int argc, char ** argv) {
777
  exit(0);
778
  }
779
 
 
 
 
 
 
 
780
  // whisper init
781
 
782
  struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
@@ -818,11 +841,12 @@ int main(int argc, char ** argv) {
818
  if (params.detect_language) {
819
  params.language = "auto";
820
  }
821
- fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d ...\n",
822
  __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
823
  params.n_threads, params.n_processors,
824
  params.language.c_str(),
825
  params.translate ? "translate" : "transcribe",
 
826
  params.no_timestamps ? 0 : 1);
827
 
828
  fprintf(stderr, "\n");
@@ -853,6 +877,8 @@ int main(int argc, char ** argv) {
853
 
854
  wparams.speed_up = params.speed_up;
855
 
 
 
856
  wparams.initial_prompt = params.prompt.c_str();
857
 
858
  wparams.greedy.best_of = params.best_of;
 
68
  float entropy_thold = 2.40f;
69
  float logprob_thold = -1.00f;
70
 
71
+ bool speed_up = false;
72
+ bool translate = false;
73
+ bool detect_language = false;
74
+ bool diarize = false;
75
+ bool tinydiarize = false;
76
+ bool split_on_word = false;
77
+ bool no_fallback = false;
78
+ bool output_txt = false;
79
+ bool output_vtt = false;
80
+ bool output_srt = false;
81
+ bool output_wts = false;
82
+ bool output_csv = false;
83
+ bool output_jsn = false;
84
+ bool output_lrc = false;
85
+ bool print_special = false;
86
+ bool print_colors = false;
87
+ bool print_progress = false;
88
+ bool no_timestamps = false;
89
+
90
+ std::string language = "en";
91
  std::string prompt;
92
  std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
93
+ std::string model = "models/ggml-base.en.bin";
94
+
95
+ // [TDRZ] speaker turn string
96
+ std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
97
 
98
  std::vector<std::string> fname_inp = {};
99
  std::vector<std::string> fname_out = {};
 
119
  whisper_print_usage(argc, argv, params);
120
  exit(0);
121
  }
122
+ else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
123
+ else if (arg == "-p" || arg == "--processors") { params.n_processors = std::stoi(argv[++i]); }
124
+ else if (arg == "-ot" || arg == "--offset-t") { params.offset_t_ms = std::stoi(argv[++i]); }
125
+ else if (arg == "-on" || arg == "--offset-n") { params.offset_n = std::stoi(argv[++i]); }
126
+ else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(argv[++i]); }
127
+ else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(argv[++i]); }
128
+ else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
129
+ else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
130
+ else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
131
+ else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
132
+ else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
133
+ else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
134
+ else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
135
+ else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
136
+ else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
137
+ else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; }
138
+ else if (arg == "-sow" || arg == "--split-on-word") { params.split_on_word = true; }
139
+ else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
140
+ else if (arg == "-otxt" || arg == "--output-txt") { params.output_txt = true; }
141
+ else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; }
142
+ else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
143
+ else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
144
+ else if (arg == "-olrc" || arg == "--output-lrc") { params.output_lrc = true; }
145
+ else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
146
+ else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
147
+ else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; }
148
+ else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
149
+ else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
150
+ else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
151
+ else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
152
+ else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; }
153
+ else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
154
+ else if (arg == "-dl" || arg == "--detect-language") { params.detect_language = true; }
155
+ else if ( arg == "--prompt") { params.prompt = argv[++i]; }
156
+ else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
157
+ else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
158
  else {
159
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
160
  whisper_print_usage(argc, argv, params);
 
187
  fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
188
  fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
189
  fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
190
+ fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false");
191
  fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
192
  fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
193
  fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
 
303
  printf("%s%s", speaker.c_str(), text);
304
  }
305
 
306
+ if (params.tinydiarize) {
307
+ if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
308
+ printf("%s", params.tdrz_speaker_turn.c_str());
309
+ }
310
+ }
311
+
312
  // with timestamps or speakers: each segment on new line
313
  if (!params.no_timestamps || params.diarize) {
314
  printf("\n");
 
576
  const int n_segments = whisper_full_n_segments(ctx);
577
  for (int i = 0; i < n_segments; ++i) {
578
  const char * text = whisper_full_get_segment_text(ctx, i);
579
+
580
  const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
581
  const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
582
 
 
589
  value_i("from", t0 * 10, false);
590
  value_i("to", t1 * 10, true);
591
  end_obj(false);
592
+ value_s("text", text, !params.diarize && !params.tinydiarize);
593
 
594
  if (params.diarize && pcmf32s.size() == 2) {
595
  value_s("speaker", estimate_diarization_speaker(pcmf32s, t0, t1, true).c_str(), true);
596
  }
597
+
598
+ if (params.tinydiarize) {
599
+ value_b("speaker_turn_next", whisper_full_get_segment_speaker_turn_next(ctx, i), true);
600
+ }
601
  end_obj(i == (n_segments - 1));
602
  }
603
 
 
794
  exit(0);
795
  }
796
 
797
+ if (params.diarize && params.tinydiarize) {
798
+ fprintf(stderr, "error: cannot use both --diarize and --tinydiarize\n");
799
+ whisper_print_usage(argc, argv, params);
800
+ exit(0);
801
+ }
802
+
803
  // whisper init
804
 
805
  struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
 
841
  if (params.detect_language) {
842
  params.language = "auto";
843
  }
844
+ fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, %stimestamps = %d ...\n",
845
  __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
846
  params.n_threads, params.n_processors,
847
  params.language.c_str(),
848
  params.translate ? "translate" : "transcribe",
849
+ params.tinydiarize ? "tdrz = 1, " : "",
850
  params.no_timestamps ? 0 : 1);
851
 
852
  fprintf(stderr, "\n");
 
877
 
878
  wparams.speed_up = params.speed_up;
879
 
880
+ wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
881
+
882
  wparams.initial_prompt = params.prompt.c_str();
883
 
884
  wparams.greedy.best_of = params.best_of;
models/download-ggml-model.sh CHANGED
@@ -22,7 +22,7 @@ function get_script_path() {
22
  models_path="$(get_script_path)"
23
 
24
  # Whisper models
25
- models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
26
 
27
  # list available models
28
  function list_models {
@@ -50,6 +50,12 @@ if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
50
  exit 1
51
  fi
52
 
 
 
 
 
 
 
53
  # download ggml model
54
 
55
  printf "Downloading ggml model $model from '$src' ...\n"
 
22
  models_path="$(get_script_path)"
23
 
24
  # Whisper models
25
+ models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small.en-tdrz" "small" "medium.en" "medium" "large-v1" "large" )
26
 
27
  # list available models
28
  function list_models {
 
50
  exit 1
51
  fi
52
 
53
+ # check if model contains `tdrz` and update the src and pfx accordingly
54
+ if [[ $model == *"tdrz"* ]]; then
55
+ src="https://huggingface.co/akashmjn/tinydiarize-whisper.cpp"
56
+ pfx="resolve/main/ggml"
57
+ fi
58
+
59
  # download ggml model
60
 
61
  printf "Downloading ggml model $model from '$src' ...\n"
whisper.cpp CHANGED
@@ -380,16 +380,18 @@ struct whisper_vocab {
380
  std::map<token, id> token_to_id;
381
  std::map<id, token> id_to_token;
382
 
383
- id token_eot = 50256;
384
- id token_sot = 50257;
385
- id token_prev = 50360;
386
- id token_solm = 50361; // ??
387
- id token_not = 50362; // no timestamps
388
- id token_beg = 50363;
389
-
390
- // available tasks
391
- static const id token_translate = 50358;
392
- static const id token_transcribe = 50359;
 
 
393
 
394
  bool is_multilingual() const {
395
  return n_vocab == 51865;
@@ -403,6 +405,8 @@ struct whisper_segment {
403
  std::string text;
404
 
405
  std::vector<whisper_token_data> tokens;
 
 
406
  };
407
 
408
  // medium
@@ -966,8 +970,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
966
  if (vocab.is_multilingual()) {
967
  vocab.token_eot++;
968
  vocab.token_sot++;
969
- vocab.token_prev++;
 
970
  vocab.token_solm++;
 
 
971
  vocab.token_not++;
972
  vocab.token_beg++;
973
  }
@@ -981,8 +988,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
981
  word = "[_EOT_]";
982
  } else if (i == vocab.token_sot) {
983
  word = "[_SOT_]";
 
 
984
  } else if (i == vocab.token_prev) {
985
  word = "[_PREV_]";
 
 
986
  } else if (i == vocab.token_not) {
987
  word = "[_NOT_]";
988
  } else if (i == vocab.token_beg) {
@@ -3208,12 +3219,16 @@ whisper_token whisper_token_sot(struct whisper_context * ctx) {
3208
  return ctx->vocab.token_sot;
3209
  }
3210
 
 
 
 
 
3211
  whisper_token whisper_token_prev(struct whisper_context * ctx) {
3212
  return ctx->vocab.token_prev;
3213
  }
3214
 
3215
- whisper_token whisper_token_solm(struct whisper_context * ctx) {
3216
- return ctx->vocab.token_solm;
3217
  }
3218
 
3219
  whisper_token whisper_token_not(struct whisper_context * ctx) {
@@ -3228,12 +3243,12 @@ whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id) {
3228
  return whisper_token_sot(ctx) + 1 + lang_id;
3229
  }
3230
 
3231
- whisper_token whisper_token_translate(void) {
3232
- return whisper_vocab::token_translate;
3233
  }
3234
 
3235
- whisper_token whisper_token_transcribe(void) {
3236
- return whisper_vocab::token_transcribe;
3237
  }
3238
 
3239
  void whisper_print_timings(struct whisper_context * ctx) {
@@ -3305,51 +3320,53 @@ struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sam
3305
 
3306
  struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
3307
  struct whisper_full_params result = {
3308
- /*.strategy =*/ strategy,
3309
-
3310
- /*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
3311
- /*.n_max_text_ctx =*/ 16384,
3312
- /*.offset_ms =*/ 0,
3313
- /*.duration_ms =*/ 0,
3314
-
3315
- /*.translate =*/ false,
3316
- /*.no_context =*/ true,
3317
- /*.single_segment =*/ false,
3318
- /*.print_special =*/ false,
3319
- /*.print_progress =*/ true,
3320
- /*.print_realtime =*/ false,
3321
- /*.print_timestamps =*/ true,
3322
-
3323
- /*.token_timestamps =*/ false,
3324
- /*.thold_pt =*/ 0.01f,
3325
- /*.thold_ptsum =*/ 0.01f,
3326
- /*.max_len =*/ 0,
3327
- /*.split_on_word =*/ false,
3328
- /*.max_tokens =*/ 0,
3329
-
3330
- /*.speed_up =*/ false,
3331
- /*.audio_ctx =*/ 0,
3332
-
3333
- /*.initial_prompt =*/ nullptr,
3334
- /*.prompt_tokens =*/ nullptr,
3335
- /*.prompt_n_tokens =*/ 0,
3336
-
3337
- /*.language =*/ "en",
3338
- /*.detect_language =*/ false,
3339
-
3340
- /*.suppress_blank =*/ true,
 
 
3341
  /*.suppress_non_speech_tokens =*/ false,
3342
 
3343
- /*.temperature =*/ 0.0f,
3344
- /*.max_initial_ts =*/ 1.0f,
3345
- /*.length_penalty =*/ -1.0f,
3346
 
3347
- /*.temperature_inc =*/ 0.4f,
3348
- /*.entropy_thold =*/ 2.4f,
3349
- /*.logprob_thold =*/ -1.0f,
3350
- /*.no_speech_thold =*/ 0.6f,
3351
 
3352
- /*.greedy =*/ {
3353
  /*.best_of =*/ -1,
3354
  },
3355
 
@@ -3430,6 +3447,7 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta
3430
  state.result_all.back().text = std::move(text);
3431
  state.result_all.back().t1 = token.t0;
3432
  state.result_all.back().tokens.resize(i);
 
3433
 
3434
  state.result_all.push_back({});
3435
  state.result_all.back().t0 = token.t0;
@@ -3441,6 +3459,8 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta
3441
  segment.tokens.begin() + i,
3442
  segment.tokens.end());
3443
 
 
 
3444
  acc = 0;
3445
  text = "";
3446
 
@@ -3519,9 +3539,14 @@ static void whisper_process_logits(
3519
  // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L410-L412
3520
  logits[vocab.token_not] = -INFINITY;
3521
 
3522
- // suppress sot and solm tokens
3523
  logits[vocab.token_sot] = -INFINITY;
3524
- logits[vocab.token_solm] = -INFINITY;
 
 
 
 
 
3525
 
3526
  // suppress task tokens
3527
  logits[vocab.token_translate] = -INFINITY;
@@ -4018,9 +4043,9 @@ int whisper_full_with_state(
4018
  state->lang_id = lang_id;
4019
  prompt_init.push_back(whisper_token_lang(ctx, lang_id));
4020
  if (params.translate) {
4021
- prompt_init.push_back(whisper_token_translate());
4022
  } else {
4023
- prompt_init.push_back(whisper_token_transcribe());
4024
  }
4025
  }
4026
 
@@ -4500,23 +4525,27 @@ int whisper_full_with_state(
4500
  prompt_past.push_back(tokens_cur[i].id);
4501
  }
4502
 
4503
- // store the text from this iteration
4504
  if (!tokens_cur.empty() && ctx->model.n_loaded > 0) {
4505
  int i0 = 0;
4506
  auto t0 = seek + 2*(tokens_cur.front().tid - whisper_token_beg(ctx));
4507
 
4508
  std::string text;
 
4509
 
4510
  for (int i = 0; i < (int) tokens_cur.size(); i++) {
4511
  //printf("%s: %18s %6.3f %18s %6.3f\n", __func__,
4512
  // ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].p,
4513
  // ctx->vocab.id_to_token[tokens_cur[i].tid].c_str(), tokens_cur[i].pt);
4514
 
4515
- if (params.print_special == false && tokens_cur[i].id >= whisper_token_eot(ctx)) {
4516
- } else {
4517
  text += whisper_token_to_str(ctx, tokens_cur[i].id);
4518
  }
4519
 
 
 
 
 
 
4520
  if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
4521
  const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
4522
 
@@ -4535,7 +4564,7 @@ int whisper_full_with_state(
4535
 
4536
  //printf("tt0 = %d, tt1 = %d, text = %s, token = %s, token_id = %d, tid = %d\n", tt0, tt1, text.c_str(), ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].id, tokens_cur[i].tid);
4537
 
4538
- result_all.push_back({ tt0, tt1, text, {} });
4539
  for (int j = i0; j <= i; j++) {
4540
  result_all.back().tokens.push_back(tokens_cur[j]);
4541
  }
@@ -4561,6 +4590,7 @@ int whisper_full_with_state(
4561
  i--;
4562
  t0 = t1;
4563
  i0 = i + 1;
 
4564
  }
4565
  }
4566
 
@@ -4579,7 +4609,7 @@ int whisper_full_with_state(
4579
  }
4580
  }
4581
 
4582
- result_all.push_back({ tt0, tt1, text, {} });
4583
  for (int j = i0; j < (int) tokens_cur.size(); j++) {
4584
  result_all.back().tokens.push_back(tokens_cur[j]);
4585
  }
@@ -4759,6 +4789,10 @@ int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment)
4759
  return ctx->state->result_all[i_segment].t1;
4760
  }
4761
 
 
 
 
 
4762
  const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment) {
4763
  return state->result_all[i_segment].text.c_str();
4764
  }
 
380
  std::map<token, id> token_to_id;
381
  std::map<id, token> id_to_token;
382
 
383
+ // reference: https://github.com/openai/whisper/blob/248b6cb124225dd263bb9bd32d060b6517e067f8/whisper/tokenizer.py#L334-L349
384
+ id token_eot = 50256;
385
+ id token_sot = 50257;
386
+ // task tokens (used only for multilingual models)
387
+ id token_translate = 50357;
388
+ id token_transcribe = 50358;
389
+ // other special tokens
390
+ id token_solm = 50359; // [TDRZ] used by tinydiarize models to indicate speaker turn
391
+ id token_prev = 50360;
392
+ id token_nosp = 50361;
393
+ id token_not = 50362; // no timestamps
394
+ id token_beg = 50363; // begin timestamps
395
 
396
  bool is_multilingual() const {
397
  return n_vocab == 51865;
 
405
  std::string text;
406
 
407
  std::vector<whisper_token_data> tokens;
408
+
409
+ bool speaker_turn_next;
410
  };
411
 
412
  // medium
 
970
  if (vocab.is_multilingual()) {
971
  vocab.token_eot++;
972
  vocab.token_sot++;
973
+ vocab.token_translate++;
974
+ vocab.token_transcribe++;
975
  vocab.token_solm++;
976
+ vocab.token_prev++;
977
+ vocab.token_nosp++;
978
  vocab.token_not++;
979
  vocab.token_beg++;
980
  }
 
988
  word = "[_EOT_]";
989
  } else if (i == vocab.token_sot) {
990
  word = "[_SOT_]";
991
+ } else if (i == vocab.token_solm) {
992
+ word = "[_SOLM_]";
993
  } else if (i == vocab.token_prev) {
994
  word = "[_PREV_]";
995
+ } else if (i == vocab.token_nosp) {
996
+ word = "[_NOSP_]";
997
  } else if (i == vocab.token_not) {
998
  word = "[_NOT_]";
999
  } else if (i == vocab.token_beg) {
 
3219
  return ctx->vocab.token_sot;
3220
  }
3221
 
3222
+ whisper_token whisper_token_solm(struct whisper_context * ctx) {
3223
+ return ctx->vocab.token_solm;
3224
+ }
3225
+
3226
  whisper_token whisper_token_prev(struct whisper_context * ctx) {
3227
  return ctx->vocab.token_prev;
3228
  }
3229
 
3230
+ whisper_token whisper_token_nosp(struct whisper_context * ctx) {
3231
+ return ctx->vocab.token_nosp;
3232
  }
3233
 
3234
  whisper_token whisper_token_not(struct whisper_context * ctx) {
 
3243
  return whisper_token_sot(ctx) + 1 + lang_id;
3244
  }
3245
 
3246
+ whisper_token whisper_token_translate(struct whisper_context * ctx) {
3247
+ return ctx->vocab.token_translate;
3248
  }
3249
 
3250
+ whisper_token whisper_token_transcribe(struct whisper_context * ctx) {
3251
+ return ctx->vocab.token_transcribe;
3252
  }
3253
 
3254
  void whisper_print_timings(struct whisper_context * ctx) {
 
3320
 
3321
  struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
3322
  struct whisper_full_params result = {
3323
+ /*.strategy =*/ strategy,
3324
+
3325
+ /*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
3326
+ /*.n_max_text_ctx =*/ 16384,
3327
+ /*.offset_ms =*/ 0,
3328
+ /*.duration_ms =*/ 0,
3329
+
3330
+ /*.translate =*/ false,
3331
+ /*.no_context =*/ true,
3332
+ /*.single_segment =*/ false,
3333
+ /*.print_special =*/ false,
3334
+ /*.print_progress =*/ true,
3335
+ /*.print_realtime =*/ false,
3336
+ /*.print_timestamps =*/ true,
3337
+
3338
+ /*.token_timestamps =*/ false,
3339
+ /*.thold_pt =*/ 0.01f,
3340
+ /*.thold_ptsum =*/ 0.01f,
3341
+ /*.max_len =*/ 0,
3342
+ /*.split_on_word =*/ false,
3343
+ /*.max_tokens =*/ 0,
3344
+
3345
+ /*.speed_up =*/ false,
3346
+ /*.audio_ctx =*/ 0,
3347
+
3348
+ /*.tdrz_enable =*/ false,
3349
+
3350
+ /*.initial_prompt =*/ nullptr,
3351
+ /*.prompt_tokens =*/ nullptr,
3352
+ /*.prompt_n_tokens =*/ 0,
3353
+
3354
+ /*.language =*/ "en",
3355
+ /*.detect_language =*/ false,
3356
+
3357
+ /*.suppress_blank =*/ true,
3358
  /*.suppress_non_speech_tokens =*/ false,
3359
 
3360
+ /*.temperature =*/ 0.0f,
3361
+ /*.max_initial_ts =*/ 1.0f,
3362
+ /*.length_penalty =*/ -1.0f,
3363
 
3364
+ /*.temperature_inc =*/ 0.4f,
3365
+ /*.entropy_thold =*/ 2.4f,
3366
+ /*.logprob_thold =*/ -1.0f,
3367
+ /*.no_speech_thold =*/ 0.6f,
3368
 
3369
+ /*.greedy =*/ {
3370
  /*.best_of =*/ -1,
3371
  },
3372
 
 
3447
  state.result_all.back().text = std::move(text);
3448
  state.result_all.back().t1 = token.t0;
3449
  state.result_all.back().tokens.resize(i);
3450
+ state.result_all.back().speaker_turn_next = false;
3451
 
3452
  state.result_all.push_back({});
3453
  state.result_all.back().t0 = token.t0;
 
3459
  segment.tokens.begin() + i,
3460
  segment.tokens.end());
3461
 
3462
+ state.result_all.back().speaker_turn_next = segment.speaker_turn_next;
3463
+
3464
  acc = 0;
3465
  text = "";
3466
 
 
3539
  // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L410-L412
3540
  logits[vocab.token_not] = -INFINITY;
3541
 
3542
+ // suppress sot and nosp tokens
3543
  logits[vocab.token_sot] = -INFINITY;
3544
+ logits[vocab.token_nosp] = -INFINITY; // TODO: ignore this token for now
3545
+
3546
+ // [TDRZ] when tinydiarize is disabled, suppress solm token
3547
+ if (params.tdrz_enable == false) {
3548
+ logits[vocab.token_solm] = -INFINITY;
3549
+ }
3550
 
3551
  // suppress task tokens
3552
  logits[vocab.token_translate] = -INFINITY;
 
4043
  state->lang_id = lang_id;
4044
  prompt_init.push_back(whisper_token_lang(ctx, lang_id));
4045
  if (params.translate) {
4046
+ prompt_init.push_back(whisper_token_translate(ctx));
4047
  } else {
4048
+ prompt_init.push_back(whisper_token_transcribe(ctx));
4049
  }
4050
  }
4051
 
 
4525
  prompt_past.push_back(tokens_cur[i].id);
4526
  }
4527
 
 
4528
  if (!tokens_cur.empty() && ctx->model.n_loaded > 0) {
4529
  int i0 = 0;
4530
  auto t0 = seek + 2*(tokens_cur.front().tid - whisper_token_beg(ctx));
4531
 
4532
  std::string text;
4533
+ bool speaker_turn_next = false;
4534
 
4535
  for (int i = 0; i < (int) tokens_cur.size(); i++) {
4536
  //printf("%s: %18s %6.3f %18s %6.3f\n", __func__,
4537
  // ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].p,
4538
  // ctx->vocab.id_to_token[tokens_cur[i].tid].c_str(), tokens_cur[i].pt);
4539
 
4540
+ if (params.print_special || tokens_cur[i].id < whisper_token_eot(ctx)) {
 
4541
  text += whisper_token_to_str(ctx, tokens_cur[i].id);
4542
  }
4543
 
4544
+ // [TDRZ] record if speaker turn was predicted after current segment
4545
+ if (params.tdrz_enable && tokens_cur[i].id == whisper_token_solm(ctx)) {
4546
+ speaker_turn_next = true;
4547
+ }
4548
+
4549
  if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
4550
  const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
4551
 
 
4564
 
4565
  //printf("tt0 = %d, tt1 = %d, text = %s, token = %s, token_id = %d, tid = %d\n", tt0, tt1, text.c_str(), ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].id, tokens_cur[i].tid);
4566
 
4567
+ result_all.push_back({ tt0, tt1, text, {}, speaker_turn_next });
4568
  for (int j = i0; j <= i; j++) {
4569
  result_all.back().tokens.push_back(tokens_cur[j]);
4570
  }
 
4590
  i--;
4591
  t0 = t1;
4592
  i0 = i + 1;
4593
+ speaker_turn_next = false;
4594
  }
4595
  }
4596
 
 
4609
  }
4610
  }
4611
 
4612
+ result_all.push_back({ tt0, tt1, text, {} , speaker_turn_next });
4613
  for (int j = i0; j < (int) tokens_cur.size(); j++) {
4614
  result_all.back().tokens.push_back(tokens_cur[j]);
4615
  }
 
4789
  return ctx->state->result_all[i_segment].t1;
4790
  }
4791
 
4792
+ bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment) {
4793
+ return ctx->state->result_all[i_segment].speaker_turn_next;
4794
+ }
4795
+
4796
  const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment) {
4797
  return state->result_all[i_segment].text.c_str();
4798
  }
whisper.h CHANGED
@@ -277,15 +277,16 @@ extern "C" {
277
  // Special tokens
278
  WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
279
  WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
280
- WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
281
  WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
 
 
282
  WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
283
  WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
284
  WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);
285
 
286
  // Task tokens
287
- WHISPER_API whisper_token whisper_token_translate (void);
288
- WHISPER_API whisper_token whisper_token_transcribe(void);
289
 
290
  // Performance information from the default state.
291
  WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
@@ -358,6 +359,9 @@ extern "C" {
358
  bool speed_up; // speed-up the audio by 2x using Phase Vocoder
359
  int audio_ctx; // overwrite the audio context size (0 = use default)
360
 
 
 
 
361
  // tokens to provide to the whisper decoder as initial prompt
362
  // these are prepended to any existing text context from a previous call
363
  const char * initial_prompt;
@@ -460,6 +464,9 @@ extern "C" {
460
  WHISPER_API int64_t whisper_full_get_segment_t1 (struct whisper_context * ctx, int i_segment);
461
  WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
462
 
 
 
 
463
  // Get the text of the specified segment
464
  WHISPER_API const char * whisper_full_get_segment_text (struct whisper_context * ctx, int i_segment);
465
  WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
@@ -488,9 +495,9 @@ extern "C" {
488
 
489
  // Temporary helpers needed for exposing ggml interface
490
 
491
- WHISPER_API int whisper_bench_memcpy(int n_threads);
492
- WHISPER_API const char * whisper_bench_memcpy_str(int n_threads);
493
- WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
494
  WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
495
 
496
  #ifdef __cplusplus
 
277
  // Special tokens
278
  WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
279
  WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
 
280
  WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
281
+ WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
282
+ WHISPER_API whisper_token whisper_token_nosp(struct whisper_context * ctx);
283
  WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
284
  WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
285
  WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);
286
 
287
  // Task tokens
288
+ WHISPER_API whisper_token whisper_token_translate (struct whisper_context * ctx);
289
+ WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx);
290
 
291
  // Performance information from the default state.
292
  WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
 
359
  bool speed_up; // speed-up the audio by 2x using Phase Vocoder
360
  int audio_ctx; // overwrite the audio context size (0 = use default)
361
 
362
+ // [EXPERIMENTAL] [TDRZ] tinydiarize
363
+ bool tdrz_enable; // enable tinydiarize speaker turn detection
364
+
365
  // tokens to provide to the whisper decoder as initial prompt
366
  // these are prepended to any existing text context from a previous call
367
  const char * initial_prompt;
 
464
  WHISPER_API int64_t whisper_full_get_segment_t1 (struct whisper_context * ctx, int i_segment);
465
  WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
466
 
467
+ // Get whether the next segment is predicted as a speaker turn
468
+ WHISPER_API bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment);
469
+
470
  // Get the text of the specified segment
471
  WHISPER_API const char * whisper_full_get_segment_text (struct whisper_context * ctx, int i_segment);
472
  WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
 
495
 
496
  // Temporary helpers needed for exposing ggml interface
497
 
498
+ WHISPER_API int whisper_bench_memcpy (int n_threads);
499
+ WHISPER_API const char * whisper_bench_memcpy_str (int n_threads);
500
+ WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads);
501
  WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
502
 
503
  #ifdef __cplusplus