Spaces:
Running
whisper : support speaker segmentation (local diarization) of mono audio via tinydiarize (#1058)
Browse files* add HuggingFace mirror to download ggml model
* support tdrz via simple hack overriding solm tokens
* fix incorrect translate/transcribe token_ids that are not static const
* add apollo 13 sample for tdrz demo
* render [SPEAKER TURN] consistently in all terminal output using vocab.id_to_token
* extend whisper_segment with speaker_turn_next field and save in json output
* fix failing go build
* slipped in some python syntax whoops
* whisper : finalize tinydiarize support (add flag + fixes)
* whisper : tdrz support for word-level timestamps (respect max_len)
* java : try to fix tests after adding tdrz_enable flag
* main : remove TODO leftover
* java : fix params order list after adding "tdrz_enable"
* whisper : fix solm and add nosp token
* main : print tinydiarize help
---------
Co-authored-by: Georgi Gerganov <[email protected]>
- Makefile +4 -0
- bindings/go/whisper.go +4 -4
- bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java +2 -2
- bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java +9 -1
- examples/main/main.cpp +83 -57
- models/download-ggml-model.sh +7 -1
- whisper.cpp +101 -67
- whisper.h +13 -6
|
@@ -308,12 +308,16 @@ samples:
|
|
| 308 |
@wget --quiet --show-progress -O samples/gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
|
| 309 |
@wget --quiet --show-progress -O samples/hp0.ogg https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg
|
| 310 |
@wget --quiet --show-progress -O samples/mm1.wav https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav
|
|
|
|
| 311 |
@echo "Converting to 16-bit WAV ..."
|
| 312 |
@ffmpeg -loglevel -0 -y -i samples/gb0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb0.wav
|
| 313 |
@ffmpeg -loglevel -0 -y -i samples/gb1.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb1.wav
|
| 314 |
@ffmpeg -loglevel -0 -y -i samples/hp0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/hp0.wav
|
|
|
|
| 315 |
@ffmpeg -loglevel -0 -y -i samples/mm1.wav -ar 16000 -ac 1 -c:a pcm_s16le samples/mm0.wav
|
| 316 |
@rm samples/mm1.wav
|
|
|
|
|
|
|
| 317 |
|
| 318 |
#
|
| 319 |
# Models
|
|
|
|
| 308 |
@wget --quiet --show-progress -O samples/gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
|
| 309 |
@wget --quiet --show-progress -O samples/hp0.ogg https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg
|
| 310 |
@wget --quiet --show-progress -O samples/mm1.wav https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav
|
| 311 |
+
@wget --quiet --show-progress -O samples/a13.mp3 https://upload.wikimedia.org/wikipedia/commons/transcoded/6/6f/Apollo13-wehaveaproblem.ogg/Apollo13-wehaveaproblem.ogg.mp3
|
| 312 |
@echo "Converting to 16-bit WAV ..."
|
| 313 |
@ffmpeg -loglevel -0 -y -i samples/gb0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb0.wav
|
| 314 |
@ffmpeg -loglevel -0 -y -i samples/gb1.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb1.wav
|
| 315 |
@ffmpeg -loglevel -0 -y -i samples/hp0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/hp0.wav
|
| 316 |
+
@rm samples/*.ogg
|
| 317 |
@ffmpeg -loglevel -0 -y -i samples/mm1.wav -ar 16000 -ac 1 -c:a pcm_s16le samples/mm0.wav
|
| 318 |
@rm samples/mm1.wav
|
| 319 |
+
@ffmpeg -loglevel -0 -y -i samples/a13.mp3 -ar 16000 -ac 1 -c:a pcm_s16le -ss 00:00:00 -to 00:00:30 samples/a13.wav
|
| 320 |
+
@rm samples/a13.mp3
|
| 321 |
|
| 322 |
#
|
| 323 |
# Models
|
|
@@ -270,13 +270,13 @@ func (ctx *Context) Whisper_token_lang(lang_id int) Token {
|
|
| 270 |
}
|
| 271 |
|
| 272 |
// Task tokens
|
| 273 |
-
func Whisper_token_translate() Token {
|
| 274 |
-
return Token(C.whisper_token_translate())
|
| 275 |
}
|
| 276 |
|
| 277 |
// Task tokens
|
| 278 |
-
func Whisper_token_transcribe() Token {
|
| 279 |
-
return Token(C.whisper_token_transcribe())
|
| 280 |
}
|
| 281 |
|
| 282 |
// Performance information
|
|
|
|
| 270 |
}
|
| 271 |
|
| 272 |
// Task tokens
|
| 273 |
+
func (ctx *Context) Whisper_token_translate() Token {
|
| 274 |
+
return Token(C.whisper_token_translate((*C.struct_whisper_context)(ctx)))
|
| 275 |
}
|
| 276 |
|
| 277 |
// Task tokens
|
| 278 |
+
func (ctx *Context) Whisper_token_transcribe() Token {
|
| 279 |
+
return Token(C.whisper_token_transcribe((*C.struct_whisper_context)(ctx)))
|
| 280 |
}
|
| 281 |
|
| 282 |
// Performance information
|
|
@@ -224,8 +224,8 @@ public interface WhisperCppJnaLibrary extends Library {
|
|
| 224 |
int whisper_token_lang(Pointer ctx, int lang_id);
|
| 225 |
|
| 226 |
// Task tokens
|
| 227 |
-
int whisper_token_translate();
|
| 228 |
-
int whisper_token_transcribe();
|
| 229 |
|
| 230 |
// Performance information from the default state.
|
| 231 |
void whisper_print_timings(Pointer ctx);
|
|
|
|
| 224 |
int whisper_token_lang(Pointer ctx, int lang_id);
|
| 225 |
|
| 226 |
// Task tokens
|
| 227 |
+
int whisper_token_translate (Pointer ctx);
|
| 228 |
+
int whisper_token_transcribe(Pointer ctx);
|
| 229 |
|
| 230 |
// Performance information from the default state.
|
| 231 |
void whisper_print_timings(Pointer ctx);
|
|
@@ -137,6 +137,14 @@ public class WhisperFullParams extends Structure {
|
|
| 137 |
/** Overwrite the audio context size (0 = use default). */
|
| 138 |
public int audio_ctx;
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
/** Tokens to provide to the whisper decoder as an initial prompt.
|
| 141 |
* These are prepended to any existing text context from a previous call. */
|
| 142 |
public String initial_prompt;
|
|
@@ -302,7 +310,7 @@ public class WhisperFullParams extends Structure {
|
|
| 302 |
"no_context", "single_segment",
|
| 303 |
"print_special", "print_progress", "print_realtime", "print_timestamps", "token_timestamps",
|
| 304 |
"thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "speed_up", "audio_ctx",
|
| 305 |
-
"initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
|
| 306 |
"suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty",
|
| 307 |
"temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
|
| 308 |
"new_segment_callback", "new_segment_callback_user_data",
|
|
|
|
| 137 |
/** Overwrite the audio context size (0 = use default). */
|
| 138 |
public int audio_ctx;
|
| 139 |
|
| 140 |
+
/** Enable tinydiarize (default = false) */
|
| 141 |
+
public CBool tdrz_enable;
|
| 142 |
+
|
| 143 |
+
/** Enable tinydiarize (default = false) */
|
| 144 |
+
public void tdrzEnable(boolean enable) {
|
| 145 |
+
tdrz_enable = enable ? CBool.TRUE : CBool.FALSE;
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
/** Tokens to provide to the whisper decoder as an initial prompt.
|
| 149 |
* These are prepended to any existing text context from a previous call. */
|
| 150 |
public String initial_prompt;
|
|
|
|
| 310 |
"no_context", "single_segment",
|
| 311 |
"print_special", "print_progress", "print_realtime", "print_timestamps", "token_timestamps",
|
| 312 |
"thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "speed_up", "audio_ctx",
|
| 313 |
+
"tdrz_enable", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
|
| 314 |
"suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty",
|
| 315 |
"temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
|
| 316 |
"new_segment_callback", "new_segment_callback_user_data",
|
|
@@ -68,28 +68,32 @@ struct whisper_params {
|
|
| 68 |
float entropy_thold = 2.40f;
|
| 69 |
float logprob_thold = -1.00f;
|
| 70 |
|
| 71 |
-
bool speed_up
|
| 72 |
-
bool translate
|
| 73 |
-
bool detect_language= false;
|
| 74 |
-
bool diarize
|
| 75 |
-
bool
|
| 76 |
-
bool
|
| 77 |
-
bool
|
| 78 |
-
bool
|
| 79 |
-
bool
|
| 80 |
-
bool
|
| 81 |
-
bool
|
| 82 |
-
bool
|
| 83 |
-
bool
|
| 84 |
-
bool
|
| 85 |
-
bool
|
| 86 |
-
bool
|
| 87 |
-
bool
|
| 88 |
-
|
| 89 |
-
|
|
|
|
| 90 |
std::string prompt;
|
| 91 |
std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
|
| 92 |
-
std::string model
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
std::vector<std::string> fname_inp = {};
|
| 95 |
std::vector<std::string> fname_out = {};
|
|
@@ -115,41 +119,42 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 115 |
whisper_print_usage(argc, argv, params);
|
| 116 |
exit(0);
|
| 117 |
}
|
| 118 |
-
else if (arg == "-t" || arg == "--threads")
|
| 119 |
-
else if (arg == "-p" || arg == "--processors")
|
| 120 |
-
else if (arg == "-ot" || arg == "--offset-t")
|
| 121 |
-
else if (arg == "-on" || arg == "--offset-n")
|
| 122 |
-
else if (arg == "-d" || arg == "--duration")
|
| 123 |
-
else if (arg == "-mc" || arg == "--max-context")
|
| 124 |
-
else if (arg == "-ml" || arg == "--max-len")
|
| 125 |
-
else if (arg == "-bo" || arg == "--best-of")
|
| 126 |
-
else if (arg == "-bs" || arg == "--beam-size")
|
| 127 |
-
else if (arg == "-wt" || arg == "--word-thold")
|
| 128 |
-
else if (arg == "-et" || arg == "--entropy-thold")
|
| 129 |
-
else if (arg == "-lpt" || arg == "--logprob-thold")
|
| 130 |
-
else if (arg == "-su" || arg == "--speed-up")
|
| 131 |
-
else if (arg == "-tr" || arg == "--translate")
|
| 132 |
-
else if (arg == "-di" || arg == "--diarize")
|
| 133 |
-
else if (arg == "-
|
| 134 |
-
else if (arg == "-
|
| 135 |
-
else if (arg == "-
|
| 136 |
-
else if (arg == "-
|
| 137 |
-
else if (arg == "-
|
| 138 |
-
else if (arg == "-
|
| 139 |
-
else if (arg == "-
|
| 140 |
-
else if (arg == "-
|
| 141 |
-
else if (arg == "-
|
| 142 |
-
else if (arg == "-
|
| 143 |
-
else if (arg == "-
|
| 144 |
-
else if (arg == "-
|
| 145 |
-
else if (arg == "-
|
| 146 |
-
else if (arg == "-
|
| 147 |
-
else if (arg == "-
|
| 148 |
-
else if (arg == "-
|
| 149 |
-
else if (arg == "-
|
| 150 |
-
else if (
|
| 151 |
-
else if (arg == "
|
| 152 |
-
else if (arg == "-
|
|
|
|
| 153 |
else {
|
| 154 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 155 |
whisper_print_usage(argc, argv, params);
|
|
@@ -182,6 +187,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 182 |
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
| 183 |
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 184 |
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
|
|
|
|
| 185 |
fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
|
| 186 |
fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
|
| 187 |
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
|
|
@@ -297,6 +303,12 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
|
|
| 297 |
printf("%s%s", speaker.c_str(), text);
|
| 298 |
}
|
| 299 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
// with timestamps or speakers: each segment on new line
|
| 301 |
if (!params.no_timestamps || params.diarize) {
|
| 302 |
printf("\n");
|
|
@@ -564,6 +576,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
|
|
| 564 |
const int n_segments = whisper_full_n_segments(ctx);
|
| 565 |
for (int i = 0; i < n_segments; ++i) {
|
| 566 |
const char * text = whisper_full_get_segment_text(ctx, i);
|
|
|
|
| 567 |
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
| 568 |
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
| 569 |
|
|
@@ -576,11 +589,15 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
|
|
| 576 |
value_i("from", t0 * 10, false);
|
| 577 |
value_i("to", t1 * 10, true);
|
| 578 |
end_obj(false);
|
| 579 |
-
value_s("text", text, !params.diarize);
|
| 580 |
|
| 581 |
if (params.diarize && pcmf32s.size() == 2) {
|
| 582 |
value_s("speaker", estimate_diarization_speaker(pcmf32s, t0, t1, true).c_str(), true);
|
| 583 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
end_obj(i == (n_segments - 1));
|
| 585 |
}
|
| 586 |
|
|
@@ -777,6 +794,12 @@ int main(int argc, char ** argv) {
|
|
| 777 |
exit(0);
|
| 778 |
}
|
| 779 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 780 |
// whisper init
|
| 781 |
|
| 782 |
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
|
|
@@ -818,11 +841,12 @@ int main(int argc, char ** argv) {
|
|
| 818 |
if (params.detect_language) {
|
| 819 |
params.language = "auto";
|
| 820 |
}
|
| 821 |
-
fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s,
|
| 822 |
__func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
|
| 823 |
params.n_threads, params.n_processors,
|
| 824 |
params.language.c_str(),
|
| 825 |
params.translate ? "translate" : "transcribe",
|
|
|
|
| 826 |
params.no_timestamps ? 0 : 1);
|
| 827 |
|
| 828 |
fprintf(stderr, "\n");
|
|
@@ -853,6 +877,8 @@ int main(int argc, char ** argv) {
|
|
| 853 |
|
| 854 |
wparams.speed_up = params.speed_up;
|
| 855 |
|
|
|
|
|
|
|
| 856 |
wparams.initial_prompt = params.prompt.c_str();
|
| 857 |
|
| 858 |
wparams.greedy.best_of = params.best_of;
|
|
|
|
| 68 |
float entropy_thold = 2.40f;
|
| 69 |
float logprob_thold = -1.00f;
|
| 70 |
|
| 71 |
+
bool speed_up = false;
|
| 72 |
+
bool translate = false;
|
| 73 |
+
bool detect_language = false;
|
| 74 |
+
bool diarize = false;
|
| 75 |
+
bool tinydiarize = false;
|
| 76 |
+
bool split_on_word = false;
|
| 77 |
+
bool no_fallback = false;
|
| 78 |
+
bool output_txt = false;
|
| 79 |
+
bool output_vtt = false;
|
| 80 |
+
bool output_srt = false;
|
| 81 |
+
bool output_wts = false;
|
| 82 |
+
bool output_csv = false;
|
| 83 |
+
bool output_jsn = false;
|
| 84 |
+
bool output_lrc = false;
|
| 85 |
+
bool print_special = false;
|
| 86 |
+
bool print_colors = false;
|
| 87 |
+
bool print_progress = false;
|
| 88 |
+
bool no_timestamps = false;
|
| 89 |
+
|
| 90 |
+
std::string language = "en";
|
| 91 |
std::string prompt;
|
| 92 |
std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
|
| 93 |
+
std::string model = "models/ggml-base.en.bin";
|
| 94 |
+
|
| 95 |
+
// [TDRZ] speaker turn string
|
| 96 |
+
std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
|
| 97 |
|
| 98 |
std::vector<std::string> fname_inp = {};
|
| 99 |
std::vector<std::string> fname_out = {};
|
|
|
|
| 119 |
whisper_print_usage(argc, argv, params);
|
| 120 |
exit(0);
|
| 121 |
}
|
| 122 |
+
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
|
| 123 |
+
else if (arg == "-p" || arg == "--processors") { params.n_processors = std::stoi(argv[++i]); }
|
| 124 |
+
else if (arg == "-ot" || arg == "--offset-t") { params.offset_t_ms = std::stoi(argv[++i]); }
|
| 125 |
+
else if (arg == "-on" || arg == "--offset-n") { params.offset_n = std::stoi(argv[++i]); }
|
| 126 |
+
else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(argv[++i]); }
|
| 127 |
+
else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(argv[++i]); }
|
| 128 |
+
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
|
| 129 |
+
else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
|
| 130 |
+
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
|
| 131 |
+
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
|
| 132 |
+
else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
|
| 133 |
+
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
|
| 134 |
+
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
| 135 |
+
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 136 |
+
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
|
| 137 |
+
else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; }
|
| 138 |
+
else if (arg == "-sow" || arg == "--split-on-word") { params.split_on_word = true; }
|
| 139 |
+
else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
|
| 140 |
+
else if (arg == "-otxt" || arg == "--output-txt") { params.output_txt = true; }
|
| 141 |
+
else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; }
|
| 142 |
+
else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
|
| 143 |
+
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
|
| 144 |
+
else if (arg == "-olrc" || arg == "--output-lrc") { params.output_lrc = true; }
|
| 145 |
+
else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
|
| 146 |
+
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
|
| 147 |
+
else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; }
|
| 148 |
+
else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
|
| 149 |
+
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
| 150 |
+
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
|
| 151 |
+
else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
|
| 152 |
+
else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; }
|
| 153 |
+
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
|
| 154 |
+
else if (arg == "-dl" || arg == "--detect-language") { params.detect_language = true; }
|
| 155 |
+
else if ( arg == "--prompt") { params.prompt = argv[++i]; }
|
| 156 |
+
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
| 157 |
+
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
|
| 158 |
else {
|
| 159 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 160 |
whisper_print_usage(argc, argv, params);
|
|
|
|
| 187 |
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
| 188 |
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 189 |
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
|
| 190 |
+
fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false");
|
| 191 |
fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
|
| 192 |
fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
|
| 193 |
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
|
|
|
|
| 303 |
printf("%s%s", speaker.c_str(), text);
|
| 304 |
}
|
| 305 |
|
| 306 |
+
if (params.tinydiarize) {
|
| 307 |
+
if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
|
| 308 |
+
printf("%s", params.tdrz_speaker_turn.c_str());
|
| 309 |
+
}
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
// with timestamps or speakers: each segment on new line
|
| 313 |
if (!params.no_timestamps || params.diarize) {
|
| 314 |
printf("\n");
|
|
|
|
| 576 |
const int n_segments = whisper_full_n_segments(ctx);
|
| 577 |
for (int i = 0; i < n_segments; ++i) {
|
| 578 |
const char * text = whisper_full_get_segment_text(ctx, i);
|
| 579 |
+
|
| 580 |
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
| 581 |
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
| 582 |
|
|
|
|
| 589 |
value_i("from", t0 * 10, false);
|
| 590 |
value_i("to", t1 * 10, true);
|
| 591 |
end_obj(false);
|
| 592 |
+
value_s("text", text, !params.diarize && !params.tinydiarize);
|
| 593 |
|
| 594 |
if (params.diarize && pcmf32s.size() == 2) {
|
| 595 |
value_s("speaker", estimate_diarization_speaker(pcmf32s, t0, t1, true).c_str(), true);
|
| 596 |
}
|
| 597 |
+
|
| 598 |
+
if (params.tinydiarize) {
|
| 599 |
+
value_b("speaker_turn_next", whisper_full_get_segment_speaker_turn_next(ctx, i), true);
|
| 600 |
+
}
|
| 601 |
end_obj(i == (n_segments - 1));
|
| 602 |
}
|
| 603 |
|
|
|
|
| 794 |
exit(0);
|
| 795 |
}
|
| 796 |
|
| 797 |
+
if (params.diarize && params.tinydiarize) {
|
| 798 |
+
fprintf(stderr, "error: cannot use both --diarize and --tinydiarize\n");
|
| 799 |
+
whisper_print_usage(argc, argv, params);
|
| 800 |
+
exit(0);
|
| 801 |
+
}
|
| 802 |
+
|
| 803 |
// whisper init
|
| 804 |
|
| 805 |
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
|
|
|
|
| 841 |
if (params.detect_language) {
|
| 842 |
params.language = "auto";
|
| 843 |
}
|
| 844 |
+
fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, %stimestamps = %d ...\n",
|
| 845 |
__func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
|
| 846 |
params.n_threads, params.n_processors,
|
| 847 |
params.language.c_str(),
|
| 848 |
params.translate ? "translate" : "transcribe",
|
| 849 |
+
params.tinydiarize ? "tdrz = 1, " : "",
|
| 850 |
params.no_timestamps ? 0 : 1);
|
| 851 |
|
| 852 |
fprintf(stderr, "\n");
|
|
|
|
| 877 |
|
| 878 |
wparams.speed_up = params.speed_up;
|
| 879 |
|
| 880 |
+
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
|
| 881 |
+
|
| 882 |
wparams.initial_prompt = params.prompt.c_str();
|
| 883 |
|
| 884 |
wparams.greedy.best_of = params.best_of;
|
|
@@ -22,7 +22,7 @@ function get_script_path() {
|
|
| 22 |
models_path="$(get_script_path)"
|
| 23 |
|
| 24 |
# Whisper models
|
| 25 |
-
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
|
| 26 |
|
| 27 |
# list available models
|
| 28 |
function list_models {
|
|
@@ -50,6 +50,12 @@ if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
|
|
| 50 |
exit 1
|
| 51 |
fi
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
# download ggml model
|
| 54 |
|
| 55 |
printf "Downloading ggml model $model from '$src' ...\n"
|
|
|
|
| 22 |
models_path="$(get_script_path)"
|
| 23 |
|
| 24 |
# Whisper models
|
| 25 |
+
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small.en-tdrz" "small" "medium.en" "medium" "large-v1" "large" )
|
| 26 |
|
| 27 |
# list available models
|
| 28 |
function list_models {
|
|
|
|
| 50 |
exit 1
|
| 51 |
fi
|
| 52 |
|
| 53 |
+
# check if model contains `tdrz` and update the src and pfx accordingly
|
| 54 |
+
if [[ $model == *"tdrz"* ]]; then
|
| 55 |
+
src="https://huggingface.co/akashmjn/tinydiarize-whisper.cpp"
|
| 56 |
+
pfx="resolve/main/ggml"
|
| 57 |
+
fi
|
| 58 |
+
|
| 59 |
# download ggml model
|
| 60 |
|
| 61 |
printf "Downloading ggml model $model from '$src' ...\n"
|
|
@@ -380,16 +380,18 @@ struct whisper_vocab {
|
|
| 380 |
std::map<token, id> token_to_id;
|
| 381 |
std::map<id, token> id_to_token;
|
| 382 |
|
| 383 |
-
|
| 384 |
-
id
|
| 385 |
-
id
|
| 386 |
-
|
| 387 |
-
id
|
| 388 |
-
id
|
| 389 |
-
|
| 390 |
-
//
|
| 391 |
-
|
| 392 |
-
|
|
|
|
|
|
|
| 393 |
|
| 394 |
bool is_multilingual() const {
|
| 395 |
return n_vocab == 51865;
|
|
@@ -403,6 +405,8 @@ struct whisper_segment {
|
|
| 403 |
std::string text;
|
| 404 |
|
| 405 |
std::vector<whisper_token_data> tokens;
|
|
|
|
|
|
|
| 406 |
};
|
| 407 |
|
| 408 |
// medium
|
|
@@ -966,8 +970,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 966 |
if (vocab.is_multilingual()) {
|
| 967 |
vocab.token_eot++;
|
| 968 |
vocab.token_sot++;
|
| 969 |
-
vocab.
|
|
|
|
| 970 |
vocab.token_solm++;
|
|
|
|
|
|
|
| 971 |
vocab.token_not++;
|
| 972 |
vocab.token_beg++;
|
| 973 |
}
|
|
@@ -981,8 +988,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 981 |
word = "[_EOT_]";
|
| 982 |
} else if (i == vocab.token_sot) {
|
| 983 |
word = "[_SOT_]";
|
|
|
|
|
|
|
| 984 |
} else if (i == vocab.token_prev) {
|
| 985 |
word = "[_PREV_]";
|
|
|
|
|
|
|
| 986 |
} else if (i == vocab.token_not) {
|
| 987 |
word = "[_NOT_]";
|
| 988 |
} else if (i == vocab.token_beg) {
|
|
@@ -3208,12 +3219,16 @@ whisper_token whisper_token_sot(struct whisper_context * ctx) {
|
|
| 3208 |
return ctx->vocab.token_sot;
|
| 3209 |
}
|
| 3210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3211 |
whisper_token whisper_token_prev(struct whisper_context * ctx) {
|
| 3212 |
return ctx->vocab.token_prev;
|
| 3213 |
}
|
| 3214 |
|
| 3215 |
-
whisper_token
|
| 3216 |
-
return ctx->vocab.
|
| 3217 |
}
|
| 3218 |
|
| 3219 |
whisper_token whisper_token_not(struct whisper_context * ctx) {
|
|
@@ -3228,12 +3243,12 @@ whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id) {
|
|
| 3228 |
return whisper_token_sot(ctx) + 1 + lang_id;
|
| 3229 |
}
|
| 3230 |
|
| 3231 |
-
whisper_token whisper_token_translate(
|
| 3232 |
-
return
|
| 3233 |
}
|
| 3234 |
|
| 3235 |
-
whisper_token whisper_token_transcribe(
|
| 3236 |
-
return
|
| 3237 |
}
|
| 3238 |
|
| 3239 |
void whisper_print_timings(struct whisper_context * ctx) {
|
|
@@ -3305,51 +3320,53 @@ struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sam
|
|
| 3305 |
|
| 3306 |
struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
|
| 3307 |
struct whisper_full_params result = {
|
| 3308 |
-
/*.strategy
|
| 3309 |
-
|
| 3310 |
-
/*.n_threads
|
| 3311 |
-
/*.n_max_text_ctx
|
| 3312 |
-
/*.offset_ms
|
| 3313 |
-
/*.duration_ms
|
| 3314 |
-
|
| 3315 |
-
/*.translate
|
| 3316 |
-
/*.no_context
|
| 3317 |
-
/*.single_segment
|
| 3318 |
-
/*.print_special
|
| 3319 |
-
/*.print_progress
|
| 3320 |
-
/*.print_realtime
|
| 3321 |
-
/*.print_timestamps
|
| 3322 |
-
|
| 3323 |
-
/*.token_timestamps
|
| 3324 |
-
/*.thold_pt
|
| 3325 |
-
/*.thold_ptsum
|
| 3326 |
-
/*.max_len
|
| 3327 |
-
/*.split_on_word
|
| 3328 |
-
/*.max_tokens
|
| 3329 |
-
|
| 3330 |
-
/*.speed_up
|
| 3331 |
-
/*.audio_ctx
|
| 3332 |
-
|
| 3333 |
-
/*.
|
| 3334 |
-
|
| 3335 |
-
/*.
|
| 3336 |
-
|
| 3337 |
-
/*.
|
| 3338 |
-
|
| 3339 |
-
|
| 3340 |
-
/*.
|
|
|
|
|
|
|
| 3341 |
/*.suppress_non_speech_tokens =*/ false,
|
| 3342 |
|
| 3343 |
-
/*.temperature
|
| 3344 |
-
/*.max_initial_ts
|
| 3345 |
-
/*.length_penalty
|
| 3346 |
|
| 3347 |
-
/*.temperature_inc
|
| 3348 |
-
/*.entropy_thold
|
| 3349 |
-
/*.logprob_thold
|
| 3350 |
-
/*.no_speech_thold
|
| 3351 |
|
| 3352 |
-
/*.greedy
|
| 3353 |
/*.best_of =*/ -1,
|
| 3354 |
},
|
| 3355 |
|
|
@@ -3430,6 +3447,7 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta
|
|
| 3430 |
state.result_all.back().text = std::move(text);
|
| 3431 |
state.result_all.back().t1 = token.t0;
|
| 3432 |
state.result_all.back().tokens.resize(i);
|
|
|
|
| 3433 |
|
| 3434 |
state.result_all.push_back({});
|
| 3435 |
state.result_all.back().t0 = token.t0;
|
|
@@ -3441,6 +3459,8 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta
|
|
| 3441 |
segment.tokens.begin() + i,
|
| 3442 |
segment.tokens.end());
|
| 3443 |
|
|
|
|
|
|
|
| 3444 |
acc = 0;
|
| 3445 |
text = "";
|
| 3446 |
|
|
@@ -3519,9 +3539,14 @@ static void whisper_process_logits(
|
|
| 3519 |
// ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L410-L412
|
| 3520 |
logits[vocab.token_not] = -INFINITY;
|
| 3521 |
|
| 3522 |
-
// suppress sot and
|
| 3523 |
logits[vocab.token_sot] = -INFINITY;
|
| 3524 |
-
logits[vocab.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3525 |
|
| 3526 |
// suppress task tokens
|
| 3527 |
logits[vocab.token_translate] = -INFINITY;
|
|
@@ -4018,9 +4043,9 @@ int whisper_full_with_state(
|
|
| 4018 |
state->lang_id = lang_id;
|
| 4019 |
prompt_init.push_back(whisper_token_lang(ctx, lang_id));
|
| 4020 |
if (params.translate) {
|
| 4021 |
-
prompt_init.push_back(whisper_token_translate());
|
| 4022 |
} else {
|
| 4023 |
-
prompt_init.push_back(whisper_token_transcribe());
|
| 4024 |
}
|
| 4025 |
}
|
| 4026 |
|
|
@@ -4500,23 +4525,27 @@ int whisper_full_with_state(
|
|
| 4500 |
prompt_past.push_back(tokens_cur[i].id);
|
| 4501 |
}
|
| 4502 |
|
| 4503 |
-
// store the text from this iteration
|
| 4504 |
if (!tokens_cur.empty() && ctx->model.n_loaded > 0) {
|
| 4505 |
int i0 = 0;
|
| 4506 |
auto t0 = seek + 2*(tokens_cur.front().tid - whisper_token_beg(ctx));
|
| 4507 |
|
| 4508 |
std::string text;
|
|
|
|
| 4509 |
|
| 4510 |
for (int i = 0; i < (int) tokens_cur.size(); i++) {
|
| 4511 |
//printf("%s: %18s %6.3f %18s %6.3f\n", __func__,
|
| 4512 |
// ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].p,
|
| 4513 |
// ctx->vocab.id_to_token[tokens_cur[i].tid].c_str(), tokens_cur[i].pt);
|
| 4514 |
|
| 4515 |
-
if (params.print_special
|
| 4516 |
-
} else {
|
| 4517 |
text += whisper_token_to_str(ctx, tokens_cur[i].id);
|
| 4518 |
}
|
| 4519 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4520 |
if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
|
| 4521 |
const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
|
| 4522 |
|
|
@@ -4535,7 +4564,7 @@ int whisper_full_with_state(
|
|
| 4535 |
|
| 4536 |
//printf("tt0 = %d, tt1 = %d, text = %s, token = %s, token_id = %d, tid = %d\n", tt0, tt1, text.c_str(), ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].id, tokens_cur[i].tid);
|
| 4537 |
|
| 4538 |
-
result_all.push_back({ tt0, tt1, text, {} });
|
| 4539 |
for (int j = i0; j <= i; j++) {
|
| 4540 |
result_all.back().tokens.push_back(tokens_cur[j]);
|
| 4541 |
}
|
|
@@ -4561,6 +4590,7 @@ int whisper_full_with_state(
|
|
| 4561 |
i--;
|
| 4562 |
t0 = t1;
|
| 4563 |
i0 = i + 1;
|
|
|
|
| 4564 |
}
|
| 4565 |
}
|
| 4566 |
|
|
@@ -4579,7 +4609,7 @@ int whisper_full_with_state(
|
|
| 4579 |
}
|
| 4580 |
}
|
| 4581 |
|
| 4582 |
-
result_all.push_back({ tt0, tt1, text, {} });
|
| 4583 |
for (int j = i0; j < (int) tokens_cur.size(); j++) {
|
| 4584 |
result_all.back().tokens.push_back(tokens_cur[j]);
|
| 4585 |
}
|
|
@@ -4759,6 +4789,10 @@ int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment)
|
|
| 4759 |
return ctx->state->result_all[i_segment].t1;
|
| 4760 |
}
|
| 4761 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4762 |
const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment) {
|
| 4763 |
return state->result_all[i_segment].text.c_str();
|
| 4764 |
}
|
|
|
|
| 380 |
std::map<token, id> token_to_id;
|
| 381 |
std::map<id, token> id_to_token;
|
| 382 |
|
| 383 |
+
// reference: https://github.com/openai/whisper/blob/248b6cb124225dd263bb9bd32d060b6517e067f8/whisper/tokenizer.py#L334-L349
|
| 384 |
+
id token_eot = 50256;
|
| 385 |
+
id token_sot = 50257;
|
| 386 |
+
// task tokens (used only for multilingual models)
|
| 387 |
+
id token_translate = 50357;
|
| 388 |
+
id token_transcribe = 50358;
|
| 389 |
+
// other special tokens
|
| 390 |
+
id token_solm = 50359; // [TDRZ] used by tinydiarize models to indicate speaker turn
|
| 391 |
+
id token_prev = 50360;
|
| 392 |
+
id token_nosp = 50361;
|
| 393 |
+
id token_not = 50362; // no timestamps
|
| 394 |
+
id token_beg = 50363; // begin timestamps
|
| 395 |
|
| 396 |
bool is_multilingual() const {
|
| 397 |
return n_vocab == 51865;
|
|
|
|
| 405 |
std::string text;
|
| 406 |
|
| 407 |
std::vector<whisper_token_data> tokens;
|
| 408 |
+
|
| 409 |
+
bool speaker_turn_next;
|
| 410 |
};
|
| 411 |
|
| 412 |
// medium
|
|
|
|
| 970 |
if (vocab.is_multilingual()) {
|
| 971 |
vocab.token_eot++;
|
| 972 |
vocab.token_sot++;
|
| 973 |
+
vocab.token_translate++;
|
| 974 |
+
vocab.token_transcribe++;
|
| 975 |
vocab.token_solm++;
|
| 976 |
+
vocab.token_prev++;
|
| 977 |
+
vocab.token_nosp++;
|
| 978 |
vocab.token_not++;
|
| 979 |
vocab.token_beg++;
|
| 980 |
}
|
|
|
|
| 988 |
word = "[_EOT_]";
|
| 989 |
} else if (i == vocab.token_sot) {
|
| 990 |
word = "[_SOT_]";
|
| 991 |
+
} else if (i == vocab.token_solm) {
|
| 992 |
+
word = "[_SOLM_]";
|
| 993 |
} else if (i == vocab.token_prev) {
|
| 994 |
word = "[_PREV_]";
|
| 995 |
+
} else if (i == vocab.token_nosp) {
|
| 996 |
+
word = "[_NOSP_]";
|
| 997 |
} else if (i == vocab.token_not) {
|
| 998 |
word = "[_NOT_]";
|
| 999 |
} else if (i == vocab.token_beg) {
|
|
|
|
| 3219 |
return ctx->vocab.token_sot;
|
| 3220 |
}
|
| 3221 |
|
| 3222 |
+
whisper_token whisper_token_solm(struct whisper_context * ctx) {
|
| 3223 |
+
return ctx->vocab.token_solm;
|
| 3224 |
+
}
|
| 3225 |
+
|
| 3226 |
whisper_token whisper_token_prev(struct whisper_context * ctx) {
|
| 3227 |
return ctx->vocab.token_prev;
|
| 3228 |
}
|
| 3229 |
|
| 3230 |
+
whisper_token whisper_token_nosp(struct whisper_context * ctx) {
|
| 3231 |
+
return ctx->vocab.token_nosp;
|
| 3232 |
}
|
| 3233 |
|
| 3234 |
whisper_token whisper_token_not(struct whisper_context * ctx) {
|
|
|
|
| 3243 |
return whisper_token_sot(ctx) + 1 + lang_id;
|
| 3244 |
}
|
| 3245 |
|
| 3246 |
+
whisper_token whisper_token_translate(struct whisper_context * ctx) {
|
| 3247 |
+
return ctx->vocab.token_translate;
|
| 3248 |
}
|
| 3249 |
|
| 3250 |
+
whisper_token whisper_token_transcribe(struct whisper_context * ctx) {
|
| 3251 |
+
return ctx->vocab.token_transcribe;
|
| 3252 |
}
|
| 3253 |
|
| 3254 |
void whisper_print_timings(struct whisper_context * ctx) {
|
|
|
|
| 3320 |
|
| 3321 |
struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
|
| 3322 |
struct whisper_full_params result = {
|
| 3323 |
+
/*.strategy =*/ strategy,
|
| 3324 |
+
|
| 3325 |
+
/*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
|
| 3326 |
+
/*.n_max_text_ctx =*/ 16384,
|
| 3327 |
+
/*.offset_ms =*/ 0,
|
| 3328 |
+
/*.duration_ms =*/ 0,
|
| 3329 |
+
|
| 3330 |
+
/*.translate =*/ false,
|
| 3331 |
+
/*.no_context =*/ true,
|
| 3332 |
+
/*.single_segment =*/ false,
|
| 3333 |
+
/*.print_special =*/ false,
|
| 3334 |
+
/*.print_progress =*/ true,
|
| 3335 |
+
/*.print_realtime =*/ false,
|
| 3336 |
+
/*.print_timestamps =*/ true,
|
| 3337 |
+
|
| 3338 |
+
/*.token_timestamps =*/ false,
|
| 3339 |
+
/*.thold_pt =*/ 0.01f,
|
| 3340 |
+
/*.thold_ptsum =*/ 0.01f,
|
| 3341 |
+
/*.max_len =*/ 0,
|
| 3342 |
+
/*.split_on_word =*/ false,
|
| 3343 |
+
/*.max_tokens =*/ 0,
|
| 3344 |
+
|
| 3345 |
+
/*.speed_up =*/ false,
|
| 3346 |
+
/*.audio_ctx =*/ 0,
|
| 3347 |
+
|
| 3348 |
+
/*.tdrz_enable =*/ false,
|
| 3349 |
+
|
| 3350 |
+
/*.initial_prompt =*/ nullptr,
|
| 3351 |
+
/*.prompt_tokens =*/ nullptr,
|
| 3352 |
+
/*.prompt_n_tokens =*/ 0,
|
| 3353 |
+
|
| 3354 |
+
/*.language =*/ "en",
|
| 3355 |
+
/*.detect_language =*/ false,
|
| 3356 |
+
|
| 3357 |
+
/*.suppress_blank =*/ true,
|
| 3358 |
/*.suppress_non_speech_tokens =*/ false,
|
| 3359 |
|
| 3360 |
+
/*.temperature =*/ 0.0f,
|
| 3361 |
+
/*.max_initial_ts =*/ 1.0f,
|
| 3362 |
+
/*.length_penalty =*/ -1.0f,
|
| 3363 |
|
| 3364 |
+
/*.temperature_inc =*/ 0.4f,
|
| 3365 |
+
/*.entropy_thold =*/ 2.4f,
|
| 3366 |
+
/*.logprob_thold =*/ -1.0f,
|
| 3367 |
+
/*.no_speech_thold =*/ 0.6f,
|
| 3368 |
|
| 3369 |
+
/*.greedy =*/ {
|
| 3370 |
/*.best_of =*/ -1,
|
| 3371 |
},
|
| 3372 |
|
|
|
|
| 3447 |
state.result_all.back().text = std::move(text);
|
| 3448 |
state.result_all.back().t1 = token.t0;
|
| 3449 |
state.result_all.back().tokens.resize(i);
|
| 3450 |
+
state.result_all.back().speaker_turn_next = false;
|
| 3451 |
|
| 3452 |
state.result_all.push_back({});
|
| 3453 |
state.result_all.back().t0 = token.t0;
|
|
|
|
| 3459 |
segment.tokens.begin() + i,
|
| 3460 |
segment.tokens.end());
|
| 3461 |
|
| 3462 |
+
state.result_all.back().speaker_turn_next = segment.speaker_turn_next;
|
| 3463 |
+
|
| 3464 |
acc = 0;
|
| 3465 |
text = "";
|
| 3466 |
|
|
|
|
| 3539 |
// ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L410-L412
|
| 3540 |
logits[vocab.token_not] = -INFINITY;
|
| 3541 |
|
| 3542 |
+
// suppress sot and nosp tokens
|
| 3543 |
logits[vocab.token_sot] = -INFINITY;
|
| 3544 |
+
logits[vocab.token_nosp] = -INFINITY; // TODO: ignore this token for now
|
| 3545 |
+
|
| 3546 |
+
// [TDRZ] when tinydiarize is disabled, suppress solm token
|
| 3547 |
+
if (params.tdrz_enable == false) {
|
| 3548 |
+
logits[vocab.token_solm] = -INFINITY;
|
| 3549 |
+
}
|
| 3550 |
|
| 3551 |
// suppress task tokens
|
| 3552 |
logits[vocab.token_translate] = -INFINITY;
|
|
|
|
| 4043 |
state->lang_id = lang_id;
|
| 4044 |
prompt_init.push_back(whisper_token_lang(ctx, lang_id));
|
| 4045 |
if (params.translate) {
|
| 4046 |
+
prompt_init.push_back(whisper_token_translate(ctx));
|
| 4047 |
} else {
|
| 4048 |
+
prompt_init.push_back(whisper_token_transcribe(ctx));
|
| 4049 |
}
|
| 4050 |
}
|
| 4051 |
|
|
|
|
| 4525 |
prompt_past.push_back(tokens_cur[i].id);
|
| 4526 |
}
|
| 4527 |
|
|
|
|
| 4528 |
if (!tokens_cur.empty() && ctx->model.n_loaded > 0) {
|
| 4529 |
int i0 = 0;
|
| 4530 |
auto t0 = seek + 2*(tokens_cur.front().tid - whisper_token_beg(ctx));
|
| 4531 |
|
| 4532 |
std::string text;
|
| 4533 |
+
bool speaker_turn_next = false;
|
| 4534 |
|
| 4535 |
for (int i = 0; i < (int) tokens_cur.size(); i++) {
|
| 4536 |
//printf("%s: %18s %6.3f %18s %6.3f\n", __func__,
|
| 4537 |
// ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].p,
|
| 4538 |
// ctx->vocab.id_to_token[tokens_cur[i].tid].c_str(), tokens_cur[i].pt);
|
| 4539 |
|
| 4540 |
+
if (params.print_special || tokens_cur[i].id < whisper_token_eot(ctx)) {
|
|
|
|
| 4541 |
text += whisper_token_to_str(ctx, tokens_cur[i].id);
|
| 4542 |
}
|
| 4543 |
|
| 4544 |
+
// [TDRZ] record if speaker turn was predicted after current segment
|
| 4545 |
+
if (params.tdrz_enable && tokens_cur[i].id == whisper_token_solm(ctx)) {
|
| 4546 |
+
speaker_turn_next = true;
|
| 4547 |
+
}
|
| 4548 |
+
|
| 4549 |
if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
|
| 4550 |
const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
|
| 4551 |
|
|
|
|
| 4564 |
|
| 4565 |
//printf("tt0 = %d, tt1 = %d, text = %s, token = %s, token_id = %d, tid = %d\n", tt0, tt1, text.c_str(), ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].id, tokens_cur[i].tid);
|
| 4566 |
|
| 4567 |
+
result_all.push_back({ tt0, tt1, text, {}, speaker_turn_next });
|
| 4568 |
for (int j = i0; j <= i; j++) {
|
| 4569 |
result_all.back().tokens.push_back(tokens_cur[j]);
|
| 4570 |
}
|
|
|
|
| 4590 |
i--;
|
| 4591 |
t0 = t1;
|
| 4592 |
i0 = i + 1;
|
| 4593 |
+
speaker_turn_next = false;
|
| 4594 |
}
|
| 4595 |
}
|
| 4596 |
|
|
|
|
| 4609 |
}
|
| 4610 |
}
|
| 4611 |
|
| 4612 |
+
result_all.push_back({ tt0, tt1, text, {} , speaker_turn_next });
|
| 4613 |
for (int j = i0; j < (int) tokens_cur.size(); j++) {
|
| 4614 |
result_all.back().tokens.push_back(tokens_cur[j]);
|
| 4615 |
}
|
|
|
|
| 4789 |
return ctx->state->result_all[i_segment].t1;
|
| 4790 |
}
|
| 4791 |
|
| 4792 |
+
bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment) {
|
| 4793 |
+
return ctx->state->result_all[i_segment].speaker_turn_next;
|
| 4794 |
+
}
|
| 4795 |
+
|
| 4796 |
const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment) {
|
| 4797 |
return state->result_all[i_segment].text.c_str();
|
| 4798 |
}
|
|
@@ -277,15 +277,16 @@ extern "C" {
|
|
| 277 |
// Special tokens
|
| 278 |
WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
|
| 279 |
WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
|
| 280 |
-
WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
|
| 281 |
WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
|
|
|
|
|
|
|
| 282 |
WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
|
| 283 |
WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
|
| 284 |
WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);
|
| 285 |
|
| 286 |
// Task tokens
|
| 287 |
-
WHISPER_API whisper_token whisper_token_translate (
|
| 288 |
-
WHISPER_API whisper_token whisper_token_transcribe(
|
| 289 |
|
| 290 |
// Performance information from the default state.
|
| 291 |
WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
|
|
@@ -358,6 +359,9 @@ extern "C" {
|
|
| 358 |
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
| 359 |
int audio_ctx; // overwrite the audio context size (0 = use default)
|
| 360 |
|
|
|
|
|
|
|
|
|
|
| 361 |
// tokens to provide to the whisper decoder as initial prompt
|
| 362 |
// these are prepended to any existing text context from a previous call
|
| 363 |
const char * initial_prompt;
|
|
@@ -460,6 +464,9 @@ extern "C" {
|
|
| 460 |
WHISPER_API int64_t whisper_full_get_segment_t1 (struct whisper_context * ctx, int i_segment);
|
| 461 |
WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
|
| 462 |
|
|
|
|
|
|
|
|
|
|
| 463 |
// Get the text of the specified segment
|
| 464 |
WHISPER_API const char * whisper_full_get_segment_text (struct whisper_context * ctx, int i_segment);
|
| 465 |
WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
|
|
@@ -488,9 +495,9 @@ extern "C" {
|
|
| 488 |
|
| 489 |
// Temporary helpers needed for exposing ggml interface
|
| 490 |
|
| 491 |
-
WHISPER_API int
|
| 492 |
-
WHISPER_API const char * whisper_bench_memcpy_str(int n_threads);
|
| 493 |
-
WHISPER_API int
|
| 494 |
WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
|
| 495 |
|
| 496 |
#ifdef __cplusplus
|
|
|
|
| 277 |
// Special tokens
|
| 278 |
WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
|
| 279 |
WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
|
|
|
|
| 280 |
WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
|
| 281 |
+
WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
|
| 282 |
+
WHISPER_API whisper_token whisper_token_nosp(struct whisper_context * ctx);
|
| 283 |
WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
|
| 284 |
WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
|
| 285 |
WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);
|
| 286 |
|
| 287 |
// Task tokens
|
| 288 |
+
WHISPER_API whisper_token whisper_token_translate (struct whisper_context * ctx);
|
| 289 |
+
WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx);
|
| 290 |
|
| 291 |
// Performance information from the default state.
|
| 292 |
WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
|
|
|
|
| 359 |
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
| 360 |
int audio_ctx; // overwrite the audio context size (0 = use default)
|
| 361 |
|
| 362 |
+
// [EXPERIMENTAL] [TDRZ] tinydiarize
|
| 363 |
+
bool tdrz_enable; // enable tinydiarize speaker turn detection
|
| 364 |
+
|
| 365 |
// tokens to provide to the whisper decoder as initial prompt
|
| 366 |
// these are prepended to any existing text context from a previous call
|
| 367 |
const char * initial_prompt;
|
|
|
|
| 464 |
WHISPER_API int64_t whisper_full_get_segment_t1 (struct whisper_context * ctx, int i_segment);
|
| 465 |
WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
|
| 466 |
|
| 467 |
+
// Get whether the next segment is predicted as a speaker turn
|
| 468 |
+
WHISPER_API bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment);
|
| 469 |
+
|
| 470 |
// Get the text of the specified segment
|
| 471 |
WHISPER_API const char * whisper_full_get_segment_text (struct whisper_context * ctx, int i_segment);
|
| 472 |
WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
|
|
|
|
| 495 |
|
| 496 |
// Temporary helpers needed for exposing ggml interface
|
| 497 |
|
| 498 |
+
WHISPER_API int whisper_bench_memcpy (int n_threads);
|
| 499 |
+
WHISPER_API const char * whisper_bench_memcpy_str (int n_threads);
|
| 500 |
+
WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads);
|
| 501 |
WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
|
| 502 |
|
| 503 |
#ifdef __cplusplus
|