Spaces:
Running
Running
whisper : minor improvemnt in decoding strategy (#244)
Browse filesDo not allow for text segments to go beyond end of audio.
This partially mitigates some issues when the last audio window is 1-2
seconds just before the end of the audio file and the decoding spirals
into a repetition of the last transcribed phrase.
- whisper.cpp +8 -4
whisper.cpp
CHANGED
|
@@ -2687,6 +2687,7 @@ int whisper_full(
|
|
| 2687 |
tokens_cur.clear();
|
| 2688 |
|
| 2689 |
bool failed = false;
|
|
|
|
| 2690 |
|
| 2691 |
for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
|
| 2692 |
if (whisper_decode(ctx, prompt.data(), prompt.size(), n_past, params.n_threads) != 0) {
|
|
@@ -2712,13 +2713,13 @@ int whisper_full(
|
|
| 2712 |
const int seek_delta_new = 2*(token.id - whisper_token_beg(ctx));
|
| 2713 |
|
| 2714 |
// do not allow to go back in time
|
| 2715 |
-
if (seek_delta
|
| 2716 |
-
seek_delta > seek_delta_new && result_len < i) {
|
| 2717 |
break;
|
| 2718 |
}
|
| 2719 |
|
| 2720 |
seek_delta = seek_delta_new;
|
| 2721 |
result_len = i + 1;
|
|
|
|
| 2722 |
}
|
| 2723 |
|
| 2724 |
// add it to the context
|
|
@@ -2730,8 +2731,11 @@ int whisper_full(
|
|
| 2730 |
// printf("%s: %10s %6d %6.3f '%s'\n", __func__, tt.c_str(), token.id, token.pt, ctx->vocab.id_to_token[token.id].c_str());
|
| 2731 |
//}
|
| 2732 |
|
| 2733 |
-
// end of
|
| 2734 |
-
if (token.id == whisper_token_eot(ctx) ||
|
|
|
|
|
|
|
|
|
|
| 2735 |
if (result_len == 0) {
|
| 2736 |
if (seek + seek_delta + 100 >= seek_end) {
|
| 2737 |
result_len = i + 1;
|
|
|
|
| 2687 |
tokens_cur.clear();
|
| 2688 |
|
| 2689 |
bool failed = false;
|
| 2690 |
+
bool has_ts = false; // have we already sampled a non-beg timestamp token for the current segment?
|
| 2691 |
|
| 2692 |
for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
|
| 2693 |
if (whisper_decode(ctx, prompt.data(), prompt.size(), n_past, params.n_threads) != 0) {
|
|
|
|
| 2713 |
const int seek_delta_new = 2*(token.id - whisper_token_beg(ctx));
|
| 2714 |
|
| 2715 |
// do not allow to go back in time
|
| 2716 |
+
if (has_ts && seek_delta > seek_delta_new && result_len < i) {
|
|
|
|
| 2717 |
break;
|
| 2718 |
}
|
| 2719 |
|
| 2720 |
seek_delta = seek_delta_new;
|
| 2721 |
result_len = i + 1;
|
| 2722 |
+
has_ts = true;
|
| 2723 |
}
|
| 2724 |
|
| 2725 |
// add it to the context
|
|
|
|
| 2731 |
// printf("%s: %10s %6d %6.3f '%s'\n", __func__, tt.c_str(), token.id, token.pt, ctx->vocab.id_to_token[token.id].c_str());
|
| 2732 |
//}
|
| 2733 |
|
| 2734 |
+
// end of segment
|
| 2735 |
+
if (token.id == whisper_token_eot(ctx) || // end of text token
|
| 2736 |
+
(params.max_tokens > 0 && i > params.max_tokens) || // max tokens per segment reached
|
| 2737 |
+
(has_ts && seek + seek_delta + 100 >= seek_end) // end of audio reached
|
| 2738 |
+
) {
|
| 2739 |
if (result_len == 0) {
|
| 2740 |
if (seek + seek_delta + 100 >= seek_end) {
|
| 2741 |
result_len = i + 1;
|