ggerganov commited on
Commit
36c5e16
·
unverified ·
1 Parent(s): 2f68de6

whisper : minor improvemnt in decoding strategy (#244)

Browse files

Do not allow for text segments to go beyond end of audio.
This partially mitigates some issues when the last audio window is 1-2
seconds just before the end of the audio file and the decoding spirals
into a repetition of the last transcribed phrase.

Files changed (1) hide show
  1. whisper.cpp +8 -4
whisper.cpp CHANGED
@@ -2687,6 +2687,7 @@ int whisper_full(
2687
  tokens_cur.clear();
2688
 
2689
  bool failed = false;
 
2690
 
2691
  for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
2692
  if (whisper_decode(ctx, prompt.data(), prompt.size(), n_past, params.n_threads) != 0) {
@@ -2712,13 +2713,13 @@ int whisper_full(
2712
  const int seek_delta_new = 2*(token.id - whisper_token_beg(ctx));
2713
 
2714
  // do not allow to go back in time
2715
- if (seek_delta != 100*WHISPER_CHUNK_SIZE &&
2716
- seek_delta > seek_delta_new && result_len < i) {
2717
  break;
2718
  }
2719
 
2720
  seek_delta = seek_delta_new;
2721
  result_len = i + 1;
 
2722
  }
2723
 
2724
  // add it to the context
@@ -2730,8 +2731,11 @@ int whisper_full(
2730
  // printf("%s: %10s %6d %6.3f '%s'\n", __func__, tt.c_str(), token.id, token.pt, ctx->vocab.id_to_token[token.id].c_str());
2731
  //}
2732
 
2733
- // end of text token
2734
- if (token.id == whisper_token_eot(ctx) || (params.max_tokens > 0 && i > params.max_tokens)) {
 
 
 
2735
  if (result_len == 0) {
2736
  if (seek + seek_delta + 100 >= seek_end) {
2737
  result_len = i + 1;
 
2687
  tokens_cur.clear();
2688
 
2689
  bool failed = false;
2690
+ bool has_ts = false; // have we already sampled a non-beg timestamp token for the current segment?
2691
 
2692
  for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
2693
  if (whisper_decode(ctx, prompt.data(), prompt.size(), n_past, params.n_threads) != 0) {
 
2713
  const int seek_delta_new = 2*(token.id - whisper_token_beg(ctx));
2714
 
2715
  // do not allow to go back in time
2716
+ if (has_ts && seek_delta > seek_delta_new && result_len < i) {
 
2717
  break;
2718
  }
2719
 
2720
  seek_delta = seek_delta_new;
2721
  result_len = i + 1;
2722
+ has_ts = true;
2723
  }
2724
 
2725
  // add it to the context
 
2731
  // printf("%s: %10s %6d %6.3f '%s'\n", __func__, tt.c_str(), token.id, token.pt, ctx->vocab.id_to_token[token.id].c_str());
2732
  //}
2733
 
2734
+ // end of segment
2735
+ if (token.id == whisper_token_eot(ctx) || // end of text token
2736
+ (params.max_tokens > 0 && i > params.max_tokens) || // max tokens per segment reached
2737
+ (has_ts && seek + seek_delta + 100 >= seek_end) // end of audio reached
2738
+ ) {
2739
  if (result_len == 0) {
2740
  if (seek + seek_delta + 100 >= seek_end) {
2741
  result_len = i + 1;