stanimirovb commited on
Commit
7ef0c95
·
unverified ·
1 Parent(s): f08dc65

whisper : remove `speed_up` and `phase_vocoder*` functions (#2198)

Browse files

* whisper : fix cast warning

* whisper : remove phase_vocoder functions, ref #2195

* whisper : remove speed_up from whisper_full_params, closes #2195

bindings/go/examples/go-whisper/flags.go CHANGED
@@ -68,10 +68,6 @@ func (flags *Flags) GetOut() string {
68
  return strings.ToLower(flags.Lookup("out").Value.String())
69
  }
70
 
71
- func (flags *Flags) IsSpeedup() bool {
72
- return flags.Lookup("speedup").Value.String() == "true"
73
- }
74
-
75
  func (flags *Flags) IsTokens() bool {
76
  return flags.Lookup("tokens").Value.String() == "true"
77
  }
@@ -111,10 +107,6 @@ func (flags *Flags) SetParams(context whisper.Context) error {
111
  fmt.Fprintf(flags.Output(), "Setting duration to %v\n", duration)
112
  context.SetDuration(duration)
113
  }
114
- if flags.IsSpeedup() {
115
- fmt.Fprintf(flags.Output(), "Setting speedup to true\n")
116
- context.SetSpeedup(true)
117
- }
118
  if threads := flags.GetThreads(); threads != 0 {
119
  fmt.Fprintf(flags.Output(), "Setting threads to %d\n", threads)
120
  context.SetThreads(threads)
@@ -146,7 +138,6 @@ func registerFlags(flag *Flags) {
146
  flag.Duration("offset", 0, "Time offset")
147
  flag.Duration("duration", 0, "Duration of audio to process")
148
  flag.Uint("threads", 0, "Number of threads to use")
149
- flag.Bool("speedup", false, "Enable speedup")
150
  flag.Uint("max-len", 0, "Maximum segment length in characters")
151
  flag.Uint("max-tokens", 0, "Maximum tokens per segment")
152
  flag.Float64("word-thold", 0, "Maximum segment score")
 
68
  return strings.ToLower(flags.Lookup("out").Value.String())
69
  }
70
 
 
 
 
 
71
  func (flags *Flags) IsTokens() bool {
72
  return flags.Lookup("tokens").Value.String() == "true"
73
  }
 
107
  fmt.Fprintf(flags.Output(), "Setting duration to %v\n", duration)
108
  context.SetDuration(duration)
109
  }
 
 
 
 
110
  if threads := flags.GetThreads(); threads != 0 {
111
  fmt.Fprintf(flags.Output(), "Setting threads to %d\n", threads)
112
  context.SetThreads(threads)
 
138
  flag.Duration("offset", 0, "Time offset")
139
  flag.Duration("duration", 0, "Duration of audio to process")
140
  flag.Uint("threads", 0, "Number of threads to use")
 
141
  flag.Uint("max-len", 0, "Maximum segment length in characters")
142
  flag.Uint("max-tokens", 0, "Maximum tokens per segment")
143
  flag.Float64("word-thold", 0, "Maximum segment score")
bindings/go/params.go CHANGED
@@ -47,10 +47,6 @@ func (p *Params) SetPrintTimestamps(v bool) {
47
  p.print_timestamps = toBool(v)
48
  }
49
 
50
- func (p *Params) SetSpeedup(v bool) {
51
- p.speed_up = toBool(v)
52
- }
53
-
54
  // Set language id
55
  func (p *Params) SetLanguage(lang int) error {
56
  if lang == -1 {
@@ -177,9 +173,6 @@ func (p *Params) String() string {
177
  if p.token_timestamps {
178
  str += " token_timestamps"
179
  }
180
- if p.speed_up {
181
- str += " speed_up"
182
- }
183
 
184
  return str + ">"
185
  }
 
47
  p.print_timestamps = toBool(v)
48
  }
49
 
 
 
 
 
50
  // Set language id
51
  func (p *Params) SetLanguage(lang int) error {
52
  if lang == -1 {
 
173
  if p.token_timestamps {
174
  str += " token_timestamps"
175
  }
 
 
 
176
 
177
  return str + ">"
178
  }
bindings/go/pkg/whisper/context.go CHANGED
@@ -76,11 +76,6 @@ func (context *context) SetTranslate(v bool) {
76
  context.params.SetTranslate(v)
77
  }
78
 
79
- // Set speedup flag
80
- func (context *context) SetSpeedup(v bool) {
81
- context.params.SetSpeedup(v)
82
- }
83
-
84
  func (context *context) SetSplitOnWord(v bool) {
85
  context.params.SetSplitOnWord(v)
86
  }
 
76
  context.params.SetTranslate(v)
77
  }
78
 
 
 
 
 
 
79
  func (context *context) SetSplitOnWord(v bool) {
80
  context.params.SetSplitOnWord(v)
81
  }
bindings/go/pkg/whisper/interface.go CHANGED
@@ -41,7 +41,6 @@ type Context interface {
41
  SetOffset(time.Duration) // Set offset
42
  SetDuration(time.Duration) // Set duration
43
  SetThreads(uint) // Set number of threads to use
44
- SetSpeedup(bool) // Set speedup flag
45
  SetSplitOnWord(bool) // Set split on word flag
46
  SetTokenThreshold(float32) // Set timestamp token probability threshold
47
  SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
 
41
  SetOffset(time.Duration) // Set offset
42
  SetDuration(time.Duration) // Set duration
43
  SetThreads(uint) // Set number of threads to use
 
44
  SetSplitOnWord(bool) // Set split on word flag
45
  SetTokenThreshold(float32) // Set timestamp token probability threshold
46
  SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java CHANGED
@@ -20,7 +20,7 @@ public interface WhisperCppJnaLibrary extends Library {
20
  * @return Whisper context on success, null on failure
21
  */
22
  Pointer whisper_init_from_file(String path_model);
23
-
24
  /**
25
  * Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
26
  * Because this function allocates memory for the params, the caller must call either:
@@ -304,14 +304,6 @@ public interface WhisperCppJnaLibrary extends Library {
304
  /** Language id associated with the provided state */
305
  int whisper_full_lang_id_from_state(Pointer state);
306
 
307
- /**
308
- * Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
309
- * The resulting spectrogram is stored inside the default state of the provided whisper context.
310
- * @return 0 on success
311
- */
312
- int whisper_pcm_to_mel_phase_vocoder(Pointer ctx, final float[] samples, int n_samples, int n_threads);
313
-
314
- int whisper_pcm_to_mel_phase_vocoder_with_state(Pointer ctx, Pointer state, final float[] samples, int n_samples, int n_threads);
315
 
316
  /** Get the start time of the specified segment. */
317
  long whisper_full_get_segment_t0(Pointer ctx, int i_segment);
 
20
  * @return Whisper context on success, null on failure
21
  */
22
  Pointer whisper_init_from_file(String path_model);
23
+
24
  /**
25
  * Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
26
  * Because this function allocates memory for the params, the caller must call either:
 
304
  /** Language id associated with the provided state */
305
  int whisper_full_lang_id_from_state(Pointer state);
306
 
 
 
 
 
 
 
 
 
307
 
308
  /** Get the start time of the specified segment. */
309
  long whisper_full_get_segment_t0(Pointer ctx, int i_segment);
bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java CHANGED
@@ -129,14 +129,6 @@ public class WhisperFullParams extends Structure {
129
  /** Maximum tokens per segment (0, default = no limit) */
130
  public int max_tokens;
131
 
132
- /** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */
133
- public CBool speed_up;
134
-
135
- /** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */
136
- public void speedUp(boolean enable) {
137
- speed_up = enable ? CBool.TRUE : CBool.FALSE;
138
- }
139
-
140
  /** Overwrite the audio context size (0 = use default). */
141
  public int audio_ctx;
142
 
@@ -321,7 +313,7 @@ public class WhisperFullParams extends Structure {
321
  return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate",
322
  "no_context", "single_segment", "no_timestamps",
323
  "print_special", "print_progress", "print_realtime", "print_timestamps", "token_timestamps",
324
- "thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "speed_up", "audio_ctx",
325
  "tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
326
  "suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty",
327
  "temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
 
129
  /** Maximum tokens per segment (0, default = no limit) */
130
  public int max_tokens;
131
 
 
 
 
 
 
 
 
 
132
  /** Overwrite the audio context size (0 = use default). */
133
  public int audio_ctx;
134
 
 
313
  return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate",
314
  "no_context", "single_segment", "no_timestamps",
315
  "print_special", "print_progress", "print_realtime", "print_timestamps", "token_timestamps",
316
+ "thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "audio_ctx",
317
  "tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
318
  "suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty",
319
  "temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
bindings/ruby/ext/ruby_whisper.cpp CHANGED
@@ -311,12 +311,6 @@ static VALUE ruby_whisper_params_get_split_on_word(VALUE self) {
311
  static VALUE ruby_whisper_params_set_split_on_word(VALUE self, VALUE value) {
312
  BOOL_PARAMS_SETTER(self, split_on_word, value)
313
  }
314
- static VALUE ruby_whisper_params_get_speed_up(VALUE self) {
315
- BOOL_PARAMS_GETTER(self, speed_up)
316
- }
317
- static VALUE ruby_whisper_params_set_speed_up(VALUE self, VALUE value) {
318
- BOOL_PARAMS_SETTER(self, speed_up, value)
319
- }
320
  static VALUE ruby_whisper_params_get_diarize(VALUE self) {
321
  ruby_whisper_params *rwp;
322
  Data_Get_Struct(self, ruby_whisper_params, rwp);
@@ -408,8 +402,6 @@ void Init_whisper() {
408
  rb_define_method(cParams, "token_timestamps=", ruby_whisper_params_set_token_timestamps, 1);
409
  rb_define_method(cParams, "split_on_word", ruby_whisper_params_get_split_on_word, 0);
410
  rb_define_method(cParams, "split_on_word=", ruby_whisper_params_set_split_on_word, 1);
411
- rb_define_method(cParams, "speed_up", ruby_whisper_params_get_speed_up, 0);
412
- rb_define_method(cParams, "speed_up=", ruby_whisper_params_set_speed_up, 1);
413
  rb_define_method(cParams, "diarize", ruby_whisper_params_get_diarize, 0);
414
  rb_define_method(cParams, "diarize=", ruby_whisper_params_set_diarize, 1);
415
 
 
311
  static VALUE ruby_whisper_params_set_split_on_word(VALUE self, VALUE value) {
312
  BOOL_PARAMS_SETTER(self, split_on_word, value)
313
  }
 
 
 
 
 
 
314
  static VALUE ruby_whisper_params_get_diarize(VALUE self) {
315
  ruby_whisper_params *rwp;
316
  Data_Get_Struct(self, ruby_whisper_params, rwp);
 
402
  rb_define_method(cParams, "token_timestamps=", ruby_whisper_params_set_token_timestamps, 1);
403
  rb_define_method(cParams, "split_on_word", ruby_whisper_params_get_split_on_word, 0);
404
  rb_define_method(cParams, "split_on_word=", ruby_whisper_params_set_split_on_word, 1);
 
 
405
  rb_define_method(cParams, "diarize", ruby_whisper_params_get_diarize, 0);
406
  rb_define_method(cParams, "diarize=", ruby_whisper_params_set_diarize, 1);
407
 
bindings/ruby/tests/test_whisper.rb CHANGED
@@ -117,13 +117,6 @@ class TestWhisper < Test::Unit::TestCase
117
  assert [email protected]_on_word
118
  end
119
 
120
- def test_speed_up
121
- @params.speed_up = true
122
- assert @params.speed_up
123
- @params.speed_up = false
124
- assert [email protected]_up
125
- end
126
-
127
  def test_whisper
128
  @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
129
  params = Whisper::Params.new
 
117
  assert [email protected]_on_word
118
  end
119
 
 
 
 
 
 
 
 
120
  def test_whisper
121
  @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
122
  params = Whisper::Params.new
examples/addon.node/addon.cpp CHANGED
@@ -25,7 +25,6 @@ struct whisper_params {
25
  float entropy_thold = 2.4f;
26
  float logprob_thold = -1.0f;
27
 
28
- bool speed_up = false;
29
  bool translate = false;
30
  bool diarize = false;
31
  bool output_txt = false;
@@ -232,8 +231,6 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
232
  wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
233
  wparams.audio_ctx = params.audio_ctx;
234
 
235
- wparams.speed_up = params.speed_up;
236
-
237
  wparams.greedy.best_of = params.best_of;
238
  wparams.beam_search.beam_size = params.beam_size;
239
 
 
25
  float entropy_thold = 2.4f;
26
  float logprob_thold = -1.0f;
27
 
 
28
  bool translate = false;
29
  bool diarize = false;
30
  bool output_txt = false;
 
231
  wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
232
  wparams.audio_ctx = params.audio_ctx;
233
 
 
 
234
  wparams.greedy.best_of = params.best_of;
235
  wparams.beam_search.beam_size = params.beam_size;
236
 
examples/command/command.cpp CHANGED
@@ -38,7 +38,6 @@ struct whisper_params {
38
 
39
  grammar_parser::parse_state grammar_parsed;
40
 
41
- bool speed_up = false;
42
  bool translate = false;
43
  bool print_special = false;
44
  bool print_energy = false;
@@ -76,7 +75,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
76
  else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
77
  else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
78
  else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
79
- else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
80
  else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
81
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
82
  else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
@@ -115,7 +113,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
115
  fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
116
  fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
117
  fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
118
- fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
119
  fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
120
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
121
  fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
@@ -165,7 +162,6 @@ std::string transcribe(
165
  wparams.n_threads = params.n_threads;
166
 
167
  wparams.audio_ctx = params.audio_ctx;
168
- wparams.speed_up = params.speed_up;
169
 
170
  wparams.temperature = 0.4f;
171
  wparams.temperature_inc = 1.0f;
@@ -371,7 +367,6 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
371
  wparams.n_threads = params.n_threads;
372
 
373
  wparams.audio_ctx = params.audio_ctx;
374
- wparams.speed_up = params.speed_up;
375
 
376
  wparams.prompt_tokens = k_tokens.data();
377
  wparams.prompt_n_tokens = k_tokens.size();
 
38
 
39
  grammar_parser::parse_state grammar_parsed;
40
 
 
41
  bool translate = false;
42
  bool print_special = false;
43
  bool print_energy = false;
 
75
  else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
76
  else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
77
  else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
 
78
  else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
79
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
80
  else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
 
113
  fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
114
  fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
115
  fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
 
116
  fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
117
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
118
  fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
 
162
  wparams.n_threads = params.n_threads;
163
 
164
  wparams.audio_ctx = params.audio_ctx;
 
165
 
166
  wparams.temperature = 0.4f;
167
  wparams.temperature_inc = 1.0f;
 
367
  wparams.n_threads = params.n_threads;
368
 
369
  wparams.audio_ctx = params.audio_ctx;
 
370
 
371
  wparams.prompt_tokens = k_tokens.data();
372
  wparams.prompt_n_tokens = k_tokens.size();
examples/common.h CHANGED
@@ -185,7 +185,7 @@ private:
185
  // It is assumed that PCM data is normalized to a range from -1 to 1
186
  bool write_audio(const float * data, size_t length) {
187
  for (size_t i = 0; i < length; ++i) {
188
- const int16_t intSample = data[i] * 32767;
189
  file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
190
  dataSize += sizeof(int16_t);
191
  }
 
185
  // It is assumed that PCM data is normalized to a range from -1 to 1
186
  bool write_audio(const float * data, size_t length) {
187
  for (size_t i = 0; i < length; ++i) {
188
+ const int16_t intSample = int16_t(data[i] * 32767);
189
  file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
190
  dataSize += sizeof(int16_t);
191
  }
examples/lsp/lsp.cpp CHANGED
@@ -26,7 +26,6 @@ struct whisper_params {
26
  float vad_thold = 0.6f;
27
  float freq_thold = 100.0f;
28
 
29
- bool speed_up = false;
30
  bool translate = false;
31
  bool print_special = false;
32
  bool print_energy = false;
@@ -70,7 +69,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
70
  else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
71
  else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
72
  else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
73
- else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
74
  else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
75
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
76
  else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
@@ -102,7 +100,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
102
  fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
103
  fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
104
  fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
105
- fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
106
  fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
107
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
108
  fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
@@ -184,7 +181,6 @@ json unguided_transcription(struct whisper_context * ctx, audio_async &audio, js
184
  wparams.n_threads = params.n_threads;
185
 
186
  wparams.audio_ctx = params.audio_ctx;
187
- wparams.speed_up = params.speed_up;
188
  wparams.suppress_non_speech_tokens = true;
189
  // run the transformer and a single decoding pass
190
  if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
@@ -223,7 +219,6 @@ json guided_transcription(struct whisper_context * ctx, audio_async &audio, cons
223
  wparams.n_threads = params.n_threads;
224
 
225
  wparams.audio_ctx = params.audio_ctx;
226
- wparams.speed_up = params.speed_up;
227
 
228
  // TODO: Do some time testing. Does an overly long prompt slow down processing?
229
  // Set up command sets/precompute prompts
 
26
  float vad_thold = 0.6f;
27
  float freq_thold = 100.0f;
28
 
 
29
  bool translate = false;
30
  bool print_special = false;
31
  bool print_energy = false;
 
69
  else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
70
  else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
71
  else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
 
72
  else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
73
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
74
  else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
 
100
  fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
101
  fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
102
  fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
 
103
  fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
104
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
105
  fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
 
181
  wparams.n_threads = params.n_threads;
182
 
183
  wparams.audio_ctx = params.audio_ctx;
 
184
  wparams.suppress_non_speech_tokens = true;
185
  // run the transformer and a single decoding pass
186
  if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
 
219
  wparams.n_threads = params.n_threads;
220
 
221
  wparams.audio_ctx = params.audio_ctx;
 
222
 
223
  // TODO: Do some time testing. Does an overly long prompt slow down processing?
224
  // Set up command sets/precompute prompts
examples/main/main.cpp CHANGED
@@ -47,7 +47,6 @@ struct whisper_params {
47
  float temperature = 0.0f;
48
  float temperature_inc = 0.2f;
49
 
50
- bool speed_up = false;
51
  bool debug_mode = false;
52
  bool translate = false;
53
  bool detect_language = false;
@@ -138,7 +137,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
138
  else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
139
  else if (arg == "-tp" || arg == "--temperature") { params.temperature = std::stof(argv[++i]); }
140
  else if (arg == "-tpi" || arg == "--temperature-inc") { params.temperature_inc = std::stof(argv[++i]); }
141
- // else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
142
  else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
143
  else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
144
  else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
@@ -206,7 +204,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
206
  fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
207
  fprintf(stderr, " -tp, --temperature N [%-7.2f] The sampling temperature, between 0 and 1\n", params.temperature);
208
  fprintf(stderr, " -tpi, --temperature-inc N [%-7.2f] The increment of temperature, between 0 and 1\n",params.temperature_inc);
209
- // fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
210
  fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
211
  fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
212
  fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
@@ -1106,7 +1103,6 @@ int main(int argc, char ** argv) {
1106
  wparams.split_on_word = params.split_on_word;
1107
  wparams.audio_ctx = params.audio_ctx;
1108
 
1109
- wparams.speed_up = params.speed_up;
1110
  wparams.debug_mode = params.debug_mode;
1111
 
1112
  wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
 
47
  float temperature = 0.0f;
48
  float temperature_inc = 0.2f;
49
 
 
50
  bool debug_mode = false;
51
  bool translate = false;
52
  bool detect_language = false;
 
137
  else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
138
  else if (arg == "-tp" || arg == "--temperature") { params.temperature = std::stof(argv[++i]); }
139
  else if (arg == "-tpi" || arg == "--temperature-inc") { params.temperature_inc = std::stof(argv[++i]); }
 
140
  else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
141
  else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
142
  else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
 
204
  fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
205
  fprintf(stderr, " -tp, --temperature N [%-7.2f] The sampling temperature, between 0 and 1\n", params.temperature);
206
  fprintf(stderr, " -tpi, --temperature-inc N [%-7.2f] The increment of temperature, between 0 and 1\n",params.temperature_inc);
 
207
  fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
208
  fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
209
  fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
 
1103
  wparams.split_on_word = params.split_on_word;
1104
  wparams.audio_ctx = params.audio_ctx;
1105
 
 
1106
  wparams.debug_mode = params.debug_mode;
1107
 
1108
  wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
examples/server/server.cpp CHANGED
@@ -61,7 +61,6 @@ struct whisper_params {
61
  float temperature = 0.00f;
62
  float temperature_inc = 0.20f;
63
 
64
- bool speed_up = false;
65
  bool debug_mode = false;
66
  bool translate = false;
67
  bool detect_language = false;
@@ -112,7 +111,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
112
  fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
113
  fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
114
  fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
115
- // fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
116
  fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
117
  fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
118
  fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
@@ -159,7 +157,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
159
  else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
160
  else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
161
  else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
162
- // else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
163
  else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
164
  else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
165
  else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
@@ -768,7 +765,6 @@ int main(int argc, char ** argv) {
768
  wparams.split_on_word = params.split_on_word;
769
  wparams.audio_ctx = params.audio_ctx;
770
 
771
- wparams.speed_up = params.speed_up;
772
  wparams.debug_mode = params.debug_mode;
773
 
774
  wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
 
61
  float temperature = 0.00f;
62
  float temperature_inc = 0.20f;
63
 
 
64
  bool debug_mode = false;
65
  bool translate = false;
66
  bool detect_language = false;
 
111
  fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
112
  fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
113
  fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
 
114
  fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
115
  fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
116
  fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
 
157
  else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
158
  else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
159
  else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
 
160
  else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
161
  else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
162
  else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
 
765
  wparams.split_on_word = params.split_on_word;
766
  wparams.audio_ctx = params.audio_ctx;
767
 
 
768
  wparams.debug_mode = params.debug_mode;
769
 
770
  wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
examples/stream/stream.cpp CHANGED
@@ -27,7 +27,6 @@ struct whisper_params {
27
  float vad_thold = 0.6f;
28
  float freq_thold = 100.0f;
29
 
30
- bool speed_up = false;
31
  bool translate = false;
32
  bool no_fallback = false;
33
  bool print_special = false;
@@ -62,7 +61,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
62
  else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
63
  else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
64
  else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
65
- else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
66
  else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
67
  else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
68
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
@@ -100,7 +98,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
100
  fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
101
  fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
102
  fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
103
- fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
104
  fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
105
  fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
106
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
@@ -314,7 +311,6 @@ int main(int argc, char ** argv) {
314
  wparams.n_threads = params.n_threads;
315
 
316
  wparams.audio_ctx = params.audio_ctx;
317
- wparams.speed_up = params.speed_up;
318
 
319
  wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
320
 
 
27
  float vad_thold = 0.6f;
28
  float freq_thold = 100.0f;
29
 
 
30
  bool translate = false;
31
  bool no_fallback = false;
32
  bool print_special = false;
 
61
  else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
62
  else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
63
  else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
 
64
  else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
65
  else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
66
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
 
98
  fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
99
  fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
100
  fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
 
101
  fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
102
  fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
103
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
 
311
  wparams.n_threads = params.n_threads;
312
 
313
  wparams.audio_ctx = params.audio_ctx;
 
314
 
315
  wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
316
 
examples/talk-llama/talk-llama.cpp CHANGED
@@ -59,7 +59,6 @@ struct whisper_params {
59
  float vad_thold = 0.6f;
60
  float freq_thold = 100.0f;
61
 
62
- bool speed_up = false;
63
  bool translate = false;
64
  bool print_special = false;
65
  bool print_energy = false;
@@ -100,7 +99,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
100
  else if (arg == "-ngl" || arg == "--n-gpu-layers") { params.n_gpu_layers = std::stoi(argv[++i]); }
101
  else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
102
  else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
103
- else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
104
  else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
105
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
106
  else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
@@ -149,7 +147,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
149
  fprintf(stderr, " -ngl N, --n-gpu-layers N [%-7d] number of layers to store in VRAM\n", params.n_gpu_layers);
150
  fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
151
  fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
152
- fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
153
  fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
154
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
155
  fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
@@ -205,7 +202,6 @@ std::string transcribe(
205
  wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
206
 
207
  wparams.audio_ctx = params.audio_ctx;
208
- wparams.speed_up = params.speed_up;
209
 
210
  if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
211
  return "";
 
59
  float vad_thold = 0.6f;
60
  float freq_thold = 100.0f;
61
 
 
62
  bool translate = false;
63
  bool print_special = false;
64
  bool print_energy = false;
 
99
  else if (arg == "-ngl" || arg == "--n-gpu-layers") { params.n_gpu_layers = std::stoi(argv[++i]); }
100
  else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
101
  else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
 
102
  else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
103
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
104
  else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
 
147
  fprintf(stderr, " -ngl N, --n-gpu-layers N [%-7d] number of layers to store in VRAM\n", params.n_gpu_layers);
148
  fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
149
  fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
 
150
  fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
151
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
152
  fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
 
202
  wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
203
 
204
  wparams.audio_ctx = params.audio_ctx;
 
205
 
206
  if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
207
  return "";
examples/talk/talk.cpp CHANGED
@@ -26,7 +26,6 @@ struct whisper_params {
26
  float vad_thold = 0.6f;
27
  float freq_thold = 100.0f;
28
 
29
- bool speed_up = false;
30
  bool translate = false;
31
  bool print_special = false;
32
  bool print_energy = false;
@@ -60,7 +59,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
60
  else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
61
  else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
62
  else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
63
- else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
64
  else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
65
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
66
  else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
@@ -96,7 +94,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
96
  fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
97
  fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
98
  fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
99
- fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
100
  fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
101
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
102
  fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
@@ -132,7 +129,6 @@ std::string transcribe(whisper_context * ctx, const whisper_params & params, con
132
  wparams.n_threads = params.n_threads;
133
 
134
  wparams.audio_ctx = params.audio_ctx;
135
- wparams.speed_up = params.speed_up;
136
 
137
  if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
138
  return "";
 
26
  float vad_thold = 0.6f;
27
  float freq_thold = 100.0f;
28
 
 
29
  bool translate = false;
30
  bool print_special = false;
31
  bool print_energy = false;
 
59
  else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
60
  else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
61
  else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
 
62
  else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
63
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
64
  else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
 
94
  fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
95
  fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
96
  fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
 
97
  fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
98
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
99
  fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
 
129
  wparams.n_threads = params.n_threads;
130
 
131
  wparams.audio_ctx = params.audio_ctx;
 
132
 
133
  if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
134
  return "";
examples/wchess/wchess.cmd/wchess.cmd.cpp CHANGED
@@ -26,7 +26,6 @@ struct whisper_params {
26
 
27
  float grammar_penalty = 100.0f;
28
 
29
- bool speed_up = false;
30
  bool translate = false;
31
  bool print_special = false;
32
  bool print_energy = false;
@@ -57,7 +56,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
57
  fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
58
  fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
59
  fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
60
- fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
61
  fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
62
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
63
  fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
@@ -89,7 +87,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
89
  else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
90
  else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
91
  else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
92
- else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
93
  else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
94
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
95
  else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
 
26
 
27
  float grammar_penalty = 100.0f;
28
 
 
29
  bool translate = false;
30
  bool print_special = false;
31
  bool print_energy = false;
 
56
  fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
57
  fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
58
  fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
 
59
  fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
60
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
61
  fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
 
87
  else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
88
  else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
89
  else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
 
90
  else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
91
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
92
  else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
whisper.cpp CHANGED
@@ -2868,13 +2868,10 @@ struct whisper_global_cache {
2868
  // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html
2869
  // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147
2870
  float hann_window[WHISPER_N_FFT];
2871
- float hann_window2x[WHISPER_N_FFT * 2];
2872
 
2873
  whisper_global_cache() {
2874
  fill_sin_cos_table();
2875
- #define FILL_HANN_WINDOW(arr) fill_hann_window(sizeof(arr) / sizeof(arr[0]), true, arr)
2876
- FILL_HANN_WINDOW(hann_window);
2877
- FILL_HANN_WINDOW(hann_window2x);
2878
  }
2879
 
2880
  void fill_sin_cos_table() {
@@ -2885,7 +2882,7 @@ struct whisper_global_cache {
2885
  }
2886
  }
2887
 
2888
- void fill_hann_window(int length, bool periodic, float* output) {
2889
  int offset = -1;
2890
  if (periodic) {
2891
  offset = 0;
@@ -3061,15 +3058,8 @@ static bool log_mel_spectrogram(
3061
  const int64_t t_start_us = ggml_time_us();
3062
 
3063
  // Hann window
3064
- const float * hann = nullptr;
3065
- if (frame_size == WHISPER_N_FFT) {
3066
- hann = global_cache.hann_window;
3067
- } else if (frame_size == 2 * WHISPER_N_FFT) {
3068
- hann = global_cache.hann_window2x;
3069
- } else {
3070
- WHISPER_ASSERT(false && "Unsupported frame_size");
3071
- return false;
3072
- }
3073
 
3074
  // Calculate the length of padding
3075
  int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
@@ -3752,30 +3742,6 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int
3752
  return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads);
3753
  }
3754
 
3755
- // same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
3756
- int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
3757
- if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
3758
- WHISPER_LOG_ERROR("%s: failed to compute mel spectrogram\n", __func__);
3759
- return -1;
3760
- }
3761
-
3762
- return 0;
3763
- }
3764
-
3765
- // same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
3766
- int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
3767
- return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads);
3768
- }
3769
-
3770
- // same as whisper_pcm_to_mel, but applies WSOLA to speed up the audio x2
3771
- // TODO
3772
-
3773
- // same as whisper_pcm_to_mel, but applies HPTSM to speed up the audio x2
3774
- // TODO
3775
-
3776
- // same as whisper_pcm_to_mel, but applies PV (with phase lock) to speed up the audio x2
3777
- // TODO
3778
-
3779
  int whisper_set_mel_with_state(
3780
  struct whisper_context * ctx,
3781
  struct whisper_state * state,
@@ -4676,7 +4642,6 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
4676
  /*.split_on_word =*/ false,
4677
  /*.max_tokens =*/ 0,
4678
 
4679
- /*.speed_up =*/ false,
4680
  /*.debug_mode =*/ false,
4681
  /*.audio_ctx =*/ 0,
4682
 
@@ -5350,15 +5315,9 @@ int whisper_full_with_state(
5350
 
5351
  if (n_samples > 0) {
5352
  // compute log mel spectrogram
5353
- if (params.speed_up) {
5354
- // TODO: Replace PV with more advanced algorithm
5355
  WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
5356
- return -1;
5357
- } else {
5358
- if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
5359
- WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
5360
- return -2;
5361
- }
5362
  }
5363
  }
5364
 
@@ -5395,7 +5354,7 @@ int whisper_full_with_state(
5395
  // if length of spectrogram is less than 1.0s (100 frames), then return
5396
  // basically don't process anything that is less than 1.0s
5397
  // see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
5398
- if (seek_end < seek_start + (params.speed_up ? 50 : 100)) {
5399
  WHISPER_LOG_WARN("%s: input is too short - %d ms < 1000 ms. consider padding the input audio with silence\n", __func__, (seek_end - seek_start)*10);
5400
  return 0;
5401
  }
@@ -6107,8 +6066,8 @@ int whisper_full_with_state(
6107
  const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
6108
 
6109
  if (!text.empty()) {
6110
- const auto tt0 = params.speed_up ? 2*t0 : t0;
6111
- const auto tt1 = params.speed_up ? 2*t1 : t1;
6112
 
6113
  if (params.print_realtime) {
6114
  if (params.print_timestamps) {
@@ -6154,8 +6113,8 @@ int whisper_full_with_state(
6154
  if (!text.empty()) {
6155
  const auto t1 = seek + seek_delta;
6156
 
6157
- const auto tt0 = params.speed_up ? 2*t0 : t0;
6158
- const auto tt1 = params.speed_up ? 2*t1 : t1;
6159
 
6160
  if (params.print_realtime) {
6161
  if (params.print_timestamps) {
 
2868
  // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html
2869
  // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147
2870
  float hann_window[WHISPER_N_FFT];
 
2871
 
2872
  whisper_global_cache() {
2873
  fill_sin_cos_table();
2874
+ fill_hann_window(sizeof(hann_window)/sizeof(hann_window[0]), true, hann_window);
 
 
2875
  }
2876
 
2877
  void fill_sin_cos_table() {
 
2882
  }
2883
  }
2884
 
2885
+ void fill_hann_window(int length, bool periodic, float * output) {
2886
  int offset = -1;
2887
  if (periodic) {
2888
  offset = 0;
 
3058
  const int64_t t_start_us = ggml_time_us();
3059
 
3060
  // Hann window
3061
+ WHISPER_ASSERT(frame_size == WHISPER_N_FFT && "Unsupported frame_size");
3062
+ const float * hann = global_cache.hann_window;
 
 
 
 
 
 
 
3063
 
3064
  // Calculate the length of padding
3065
  int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
 
3742
  return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads);
3743
  }
3744
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3745
  int whisper_set_mel_with_state(
3746
  struct whisper_context * ctx,
3747
  struct whisper_state * state,
 
4642
  /*.split_on_word =*/ false,
4643
  /*.max_tokens =*/ 0,
4644
 
 
4645
  /*.debug_mode =*/ false,
4646
  /*.audio_ctx =*/ 0,
4647
 
 
5315
 
5316
  if (n_samples > 0) {
5317
  // compute log mel spectrogram
5318
+ if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
 
5319
  WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
5320
+ return -2;
 
 
 
 
 
5321
  }
5322
  }
5323
 
 
5354
  // if length of spectrogram is less than 1.0s (100 frames), then return
5355
  // basically don't process anything that is less than 1.0s
5356
  // see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
5357
+ if (seek_end < seek_start + 100) {
5358
  WHISPER_LOG_WARN("%s: input is too short - %d ms < 1000 ms. consider padding the input audio with silence\n", __func__, (seek_end - seek_start)*10);
5359
  return 0;
5360
  }
 
6066
  const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
6067
 
6068
  if (!text.empty()) {
6069
+ const auto tt0 = t0;
6070
+ const auto tt1 = t1;
6071
 
6072
  if (params.print_realtime) {
6073
  if (params.print_timestamps) {
 
6113
  if (!text.empty()) {
6114
  const auto t1 = seek + seek_delta;
6115
 
6116
+ const auto tt0 = t0;
6117
+ const auto tt1 = t1;
6118
 
6119
  if (params.print_realtime) {
6120
  if (params.print_timestamps) {
whisper.h CHANGED
@@ -266,22 +266,6 @@ extern "C" {
266
  int n_samples,
267
  int n_threads);
268
 
269
- // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
270
- // The resulting spectrogram is stored inside the default state of the provided whisper context.
271
- // Returns 0 on success
272
- WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
273
- struct whisper_context * ctx,
274
- const float * samples,
275
- int n_samples,
276
- int n_threads);
277
-
278
- WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
279
- struct whisper_context * ctx,
280
- struct whisper_state * state,
281
- const float * samples,
282
- int n_samples,
283
- int n_threads);
284
-
285
  // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
286
  // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
287
  // n_mel must be 80
@@ -499,7 +483,6 @@ extern "C" {
499
 
500
  // [EXPERIMENTAL] speed-up techniques
501
  // note: these can significantly reduce the quality of the output
502
- bool speed_up; // speed-up the audio by 2x using Phase Vocoder
503
  bool debug_mode; // enable debug_mode provides extra info (eg. Dump log_mel)
504
  int audio_ctx; // overwrite the audio context size (0 = use default)
505
 
 
266
  int n_samples,
267
  int n_threads);
268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
270
  // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
271
  // n_mel must be 80
 
483
 
484
  // [EXPERIMENTAL] speed-up techniques
485
  // note: these can significantly reduce the quality of the output
 
486
  bool debug_mode; // enable debug_mode provides extra info (eg. Dump log_mel)
487
  int audio_ctx; // overwrite the audio context size (0 = use default)
488