Spaces:
Running
Running
whisper : remove `speed_up` and `phase_vocoder*` functions (#2198)
Browse files* whisper : fix cast warning
* whisper : remove phase_vocoder functions, ref #2195
* whisper : remove speed_up from whisper_full_params, closes #2195
- bindings/go/examples/go-whisper/flags.go +0 -9
- bindings/go/params.go +0 -7
- bindings/go/pkg/whisper/context.go +0 -5
- bindings/go/pkg/whisper/interface.go +0 -1
- bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java +1 -9
- bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java +1 -9
- bindings/ruby/ext/ruby_whisper.cpp +0 -8
- bindings/ruby/tests/test_whisper.rb +0 -7
- examples/addon.node/addon.cpp +0 -3
- examples/command/command.cpp +0 -5
- examples/common.h +1 -1
- examples/lsp/lsp.cpp +0 -5
- examples/main/main.cpp +0 -4
- examples/server/server.cpp +0 -4
- examples/stream/stream.cpp +0 -4
- examples/talk-llama/talk-llama.cpp +0 -4
- examples/talk/talk.cpp +0 -4
- examples/wchess/wchess.cmd/wchess.cmd.cpp +0 -3
- whisper.cpp +11 -52
- whisper.h +0 -17
bindings/go/examples/go-whisper/flags.go
CHANGED
|
@@ -68,10 +68,6 @@ func (flags *Flags) GetOut() string {
|
|
| 68 |
return strings.ToLower(flags.Lookup("out").Value.String())
|
| 69 |
}
|
| 70 |
|
| 71 |
-
func (flags *Flags) IsSpeedup() bool {
|
| 72 |
-
return flags.Lookup("speedup").Value.String() == "true"
|
| 73 |
-
}
|
| 74 |
-
|
| 75 |
func (flags *Flags) IsTokens() bool {
|
| 76 |
return flags.Lookup("tokens").Value.String() == "true"
|
| 77 |
}
|
|
@@ -111,10 +107,6 @@ func (flags *Flags) SetParams(context whisper.Context) error {
|
|
| 111 |
fmt.Fprintf(flags.Output(), "Setting duration to %v\n", duration)
|
| 112 |
context.SetDuration(duration)
|
| 113 |
}
|
| 114 |
-
if flags.IsSpeedup() {
|
| 115 |
-
fmt.Fprintf(flags.Output(), "Setting speedup to true\n")
|
| 116 |
-
context.SetSpeedup(true)
|
| 117 |
-
}
|
| 118 |
if threads := flags.GetThreads(); threads != 0 {
|
| 119 |
fmt.Fprintf(flags.Output(), "Setting threads to %d\n", threads)
|
| 120 |
context.SetThreads(threads)
|
|
@@ -146,7 +138,6 @@ func registerFlags(flag *Flags) {
|
|
| 146 |
flag.Duration("offset", 0, "Time offset")
|
| 147 |
flag.Duration("duration", 0, "Duration of audio to process")
|
| 148 |
flag.Uint("threads", 0, "Number of threads to use")
|
| 149 |
-
flag.Bool("speedup", false, "Enable speedup")
|
| 150 |
flag.Uint("max-len", 0, "Maximum segment length in characters")
|
| 151 |
flag.Uint("max-tokens", 0, "Maximum tokens per segment")
|
| 152 |
flag.Float64("word-thold", 0, "Maximum segment score")
|
|
|
|
| 68 |
return strings.ToLower(flags.Lookup("out").Value.String())
|
| 69 |
}
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
func (flags *Flags) IsTokens() bool {
|
| 72 |
return flags.Lookup("tokens").Value.String() == "true"
|
| 73 |
}
|
|
|
|
| 107 |
fmt.Fprintf(flags.Output(), "Setting duration to %v\n", duration)
|
| 108 |
context.SetDuration(duration)
|
| 109 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
if threads := flags.GetThreads(); threads != 0 {
|
| 111 |
fmt.Fprintf(flags.Output(), "Setting threads to %d\n", threads)
|
| 112 |
context.SetThreads(threads)
|
|
|
|
| 138 |
flag.Duration("offset", 0, "Time offset")
|
| 139 |
flag.Duration("duration", 0, "Duration of audio to process")
|
| 140 |
flag.Uint("threads", 0, "Number of threads to use")
|
|
|
|
| 141 |
flag.Uint("max-len", 0, "Maximum segment length in characters")
|
| 142 |
flag.Uint("max-tokens", 0, "Maximum tokens per segment")
|
| 143 |
flag.Float64("word-thold", 0, "Maximum segment score")
|
bindings/go/params.go
CHANGED
|
@@ -47,10 +47,6 @@ func (p *Params) SetPrintTimestamps(v bool) {
|
|
| 47 |
p.print_timestamps = toBool(v)
|
| 48 |
}
|
| 49 |
|
| 50 |
-
func (p *Params) SetSpeedup(v bool) {
|
| 51 |
-
p.speed_up = toBool(v)
|
| 52 |
-
}
|
| 53 |
-
|
| 54 |
// Set language id
|
| 55 |
func (p *Params) SetLanguage(lang int) error {
|
| 56 |
if lang == -1 {
|
|
@@ -177,9 +173,6 @@ func (p *Params) String() string {
|
|
| 177 |
if p.token_timestamps {
|
| 178 |
str += " token_timestamps"
|
| 179 |
}
|
| 180 |
-
if p.speed_up {
|
| 181 |
-
str += " speed_up"
|
| 182 |
-
}
|
| 183 |
|
| 184 |
return str + ">"
|
| 185 |
}
|
|
|
|
| 47 |
p.print_timestamps = toBool(v)
|
| 48 |
}
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
// Set language id
|
| 51 |
func (p *Params) SetLanguage(lang int) error {
|
| 52 |
if lang == -1 {
|
|
|
|
| 173 |
if p.token_timestamps {
|
| 174 |
str += " token_timestamps"
|
| 175 |
}
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
return str + ">"
|
| 178 |
}
|
bindings/go/pkg/whisper/context.go
CHANGED
|
@@ -76,11 +76,6 @@ func (context *context) SetTranslate(v bool) {
|
|
| 76 |
context.params.SetTranslate(v)
|
| 77 |
}
|
| 78 |
|
| 79 |
-
// Set speedup flag
|
| 80 |
-
func (context *context) SetSpeedup(v bool) {
|
| 81 |
-
context.params.SetSpeedup(v)
|
| 82 |
-
}
|
| 83 |
-
|
| 84 |
func (context *context) SetSplitOnWord(v bool) {
|
| 85 |
context.params.SetSplitOnWord(v)
|
| 86 |
}
|
|
|
|
| 76 |
context.params.SetTranslate(v)
|
| 77 |
}
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
func (context *context) SetSplitOnWord(v bool) {
|
| 80 |
context.params.SetSplitOnWord(v)
|
| 81 |
}
|
bindings/go/pkg/whisper/interface.go
CHANGED
|
@@ -41,7 +41,6 @@ type Context interface {
|
|
| 41 |
SetOffset(time.Duration) // Set offset
|
| 42 |
SetDuration(time.Duration) // Set duration
|
| 43 |
SetThreads(uint) // Set number of threads to use
|
| 44 |
-
SetSpeedup(bool) // Set speedup flag
|
| 45 |
SetSplitOnWord(bool) // Set split on word flag
|
| 46 |
SetTokenThreshold(float32) // Set timestamp token probability threshold
|
| 47 |
SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
|
|
|
|
| 41 |
SetOffset(time.Duration) // Set offset
|
| 42 |
SetDuration(time.Duration) // Set duration
|
| 43 |
SetThreads(uint) // Set number of threads to use
|
|
|
|
| 44 |
SetSplitOnWord(bool) // Set split on word flag
|
| 45 |
SetTokenThreshold(float32) // Set timestamp token probability threshold
|
| 46 |
SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
|
bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java
CHANGED
|
@@ -20,7 +20,7 @@ public interface WhisperCppJnaLibrary extends Library {
|
|
| 20 |
* @return Whisper context on success, null on failure
|
| 21 |
*/
|
| 22 |
Pointer whisper_init_from_file(String path_model);
|
| 23 |
-
|
| 24 |
/**
|
| 25 |
* Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
|
| 26 |
* Because this function allocates memory for the params, the caller must call either:
|
|
@@ -304,14 +304,6 @@ public interface WhisperCppJnaLibrary extends Library {
|
|
| 304 |
/** Language id associated with the provided state */
|
| 305 |
int whisper_full_lang_id_from_state(Pointer state);
|
| 306 |
|
| 307 |
-
/**
|
| 308 |
-
* Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
|
| 309 |
-
* The resulting spectrogram is stored inside the default state of the provided whisper context.
|
| 310 |
-
* @return 0 on success
|
| 311 |
-
*/
|
| 312 |
-
int whisper_pcm_to_mel_phase_vocoder(Pointer ctx, final float[] samples, int n_samples, int n_threads);
|
| 313 |
-
|
| 314 |
-
int whisper_pcm_to_mel_phase_vocoder_with_state(Pointer ctx, Pointer state, final float[] samples, int n_samples, int n_threads);
|
| 315 |
|
| 316 |
/** Get the start time of the specified segment. */
|
| 317 |
long whisper_full_get_segment_t0(Pointer ctx, int i_segment);
|
|
|
|
| 20 |
* @return Whisper context on success, null on failure
|
| 21 |
*/
|
| 22 |
Pointer whisper_init_from_file(String path_model);
|
| 23 |
+
|
| 24 |
/**
|
| 25 |
* Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
|
| 26 |
* Because this function allocates memory for the params, the caller must call either:
|
|
|
|
| 304 |
/** Language id associated with the provided state */
|
| 305 |
int whisper_full_lang_id_from_state(Pointer state);
|
| 306 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
|
| 308 |
/** Get the start time of the specified segment. */
|
| 309 |
long whisper_full_get_segment_t0(Pointer ctx, int i_segment);
|
bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
CHANGED
|
@@ -129,14 +129,6 @@ public class WhisperFullParams extends Structure {
|
|
| 129 |
/** Maximum tokens per segment (0, default = no limit) */
|
| 130 |
public int max_tokens;
|
| 131 |
|
| 132 |
-
/** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */
|
| 133 |
-
public CBool speed_up;
|
| 134 |
-
|
| 135 |
-
/** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */
|
| 136 |
-
public void speedUp(boolean enable) {
|
| 137 |
-
speed_up = enable ? CBool.TRUE : CBool.FALSE;
|
| 138 |
-
}
|
| 139 |
-
|
| 140 |
/** Overwrite the audio context size (0 = use default). */
|
| 141 |
public int audio_ctx;
|
| 142 |
|
|
@@ -321,7 +313,7 @@ public class WhisperFullParams extends Structure {
|
|
| 321 |
return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate",
|
| 322 |
"no_context", "single_segment", "no_timestamps",
|
| 323 |
"print_special", "print_progress", "print_realtime", "print_timestamps", "token_timestamps",
|
| 324 |
-
"thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "
|
| 325 |
"tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
|
| 326 |
"suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty",
|
| 327 |
"temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
|
|
|
|
| 129 |
/** Maximum tokens per segment (0, default = no limit) */
|
| 130 |
public int max_tokens;
|
| 131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
/** Overwrite the audio context size (0 = use default). */
|
| 133 |
public int audio_ctx;
|
| 134 |
|
|
|
|
| 313 |
return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate",
|
| 314 |
"no_context", "single_segment", "no_timestamps",
|
| 315 |
"print_special", "print_progress", "print_realtime", "print_timestamps", "token_timestamps",
|
| 316 |
+
"thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "audio_ctx",
|
| 317 |
"tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
|
| 318 |
"suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty",
|
| 319 |
"temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
|
bindings/ruby/ext/ruby_whisper.cpp
CHANGED
|
@@ -311,12 +311,6 @@ static VALUE ruby_whisper_params_get_split_on_word(VALUE self) {
|
|
| 311 |
static VALUE ruby_whisper_params_set_split_on_word(VALUE self, VALUE value) {
|
| 312 |
BOOL_PARAMS_SETTER(self, split_on_word, value)
|
| 313 |
}
|
| 314 |
-
static VALUE ruby_whisper_params_get_speed_up(VALUE self) {
|
| 315 |
-
BOOL_PARAMS_GETTER(self, speed_up)
|
| 316 |
-
}
|
| 317 |
-
static VALUE ruby_whisper_params_set_speed_up(VALUE self, VALUE value) {
|
| 318 |
-
BOOL_PARAMS_SETTER(self, speed_up, value)
|
| 319 |
-
}
|
| 320 |
static VALUE ruby_whisper_params_get_diarize(VALUE self) {
|
| 321 |
ruby_whisper_params *rwp;
|
| 322 |
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
@@ -408,8 +402,6 @@ void Init_whisper() {
|
|
| 408 |
rb_define_method(cParams, "token_timestamps=", ruby_whisper_params_set_token_timestamps, 1);
|
| 409 |
rb_define_method(cParams, "split_on_word", ruby_whisper_params_get_split_on_word, 0);
|
| 410 |
rb_define_method(cParams, "split_on_word=", ruby_whisper_params_set_split_on_word, 1);
|
| 411 |
-
rb_define_method(cParams, "speed_up", ruby_whisper_params_get_speed_up, 0);
|
| 412 |
-
rb_define_method(cParams, "speed_up=", ruby_whisper_params_set_speed_up, 1);
|
| 413 |
rb_define_method(cParams, "diarize", ruby_whisper_params_get_diarize, 0);
|
| 414 |
rb_define_method(cParams, "diarize=", ruby_whisper_params_set_diarize, 1);
|
| 415 |
|
|
|
|
| 311 |
static VALUE ruby_whisper_params_set_split_on_word(VALUE self, VALUE value) {
|
| 312 |
BOOL_PARAMS_SETTER(self, split_on_word, value)
|
| 313 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
static VALUE ruby_whisper_params_get_diarize(VALUE self) {
|
| 315 |
ruby_whisper_params *rwp;
|
| 316 |
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
|
|
| 402 |
rb_define_method(cParams, "token_timestamps=", ruby_whisper_params_set_token_timestamps, 1);
|
| 403 |
rb_define_method(cParams, "split_on_word", ruby_whisper_params_get_split_on_word, 0);
|
| 404 |
rb_define_method(cParams, "split_on_word=", ruby_whisper_params_set_split_on_word, 1);
|
|
|
|
|
|
|
| 405 |
rb_define_method(cParams, "diarize", ruby_whisper_params_get_diarize, 0);
|
| 406 |
rb_define_method(cParams, "diarize=", ruby_whisper_params_set_diarize, 1);
|
| 407 |
|
bindings/ruby/tests/test_whisper.rb
CHANGED
|
@@ -117,13 +117,6 @@ class TestWhisper < Test::Unit::TestCase
|
|
| 117 |
assert [email protected]_on_word
|
| 118 |
end
|
| 119 |
|
| 120 |
-
def test_speed_up
|
| 121 |
-
@params.speed_up = true
|
| 122 |
-
assert @params.speed_up
|
| 123 |
-
@params.speed_up = false
|
| 124 |
-
assert [email protected]_up
|
| 125 |
-
end
|
| 126 |
-
|
| 127 |
def test_whisper
|
| 128 |
@whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
|
| 129 |
params = Whisper::Params.new
|
|
|
|
| 117 |
assert [email protected]_on_word
|
| 118 |
end
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
def test_whisper
|
| 121 |
@whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
|
| 122 |
params = Whisper::Params.new
|
examples/addon.node/addon.cpp
CHANGED
|
@@ -25,7 +25,6 @@ struct whisper_params {
|
|
| 25 |
float entropy_thold = 2.4f;
|
| 26 |
float logprob_thold = -1.0f;
|
| 27 |
|
| 28 |
-
bool speed_up = false;
|
| 29 |
bool translate = false;
|
| 30 |
bool diarize = false;
|
| 31 |
bool output_txt = false;
|
|
@@ -232,8 +231,6 @@ int run(whisper_params ¶ms, std::vector<std::vector<std::string>> &result) {
|
|
| 232 |
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
| 233 |
wparams.audio_ctx = params.audio_ctx;
|
| 234 |
|
| 235 |
-
wparams.speed_up = params.speed_up;
|
| 236 |
-
|
| 237 |
wparams.greedy.best_of = params.best_of;
|
| 238 |
wparams.beam_search.beam_size = params.beam_size;
|
| 239 |
|
|
|
|
| 25 |
float entropy_thold = 2.4f;
|
| 26 |
float logprob_thold = -1.0f;
|
| 27 |
|
|
|
|
| 28 |
bool translate = false;
|
| 29 |
bool diarize = false;
|
| 30 |
bool output_txt = false;
|
|
|
|
| 231 |
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
| 232 |
wparams.audio_ctx = params.audio_ctx;
|
| 233 |
|
|
|
|
|
|
|
| 234 |
wparams.greedy.best_of = params.best_of;
|
| 235 |
wparams.beam_search.beam_size = params.beam_size;
|
| 236 |
|
examples/command/command.cpp
CHANGED
|
@@ -38,7 +38,6 @@ struct whisper_params {
|
|
| 38 |
|
| 39 |
grammar_parser::parse_state grammar_parsed;
|
| 40 |
|
| 41 |
-
bool speed_up = false;
|
| 42 |
bool translate = false;
|
| 43 |
bool print_special = false;
|
| 44 |
bool print_energy = false;
|
|
@@ -76,7 +75,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 76 |
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
| 77 |
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
| 78 |
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
| 79 |
-
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
| 80 |
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 81 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
| 82 |
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
|
|
@@ -115,7 +113,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 115 |
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
| 116 |
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
| 117 |
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
| 118 |
-
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
| 119 |
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 120 |
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
| 121 |
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
|
|
@@ -165,7 +162,6 @@ std::string transcribe(
|
|
| 165 |
wparams.n_threads = params.n_threads;
|
| 166 |
|
| 167 |
wparams.audio_ctx = params.audio_ctx;
|
| 168 |
-
wparams.speed_up = params.speed_up;
|
| 169 |
|
| 170 |
wparams.temperature = 0.4f;
|
| 171 |
wparams.temperature_inc = 1.0f;
|
|
@@ -371,7 +367,6 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
|
|
| 371 |
wparams.n_threads = params.n_threads;
|
| 372 |
|
| 373 |
wparams.audio_ctx = params.audio_ctx;
|
| 374 |
-
wparams.speed_up = params.speed_up;
|
| 375 |
|
| 376 |
wparams.prompt_tokens = k_tokens.data();
|
| 377 |
wparams.prompt_n_tokens = k_tokens.size();
|
|
|
|
| 38 |
|
| 39 |
grammar_parser::parse_state grammar_parsed;
|
| 40 |
|
|
|
|
| 41 |
bool translate = false;
|
| 42 |
bool print_special = false;
|
| 43 |
bool print_energy = false;
|
|
|
|
| 75 |
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
| 76 |
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
| 77 |
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
|
|
|
| 78 |
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 79 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
| 80 |
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
|
|
|
|
| 113 |
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
| 114 |
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
| 115 |
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
|
|
|
| 116 |
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 117 |
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
| 118 |
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
|
|
|
|
| 162 |
wparams.n_threads = params.n_threads;
|
| 163 |
|
| 164 |
wparams.audio_ctx = params.audio_ctx;
|
|
|
|
| 165 |
|
| 166 |
wparams.temperature = 0.4f;
|
| 167 |
wparams.temperature_inc = 1.0f;
|
|
|
|
| 367 |
wparams.n_threads = params.n_threads;
|
| 368 |
|
| 369 |
wparams.audio_ctx = params.audio_ctx;
|
|
|
|
| 370 |
|
| 371 |
wparams.prompt_tokens = k_tokens.data();
|
| 372 |
wparams.prompt_n_tokens = k_tokens.size();
|
examples/common.h
CHANGED
|
@@ -185,7 +185,7 @@ private:
|
|
| 185 |
// It is assumed that PCM data is normalized to a range from -1 to 1
|
| 186 |
bool write_audio(const float * data, size_t length) {
|
| 187 |
for (size_t i = 0; i < length; ++i) {
|
| 188 |
-
const int16_t intSample = data[i] * 32767;
|
| 189 |
file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
|
| 190 |
dataSize += sizeof(int16_t);
|
| 191 |
}
|
|
|
|
| 185 |
// It is assumed that PCM data is normalized to a range from -1 to 1
|
| 186 |
bool write_audio(const float * data, size_t length) {
|
| 187 |
for (size_t i = 0; i < length; ++i) {
|
| 188 |
+
const int16_t intSample = int16_t(data[i] * 32767);
|
| 189 |
file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
|
| 190 |
dataSize += sizeof(int16_t);
|
| 191 |
}
|
examples/lsp/lsp.cpp
CHANGED
|
@@ -26,7 +26,6 @@ struct whisper_params {
|
|
| 26 |
float vad_thold = 0.6f;
|
| 27 |
float freq_thold = 100.0f;
|
| 28 |
|
| 29 |
-
bool speed_up = false;
|
| 30 |
bool translate = false;
|
| 31 |
bool print_special = false;
|
| 32 |
bool print_energy = false;
|
|
@@ -70,7 +69,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 70 |
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
| 71 |
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
| 72 |
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
| 73 |
-
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
| 74 |
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 75 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
| 76 |
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
|
|
@@ -102,7 +100,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 102 |
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
| 103 |
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
| 104 |
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
| 105 |
-
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
| 106 |
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 107 |
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
| 108 |
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
|
|
@@ -184,7 +181,6 @@ json unguided_transcription(struct whisper_context * ctx, audio_async &audio, js
|
|
| 184 |
wparams.n_threads = params.n_threads;
|
| 185 |
|
| 186 |
wparams.audio_ctx = params.audio_ctx;
|
| 187 |
-
wparams.speed_up = params.speed_up;
|
| 188 |
wparams.suppress_non_speech_tokens = true;
|
| 189 |
// run the transformer and a single decoding pass
|
| 190 |
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
|
@@ -223,7 +219,6 @@ json guided_transcription(struct whisper_context * ctx, audio_async &audio, cons
|
|
| 223 |
wparams.n_threads = params.n_threads;
|
| 224 |
|
| 225 |
wparams.audio_ctx = params.audio_ctx;
|
| 226 |
-
wparams.speed_up = params.speed_up;
|
| 227 |
|
| 228 |
// TODO: Do some time testing. Does an overly long prompt slow down processing?
|
| 229 |
// Set up command sets/precompute prompts
|
|
|
|
| 26 |
float vad_thold = 0.6f;
|
| 27 |
float freq_thold = 100.0f;
|
| 28 |
|
|
|
|
| 29 |
bool translate = false;
|
| 30 |
bool print_special = false;
|
| 31 |
bool print_energy = false;
|
|
|
|
| 69 |
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
| 70 |
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
| 71 |
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
|
|
|
| 72 |
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 73 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
| 74 |
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
|
|
|
|
| 100 |
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
| 101 |
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
| 102 |
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
|
|
|
| 103 |
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 104 |
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
| 105 |
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
|
|
|
|
| 181 |
wparams.n_threads = params.n_threads;
|
| 182 |
|
| 183 |
wparams.audio_ctx = params.audio_ctx;
|
|
|
|
| 184 |
wparams.suppress_non_speech_tokens = true;
|
| 185 |
// run the transformer and a single decoding pass
|
| 186 |
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
|
|
|
| 219 |
wparams.n_threads = params.n_threads;
|
| 220 |
|
| 221 |
wparams.audio_ctx = params.audio_ctx;
|
|
|
|
| 222 |
|
| 223 |
// TODO: Do some time testing. Does an overly long prompt slow down processing?
|
| 224 |
// Set up command sets/precompute prompts
|
examples/main/main.cpp
CHANGED
|
@@ -47,7 +47,6 @@ struct whisper_params {
|
|
| 47 |
float temperature = 0.0f;
|
| 48 |
float temperature_inc = 0.2f;
|
| 49 |
|
| 50 |
-
bool speed_up = false;
|
| 51 |
bool debug_mode = false;
|
| 52 |
bool translate = false;
|
| 53 |
bool detect_language = false;
|
|
@@ -138,7 +137,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 138 |
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
|
| 139 |
else if (arg == "-tp" || arg == "--temperature") { params.temperature = std::stof(argv[++i]); }
|
| 140 |
else if (arg == "-tpi" || arg == "--temperature-inc") { params.temperature_inc = std::stof(argv[++i]); }
|
| 141 |
-
// else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
| 142 |
else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
|
| 143 |
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 144 |
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
|
|
@@ -206,7 +204,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 206 |
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
|
| 207 |
fprintf(stderr, " -tp, --temperature N [%-7.2f] The sampling temperature, between 0 and 1\n", params.temperature);
|
| 208 |
fprintf(stderr, " -tpi, --temperature-inc N [%-7.2f] The increment of temperature, between 0 and 1\n",params.temperature_inc);
|
| 209 |
-
// fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
| 210 |
fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
|
| 211 |
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 212 |
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
|
|
@@ -1106,7 +1103,6 @@ int main(int argc, char ** argv) {
|
|
| 1106 |
wparams.split_on_word = params.split_on_word;
|
| 1107 |
wparams.audio_ctx = params.audio_ctx;
|
| 1108 |
|
| 1109 |
-
wparams.speed_up = params.speed_up;
|
| 1110 |
wparams.debug_mode = params.debug_mode;
|
| 1111 |
|
| 1112 |
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
|
|
|
|
| 47 |
float temperature = 0.0f;
|
| 48 |
float temperature_inc = 0.2f;
|
| 49 |
|
|
|
|
| 50 |
bool debug_mode = false;
|
| 51 |
bool translate = false;
|
| 52 |
bool detect_language = false;
|
|
|
|
| 137 |
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
|
| 138 |
else if (arg == "-tp" || arg == "--temperature") { params.temperature = std::stof(argv[++i]); }
|
| 139 |
else if (arg == "-tpi" || arg == "--temperature-inc") { params.temperature_inc = std::stof(argv[++i]); }
|
|
|
|
| 140 |
else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
|
| 141 |
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 142 |
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
|
|
|
|
| 204 |
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
|
| 205 |
fprintf(stderr, " -tp, --temperature N [%-7.2f] The sampling temperature, between 0 and 1\n", params.temperature);
|
| 206 |
fprintf(stderr, " -tpi, --temperature-inc N [%-7.2f] The increment of temperature, between 0 and 1\n",params.temperature_inc);
|
|
|
|
| 207 |
fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
|
| 208 |
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 209 |
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
|
|
|
|
| 1103 |
wparams.split_on_word = params.split_on_word;
|
| 1104 |
wparams.audio_ctx = params.audio_ctx;
|
| 1105 |
|
|
|
|
| 1106 |
wparams.debug_mode = params.debug_mode;
|
| 1107 |
|
| 1108 |
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
|
examples/server/server.cpp
CHANGED
|
@@ -61,7 +61,6 @@ struct whisper_params {
|
|
| 61 |
float temperature = 0.00f;
|
| 62 |
float temperature_inc = 0.20f;
|
| 63 |
|
| 64 |
-
bool speed_up = false;
|
| 65 |
bool debug_mode = false;
|
| 66 |
bool translate = false;
|
| 67 |
bool detect_language = false;
|
|
@@ -112,7 +111,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 112 |
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
|
| 113 |
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
|
| 114 |
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
|
| 115 |
-
// fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
| 116 |
fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
|
| 117 |
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 118 |
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
|
|
@@ -159,7 +157,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
|
|
| 159 |
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
|
| 160 |
else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
|
| 161 |
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
|
| 162 |
-
// else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
| 163 |
else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
|
| 164 |
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 165 |
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
|
|
@@ -768,7 +765,6 @@ int main(int argc, char ** argv) {
|
|
| 768 |
wparams.split_on_word = params.split_on_word;
|
| 769 |
wparams.audio_ctx = params.audio_ctx;
|
| 770 |
|
| 771 |
-
wparams.speed_up = params.speed_up;
|
| 772 |
wparams.debug_mode = params.debug_mode;
|
| 773 |
|
| 774 |
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
|
|
|
|
| 61 |
float temperature = 0.00f;
|
| 62 |
float temperature_inc = 0.20f;
|
| 63 |
|
|
|
|
| 64 |
bool debug_mode = false;
|
| 65 |
bool translate = false;
|
| 66 |
bool detect_language = false;
|
|
|
|
| 111 |
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
|
| 112 |
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
|
| 113 |
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
|
|
|
|
| 114 |
fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
|
| 115 |
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 116 |
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
|
|
|
|
| 157 |
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
|
| 158 |
else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
|
| 159 |
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
|
|
|
|
| 160 |
else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
|
| 161 |
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 162 |
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
|
|
|
|
| 765 |
wparams.split_on_word = params.split_on_word;
|
| 766 |
wparams.audio_ctx = params.audio_ctx;
|
| 767 |
|
|
|
|
| 768 |
wparams.debug_mode = params.debug_mode;
|
| 769 |
|
| 770 |
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
|
examples/stream/stream.cpp
CHANGED
|
@@ -27,7 +27,6 @@ struct whisper_params {
|
|
| 27 |
float vad_thold = 0.6f;
|
| 28 |
float freq_thold = 100.0f;
|
| 29 |
|
| 30 |
-
bool speed_up = false;
|
| 31 |
bool translate = false;
|
| 32 |
bool no_fallback = false;
|
| 33 |
bool print_special = false;
|
|
@@ -62,7 +61,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 62 |
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
| 63 |
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
| 64 |
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
| 65 |
-
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
| 66 |
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 67 |
else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
|
| 68 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
|
@@ -100,7 +98,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 100 |
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
| 101 |
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
| 102 |
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
| 103 |
-
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
| 104 |
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 105 |
fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
|
| 106 |
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
|
@@ -314,7 +311,6 @@ int main(int argc, char ** argv) {
|
|
| 314 |
wparams.n_threads = params.n_threads;
|
| 315 |
|
| 316 |
wparams.audio_ctx = params.audio_ctx;
|
| 317 |
-
wparams.speed_up = params.speed_up;
|
| 318 |
|
| 319 |
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
|
| 320 |
|
|
|
|
| 27 |
float vad_thold = 0.6f;
|
| 28 |
float freq_thold = 100.0f;
|
| 29 |
|
|
|
|
| 30 |
bool translate = false;
|
| 31 |
bool no_fallback = false;
|
| 32 |
bool print_special = false;
|
|
|
|
| 61 |
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
| 62 |
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
| 63 |
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
|
|
|
| 64 |
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 65 |
else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
|
| 66 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
|
|
|
| 98 |
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
| 99 |
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
| 100 |
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
|
|
|
| 101 |
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 102 |
fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
|
| 103 |
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
|
|
|
| 311 |
wparams.n_threads = params.n_threads;
|
| 312 |
|
| 313 |
wparams.audio_ctx = params.audio_ctx;
|
|
|
|
| 314 |
|
| 315 |
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
|
| 316 |
|
examples/talk-llama/talk-llama.cpp
CHANGED
|
@@ -59,7 +59,6 @@ struct whisper_params {
|
|
| 59 |
float vad_thold = 0.6f;
|
| 60 |
float freq_thold = 100.0f;
|
| 61 |
|
| 62 |
-
bool speed_up = false;
|
| 63 |
bool translate = false;
|
| 64 |
bool print_special = false;
|
| 65 |
bool print_energy = false;
|
|
@@ -100,7 +99,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 100 |
else if (arg == "-ngl" || arg == "--n-gpu-layers") { params.n_gpu_layers = std::stoi(argv[++i]); }
|
| 101 |
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
| 102 |
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
| 103 |
-
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
| 104 |
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 105 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
| 106 |
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
|
|
@@ -149,7 +147,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 149 |
fprintf(stderr, " -ngl N, --n-gpu-layers N [%-7d] number of layers to store in VRAM\n", params.n_gpu_layers);
|
| 150 |
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
| 151 |
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
| 152 |
-
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
| 153 |
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 154 |
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
| 155 |
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
|
|
@@ -205,7 +202,6 @@ std::string transcribe(
|
|
| 205 |
wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
|
| 206 |
|
| 207 |
wparams.audio_ctx = params.audio_ctx;
|
| 208 |
-
wparams.speed_up = params.speed_up;
|
| 209 |
|
| 210 |
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
| 211 |
return "";
|
|
|
|
| 59 |
float vad_thold = 0.6f;
|
| 60 |
float freq_thold = 100.0f;
|
| 61 |
|
|
|
|
| 62 |
bool translate = false;
|
| 63 |
bool print_special = false;
|
| 64 |
bool print_energy = false;
|
|
|
|
| 99 |
else if (arg == "-ngl" || arg == "--n-gpu-layers") { params.n_gpu_layers = std::stoi(argv[++i]); }
|
| 100 |
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
| 101 |
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
|
|
|
| 102 |
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 103 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
| 104 |
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
|
|
|
|
| 147 |
fprintf(stderr, " -ngl N, --n-gpu-layers N [%-7d] number of layers to store in VRAM\n", params.n_gpu_layers);
|
| 148 |
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
| 149 |
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
|
|
|
| 150 |
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 151 |
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
| 152 |
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
|
|
|
|
| 202 |
wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
|
| 203 |
|
| 204 |
wparams.audio_ctx = params.audio_ctx;
|
|
|
|
| 205 |
|
| 206 |
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
| 207 |
return "";
|
examples/talk/talk.cpp
CHANGED
|
@@ -26,7 +26,6 @@ struct whisper_params {
|
|
| 26 |
float vad_thold = 0.6f;
|
| 27 |
float freq_thold = 100.0f;
|
| 28 |
|
| 29 |
-
bool speed_up = false;
|
| 30 |
bool translate = false;
|
| 31 |
bool print_special = false;
|
| 32 |
bool print_energy = false;
|
|
@@ -60,7 +59,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 60 |
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
| 61 |
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
| 62 |
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
| 63 |
-
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
| 64 |
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 65 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
| 66 |
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
|
|
@@ -96,7 +94,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 96 |
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
| 97 |
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
| 98 |
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
| 99 |
-
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
| 100 |
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 101 |
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
| 102 |
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
|
|
@@ -132,7 +129,6 @@ std::string transcribe(whisper_context * ctx, const whisper_params & params, con
|
|
| 132 |
wparams.n_threads = params.n_threads;
|
| 133 |
|
| 134 |
wparams.audio_ctx = params.audio_ctx;
|
| 135 |
-
wparams.speed_up = params.speed_up;
|
| 136 |
|
| 137 |
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
| 138 |
return "";
|
|
|
|
| 26 |
float vad_thold = 0.6f;
|
| 27 |
float freq_thold = 100.0f;
|
| 28 |
|
|
|
|
| 29 |
bool translate = false;
|
| 30 |
bool print_special = false;
|
| 31 |
bool print_energy = false;
|
|
|
|
| 59 |
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
| 60 |
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
| 61 |
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
|
|
|
| 62 |
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 63 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
| 64 |
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
|
|
|
|
| 94 |
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
| 95 |
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
| 96 |
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
|
|
|
| 97 |
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 98 |
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
| 99 |
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
|
|
|
|
| 129 |
wparams.n_threads = params.n_threads;
|
| 130 |
|
| 131 |
wparams.audio_ctx = params.audio_ctx;
|
|
|
|
| 132 |
|
| 133 |
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
| 134 |
return "";
|
examples/wchess/wchess.cmd/wchess.cmd.cpp
CHANGED
|
@@ -26,7 +26,6 @@ struct whisper_params {
|
|
| 26 |
|
| 27 |
float grammar_penalty = 100.0f;
|
| 28 |
|
| 29 |
-
bool speed_up = false;
|
| 30 |
bool translate = false;
|
| 31 |
bool print_special = false;
|
| 32 |
bool print_energy = false;
|
|
@@ -57,7 +56,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 57 |
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
| 58 |
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
| 59 |
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
| 60 |
-
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
| 61 |
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 62 |
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
| 63 |
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
|
|
@@ -89,7 +87,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 89 |
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
| 90 |
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
| 91 |
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
| 92 |
-
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
| 93 |
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 94 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
| 95 |
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
|
|
|
|
| 26 |
|
| 27 |
float grammar_penalty = 100.0f;
|
| 28 |
|
|
|
|
| 29 |
bool translate = false;
|
| 30 |
bool print_special = false;
|
| 31 |
bool print_energy = false;
|
|
|
|
| 56 |
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
| 57 |
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
| 58 |
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
|
|
|
| 59 |
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 60 |
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
| 61 |
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
|
|
|
|
| 87 |
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
| 88 |
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
| 89 |
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
|
|
|
| 90 |
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 91 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
| 92 |
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
|
whisper.cpp
CHANGED
|
@@ -2868,13 +2868,10 @@ struct whisper_global_cache {
|
|
| 2868 |
// ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html
|
| 2869 |
// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147
|
| 2870 |
float hann_window[WHISPER_N_FFT];
|
| 2871 |
-
float hann_window2x[WHISPER_N_FFT * 2];
|
| 2872 |
|
| 2873 |
whisper_global_cache() {
|
| 2874 |
fill_sin_cos_table();
|
| 2875 |
-
|
| 2876 |
-
FILL_HANN_WINDOW(hann_window);
|
| 2877 |
-
FILL_HANN_WINDOW(hann_window2x);
|
| 2878 |
}
|
| 2879 |
|
| 2880 |
void fill_sin_cos_table() {
|
|
@@ -2885,7 +2882,7 @@ struct whisper_global_cache {
|
|
| 2885 |
}
|
| 2886 |
}
|
| 2887 |
|
| 2888 |
-
void fill_hann_window(int length, bool periodic, float* output) {
|
| 2889 |
int offset = -1;
|
| 2890 |
if (periodic) {
|
| 2891 |
offset = 0;
|
|
@@ -3061,15 +3058,8 @@ static bool log_mel_spectrogram(
|
|
| 3061 |
const int64_t t_start_us = ggml_time_us();
|
| 3062 |
|
| 3063 |
// Hann window
|
| 3064 |
-
|
| 3065 |
-
|
| 3066 |
-
hann = global_cache.hann_window;
|
| 3067 |
-
} else if (frame_size == 2 * WHISPER_N_FFT) {
|
| 3068 |
-
hann = global_cache.hann_window2x;
|
| 3069 |
-
} else {
|
| 3070 |
-
WHISPER_ASSERT(false && "Unsupported frame_size");
|
| 3071 |
-
return false;
|
| 3072 |
-
}
|
| 3073 |
|
| 3074 |
// Calculate the length of padding
|
| 3075 |
int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
|
|
@@ -3752,30 +3742,6 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int
|
|
| 3752 |
return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads);
|
| 3753 |
}
|
| 3754 |
|
| 3755 |
-
// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
|
| 3756 |
-
int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
|
| 3757 |
-
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
|
| 3758 |
-
WHISPER_LOG_ERROR("%s: failed to compute mel spectrogram\n", __func__);
|
| 3759 |
-
return -1;
|
| 3760 |
-
}
|
| 3761 |
-
|
| 3762 |
-
return 0;
|
| 3763 |
-
}
|
| 3764 |
-
|
| 3765 |
-
// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
|
| 3766 |
-
int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
|
| 3767 |
-
return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads);
|
| 3768 |
-
}
|
| 3769 |
-
|
| 3770 |
-
// same as whisper_pcm_to_mel, but applies WSOLA to speed up the audio x2
|
| 3771 |
-
// TODO
|
| 3772 |
-
|
| 3773 |
-
// same as whisper_pcm_to_mel, but applies HPTSM to speed up the audio x2
|
| 3774 |
-
// TODO
|
| 3775 |
-
|
| 3776 |
-
// same as whisper_pcm_to_mel, but applies PV (with phase lock) to speed up the audio x2
|
| 3777 |
-
// TODO
|
| 3778 |
-
|
| 3779 |
int whisper_set_mel_with_state(
|
| 3780 |
struct whisper_context * ctx,
|
| 3781 |
struct whisper_state * state,
|
|
@@ -4676,7 +4642,6 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
| 4676 |
/*.split_on_word =*/ false,
|
| 4677 |
/*.max_tokens =*/ 0,
|
| 4678 |
|
| 4679 |
-
/*.speed_up =*/ false,
|
| 4680 |
/*.debug_mode =*/ false,
|
| 4681 |
/*.audio_ctx =*/ 0,
|
| 4682 |
|
|
@@ -5350,15 +5315,9 @@ int whisper_full_with_state(
|
|
| 5350 |
|
| 5351 |
if (n_samples > 0) {
|
| 5352 |
// compute log mel spectrogram
|
| 5353 |
-
if (params.
|
| 5354 |
-
// TODO: Replace PV with more advanced algorithm
|
| 5355 |
WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
|
| 5356 |
-
return -
|
| 5357 |
-
} else {
|
| 5358 |
-
if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
|
| 5359 |
-
WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
|
| 5360 |
-
return -2;
|
| 5361 |
-
}
|
| 5362 |
}
|
| 5363 |
}
|
| 5364 |
|
|
@@ -5395,7 +5354,7 @@ int whisper_full_with_state(
|
|
| 5395 |
// if length of spectrogram is less than 1.0s (100 frames), then return
|
| 5396 |
// basically don't process anything that is less than 1.0s
|
| 5397 |
// see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
|
| 5398 |
-
if (seek_end < seek_start +
|
| 5399 |
WHISPER_LOG_WARN("%s: input is too short - %d ms < 1000 ms. consider padding the input audio with silence\n", __func__, (seek_end - seek_start)*10);
|
| 5400 |
return 0;
|
| 5401 |
}
|
|
@@ -6107,8 +6066,8 @@ int whisper_full_with_state(
|
|
| 6107 |
const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
|
| 6108 |
|
| 6109 |
if (!text.empty()) {
|
| 6110 |
-
const auto tt0 =
|
| 6111 |
-
const auto tt1 =
|
| 6112 |
|
| 6113 |
if (params.print_realtime) {
|
| 6114 |
if (params.print_timestamps) {
|
|
@@ -6154,8 +6113,8 @@ int whisper_full_with_state(
|
|
| 6154 |
if (!text.empty()) {
|
| 6155 |
const auto t1 = seek + seek_delta;
|
| 6156 |
|
| 6157 |
-
const auto tt0 =
|
| 6158 |
-
const auto tt1 =
|
| 6159 |
|
| 6160 |
if (params.print_realtime) {
|
| 6161 |
if (params.print_timestamps) {
|
|
|
|
| 2868 |
// ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html
|
| 2869 |
// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147
|
| 2870 |
float hann_window[WHISPER_N_FFT];
|
|
|
|
| 2871 |
|
| 2872 |
whisper_global_cache() {
|
| 2873 |
fill_sin_cos_table();
|
| 2874 |
+
fill_hann_window(sizeof(hann_window)/sizeof(hann_window[0]), true, hann_window);
|
|
|
|
|
|
|
| 2875 |
}
|
| 2876 |
|
| 2877 |
void fill_sin_cos_table() {
|
|
|
|
| 2882 |
}
|
| 2883 |
}
|
| 2884 |
|
| 2885 |
+
void fill_hann_window(int length, bool periodic, float * output) {
|
| 2886 |
int offset = -1;
|
| 2887 |
if (periodic) {
|
| 2888 |
offset = 0;
|
|
|
|
| 3058 |
const int64_t t_start_us = ggml_time_us();
|
| 3059 |
|
| 3060 |
// Hann window
|
| 3061 |
+
WHISPER_ASSERT(frame_size == WHISPER_N_FFT && "Unsupported frame_size");
|
| 3062 |
+
const float * hann = global_cache.hann_window;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3063 |
|
| 3064 |
// Calculate the length of padding
|
| 3065 |
int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
|
|
|
|
| 3742 |
return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads);
|
| 3743 |
}
|
| 3744 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3745 |
int whisper_set_mel_with_state(
|
| 3746 |
struct whisper_context * ctx,
|
| 3747 |
struct whisper_state * state,
|
|
|
|
| 4642 |
/*.split_on_word =*/ false,
|
| 4643 |
/*.max_tokens =*/ 0,
|
| 4644 |
|
|
|
|
| 4645 |
/*.debug_mode =*/ false,
|
| 4646 |
/*.audio_ctx =*/ 0,
|
| 4647 |
|
|
|
|
| 5315 |
|
| 5316 |
if (n_samples > 0) {
|
| 5317 |
// compute log mel spectrogram
|
| 5318 |
+
if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
|
|
|
|
| 5319 |
WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
|
| 5320 |
+
return -2;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5321 |
}
|
| 5322 |
}
|
| 5323 |
|
|
|
|
| 5354 |
// if length of spectrogram is less than 1.0s (100 frames), then return
|
| 5355 |
// basically don't process anything that is less than 1.0s
|
| 5356 |
// see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
|
| 5357 |
+
if (seek_end < seek_start + 100) {
|
| 5358 |
WHISPER_LOG_WARN("%s: input is too short - %d ms < 1000 ms. consider padding the input audio with silence\n", __func__, (seek_end - seek_start)*10);
|
| 5359 |
return 0;
|
| 5360 |
}
|
|
|
|
| 6066 |
const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
|
| 6067 |
|
| 6068 |
if (!text.empty()) {
|
| 6069 |
+
const auto tt0 = t0;
|
| 6070 |
+
const auto tt1 = t1;
|
| 6071 |
|
| 6072 |
if (params.print_realtime) {
|
| 6073 |
if (params.print_timestamps) {
|
|
|
|
| 6113 |
if (!text.empty()) {
|
| 6114 |
const auto t1 = seek + seek_delta;
|
| 6115 |
|
| 6116 |
+
const auto tt0 = t0;
|
| 6117 |
+
const auto tt1 = t1;
|
| 6118 |
|
| 6119 |
if (params.print_realtime) {
|
| 6120 |
if (params.print_timestamps) {
|
whisper.h
CHANGED
|
@@ -266,22 +266,6 @@ extern "C" {
|
|
| 266 |
int n_samples,
|
| 267 |
int n_threads);
|
| 268 |
|
| 269 |
-
// Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
|
| 270 |
-
// The resulting spectrogram is stored inside the default state of the provided whisper context.
|
| 271 |
-
// Returns 0 on success
|
| 272 |
-
WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
|
| 273 |
-
struct whisper_context * ctx,
|
| 274 |
-
const float * samples,
|
| 275 |
-
int n_samples,
|
| 276 |
-
int n_threads);
|
| 277 |
-
|
| 278 |
-
WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
|
| 279 |
-
struct whisper_context * ctx,
|
| 280 |
-
struct whisper_state * state,
|
| 281 |
-
const float * samples,
|
| 282 |
-
int n_samples,
|
| 283 |
-
int n_threads);
|
| 284 |
-
|
| 285 |
// This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
|
| 286 |
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
|
| 287 |
// n_mel must be 80
|
|
@@ -499,7 +483,6 @@ extern "C" {
|
|
| 499 |
|
| 500 |
// [EXPERIMENTAL] speed-up techniques
|
| 501 |
// note: these can significantly reduce the quality of the output
|
| 502 |
-
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
| 503 |
bool debug_mode; // enable debug_mode provides extra info (eg. Dump log_mel)
|
| 504 |
int audio_ctx; // overwrite the audio context size (0 = use default)
|
| 505 |
|
|
|
|
| 266 |
int n_samples,
|
| 267 |
int n_threads);
|
| 268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
// This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
|
| 270 |
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
|
| 271 |
// n_mel must be 80
|
|
|
|
| 483 |
|
| 484 |
// [EXPERIMENTAL] speed-up techniques
|
| 485 |
// note: these can significantly reduce the quality of the output
|
|
|
|
| 486 |
bool debug_mode; // enable debug_mode provides extra info (eg. Dump log_mel)
|
| 487 |
int audio_ctx; // overwrite the audio context size (0 = use default)
|
| 488 |
|