Spaces:
Running
Running
stream : add "audio_ctx" parameter
Browse filesUsed to overwrite the audio context size of the Encoder.
For example, setting "audio_ctx = 512" will make it run about 3 times
faster, processing about 10s of audio, instead of 30s.
The transcription quality drops, but this can be used for real-time
streaming purposes where performance is important.
- examples/stream/stream.cpp +6 -1
- whisper.cpp +16 -6
- whisper.h +2 -3
examples/stream/stream.cpp
CHANGED
|
@@ -40,6 +40,7 @@ struct whisper_params {
|
|
| 40 |
int32_t step_ms = 3000;
|
| 41 |
int32_t length_ms = 10000;
|
| 42 |
int32_t capture_id = -1;
|
|
|
|
| 43 |
|
| 44 |
bool speed_up = false;
|
| 45 |
bool verbose = false;
|
|
@@ -69,6 +70,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 69 |
params.length_ms = std::stoi(argv[++i]);
|
| 70 |
} else if (arg == "-c" || arg == "--capture") {
|
| 71 |
params.capture_id = std::stoi(argv[++i]);
|
|
|
|
|
|
|
| 72 |
} else if (arg == "-su" || arg == "--speed-up") {
|
| 73 |
params.speed_up = true;
|
| 74 |
} else if (arg == "-v" || arg == "--verbose") {
|
|
@@ -116,6 +119,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
|
|
| 116 |
fprintf(stderr, " --step N audio step size in milliseconds (default: %d)\n", params.step_ms);
|
| 117 |
fprintf(stderr, " --length N audio length in milliseconds (default: %d)\n", params.length_ms);
|
| 118 |
fprintf(stderr, " -c ID, --capture ID capture device ID (default: -1)\n");
|
|
|
|
| 119 |
fprintf(stderr, " -su, --speed-up speed up audio by factor of 2 (faster processing, reduced accuracy, default: %s)\n", params.speed_up ? "true" : "false");
|
| 120 |
fprintf(stderr, " -v, --verbose verbose output\n");
|
| 121 |
fprintf(stderr, " --translate translate from source language to english\n");
|
|
@@ -322,7 +326,6 @@ int main(int argc, char ** argv) {
|
|
| 322 |
{
|
| 323 |
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
| 324 |
|
| 325 |
-
wparams.max_tokens = 32;
|
| 326 |
wparams.print_progress = false;
|
| 327 |
wparams.print_special_tokens = params.print_special_tokens;
|
| 328 |
wparams.print_realtime = false;
|
|
@@ -330,9 +333,11 @@ int main(int argc, char ** argv) {
|
|
| 330 |
wparams.translate = params.translate;
|
| 331 |
wparams.no_context = params.no_context;
|
| 332 |
wparams.single_segment = true;
|
|
|
|
| 333 |
wparams.language = params.language.c_str();
|
| 334 |
wparams.n_threads = params.n_threads;
|
| 335 |
|
|
|
|
| 336 |
wparams.speed_up = params.speed_up;
|
| 337 |
|
| 338 |
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
|
|
|
| 40 |
int32_t step_ms = 3000;
|
| 41 |
int32_t length_ms = 10000;
|
| 42 |
int32_t capture_id = -1;
|
| 43 |
+
int32_t audio_ctx = 0;
|
| 44 |
|
| 45 |
bool speed_up = false;
|
| 46 |
bool verbose = false;
|
|
|
|
| 70 |
params.length_ms = std::stoi(argv[++i]);
|
| 71 |
} else if (arg == "-c" || arg == "--capture") {
|
| 72 |
params.capture_id = std::stoi(argv[++i]);
|
| 73 |
+
} else if (arg == "-ac" || arg == "--audio_ctx") {
|
| 74 |
+
params.audio_ctx = std::stoi(argv[++i]);
|
| 75 |
} else if (arg == "-su" || arg == "--speed-up") {
|
| 76 |
params.speed_up = true;
|
| 77 |
} else if (arg == "-v" || arg == "--verbose") {
|
|
|
|
| 119 |
fprintf(stderr, " --step N audio step size in milliseconds (default: %d)\n", params.step_ms);
|
| 120 |
fprintf(stderr, " --length N audio length in milliseconds (default: %d)\n", params.length_ms);
|
| 121 |
fprintf(stderr, " -c ID, --capture ID capture device ID (default: -1)\n");
|
| 122 |
+
fprintf(stderr, " -ac N, --audio_ctx N audio context size (default: %d, 0 - all)\n", params.audio_ctx);
|
| 123 |
fprintf(stderr, " -su, --speed-up speed up audio by factor of 2 (faster processing, reduced accuracy, default: %s)\n", params.speed_up ? "true" : "false");
|
| 124 |
fprintf(stderr, " -v, --verbose verbose output\n");
|
| 125 |
fprintf(stderr, " --translate translate from source language to english\n");
|
|
|
|
| 326 |
{
|
| 327 |
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
| 328 |
|
|
|
|
| 329 |
wparams.print_progress = false;
|
| 330 |
wparams.print_special_tokens = params.print_special_tokens;
|
| 331 |
wparams.print_realtime = false;
|
|
|
|
| 333 |
wparams.translate = params.translate;
|
| 334 |
wparams.no_context = params.no_context;
|
| 335 |
wparams.single_segment = true;
|
| 336 |
+
wparams.max_tokens = 32;
|
| 337 |
wparams.language = params.language.c_str();
|
| 338 |
wparams.n_threads = params.n_threads;
|
| 339 |
|
| 340 |
+
wparams.audio_ctx = params.audio_ctx;
|
| 341 |
wparams.speed_up = params.speed_up;
|
| 342 |
|
| 343 |
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
whisper.cpp
CHANGED
|
@@ -424,6 +424,9 @@ struct whisper_context {
|
|
| 424 |
int64_t t_last;
|
| 425 |
whisper_token tid_last;
|
| 426 |
std::vector<float> energy; // PCM signal energy
|
|
|
|
|
|
|
|
|
|
| 427 |
};
|
| 428 |
|
| 429 |
// load the model from a ggml file
|
|
@@ -974,9 +977,6 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
|
|
| 974 |
|
| 975 |
model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
| 976 |
model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
| 977 |
-
|
| 978 |
-
//memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k));
|
| 979 |
-
//memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v));
|
| 980 |
}
|
| 981 |
|
| 982 |
const size_t memory_size =
|
|
@@ -1079,7 +1079,7 @@ static bool whisper_encode(
|
|
| 1079 |
const auto & mel_inp = wctx.mel;
|
| 1080 |
const auto & hparams = model.hparams;
|
| 1081 |
|
| 1082 |
-
const int n_ctx =
|
| 1083 |
const int n_state = hparams.n_audio_state;
|
| 1084 |
const int n_head = hparams.n_audio_head;
|
| 1085 |
const int n_layer = hparams.n_audio_layer;
|
|
@@ -1133,6 +1133,8 @@ static bool whisper_encode(
|
|
| 1133 |
cur = ggml_gelu(ctx0, cur);
|
| 1134 |
}
|
| 1135 |
|
|
|
|
|
|
|
| 1136 |
//static int iter = -1;
|
| 1137 |
//const int n_iter = 1500/n_ctx;
|
| 1138 |
|
|
@@ -1151,6 +1153,10 @@ static bool whisper_encode(
|
|
| 1151 |
struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
|
| 1152 |
|
| 1153 |
cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur));
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1154 |
|
| 1155 |
struct ggml_tensor * inpL = cur;
|
| 1156 |
|
|
@@ -1494,8 +1500,7 @@ static bool whisper_decode(
|
|
| 1494 |
const int n_layer = hparams.n_text_layer;
|
| 1495 |
|
| 1496 |
const int N = n_tokens;
|
| 1497 |
-
|
| 1498 |
-
const int M = WHISPER_EXPERIMENT_AUDIO_CTX;
|
| 1499 |
|
| 1500 |
struct ggml_init_params params = {
|
| 1501 |
.mem_size = wctx.buf_compute.size(),
|
|
@@ -2405,6 +2410,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
| 2405 |
/*.max_tokens =*/ 0,
|
| 2406 |
|
| 2407 |
/*.speed_up =*/ false,
|
|
|
|
| 2408 |
|
| 2409 |
/*.language =*/ "en",
|
| 2410 |
|
|
@@ -2447,6 +2453,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
| 2447 |
/*.max_tokens =*/ 0,
|
| 2448 |
|
| 2449 |
/*.speed_up =*/ false,
|
|
|
|
| 2450 |
|
| 2451 |
/*.language =*/ "en",
|
| 2452 |
|
|
@@ -2577,6 +2584,9 @@ int whisper_full(
|
|
| 2577 |
prompt_past.clear();
|
| 2578 |
}
|
| 2579 |
|
|
|
|
|
|
|
|
|
|
| 2580 |
// these tokens determine the task that will be performed
|
| 2581 |
std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };
|
| 2582 |
if (whisper_is_multilingual(ctx)) {
|
|
|
|
| 424 |
int64_t t_last;
|
| 425 |
whisper_token tid_last;
|
| 426 |
std::vector<float> energy; // PCM signal energy
|
| 427 |
+
|
| 428 |
+
// [EXPERIMENTAL] speed-up techniques
|
| 429 |
+
int32_t exp_n_audio_ctx; // 0 - use default
|
| 430 |
};
|
| 431 |
|
| 432 |
// load the model from a ggml file
|
|
|
|
| 977 |
|
| 978 |
model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
| 979 |
model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
|
|
|
|
|
|
|
|
|
| 980 |
}
|
| 981 |
|
| 982 |
const size_t memory_size =
|
|
|
|
| 1079 |
const auto & mel_inp = wctx.mel;
|
| 1080 |
const auto & hparams = model.hparams;
|
| 1081 |
|
| 1082 |
+
const int n_ctx = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx;
|
| 1083 |
const int n_state = hparams.n_audio_state;
|
| 1084 |
const int n_head = hparams.n_audio_head;
|
| 1085 |
const int n_layer = hparams.n_audio_layer;
|
|
|
|
| 1133 |
cur = ggml_gelu(ctx0, cur);
|
| 1134 |
}
|
| 1135 |
|
| 1136 |
+
// ===================================================================
|
| 1137 |
+
// NOTE: experimenting with partial evaluation of the encoder (ignore)
|
| 1138 |
//static int iter = -1;
|
| 1139 |
//const int n_iter = 1500/n_ctx;
|
| 1140 |
|
|
|
|
| 1153 |
struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
|
| 1154 |
|
| 1155 |
cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur));
|
| 1156 |
+
// ===================================================================
|
| 1157 |
+
|
| 1158 |
+
// original:
|
| 1159 |
+
//cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
|
| 1160 |
|
| 1161 |
struct ggml_tensor * inpL = cur;
|
| 1162 |
|
|
|
|
| 1500 |
const int n_layer = hparams.n_text_layer;
|
| 1501 |
|
| 1502 |
const int N = n_tokens;
|
| 1503 |
+
const int M = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx;
|
|
|
|
| 1504 |
|
| 1505 |
struct ggml_init_params params = {
|
| 1506 |
.mem_size = wctx.buf_compute.size(),
|
|
|
|
| 2410 |
/*.max_tokens =*/ 0,
|
| 2411 |
|
| 2412 |
/*.speed_up =*/ false,
|
| 2413 |
+
/*.audio_ctx =*/ 0,
|
| 2414 |
|
| 2415 |
/*.language =*/ "en",
|
| 2416 |
|
|
|
|
| 2453 |
/*.max_tokens =*/ 0,
|
| 2454 |
|
| 2455 |
/*.speed_up =*/ false,
|
| 2456 |
+
/*.audio_ctx =*/ 0,
|
| 2457 |
|
| 2458 |
/*.language =*/ "en",
|
| 2459 |
|
|
|
|
| 2584 |
prompt_past.clear();
|
| 2585 |
}
|
| 2586 |
|
| 2587 |
+
// overwrite audio_ctx
|
| 2588 |
+
ctx->exp_n_audio_ctx = params.audio_ctx;
|
| 2589 |
+
|
| 2590 |
// these tokens determine the task that will be performed
|
| 2591 |
std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };
|
| 2592 |
if (whisper_is_multilingual(ctx)) {
|
whisper.h
CHANGED
|
@@ -24,8 +24,6 @@
|
|
| 24 |
#define WHISPER_HOP_LENGTH 160
|
| 25 |
#define WHISPER_CHUNK_SIZE 30
|
| 26 |
|
| 27 |
-
#define WHISPER_EXPERIMENT_AUDIO_CTX 512
|
| 28 |
-
|
| 29 |
#ifdef __cplusplus
|
| 30 |
extern "C" {
|
| 31 |
#endif
|
|
@@ -207,7 +205,8 @@ extern "C" {
|
|
| 207 |
int max_tokens; // max tokens per segment (0 = no limit)
|
| 208 |
|
| 209 |
// [EXPERIMENTAL] speed-up techniques
|
| 210 |
-
bool speed_up;
|
|
|
|
| 211 |
|
| 212 |
const char * language;
|
| 213 |
|
|
|
|
| 24 |
#define WHISPER_HOP_LENGTH 160
|
| 25 |
#define WHISPER_CHUNK_SIZE 30
|
| 26 |
|
|
|
|
|
|
|
| 27 |
#ifdef __cplusplus
|
| 28 |
extern "C" {
|
| 29 |
#endif
|
|
|
|
| 205 |
int max_tokens; // max tokens per segment (0 = no limit)
|
| 206 |
|
| 207 |
// [EXPERIMENTAL] speed-up techniques
|
| 208 |
+
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
| 209 |
+
int audio_ctx; // overwrite the audio context size (0 = use default)
|
| 210 |
|
| 211 |
const char * language;
|
| 212 |
|