ggerganov commited on
Commit
6adc1fe
·
1 Parent(s): e48ba5c

stream : add "audio_ctx" parameter

Browse files

Used to overwrite the audio context size of the Encoder.
For example, setting "audio_ctx = 512" will make it run about 3 times
faster, processing about 10s of audio, instead of 30s.

The transcription quality drops, but this can be used for real-time
streaming purposes where performance is important.

Files changed (3) hide show
  1. examples/stream/stream.cpp +6 -1
  2. whisper.cpp +16 -6
  3. whisper.h +2 -3
examples/stream/stream.cpp CHANGED
@@ -40,6 +40,7 @@ struct whisper_params {
40
  int32_t step_ms = 3000;
41
  int32_t length_ms = 10000;
42
  int32_t capture_id = -1;
 
43
 
44
  bool speed_up = false;
45
  bool verbose = false;
@@ -69,6 +70,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
69
  params.length_ms = std::stoi(argv[++i]);
70
  } else if (arg == "-c" || arg == "--capture") {
71
  params.capture_id = std::stoi(argv[++i]);
 
 
72
  } else if (arg == "-su" || arg == "--speed-up") {
73
  params.speed_up = true;
74
  } else if (arg == "-v" || arg == "--verbose") {
@@ -116,6 +119,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
116
  fprintf(stderr, " --step N audio step size in milliseconds (default: %d)\n", params.step_ms);
117
  fprintf(stderr, " --length N audio length in milliseconds (default: %d)\n", params.length_ms);
118
  fprintf(stderr, " -c ID, --capture ID capture device ID (default: -1)\n");
 
119
  fprintf(stderr, " -su, --speed-up speed up audio by factor of 2 (faster processing, reduced accuracy, default: %s)\n", params.speed_up ? "true" : "false");
120
  fprintf(stderr, " -v, --verbose verbose output\n");
121
  fprintf(stderr, " --translate translate from source language to english\n");
@@ -322,7 +326,6 @@ int main(int argc, char ** argv) {
322
  {
323
  whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
324
 
325
- wparams.max_tokens = 32;
326
  wparams.print_progress = false;
327
  wparams.print_special_tokens = params.print_special_tokens;
328
  wparams.print_realtime = false;
@@ -330,9 +333,11 @@ int main(int argc, char ** argv) {
330
  wparams.translate = params.translate;
331
  wparams.no_context = params.no_context;
332
  wparams.single_segment = true;
 
333
  wparams.language = params.language.c_str();
334
  wparams.n_threads = params.n_threads;
335
 
 
336
  wparams.speed_up = params.speed_up;
337
 
338
  if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
 
40
  int32_t step_ms = 3000;
41
  int32_t length_ms = 10000;
42
  int32_t capture_id = -1;
43
+ int32_t audio_ctx = 0;
44
 
45
  bool speed_up = false;
46
  bool verbose = false;
 
70
  params.length_ms = std::stoi(argv[++i]);
71
  } else if (arg == "-c" || arg == "--capture") {
72
  params.capture_id = std::stoi(argv[++i]);
73
+ } else if (arg == "-ac" || arg == "--audio_ctx") {
74
+ params.audio_ctx = std::stoi(argv[++i]);
75
  } else if (arg == "-su" || arg == "--speed-up") {
76
  params.speed_up = true;
77
  } else if (arg == "-v" || arg == "--verbose") {
 
119
  fprintf(stderr, " --step N audio step size in milliseconds (default: %d)\n", params.step_ms);
120
  fprintf(stderr, " --length N audio length in milliseconds (default: %d)\n", params.length_ms);
121
  fprintf(stderr, " -c ID, --capture ID capture device ID (default: -1)\n");
122
+ fprintf(stderr, " -ac N, --audio_ctx N audio context size (default: %d, 0 - all)\n", params.audio_ctx);
123
  fprintf(stderr, " -su, --speed-up speed up audio by factor of 2 (faster processing, reduced accuracy, default: %s)\n", params.speed_up ? "true" : "false");
124
  fprintf(stderr, " -v, --verbose verbose output\n");
125
  fprintf(stderr, " --translate translate from source language to english\n");
 
326
  {
327
  whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
328
 
 
329
  wparams.print_progress = false;
330
  wparams.print_special_tokens = params.print_special_tokens;
331
  wparams.print_realtime = false;
 
333
  wparams.translate = params.translate;
334
  wparams.no_context = params.no_context;
335
  wparams.single_segment = true;
336
+ wparams.max_tokens = 32;
337
  wparams.language = params.language.c_str();
338
  wparams.n_threads = params.n_threads;
339
 
340
+ wparams.audio_ctx = params.audio_ctx;
341
  wparams.speed_up = params.speed_up;
342
 
343
  if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
whisper.cpp CHANGED
@@ -424,6 +424,9 @@ struct whisper_context {
424
  int64_t t_last;
425
  whisper_token tid_last;
426
  std::vector<float> energy; // PCM signal energy
 
 
 
427
  };
428
 
429
  // load the model from a ggml file
@@ -974,9 +977,6 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
974
 
975
  model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
976
  model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
977
-
978
- //memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k));
979
- //memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v));
980
  }
981
 
982
  const size_t memory_size =
@@ -1079,7 +1079,7 @@ static bool whisper_encode(
1079
  const auto & mel_inp = wctx.mel;
1080
  const auto & hparams = model.hparams;
1081
 
1082
- const int n_ctx = WHISPER_EXPERIMENT_AUDIO_CTX;
1083
  const int n_state = hparams.n_audio_state;
1084
  const int n_head = hparams.n_audio_head;
1085
  const int n_layer = hparams.n_audio_layer;
@@ -1133,6 +1133,8 @@ static bool whisper_encode(
1133
  cur = ggml_gelu(ctx0, cur);
1134
  }
1135
 
 
 
1136
  //static int iter = -1;
1137
  //const int n_iter = 1500/n_ctx;
1138
 
@@ -1151,6 +1153,10 @@ static bool whisper_encode(
1151
  struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
1152
 
1153
  cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur));
 
 
 
 
1154
 
1155
  struct ggml_tensor * inpL = cur;
1156
 
@@ -1494,8 +1500,7 @@ static bool whisper_decode(
1494
  const int n_layer = hparams.n_text_layer;
1495
 
1496
  const int N = n_tokens;
1497
- //const int M = hparams.n_audio_ctx;
1498
- const int M = WHISPER_EXPERIMENT_AUDIO_CTX;
1499
 
1500
  struct ggml_init_params params = {
1501
  .mem_size = wctx.buf_compute.size(),
@@ -2405,6 +2410,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
2405
  /*.max_tokens =*/ 0,
2406
 
2407
  /*.speed_up =*/ false,
 
2408
 
2409
  /*.language =*/ "en",
2410
 
@@ -2447,6 +2453,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
2447
  /*.max_tokens =*/ 0,
2448
 
2449
  /*.speed_up =*/ false,
 
2450
 
2451
  /*.language =*/ "en",
2452
 
@@ -2577,6 +2584,9 @@ int whisper_full(
2577
  prompt_past.clear();
2578
  }
2579
 
 
 
 
2580
  // these tokens determine the task that will be performed
2581
  std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };
2582
  if (whisper_is_multilingual(ctx)) {
 
424
  int64_t t_last;
425
  whisper_token tid_last;
426
  std::vector<float> energy; // PCM signal energy
427
+
428
+ // [EXPERIMENTAL] speed-up techniques
429
+ int32_t exp_n_audio_ctx; // 0 - use default
430
  };
431
 
432
  // load the model from a ggml file
 
977
 
978
  model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
979
  model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
 
 
 
980
  }
981
 
982
  const size_t memory_size =
 
1079
  const auto & mel_inp = wctx.mel;
1080
  const auto & hparams = model.hparams;
1081
 
1082
+ const int n_ctx = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx;
1083
  const int n_state = hparams.n_audio_state;
1084
  const int n_head = hparams.n_audio_head;
1085
  const int n_layer = hparams.n_audio_layer;
 
1133
  cur = ggml_gelu(ctx0, cur);
1134
  }
1135
 
1136
+ // ===================================================================
1137
+ // NOTE: experimenting with partial evaluation of the encoder (ignore)
1138
  //static int iter = -1;
1139
  //const int n_iter = 1500/n_ctx;
1140
 
 
1153
  struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
1154
 
1155
  cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur));
1156
+ // ===================================================================
1157
+
1158
+ // original:
1159
+ //cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
1160
 
1161
  struct ggml_tensor * inpL = cur;
1162
 
 
1500
  const int n_layer = hparams.n_text_layer;
1501
 
1502
  const int N = n_tokens;
1503
+ const int M = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx;
 
1504
 
1505
  struct ggml_init_params params = {
1506
  .mem_size = wctx.buf_compute.size(),
 
2410
  /*.max_tokens =*/ 0,
2411
 
2412
  /*.speed_up =*/ false,
2413
+ /*.audio_ctx =*/ 0,
2414
 
2415
  /*.language =*/ "en",
2416
 
 
2453
  /*.max_tokens =*/ 0,
2454
 
2455
  /*.speed_up =*/ false,
2456
+ /*.audio_ctx =*/ 0,
2457
 
2458
  /*.language =*/ "en",
2459
 
 
2584
  prompt_past.clear();
2585
  }
2586
 
2587
+ // overwrite audio_ctx
2588
+ ctx->exp_n_audio_ctx = params.audio_ctx;
2589
+
2590
  // these tokens determine the task that will be performed
2591
  std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };
2592
  if (whisper_is_multilingual(ctx)) {
whisper.h CHANGED
@@ -24,8 +24,6 @@
24
  #define WHISPER_HOP_LENGTH 160
25
  #define WHISPER_CHUNK_SIZE 30
26
 
27
- #define WHISPER_EXPERIMENT_AUDIO_CTX 512
28
-
29
  #ifdef __cplusplus
30
  extern "C" {
31
  #endif
@@ -207,7 +205,8 @@ extern "C" {
207
  int max_tokens; // max tokens per segment (0 = no limit)
208
 
209
  // [EXPERIMENTAL] speed-up techniques
210
- bool speed_up; // speed-up the audio by 2x using Phase Vocoder
 
211
 
212
  const char * language;
213
 
 
24
  #define WHISPER_HOP_LENGTH 160
25
  #define WHISPER_CHUNK_SIZE 30
26
 
 
 
27
  #ifdef __cplusplus
28
  extern "C" {
29
  #endif
 
205
  int max_tokens; // max tokens per segment (0 = no limit)
206
 
207
  // [EXPERIMENTAL] speed-up techniques
208
+ bool speed_up; // speed-up the audio by 2x using Phase Vocoder
209
+ int audio_ctx; // overwrite the audio context size (0 = use default)
210
 
211
  const char * language;
212