ggerganov commited on
Commit
097e96a
·
unverified ·
1 Parent(s): 4b609e3

ref #22 : add "duration" option

Browse files

Can be used to partially process a recording

Files changed (3) hide show
  1. examples/main/main.cpp +5 -0
  2. whisper.cpp +7 -4
  3. whisper.h +2 -1
examples/main/main.cpp CHANGED
@@ -53,6 +53,7 @@ struct whisper_params {
53
  int32_t n_processors = 1;
54
  int32_t offset_t_ms = 0;
55
  int32_t offset_n = 0;
 
56
  int32_t max_context = -1;
57
  int32_t max_len = 0;
58
 
@@ -95,6 +96,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
95
  params.offset_t_ms = std::stoi(argv[++i]);
96
  } else if (arg == "-on" || arg == "--offset-n") {
97
  params.offset_n = std::stoi(argv[++i]);
 
 
98
  } else if (arg == "-mc" || arg == "--max-context") {
99
  params.max_context = std::stoi(argv[++i]);
100
  } else if (arg == "-ml" || arg == "--max-len") {
@@ -154,6 +157,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
154
  fprintf(stderr, " -p N, --processors N number of processors to use during computation (default: %d)\n", params.n_processors);
155
  fprintf(stderr, " -ot N, --offset-t N time offset in milliseconds (default: %d)\n", params.offset_t_ms);
156
  fprintf(stderr, " -on N, --offset-n N segment index offset (default: %d)\n", params.offset_n);
 
157
  fprintf(stderr, " -mc N, --max-context N maximum number of text context tokens to store (default: max)\n");
158
  fprintf(stderr, " -ml N, --max-len N maximum segment length in characters (default: %d)\n", params.max_len);
159
  fprintf(stderr, " -wt N, --word-thold N word timestamp probability threshold (default: %f)\n", params.word_thold);
@@ -532,6 +536,7 @@ int main(int argc, char ** argv) {
532
  wparams.n_threads = params.n_threads;
533
  wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
534
  wparams.offset_ms = params.offset_t_ms;
 
535
 
536
  wparams.token_timestamps = params.output_wts || params.max_len > 0;
537
  wparams.thold_pt = params.word_thold;
 
53
  int32_t n_processors = 1;
54
  int32_t offset_t_ms = 0;
55
  int32_t offset_n = 0;
56
+ int32_t duration_ms = 0;
57
  int32_t max_context = -1;
58
  int32_t max_len = 0;
59
 
 
96
  params.offset_t_ms = std::stoi(argv[++i]);
97
  } else if (arg == "-on" || arg == "--offset-n") {
98
  params.offset_n = std::stoi(argv[++i]);
99
+ } else if (arg == "-d" || arg == "--duration") {
100
+ params.duration_ms = std::stoi(argv[++i]);
101
  } else if (arg == "-mc" || arg == "--max-context") {
102
  params.max_context = std::stoi(argv[++i]);
103
  } else if (arg == "-ml" || arg == "--max-len") {
 
157
  fprintf(stderr, " -p N, --processors N number of processors to use during computation (default: %d)\n", params.n_processors);
158
  fprintf(stderr, " -ot N, --offset-t N time offset in milliseconds (default: %d)\n", params.offset_t_ms);
159
  fprintf(stderr, " -on N, --offset-n N segment index offset (default: %d)\n", params.offset_n);
160
+ fprintf(stderr, " -d N, --duration N duration of audio to process in milliseconds (default: %d)\n", params.duration_ms);
161
  fprintf(stderr, " -mc N, --max-context N maximum number of text context tokens to store (default: max)\n");
162
  fprintf(stderr, " -ml N, --max-len N maximum segment length in characters (default: %d)\n", params.max_len);
163
  fprintf(stderr, " -wt N, --word-thold N word timestamp probability threshold (default: %f)\n", params.word_thold);
 
536
  wparams.n_threads = params.n_threads;
537
  wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
538
  wparams.offset_ms = params.offset_t_ms;
539
+ wparams.duration_ms = params.duration_ms;
540
 
541
  wparams.token_timestamps = params.output_wts || params.max_len > 0;
542
  wparams.thold_pt = params.word_thold;
whisper.cpp CHANGED
@@ -2339,6 +2339,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
2339
  /*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
2340
  /*.n_max_text_ctx =*/ 16384,
2341
  /*.offset_ms =*/ 0,
 
2342
 
2343
  /*.translate =*/ false,
2344
  /*.no_context =*/ false,
@@ -2376,6 +2377,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
2376
  /*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
2377
  /*.n_max_text_ctx =*/ 16384,
2378
  /*.offset_ms =*/ 0,
 
2379
 
2380
  /*.translate =*/ false,
2381
  /*.no_context =*/ false,
@@ -2496,11 +2498,12 @@ int whisper_full(
2496
  }
2497
 
2498
  const int seek_start = params.offset_ms/10;
 
2499
 
2500
  // if length of spectrogram is less than 1s (100 samples), then return
2501
  // basically don't process anything that is less than 1s
2502
  // see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
2503
- if (whisper_n_len(ctx) < 100 + seek_start) {
2504
  return 0;
2505
  }
2506
 
@@ -2533,7 +2536,7 @@ int whisper_full(
2533
  // main loop
2534
  int seek = seek_start;
2535
  while (true) {
2536
- int progress_cur = (100*seek)/whisper_n_len(ctx);
2537
  while (progress_cur >= progress_prev + progress_step) {
2538
  progress_prev += progress_step;
2539
  if (params.print_progress) {
@@ -2541,7 +2544,7 @@ int whisper_full(
2541
  }
2542
  }
2543
 
2544
- if (seek + 100 >= whisper_n_len(ctx)) {
2545
  break;
2546
  }
2547
 
@@ -2622,7 +2625,7 @@ int whisper_full(
2622
  // end of text token
2623
  if (token.id == whisper_token_eot(ctx)) {
2624
  if (result_len == 0) {
2625
- if (seek + seek_delta + 100 >= whisper_n_len(ctx)) {
2626
  result_len = i + 1;
2627
  } else {
2628
  // TODO: figure out how to resolve this
 
2339
  /*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
2340
  /*.n_max_text_ctx =*/ 16384,
2341
  /*.offset_ms =*/ 0,
2342
+ /*.duration_ms =*/ 0,
2343
 
2344
  /*.translate =*/ false,
2345
  /*.no_context =*/ false,
 
2377
  /*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
2378
  /*.n_max_text_ctx =*/ 16384,
2379
  /*.offset_ms =*/ 0,
2380
+ /*.duration_ms =*/ 0,
2381
 
2382
  /*.translate =*/ false,
2383
  /*.no_context =*/ false,
 
2498
  }
2499
 
2500
  const int seek_start = params.offset_ms/10;
2501
+ const int seek_end = seek_start + (params.duration_ms == 0 ? whisper_n_len(ctx) : params.duration_ms/10);
2502
 
2503
  // if length of spectrogram is less than 1s (100 samples), then return
2504
  // basically don't process anything that is less than 1s
2505
  // see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
2506
+ if (seek_end < 100 + seek_start) {
2507
  return 0;
2508
  }
2509
 
 
2536
  // main loop
2537
  int seek = seek_start;
2538
  while (true) {
2539
+ const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
2540
  while (progress_cur >= progress_prev + progress_step) {
2541
  progress_prev += progress_step;
2542
  if (params.print_progress) {
 
2544
  }
2545
  }
2546
 
2547
+ if (seek + 100 >= seek_end) {
2548
  break;
2549
  }
2550
 
 
2625
  // end of text token
2626
  if (token.id == whisper_token_eot(ctx)) {
2627
  if (result_len == 0) {
2628
+ if (seek + seek_delta + 100 >= seek_end) {
2629
  result_len = i + 1;
2630
  } else {
2631
  // TODO: figure out how to resolve this
whisper.h CHANGED
@@ -186,7 +186,8 @@ extern "C" {
186
 
187
  int n_threads;
188
  int n_max_text_ctx;
189
- int offset_ms;
 
190
 
191
  bool translate;
192
  bool no_context;
 
186
 
187
  int n_threads;
188
  int n_max_text_ctx;
189
+ int offset_ms; // start offset in ms
190
+ int duration_ms; // audio duration to process in ms
191
 
192
  bool translate;
193
  bool no_context;