Spaces:
Running
Running
ref #22 : add "duration" option
Browse filesCan be used to partially process a recording
- examples/main/main.cpp +5 -0
- whisper.cpp +7 -4
- whisper.h +2 -1
examples/main/main.cpp
CHANGED
|
@@ -53,6 +53,7 @@ struct whisper_params {
|
|
| 53 |
int32_t n_processors = 1;
|
| 54 |
int32_t offset_t_ms = 0;
|
| 55 |
int32_t offset_n = 0;
|
|
|
|
| 56 |
int32_t max_context = -1;
|
| 57 |
int32_t max_len = 0;
|
| 58 |
|
|
@@ -95,6 +96,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 95 |
params.offset_t_ms = std::stoi(argv[++i]);
|
| 96 |
} else if (arg == "-on" || arg == "--offset-n") {
|
| 97 |
params.offset_n = std::stoi(argv[++i]);
|
|
|
|
|
|
|
| 98 |
} else if (arg == "-mc" || arg == "--max-context") {
|
| 99 |
params.max_context = std::stoi(argv[++i]);
|
| 100 |
} else if (arg == "-ml" || arg == "--max-len") {
|
|
@@ -154,6 +157,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
|
|
| 154 |
fprintf(stderr, " -p N, --processors N number of processors to use during computation (default: %d)\n", params.n_processors);
|
| 155 |
fprintf(stderr, " -ot N, --offset-t N time offset in milliseconds (default: %d)\n", params.offset_t_ms);
|
| 156 |
fprintf(stderr, " -on N, --offset-n N segment index offset (default: %d)\n", params.offset_n);
|
|
|
|
| 157 |
fprintf(stderr, " -mc N, --max-context N maximum number of text context tokens to store (default: max)\n");
|
| 158 |
fprintf(stderr, " -ml N, --max-len N maximum segment length in characters (default: %d)\n", params.max_len);
|
| 159 |
fprintf(stderr, " -wt N, --word-thold N word timestamp probability threshold (default: %f)\n", params.word_thold);
|
|
@@ -532,6 +536,7 @@ int main(int argc, char ** argv) {
|
|
| 532 |
wparams.n_threads = params.n_threads;
|
| 533 |
wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
|
| 534 |
wparams.offset_ms = params.offset_t_ms;
|
|
|
|
| 535 |
|
| 536 |
wparams.token_timestamps = params.output_wts || params.max_len > 0;
|
| 537 |
wparams.thold_pt = params.word_thold;
|
|
|
|
| 53 |
int32_t n_processors = 1;
|
| 54 |
int32_t offset_t_ms = 0;
|
| 55 |
int32_t offset_n = 0;
|
| 56 |
+
int32_t duration_ms = 0;
|
| 57 |
int32_t max_context = -1;
|
| 58 |
int32_t max_len = 0;
|
| 59 |
|
|
|
|
| 96 |
params.offset_t_ms = std::stoi(argv[++i]);
|
| 97 |
} else if (arg == "-on" || arg == "--offset-n") {
|
| 98 |
params.offset_n = std::stoi(argv[++i]);
|
| 99 |
+
} else if (arg == "-d" || arg == "--duration") {
|
| 100 |
+
params.duration_ms = std::stoi(argv[++i]);
|
| 101 |
} else if (arg == "-mc" || arg == "--max-context") {
|
| 102 |
params.max_context = std::stoi(argv[++i]);
|
| 103 |
} else if (arg == "-ml" || arg == "--max-len") {
|
|
|
|
| 157 |
fprintf(stderr, " -p N, --processors N number of processors to use during computation (default: %d)\n", params.n_processors);
|
| 158 |
fprintf(stderr, " -ot N, --offset-t N time offset in milliseconds (default: %d)\n", params.offset_t_ms);
|
| 159 |
fprintf(stderr, " -on N, --offset-n N segment index offset (default: %d)\n", params.offset_n);
|
| 160 |
+
fprintf(stderr, " -d N, --duration N duration of audio to process in milliseconds (default: %d)\n", params.duration_ms);
|
| 161 |
fprintf(stderr, " -mc N, --max-context N maximum number of text context tokens to store (default: max)\n");
|
| 162 |
fprintf(stderr, " -ml N, --max-len N maximum segment length in characters (default: %d)\n", params.max_len);
|
| 163 |
fprintf(stderr, " -wt N, --word-thold N word timestamp probability threshold (default: %f)\n", params.word_thold);
|
|
|
|
| 536 |
wparams.n_threads = params.n_threads;
|
| 537 |
wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
|
| 538 |
wparams.offset_ms = params.offset_t_ms;
|
| 539 |
+
wparams.duration_ms = params.duration_ms;
|
| 540 |
|
| 541 |
wparams.token_timestamps = params.output_wts || params.max_len > 0;
|
| 542 |
wparams.thold_pt = params.word_thold;
|
whisper.cpp
CHANGED
|
@@ -2339,6 +2339,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
| 2339 |
/*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
|
| 2340 |
/*.n_max_text_ctx =*/ 16384,
|
| 2341 |
/*.offset_ms =*/ 0,
|
|
|
|
| 2342 |
|
| 2343 |
/*.translate =*/ false,
|
| 2344 |
/*.no_context =*/ false,
|
|
@@ -2376,6 +2377,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
| 2376 |
/*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
|
| 2377 |
/*.n_max_text_ctx =*/ 16384,
|
| 2378 |
/*.offset_ms =*/ 0,
|
|
|
|
| 2379 |
|
| 2380 |
/*.translate =*/ false,
|
| 2381 |
/*.no_context =*/ false,
|
|
@@ -2496,11 +2498,12 @@ int whisper_full(
|
|
| 2496 |
}
|
| 2497 |
|
| 2498 |
const int seek_start = params.offset_ms/10;
|
|
|
|
| 2499 |
|
| 2500 |
// if length of spectrogram is less than 1s (100 samples), then return
|
| 2501 |
// basically don't process anything that is less than 1s
|
| 2502 |
// see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
|
| 2503 |
-
if (
|
| 2504 |
return 0;
|
| 2505 |
}
|
| 2506 |
|
|
@@ -2533,7 +2536,7 @@ int whisper_full(
|
|
| 2533 |
// main loop
|
| 2534 |
int seek = seek_start;
|
| 2535 |
while (true) {
|
| 2536 |
-
int progress_cur = (100*seek)/
|
| 2537 |
while (progress_cur >= progress_prev + progress_step) {
|
| 2538 |
progress_prev += progress_step;
|
| 2539 |
if (params.print_progress) {
|
|
@@ -2541,7 +2544,7 @@ int whisper_full(
|
|
| 2541 |
}
|
| 2542 |
}
|
| 2543 |
|
| 2544 |
-
if (seek + 100 >=
|
| 2545 |
break;
|
| 2546 |
}
|
| 2547 |
|
|
@@ -2622,7 +2625,7 @@ int whisper_full(
|
|
| 2622 |
// end of text token
|
| 2623 |
if (token.id == whisper_token_eot(ctx)) {
|
| 2624 |
if (result_len == 0) {
|
| 2625 |
-
if (seek + seek_delta + 100 >=
|
| 2626 |
result_len = i + 1;
|
| 2627 |
} else {
|
| 2628 |
// TODO: figure out how to resolve this
|
|
|
|
| 2339 |
/*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
|
| 2340 |
/*.n_max_text_ctx =*/ 16384,
|
| 2341 |
/*.offset_ms =*/ 0,
|
| 2342 |
+
/*.duration_ms =*/ 0,
|
| 2343 |
|
| 2344 |
/*.translate =*/ false,
|
| 2345 |
/*.no_context =*/ false,
|
|
|
|
| 2377 |
/*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
|
| 2378 |
/*.n_max_text_ctx =*/ 16384,
|
| 2379 |
/*.offset_ms =*/ 0,
|
| 2380 |
+
/*.duration_ms =*/ 0,
|
| 2381 |
|
| 2382 |
/*.translate =*/ false,
|
| 2383 |
/*.no_context =*/ false,
|
|
|
|
| 2498 |
}
|
| 2499 |
|
| 2500 |
const int seek_start = params.offset_ms/10;
|
| 2501 |
+
const int seek_end = seek_start + (params.duration_ms == 0 ? whisper_n_len(ctx) : params.duration_ms/10);
|
| 2502 |
|
| 2503 |
// if length of spectrogram is less than 1s (100 samples), then return
|
| 2504 |
// basically don't process anything that is less than 1s
|
| 2505 |
// see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
|
| 2506 |
+
if (seek_end < 100 + seek_start) {
|
| 2507 |
return 0;
|
| 2508 |
}
|
| 2509 |
|
|
|
|
| 2536 |
// main loop
|
| 2537 |
int seek = seek_start;
|
| 2538 |
while (true) {
|
| 2539 |
+
const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
|
| 2540 |
while (progress_cur >= progress_prev + progress_step) {
|
| 2541 |
progress_prev += progress_step;
|
| 2542 |
if (params.print_progress) {
|
|
|
|
| 2544 |
}
|
| 2545 |
}
|
| 2546 |
|
| 2547 |
+
if (seek + 100 >= seek_end) {
|
| 2548 |
break;
|
| 2549 |
}
|
| 2550 |
|
|
|
|
| 2625 |
// end of text token
|
| 2626 |
if (token.id == whisper_token_eot(ctx)) {
|
| 2627 |
if (result_len == 0) {
|
| 2628 |
+
if (seek + seek_delta + 100 >= seek_end) {
|
| 2629 |
result_len = i + 1;
|
| 2630 |
} else {
|
| 2631 |
// TODO: figure out how to resolve this
|
whisper.h
CHANGED
|
@@ -186,7 +186,8 @@ extern "C" {
|
|
| 186 |
|
| 187 |
int n_threads;
|
| 188 |
int n_max_text_ctx;
|
| 189 |
-
int offset_ms;
|
|
|
|
| 190 |
|
| 191 |
bool translate;
|
| 192 |
bool no_context;
|
|
|
|
| 186 |
|
| 187 |
int n_threads;
|
| 188 |
int n_max_text_ctx;
|
| 189 |
+
int offset_ms; // start offset in ms
|
| 190 |
+
int duration_ms; // audio duration to process in ms
|
| 191 |
|
| 192 |
bool translate;
|
| 193 |
bool no_context;
|