Spaces:
Sleeping
Sleeping
main : add an option to accept optional output filenames (#424)
Browse files* Add an option to accept optional output filenames
* Format the file
Co-authored-by: Chia-Hsiang Cheng <[email protected]>
- examples/main/main.cpp +38 -34
examples/main/main.cpp
CHANGED
|
@@ -84,6 +84,7 @@ struct whisper_params {
|
|
| 84 |
std::string model = "models/ggml-base.en.bin";
|
| 85 |
|
| 86 |
std::vector<std::string> fname_inp = {};
|
|
|
|
| 87 |
};
|
| 88 |
|
| 89 |
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
|
@@ -121,6 +122,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 121 |
else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
|
| 122 |
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
|
| 123 |
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
|
|
|
|
| 124 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
| 125 |
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
|
| 126 |
else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
|
|
@@ -144,35 +146,36 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 144 |
fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
|
| 145 |
fprintf(stderr, "\n");
|
| 146 |
fprintf(stderr, "options:\n");
|
| 147 |
-
fprintf(stderr, " -h,
|
| 148 |
-
fprintf(stderr, " -t N,
|
| 149 |
-
fprintf(stderr, " -p N,
|
| 150 |
-
fprintf(stderr, " -ot N,
|
| 151 |
-
fprintf(stderr, " -on N,
|
| 152 |
-
fprintf(stderr, " -d N,
|
| 153 |
-
fprintf(stderr, " -mc N,
|
| 154 |
-
fprintf(stderr, " -ml N,
|
| 155 |
-
fprintf(stderr, " -bo N,
|
| 156 |
-
fprintf(stderr, " -bs N,
|
| 157 |
-
fprintf(stderr, " -wt N,
|
| 158 |
-
fprintf(stderr, " -et N,
|
| 159 |
-
fprintf(stderr, " -lpt N,
|
| 160 |
-
fprintf(stderr, " -su,
|
| 161 |
-
fprintf(stderr, " -tr,
|
| 162 |
-
fprintf(stderr, " -di,
|
| 163 |
-
fprintf(stderr, " -otxt,
|
| 164 |
-
fprintf(stderr, " -ovtt,
|
| 165 |
-
fprintf(stderr, " -osrt,
|
| 166 |
-
fprintf(stderr, " -owts,
|
| 167 |
-
fprintf(stderr, " -ocsv,
|
| 168 |
-
fprintf(stderr, " -
|
| 169 |
-
fprintf(stderr, " -
|
| 170 |
-
fprintf(stderr, " -
|
| 171 |
-
fprintf(stderr, " -
|
| 172 |
-
fprintf(stderr, " -
|
| 173 |
-
fprintf(stderr, "
|
| 174 |
-
fprintf(stderr, "
|
| 175 |
-
fprintf(stderr, " -
|
|
|
|
| 176 |
fprintf(stderr, "\n");
|
| 177 |
}
|
| 178 |
|
|
@@ -514,6 +517,7 @@ int main(int argc, char ** argv) {
|
|
| 514 |
|
| 515 |
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
|
| 516 |
const auto fname_inp = params.fname_inp[f];
|
|
|
|
| 517 |
|
| 518 |
std::vector<float> pcmf32; // mono-channel F32 PCM
|
| 519 |
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
|
@@ -692,31 +696,31 @@ int main(int argc, char ** argv) {
|
|
| 692 |
|
| 693 |
// output to text file
|
| 694 |
if (params.output_txt) {
|
| 695 |
-
const auto fname_txt =
|
| 696 |
output_txt(ctx, fname_txt.c_str());
|
| 697 |
}
|
| 698 |
|
| 699 |
// output to VTT file
|
| 700 |
if (params.output_vtt) {
|
| 701 |
-
const auto fname_vtt =
|
| 702 |
output_vtt(ctx, fname_vtt.c_str());
|
| 703 |
}
|
| 704 |
|
| 705 |
// output to SRT file
|
| 706 |
if (params.output_srt) {
|
| 707 |
-
const auto fname_srt =
|
| 708 |
output_srt(ctx, fname_srt.c_str(), params);
|
| 709 |
}
|
| 710 |
|
| 711 |
// output to WTS file
|
| 712 |
if (params.output_wts) {
|
| 713 |
-
const auto fname_wts =
|
| 714 |
output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
|
| 715 |
}
|
| 716 |
|
| 717 |
// output to CSV file
|
| 718 |
if (params.output_csv) {
|
| 719 |
-
const auto fname_csv =
|
| 720 |
output_csv(ctx, fname_csv.c_str());
|
| 721 |
}
|
| 722 |
|
|
|
|
| 84 |
std::string model = "models/ggml-base.en.bin";
|
| 85 |
|
| 86 |
std::vector<std::string> fname_inp = {};
|
| 87 |
+
std::vector<std::string> fname_outp = {};
|
| 88 |
};
|
| 89 |
|
| 90 |
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
|
|
|
| 122 |
else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
|
| 123 |
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
|
| 124 |
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
|
| 125 |
+
else if (arg == "-of" || arg == "--output-file") { params.fname_outp.emplace_back(argv[++i]); }
|
| 126 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
| 127 |
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
|
| 128 |
else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
|
|
|
|
| 146 |
fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
|
| 147 |
fprintf(stderr, "\n");
|
| 148 |
fprintf(stderr, "options:\n");
|
| 149 |
+
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
| 150 |
+
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
| 151 |
+
fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors);
|
| 152 |
+
fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms);
|
| 153 |
+
fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n);
|
| 154 |
+
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
|
| 155 |
+
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
|
| 156 |
+
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
|
| 157 |
+
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
|
| 158 |
+
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
|
| 159 |
+
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
|
| 160 |
+
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
|
| 161 |
+
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
|
| 162 |
+
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
| 163 |
+
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 164 |
+
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
|
| 165 |
+
fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
|
| 166 |
+
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
|
| 167 |
+
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
|
| 168 |
+
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
|
| 169 |
+
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
|
| 170 |
+
fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
|
| 171 |
+
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
| 172 |
+
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
|
| 173 |
+
fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
|
| 174 |
+
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "false" : "true");
|
| 175 |
+
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
|
| 176 |
+
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
|
| 177 |
+
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
| 178 |
+
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
|
| 179 |
fprintf(stderr, "\n");
|
| 180 |
}
|
| 181 |
|
|
|
|
| 517 |
|
| 518 |
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
|
| 519 |
const auto fname_inp = params.fname_inp[f];
|
| 520 |
+
const auto fname_outp = f < params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];
|
| 521 |
|
| 522 |
std::vector<float> pcmf32; // mono-channel F32 PCM
|
| 523 |
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
|
|
|
| 696 |
|
| 697 |
// output to text file
|
| 698 |
if (params.output_txt) {
|
| 699 |
+
const auto fname_txt = fname_outp + ".txt";
|
| 700 |
output_txt(ctx, fname_txt.c_str());
|
| 701 |
}
|
| 702 |
|
| 703 |
// output to VTT file
|
| 704 |
if (params.output_vtt) {
|
| 705 |
+
const auto fname_vtt = fname_outp + ".vtt";
|
| 706 |
output_vtt(ctx, fname_vtt.c_str());
|
| 707 |
}
|
| 708 |
|
| 709 |
// output to SRT file
|
| 710 |
if (params.output_srt) {
|
| 711 |
+
const auto fname_srt = fname_outp + ".srt";
|
| 712 |
output_srt(ctx, fname_srt.c_str(), params);
|
| 713 |
}
|
| 714 |
|
| 715 |
// output to WTS file
|
| 716 |
if (params.output_wts) {
|
| 717 |
+
const auto fname_wts = fname_outp + ".wts";
|
| 718 |
output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
|
| 719 |
}
|
| 720 |
|
| 721 |
// output to CSV file
|
| 722 |
if (params.output_csv) {
|
| 723 |
+
const auto fname_csv = fname_outp + ".csv";
|
| 724 |
output_csv(ctx, fname_csv.c_str());
|
| 725 |
}
|
| 726 |
|