Chia-Hsiang Cheng garyhsiang commited on
Commit
7936381
·
unverified ·
1 Parent(s): 284c9d9

main : add an option to accept optional output filenames (#424)

Browse files

* Add an option to accept optional output filenames

* Format the file

Co-authored-by: Chia-Hsiang Cheng <[email protected]>

Files changed (1) hide show
  1. examples/main/main.cpp +38 -34
examples/main/main.cpp CHANGED
@@ -84,6 +84,7 @@ struct whisper_params {
84
  std::string model = "models/ggml-base.en.bin";
85
 
86
  std::vector<std::string> fname_inp = {};
 
87
  };
88
 
89
  void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@@ -121,6 +122,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
121
  else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
122
  else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
123
  else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
 
124
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
125
  else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
126
  else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
@@ -144,35 +146,36 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
144
  fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
145
  fprintf(stderr, "\n");
146
  fprintf(stderr, "options:\n");
147
- fprintf(stderr, " -h, --help [default] show this help message and exit\n");
148
- fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
149
- fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors);
150
- fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms);
151
- fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n);
152
- fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
153
- fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
154
- fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
155
- fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
156
- fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
157
- fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
158
- fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
159
- fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
160
- fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
161
- fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
162
- fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
163
- fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
164
- fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
165
- fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
166
- fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
167
- fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
168
- fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
169
- fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
170
- fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
171
- fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "false" : "true");
172
- fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
173
- fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
174
- fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
175
- fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
 
176
  fprintf(stderr, "\n");
177
  }
178
 
@@ -514,6 +517,7 @@ int main(int argc, char ** argv) {
514
 
515
  for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
516
  const auto fname_inp = params.fname_inp[f];
 
517
 
518
  std::vector<float> pcmf32; // mono-channel F32 PCM
519
  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
@@ -692,31 +696,31 @@ int main(int argc, char ** argv) {
692
 
693
  // output to text file
694
  if (params.output_txt) {
695
- const auto fname_txt = fname_inp + ".txt";
696
  output_txt(ctx, fname_txt.c_str());
697
  }
698
 
699
  // output to VTT file
700
  if (params.output_vtt) {
701
- const auto fname_vtt = fname_inp + ".vtt";
702
  output_vtt(ctx, fname_vtt.c_str());
703
  }
704
 
705
  // output to SRT file
706
  if (params.output_srt) {
707
- const auto fname_srt = fname_inp + ".srt";
708
  output_srt(ctx, fname_srt.c_str(), params);
709
  }
710
 
711
  // output to WTS file
712
  if (params.output_wts) {
713
- const auto fname_wts = fname_inp + ".wts";
714
  output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
715
  }
716
 
717
  // output to CSV file
718
  if (params.output_csv) {
719
- const auto fname_csv = fname_inp + ".csv";
720
  output_csv(ctx, fname_csv.c_str());
721
  }
722
 
 
84
  std::string model = "models/ggml-base.en.bin";
85
 
86
  std::vector<std::string> fname_inp = {};
87
+ std::vector<std::string> fname_outp = {};
88
  };
89
 
90
  void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
 
122
  else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
123
  else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
124
  else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
125
+ else if (arg == "-of" || arg == "--output-file") { params.fname_outp.emplace_back(argv[++i]); }
126
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
127
  else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
128
  else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
 
146
  fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
147
  fprintf(stderr, "\n");
148
  fprintf(stderr, "options:\n");
149
+ fprintf(stderr, " -h, --help [default] show this help message and exit\n");
150
+ fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
151
+ fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors);
152
+ fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms);
153
+ fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n);
154
+ fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
155
+ fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
156
+ fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
157
+ fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
158
+ fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
159
+ fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
160
+ fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
161
+ fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
162
+ fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
163
+ fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
164
+ fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
165
+ fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
166
+ fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
167
+ fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
168
+ fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
169
+ fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
170
+ fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
171
+ fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
172
+ fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
173
+ fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
174
+ fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "false" : "true");
175
+ fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
176
+ fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
177
+ fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
178
+ fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
179
  fprintf(stderr, "\n");
180
  }
181
 
 
517
 
518
  for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
519
  const auto fname_inp = params.fname_inp[f];
520
+ const auto fname_outp = f < params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];
521
 
522
  std::vector<float> pcmf32; // mono-channel F32 PCM
523
  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
 
696
 
697
  // output to text file
698
  if (params.output_txt) {
699
+ const auto fname_txt = fname_outp + ".txt";
700
  output_txt(ctx, fname_txt.c_str());
701
  }
702
 
703
  // output to VTT file
704
  if (params.output_vtt) {
705
+ const auto fname_vtt = fname_outp + ".vtt";
706
  output_vtt(ctx, fname_vtt.c_str());
707
  }
708
 
709
  // output to SRT file
710
  if (params.output_srt) {
711
+ const auto fname_srt = fname_outp + ".srt";
712
  output_srt(ctx, fname_srt.c_str(), params);
713
  }
714
 
715
  // output to WTS file
716
  if (params.output_wts) {
717
+ const auto fname_wts = fname_outp + ".wts";
718
  output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
719
  }
720
 
721
  // output to CSV file
722
  if (params.output_csv) {
723
+ const auto fname_csv = fname_outp + ".csv";
724
  output_csv(ctx, fname_csv.c_str());
725
  }
726