tuxpoldo ggerganov commited on
Commit
3381725
·
unverified ·
1 Parent(s): 2800fe4

main : provide option for creating JSON output (#615)

Browse files

* examples : provide option for exporting also as JSON file (ggerganov/whisper.cpp#614)

* main : remove leftovers

---------

Co-authored-by: Georgi Gerganov <[email protected]>

Files changed (4) hide show
  1. examples/main/README.md +1 -0
  2. examples/main/main.cpp +132 -0
  3. whisper.cpp +66 -1
  4. whisper.h +15 -0
examples/main/README.md CHANGED
@@ -31,6 +31,7 @@ options:
31
  -osrt, --output-srt [false ] output result in a srt file
32
  -owts, --output-words [false ] output script for generating karaoke video
33
  -ocsv, --output-csv [false ] output result in a CSV file
 
34
  -of FNAME, --output-file FNAME [ ] output file path (without file extension)
35
  -ps, --print-special [false ] print special tokens
36
  -pc, --print-colors [false ] print colors
 
31
  -osrt, --output-srt [false ] output result in a srt file
32
  -owts, --output-words [false ] output script for generating karaoke video
33
  -ocsv, --output-csv [false ] output result in a CSV file
34
+ -oj, --output-json [false ] output result in a JSON file
35
  -of FNAME, --output-file FNAME [ ] output file path (without file extension)
36
  -ps, --print-special [false ] print special tokens
37
  -pc, --print-colors [false ] print colors
examples/main/main.cpp CHANGED
@@ -73,6 +73,7 @@ struct whisper_params {
73
  bool output_srt = false;
74
  bool output_wts = false;
75
  bool output_csv = false;
 
76
  bool print_special = false;
77
  bool print_colors = false;
78
  bool print_progress = false;
@@ -130,6 +131,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
130
  else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
131
  else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
132
  else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
 
133
  else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
134
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
135
  else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
@@ -178,6 +180,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
178
  fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
179
  fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str());
180
  fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
 
181
  fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
182
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
183
  fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
@@ -368,6 +371,129 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
368
  return true;
369
  }
370
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
  // karaoke video generation
372
  // outputs a bash script that uses ffmpeg to generate a video with the subtitles
373
  // TODO: font parameter adjustments
@@ -662,6 +788,12 @@ int main(int argc, char ** argv) {
662
  const auto fname_csv = fname_out + ".csv";
663
  output_csv(ctx, fname_csv.c_str());
664
  }
 
 
 
 
 
 
665
  }
666
  }
667
 
 
73
  bool output_srt = false;
74
  bool output_wts = false;
75
  bool output_csv = false;
76
+ bool output_jsn = false;
77
  bool print_special = false;
78
  bool print_colors = false;
79
  bool print_progress = false;
 
131
  else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
132
  else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
133
  else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
134
+ else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; }
135
  else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
136
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
137
  else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
 
180
  fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
181
  fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str());
182
  fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
183
+ fprintf(stderr, " -oj, --output-json [%-7s] output result in a JSON file\n", params.output_jsn ? "true" : "false");
184
  fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
185
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
186
  fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
 
371
  return true;
372
  }
373
 
374
+ bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
375
+ std::ofstream fout(fname);
376
+ int indent = 0;
377
+
378
+ auto doindent = [&]() {
379
+ for (int i = 0; i < indent; i++) fout << "\t";
380
+ };
381
+
382
+ auto start_arr = [&](const char *name) {
383
+ doindent();
384
+ fout << "\"" << name << "\": [\n";
385
+ indent++;
386
+ };
387
+
388
+ auto end_arr = [&](bool end = false) {
389
+ indent--;
390
+ doindent();
391
+ fout << (end ? "]\n" : "},\n");
392
+ };
393
+
394
+ auto start_obj = [&](const char *name = nullptr) {
395
+ doindent();
396
+ if (name) {
397
+ fout << "\"" << name << "\": {\n";
398
+ } else {
399
+ fout << "{\n";
400
+ }
401
+ indent++;
402
+ };
403
+
404
+ auto end_obj = [&](bool end = false) {
405
+ indent--;
406
+ doindent();
407
+ fout << (end ? "}\n" : "},\n");
408
+ };
409
+
410
+ auto start_value = [&](const char *name) {
411
+ doindent();
412
+ fout << "\"" << name << "\": ";
413
+ };
414
+
415
+ auto value_s = [&](const char *name, const char *val, bool end = false) {
416
+ start_value(name);
417
+ fout << "\"" << val << (end ? "\"\n" : "\",\n");
418
+ };
419
+
420
+ auto end_value = [&](bool end = false) {
421
+ fout << (end ? "\n" : ",\n");
422
+ };
423
+
424
+ auto value_i = [&](const char *name, const int64_t val, bool end = false) {
425
+ start_value(name);
426
+ fout << val;
427
+ end_value(end);
428
+ };
429
+
430
+ auto value_b = [&](const char *name, const bool val, bool end = false) {
431
+ start_value(name);
432
+ fout << (val ? "true" : "false");
433
+ end_value(end);
434
+ };
435
+
436
+ if (!fout.is_open()) {
437
+ fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
438
+ return false;
439
+ }
440
+
441
+ fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
442
+ start_obj();
443
+ value_s("systeminfo", whisper_print_system_info());
444
+ start_obj("model");
445
+ value_s("type", whisper_model_type_readable(ctx));
446
+ value_b("multilingual", whisper_is_multilingual(ctx));
447
+ value_i("vocab", whisper_model_n_vocab(ctx));
448
+ start_obj("audio");
449
+ value_i("ctx", whisper_model_n_audio_ctx(ctx));
450
+ value_i("state", whisper_model_n_audio_state(ctx));
451
+ value_i("head", whisper_model_n_audio_head(ctx));
452
+ value_i("layer", whisper_model_n_audio_layer(ctx), true);
453
+ end_obj();
454
+ start_obj("text");
455
+ value_i("ctx", whisper_model_n_text_ctx(ctx));
456
+ value_i("state", whisper_model_n_text_state(ctx));
457
+ value_i("head", whisper_model_n_text_head(ctx));
458
+ value_i("leyer", whisper_model_n_text_layer(ctx), true);
459
+ end_obj();
460
+ value_i("mels", whisper_model_n_mels(ctx));
461
+ value_i("f16", whisper_model_f16(ctx), true);
462
+ end_obj();
463
+ start_obj("params");
464
+ value_s("model", params.model.c_str());
465
+ value_s("language", params.language.c_str());
466
+ value_b("translate", params.translate, true);
467
+ end_obj();
468
+ start_obj("result");
469
+ value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
470
+ end_obj();
471
+ start_arr("transcription");
472
+
473
+ const int n_segments = whisper_full_n_segments(ctx);
474
+ for (int i = 0; i < n_segments; ++i) {
475
+ const char * text = whisper_full_get_segment_text(ctx, i);
476
+ const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
477
+ const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
478
+
479
+ start_obj();
480
+ start_obj("timestanps");
481
+ value_s("from", to_timestamp(t0, true).c_str());
482
+ value_s("to", to_timestamp(t1, true).c_str(), true);
483
+ end_obj();
484
+ start_obj("offsets");
485
+ value_i("from", t0 * 10);
486
+ value_i("to", t1 * 10, true);
487
+ end_obj();
488
+ value_s("text", text, true);
489
+ end_obj(i == (n_segments - 1));
490
+ }
491
+
492
+ end_arr(true);
493
+ end_obj(true);
494
+ return true;
495
+ }
496
+
497
  // karaoke video generation
498
  // outputs a bash script that uses ffmpeg to generate a video with the subtitles
499
  // TODO: font parameter adjustments
 
788
  const auto fname_csv = fname_out + ".csv";
789
  output_csv(ctx, fname_csv.c_str());
790
  }
791
+
792
+ // output to JSON file
793
+ if (params.output_jsn) {
794
+ const auto fname_jsn = fname_out + ".json";
795
+ output_json(ctx, fname_jsn.c_str(), params);
796
+ }
797
  }
798
  }
799
 
whisper.cpp CHANGED
@@ -1408,7 +1408,7 @@ static bool whisper_encode_internal(
1408
  //}
1409
 
1410
  static int iter = 0;
1411
-
1412
  const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
1413
  const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;
1414
 
@@ -2919,6 +2919,71 @@ int whisper_lang_auto_detect(
2919
  return whisper_lang_auto_detect_with_state(ctx, ctx->state, offset_ms, n_threads, lang_probs);
2920
  }
2921
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2922
  int whisper_n_len_from_state(struct whisper_state * state) {
2923
  return state->mel.n_len;
2924
  }
 
1408
  //}
1409
 
1410
  static int iter = 0;
1411
+
1412
  const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
1413
  const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;
1414
 
 
2919
  return whisper_lang_auto_detect_with_state(ctx, ctx->state, offset_ms, n_threads, lang_probs);
2920
  }
2921
 
2922
+ int whisper_model_n_vocab(struct whisper_context * ctx) {
2923
+ return ctx->model.hparams.n_vocab;
2924
+ }
2925
+
2926
+ int whisper_model_n_audio_ctx(struct whisper_context * ctx) {
2927
+ return ctx->model.hparams.n_audio_ctx;
2928
+ }
2929
+
2930
+ int whisper_model_n_audio_state(struct whisper_context * ctx) {
2931
+ return ctx->model.hparams.n_audio_state;
2932
+ }
2933
+
2934
+ int whisper_model_n_audio_head(struct whisper_context * ctx) {
2935
+ return ctx->model.hparams.n_audio_head;
2936
+ }
2937
+
2938
+ int whisper_model_n_audio_layer(struct whisper_context * ctx) {
2939
+ return ctx->model.hparams.n_audio_layer;
2940
+ }
2941
+
2942
+ int whisper_model_n_text_ctx(struct whisper_context * ctx) {
2943
+ return ctx->model.hparams.n_text_ctx;
2944
+ }
2945
+
2946
+ int whisper_model_n_text_state(struct whisper_context * ctx) {
2947
+ return ctx->model.hparams.n_text_state;
2948
+ }
2949
+
2950
+ int whisper_model_n_text_head(struct whisper_context * ctx) {
2951
+ return ctx->model.hparams.n_text_head;
2952
+ }
2953
+
2954
+ int whisper_model_n_text_layer(struct whisper_context * ctx) {
2955
+ return ctx->model.hparams.n_text_layer;
2956
+ }
2957
+
2958
+ int whisper_model_n_mels(struct whisper_context * ctx) {
2959
+ return ctx->model.hparams.n_mels;
2960
+ }
2961
+
2962
+ int whisper_model_f16(struct whisper_context * ctx) {
2963
+ return ctx->model.hparams.f16;
2964
+ }
2965
+
2966
+ int whisper_model_type(struct whisper_context * ctx) {
2967
+ return ctx->model.type;
2968
+ }
2969
+
2970
+ const char *whisper_model_type_readable(struct whisper_context * ctx) {
2971
+ switch (ctx->model.type) {
2972
+ case e_model::MODEL_TINY:
2973
+ return "tiny";
2974
+ case e_model::MODEL_BASE:
2975
+ return "base";
2976
+ case e_model::MODEL_SMALL:
2977
+ return "small";
2978
+ case e_model::MODEL_MEDIUM:
2979
+ return "medium";
2980
+ case e_model::MODEL_LARGE:
2981
+ return "large";
2982
+ default:
2983
+ return "unknown";
2984
+ }
2985
+ }
2986
+
2987
  int whisper_n_len_from_state(struct whisper_state * state) {
2988
  return state->mel.n_len;
2989
  }
whisper.h CHANGED
@@ -248,6 +248,19 @@ extern "C" {
248
  WHISPER_API int whisper_n_audio_ctx (struct whisper_context * ctx);
249
  WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  // Token logits obtained from the last call to whisper_decode()
252
  // The logits for the last token are stored in the last row
253
  // Rows: n_tokens
@@ -257,6 +270,8 @@ extern "C" {
257
 
258
  // Token Id -> String. Uses the vocabulary in the provided context
259
  WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
 
 
260
 
261
  // Special tokens
262
  WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
 
248
  WHISPER_API int whisper_n_audio_ctx (struct whisper_context * ctx);
249
  WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
250
 
251
+ WHISPER_API int whisper_model_n_vocab (struct whisper_context * ctx);
252
+ WHISPER_API int whisper_model_n_audio_ctx (struct whisper_context * ctx);
253
+ WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
254
+ WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
255
+ WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
256
+ WHISPER_API int whisper_model_n_text_ctx (struct whisper_context * ctx);
257
+ WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
258
+ WHISPER_API int whisper_model_n_text_head (struct whisper_context * ctx);
259
+ WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
260
+ WHISPER_API int whisper_model_n_mels (struct whisper_context * ctx);
261
+ WHISPER_API int whisper_model_f16 (struct whisper_context * ctx);
262
+ WHISPER_API int whisper_model_type (struct whisper_context * ctx);
263
+
264
  // Token logits obtained from the last call to whisper_decode()
265
  // The logits for the last token are stored in the last row
266
  // Rows: n_tokens
 
270
 
271
  // Token Id -> String. Uses the vocabulary in the provided context
272
  WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
273
+ WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
274
+
275
 
276
  // Special tokens
277
  WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);