ggerganov commited on
Commit
ed1850d
·
unverified ·
1 Parent(s): 082b647

main : add some comments for the word-level timestamp algorithm

Browse files
Files changed (1) hide show
  1. examples/main/main.cpp +235 -227
examples/main/main.cpp CHANGED
@@ -321,124 +321,125 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
321
  }
322
 
323
  // word-level timestamps (experimental)
324
- // TODO: probably still has bugs, needs refactoring, etc..
325
- // TODO: auto threshold
326
  // TODO: extra pass to detect unused speech and assign to tokens
327
  // TODO: font parameter adjustments
 
328
  bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, const std::vector<float> & pcmf32) {
329
- if (params.output_wts) {
330
- std::vector<float> pcm_avg(pcmf32.size(), 0);
331
 
332
- // average the fabs of the signal
333
- {
334
- const int hw = 32;
335
 
336
- for (int i = 0; i < pcmf32.size(); i++) {
337
- float sum = 0;
338
- for (int j = -hw; j <= hw; j++) {
339
- if (i + j >= 0 && i + j < pcmf32.size()) {
340
- sum += fabs(pcmf32[i + j]);
341
- }
342
  }
343
- pcm_avg[i] = sum/(2*hw + 1);
344
  }
 
345
  }
 
346
 
347
- struct token_info {
348
- int64_t t0 = -1;
349
- int64_t t1 = -1;
350
 
351
- int64_t tt0 = -1;
352
- int64_t tt1 = -1;
353
 
354
- whisper_token id;
355
- whisper_token tid;
356
 
357
- float p = 0.0f;
358
- float pt = 0.0f;
359
- float ptsum = 0.0f;
360
 
361
- std::string text;
362
- float vlen = 0.0f; // voice length of this token
363
- };
364
 
365
- int64_t t_beg = 0;
366
- int64_t t_last = 0;
367
 
368
- whisper_token tid_last = 0;
369
 
370
- std::ofstream fout(fname);
371
 
372
- fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
373
 
374
- fout << "!/bin/bash" << "\n";
375
- fout << "\n";
376
 
377
- fout << "ffmpeg -i " << fname_inp << " -f lavfi -i color=size=1200x120:duration=" << float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE << ":rate=25:color=black -vf \"";
378
 
379
- bool is_first = true;
380
 
381
- for (int i = 0; i < whisper_full_n_segments(ctx); i++) {
382
- const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
383
- const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
384
 
385
- const char *text = whisper_full_get_segment_text(ctx, i);
386
 
387
- const int s0 = std::max(0, (int) (t0*WHISPER_SAMPLE_RATE/100));
388
- const int s1 = std::min((int) pcmf32.size(), (int) (t1*WHISPER_SAMPLE_RATE/100));
389
 
390
- const int n = whisper_full_n_tokens(ctx, i);
391
 
392
- std::vector<token_info> tokens(n);
393
 
394
- if (n <= 1) {
395
- continue;
396
- }
397
 
398
- for (int j = 0; j < n; ++j) {
399
- struct whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
400
 
401
- if (j == 0) {
402
- if (token.id == whisper_token_beg(ctx)) {
403
- tokens[j ].t0 = t0;
404
- tokens[j ].t1 = t0;
405
- tokens[j + 1].t0 = t0;
406
 
407
- t_beg = t0;
408
- t_last = t0;
409
- tid_last = whisper_token_beg(ctx);
410
- } else {
411
- tokens[j ].t0 = t_last;
412
- }
413
  }
 
414
 
415
- const int64_t tt = t_beg + 2*(token.tid - whisper_token_beg(ctx));
416
 
417
- tokens[j].id = token.id;
418
- tokens[j].tid = token.tid;
419
- tokens[j].p = token.p;
420
- tokens[j].pt = token.pt;
421
- tokens[j].ptsum = token.ptsum;
422
 
423
- tokens[j].text = whisper_token_to_str(ctx, token.id);
424
- //tokens[j].vlen = tokens[j].pt;
425
- tokens[j].vlen = voice_length(tokens[j].text);
426
 
427
- if (token.pt > params.word_thold && token.ptsum > 0.01 && token.tid > tid_last && tt <= t1) {
428
- if (j > 0) {
429
- tokens[j - 1].t1 = tt;
430
- }
431
- tokens[j].t0 = tt;
432
- tid_last = token.tid;
433
  }
 
 
434
  }
 
435
 
436
- tokens[n - 2].t1 = t1;
437
- tokens[n - 1].t0 = t1;
438
- tokens[n - 1].t1 = t1;
439
 
440
- t_last = t1;
441
 
 
 
 
442
  int p0 = 0;
443
  int p1 = 0;
444
  while (true) {
@@ -460,10 +461,9 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
460
 
461
  const double dt = tokens[p1].t1 - tokens[p0].t0;
462
 
 
463
  for (int j = p0 + 1; j <= p1; j++) {
464
  const double ct = tokens[j - 1].t0 + dt*tokens[j - 1].vlen/psum;
465
- //const double ct = tokens[j - 1].t0 + (dt*(j - p0))/(p1 - p0 + 1);
466
- //const double ct = tokens[p0].t0 + (dt*(j - p0))/(p1 - p0 + 1);
467
 
468
  tokens[j - 1].t1 = ct;
469
  tokens[j ].t0 = ct;
@@ -476,95 +476,100 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
476
  break;
477
  }
478
  }
 
479
 
480
- for (int j = 0; j < n - 1; j++) {
481
- if (tokens[j].t1 < 0) {
482
- tokens[j + 1].t0 = tokens[j].t1;
483
- }
 
484
 
485
- if (j > 0) {
486
- if (tokens[j - 1].t1 > tokens[j].t0) {
487
- tokens[j].t0 = tokens[j - 1].t1;
488
- tokens[j].t1 = std::max(tokens[j].t0, tokens[j].t1);
489
- }
490
  }
491
-
492
- tokens[j].tt0 = tokens[j].t0;
493
- tokens[j].tt1 = tokens[j].t1;
494
  }
495
 
496
- // VAD
497
- {
498
- const int hw = WHISPER_SAMPLE_RATE/8;
499
 
500
- for (int j = 0; j < n; j++) {
501
- if (tokens[j].id >= whisper_token_eot(ctx)) {
502
- continue;
503
- }
 
 
 
 
 
504
 
505
- const int64_t t0 = tokens[j].t0;
506
- const int64_t t1 = tokens[j].t1;
507
 
508
- int s0 = std::max(0, (int) (t0*WHISPER_SAMPLE_RATE/100));
509
- int s1 = std::min((int) pcmf32.size() - 1, (int) (t1*WHISPER_SAMPLE_RATE/100));
510
 
511
- const int ss0 = std::max(0, (int) (t0*WHISPER_SAMPLE_RATE/100) - hw);
512
- const int ss1 = std::min((int) pcmf32.size() - 1, (int) (t1*WHISPER_SAMPLE_RATE/100) + hw);
513
 
514
- const int n = ss1 - ss0;
515
 
516
- float sum = 0.0f;
517
 
518
- for (int k = ss0; k < ss1; k++) {
519
- sum += pcm_avg[k];
520
- }
521
 
522
- const float thold = 0.5*sum/n;
523
-
524
- {
525
- int k = s0;
526
- if (pcm_avg[k] > thold && j > 0) {
527
- while (k > 0 && pcm_avg[k] > thold) {
528
- k--;
529
- }
530
- tokens[j].t0 = (int64_t) (100*k/WHISPER_SAMPLE_RATE);
531
- if (tokens[j].t0 < tokens[j - 1].t1) {
532
- tokens[j].t0 = tokens[j - 1].t1;
533
- } else {
534
- s0 = k;
535
- }
536
  } else {
537
- while (pcm_avg[k] < thold && k < s1) {
538
- k++;
539
- }
540
  s0 = k;
541
- tokens[j].t0 = 100*k/WHISPER_SAMPLE_RATE;
542
  }
 
 
 
 
 
 
543
  }
 
544
 
545
- {
546
- int k = s1;
547
- if (pcm_avg[k] > thold) {
548
- while (k < (int) pcmf32.size() - 1 && pcm_avg[k] > thold) {
549
- k++;
550
- }
551
- tokens[j].t1 = 100*k/WHISPER_SAMPLE_RATE;
552
- if (j < n - 1 && tokens[j].t1 > tokens[j + 1].t0) {
553
- tokens[j].t1 = tokens[j + 1].t0;
554
- } else {
555
- s1 = k;
556
- }
557
  } else {
558
- while (pcm_avg[k] < thold && k > s0) {
559
- k--;
560
- }
561
  s1 = k;
562
- tokens[j].t1 = 100*k/WHISPER_SAMPLE_RATE;
563
  }
 
 
 
 
 
 
564
  }
565
  }
566
  }
 
567
 
 
 
568
  const int t_expand = 0;
569
 
570
  for (int j = 0; j < n; j++) {
@@ -575,118 +580,121 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
575
  tokens[j].t1 = tokens[j].t1 + t_expand;
576
  }
577
  }
 
578
 
579
- for (int j = 0; j < n; ++j) {
580
- const auto & token = tokens[j];
581
- const auto tt = token.pt > params.word_thold && token.ptsum > 0.01 ? whisper_token_to_str(ctx, token.tid) : "[?]";
582
- printf("%s: %10s %6.3f %6.3f %6.3f %6.3f %5d %5d '%s'\n", __func__,
583
- tt, token.p, token.pt, token.ptsum, token.vlen, (int) token.t0, (int) token.t1, token.text.c_str());
 
 
584
 
585
- if (tokens[j].id >= whisper_token_eot(ctx)) {
586
- continue;
587
- }
588
 
589
- //printf("[%s --> %s] %s\n", to_timestamp(token.t0).c_str(), to_timestamp(token.t1).c_str(), whisper_token_to_str(ctx, token.id));
590
 
591
- //fout << "# " << to_timestamp(token.t0) << " --> " << to_timestamp(token.t1) << " " << whisper_token_to_str(ctx, token.id) << "\n";
592
- }
593
 
594
- static const int line_wrap = 60;
595
- static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
 
596
 
597
- if (!is_first) {
598
- fout << ",";
599
- }
600
 
601
- // background text
602
- fout << "drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='':enable='between(t," << t0/100.0 << "," << t0/100.0 << ")'";
603
 
604
- is_first = false;
605
 
606
- for (int j = 0; j < n; ++j) {
607
- const auto & token = tokens[j];
608
 
609
- if (tokens[j].id >= whisper_token_eot(ctx)) {
610
- continue;
611
- }
612
 
613
- std::string txt_bg;
614
- std::string txt_fg; // highlight token
615
- std::string txt_ul; // underline
616
 
617
- txt_bg = "> ";
618
- txt_fg = "> ";
619
- txt_ul = "\\ \\ ";
620
 
621
- {
622
- int ncnt = 0;
623
- for (int k = 0; k < n; ++k) {
624
- const auto & token2 = tokens[k];
625
 
626
- if (tokens[k].id >= whisper_token_eot(ctx)) {
627
- continue;
628
- }
629
 
630
- const std::string txt = whisper_token_to_str(ctx, token2.id);
631
 
632
- txt_bg += txt;
633
 
634
- if (k == j) {
635
- for (int l = 0; l < (int) txt.size(); ++l) {
636
- txt_fg += txt[l];
637
- txt_ul += "_";
638
- }
639
- txt_fg += "|";
640
- } else {
641
- for (int l = 0; l < (int) txt.size(); ++l) {
642
- txt_fg += "\\ ";
643
- txt_ul += "\\ ";
644
- }
645
  }
646
-
647
- ncnt += txt.size();
648
-
649
- if (ncnt > line_wrap) {
650
- if (k < j) {
651
- txt_bg = "> ";
652
- txt_fg = "> ";
653
- txt_ul = "\\ \\ ";
654
- ncnt = 0;
655
- } else {
656
- break;
657
- }
658
  }
659
  }
660
 
661
- ::replace_all(txt_bg, "'", "’");
662
- ::replace_all(txt_bg, "\"", "\\\"");
663
- ::replace_all(txt_fg, "'", "’");
664
- ::replace_all(txt_fg, "\"", "\\\"");
 
 
 
 
 
 
 
 
665
  }
666
 
667
- // background text
668
- fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='" << txt_bg << "':enable='between(t," << token.tt0/100.0 << "," << token.tt1/100.0 << ")'";
 
 
 
 
 
 
669
 
670
- // foreground text
671
- fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2:text='" << txt_fg << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";
672
 
673
- // underline
674
- fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2+16:text='" << txt_ul << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";
675
- }
676
  }
 
677
 
678
- fout << "\" -c:v libx264 -pix_fmt yuv420p -y " << fname_inp << ".mp4" << "\n";
679
 
680
- fout << "\n\n";
681
- fout << "echo \"Your video has been saved to " << fname_inp << ".mp4\"" << "\n";
682
- fout << "\n";
683
- fout << "echo \" ffplay " << fname_inp << ".mp4\"\n";
684
- fout << "\n";
685
 
686
- fout.close();
687
 
688
- fprintf(stderr, "%s: run 'source %s' to generate karaoke video\n", __func__, fname);
689
- }
690
 
691
  return true;
692
  }
 
321
  }
322
 
323
  // word-level timestamps (experimental)
324
+ // TODO: make ffmpeg output optional
 
325
  // TODO: extra pass to detect unused speech and assign to tokens
326
  // TODO: font parameter adjustments
327
+ // TODO: move to whisper.h/whisper.cpp and add parameter to select max line-length of subtitles
328
  bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, const std::vector<float> & pcmf32) {
329
+ std::vector<float> pcm_avg(pcmf32.size(), 0);
 
330
 
331
+ // average the fabs of the signal
332
+ {
333
+ const int hw = 32;
334
 
335
+ for (int i = 0; i < pcmf32.size(); i++) {
336
+ float sum = 0;
337
+ for (int j = -hw; j <= hw; j++) {
338
+ if (i + j >= 0 && i + j < pcmf32.size()) {
339
+ sum += fabs(pcmf32[i + j]);
 
340
  }
 
341
  }
342
+ pcm_avg[i] = sum/(2*hw + 1);
343
  }
344
+ }
345
 
346
+ struct token_info {
347
+ int64_t t0 = -1;
348
+ int64_t t1 = -1;
349
 
350
+ int64_t tt0 = -1;
351
+ int64_t tt1 = -1;
352
 
353
+ whisper_token id;
354
+ whisper_token tid;
355
 
356
+ float p = 0.0f;
357
+ float pt = 0.0f;
358
+ float ptsum = 0.0f;
359
 
360
+ std::string text;
361
+ float vlen = 0.0f; // voice length of this token
362
+ };
363
 
364
+ int64_t t_beg = 0;
365
+ int64_t t_last = 0;
366
 
367
+ whisper_token tid_last = 0;
368
 
369
+ std::ofstream fout(fname);
370
 
371
+ fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
372
 
373
+ fout << "!/bin/bash" << "\n";
374
+ fout << "\n";
375
 
376
+ fout << "ffmpeg -i " << fname_inp << " -f lavfi -i color=size=1200x120:duration=" << float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE << ":rate=25:color=black -vf \"";
377
 
378
+ bool is_first = true;
379
 
380
+ for (int i = 0; i < whisper_full_n_segments(ctx); i++) {
381
+ const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
382
+ const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
383
 
384
+ const char *text = whisper_full_get_segment_text(ctx, i);
385
 
386
+ const int s0 = std::max(0, (int) (t0*WHISPER_SAMPLE_RATE/100));
387
+ const int s1 = std::min((int) pcmf32.size(), (int) (t1*WHISPER_SAMPLE_RATE/100));
388
 
389
+ const int n = whisper_full_n_tokens(ctx, i);
390
 
391
+ std::vector<token_info> tokens(n);
392
 
393
+ if (n <= 1) {
394
+ continue;
395
+ }
396
 
397
+ for (int j = 0; j < n; ++j) {
398
+ struct whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
399
 
400
+ if (j == 0) {
401
+ if (token.id == whisper_token_beg(ctx)) {
402
+ tokens[j ].t0 = t0;
403
+ tokens[j ].t1 = t0;
404
+ tokens[j + 1].t0 = t0;
405
 
406
+ t_beg = t0;
407
+ t_last = t0;
408
+ tid_last = whisper_token_beg(ctx);
409
+ } else {
410
+ tokens[j ].t0 = t_last;
 
411
  }
412
+ }
413
 
414
+ const int64_t tt = t_beg + 2*(token.tid - whisper_token_beg(ctx));
415
 
416
+ tokens[j].id = token.id;
417
+ tokens[j].tid = token.tid;
418
+ tokens[j].p = token.p;
419
+ tokens[j].pt = token.pt;
420
+ tokens[j].ptsum = token.ptsum;
421
 
422
+ tokens[j].text = whisper_token_to_str(ctx, token.id);
423
+ tokens[j].vlen = voice_length(tokens[j].text);
 
424
 
425
+ if (token.pt > params.word_thold && token.ptsum > 0.01 && token.tid > tid_last && tt <= t1) {
426
+ if (j > 0) {
427
+ tokens[j - 1].t1 = tt;
 
 
 
428
  }
429
+ tokens[j].t0 = tt;
430
+ tid_last = token.tid;
431
  }
432
+ }
433
 
434
+ tokens[n - 2].t1 = t1;
435
+ tokens[n - 1].t0 = t1;
436
+ tokens[n - 1].t1 = t1;
437
 
438
+ t_last = t1;
439
 
440
+ // find intervals of tokens with unknown timestamps
441
+ // fill the timestamps by proportionally splitting the interval based on the token voice lengths
442
+ {
443
  int p0 = 0;
444
  int p1 = 0;
445
  while (true) {
 
461
 
462
  const double dt = tokens[p1].t1 - tokens[p0].t0;
463
 
464
+ // split the time proportionally to the voice length
465
  for (int j = p0 + 1; j <= p1; j++) {
466
  const double ct = tokens[j - 1].t0 + dt*tokens[j - 1].vlen/psum;
 
 
467
 
468
  tokens[j - 1].t1 = ct;
469
  tokens[j ].t0 = ct;
 
476
  break;
477
  }
478
  }
479
+ }
480
 
481
+ // fix up (just in case)
482
+ for (int j = 0; j < n - 1; j++) {
483
+ if (tokens[j].t1 < 0) {
484
+ tokens[j + 1].t0 = tokens[j].t1;
485
+ }
486
 
487
+ if (j > 0) {
488
+ if (tokens[j - 1].t1 > tokens[j].t0) {
489
+ tokens[j].t0 = tokens[j - 1].t1;
490
+ tokens[j].t1 = std::max(tokens[j].t0, tokens[j].t1);
 
491
  }
 
 
 
492
  }
493
 
494
+ tokens[j].tt0 = tokens[j].t0;
495
+ tokens[j].tt1 = tokens[j].t1;
496
+ }
497
 
498
+ // VAD
499
+ // expand or contract tokens based on voice activity
500
+ {
501
+ const int hw = WHISPER_SAMPLE_RATE/8;
502
+
503
+ for (int j = 0; j < n; j++) {
504
+ if (tokens[j].id >= whisper_token_eot(ctx)) {
505
+ continue;
506
+ }
507
 
508
+ const int64_t t0 = tokens[j].t0;
509
+ const int64_t t1 = tokens[j].t1;
510
 
511
+ int s0 = std::max(0, (int) (t0*WHISPER_SAMPLE_RATE/100));
512
+ int s1 = std::min((int) pcmf32.size() - 1, (int) (t1*WHISPER_SAMPLE_RATE/100));
513
 
514
+ const int ss0 = std::max(0, (int) (t0*WHISPER_SAMPLE_RATE/100) - hw);
515
+ const int ss1 = std::min((int) pcmf32.size() - 1, (int) (t1*WHISPER_SAMPLE_RATE/100) + hw);
516
 
517
+ const int n = ss1 - ss0;
518
 
519
+ float sum = 0.0f;
520
 
521
+ for (int k = ss0; k < ss1; k++) {
522
+ sum += pcm_avg[k];
523
+ }
524
 
525
+ const float thold = 0.5*sum/n;
526
+
527
+ {
528
+ int k = s0;
529
+ if (pcm_avg[k] > thold && j > 0) {
530
+ while (k > 0 && pcm_avg[k] > thold) {
531
+ k--;
532
+ }
533
+ tokens[j].t0 = (int64_t) (100*k/WHISPER_SAMPLE_RATE);
534
+ if (tokens[j].t0 < tokens[j - 1].t1) {
535
+ tokens[j].t0 = tokens[j - 1].t1;
 
 
 
536
  } else {
 
 
 
537
  s0 = k;
 
538
  }
539
+ } else {
540
+ while (pcm_avg[k] < thold && k < s1) {
541
+ k++;
542
+ }
543
+ s0 = k;
544
+ tokens[j].t0 = 100*k/WHISPER_SAMPLE_RATE;
545
  }
546
+ }
547
 
548
+ {
549
+ int k = s1;
550
+ if (pcm_avg[k] > thold) {
551
+ while (k < (int) pcmf32.size() - 1 && pcm_avg[k] > thold) {
552
+ k++;
553
+ }
554
+ tokens[j].t1 = 100*k/WHISPER_SAMPLE_RATE;
555
+ if (j < n - 1 && tokens[j].t1 > tokens[j + 1].t0) {
556
+ tokens[j].t1 = tokens[j + 1].t0;
 
 
 
557
  } else {
 
 
 
558
  s1 = k;
 
559
  }
560
+ } else {
561
+ while (pcm_avg[k] < thold && k > s0) {
562
+ k--;
563
+ }
564
+ s1 = k;
565
+ tokens[j].t1 = 100*k/WHISPER_SAMPLE_RATE;
566
  }
567
  }
568
  }
569
+ }
570
 
571
+ // fixed token expand (optional)
572
+ {
573
  const int t_expand = 0;
574
 
575
  for (int j = 0; j < n; j++) {
 
580
  tokens[j].t1 = tokens[j].t1 + t_expand;
581
  }
582
  }
583
+ }
584
 
585
+ // debug info
586
+ // TODO: toggle via parameter
587
+ for (int j = 0; j < n; ++j) {
588
+ const auto & token = tokens[j];
589
+ const auto tt = token.pt > params.word_thold && token.ptsum > 0.01 ? whisper_token_to_str(ctx, token.tid) : "[?]";
590
+ printf("%s: %10s %6.3f %6.3f %6.3f %6.3f %5d %5d '%s'\n", __func__,
591
+ tt, token.p, token.pt, token.ptsum, token.vlen, (int) token.t0, (int) token.t1, token.text.c_str());
592
 
593
+ if (tokens[j].id >= whisper_token_eot(ctx)) {
594
+ continue;
595
+ }
596
 
597
+ //printf("[%s --> %s] %s\n", to_timestamp(token.t0).c_str(), to_timestamp(token.t1).c_str(), whisper_token_to_str(ctx, token.id));
598
 
599
+ //fout << "# " << to_timestamp(token.t0) << " --> " << to_timestamp(token.t1) << " " << whisper_token_to_str(ctx, token.id) << "\n";
600
+ }
601
 
602
+ // TODO: become parameters
603
+ static const int line_wrap = 60;
604
+ static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
605
 
606
+ if (!is_first) {
607
+ fout << ",";
608
+ }
609
 
610
+ // background text
611
+ fout << "drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='':enable='between(t," << t0/100.0 << "," << t0/100.0 << ")'";
612
 
613
+ is_first = false;
614
 
615
+ for (int j = 0; j < n; ++j) {
616
+ const auto & token = tokens[j];
617
 
618
+ if (tokens[j].id >= whisper_token_eot(ctx)) {
619
+ continue;
620
+ }
621
 
622
+ std::string txt_bg;
623
+ std::string txt_fg; // highlight token
624
+ std::string txt_ul; // underline
625
 
626
+ txt_bg = "> ";
627
+ txt_fg = "> ";
628
+ txt_ul = "\\ \\ ";
629
 
630
+ {
631
+ int ncnt = 0;
632
+ for (int k = 0; k < n; ++k) {
633
+ const auto & token2 = tokens[k];
634
 
635
+ if (tokens[k].id >= whisper_token_eot(ctx)) {
636
+ continue;
637
+ }
638
 
639
+ const std::string txt = whisper_token_to_str(ctx, token2.id);
640
 
641
+ txt_bg += txt;
642
 
643
+ if (k == j) {
644
+ for (int l = 0; l < (int) txt.size(); ++l) {
645
+ txt_fg += txt[l];
646
+ txt_ul += "_";
 
 
 
 
 
 
 
647
  }
648
+ txt_fg += "|";
649
+ } else {
650
+ for (int l = 0; l < (int) txt.size(); ++l) {
651
+ txt_fg += "\\ ";
652
+ txt_ul += "\\ ";
 
 
 
 
 
 
 
653
  }
654
  }
655
 
656
+ ncnt += txt.size();
657
+
658
+ if (ncnt > line_wrap) {
659
+ if (k < j) {
660
+ txt_bg = "> ";
661
+ txt_fg = "> ";
662
+ txt_ul = "\\ \\ ";
663
+ ncnt = 0;
664
+ } else {
665
+ break;
666
+ }
667
+ }
668
  }
669
 
670
+ ::replace_all(txt_bg, "'", "’");
671
+ ::replace_all(txt_bg, "\"", "\\\"");
672
+ ::replace_all(txt_fg, "'", "’");
673
+ ::replace_all(txt_fg, "\"", "\\\"");
674
+ }
675
+
676
+ // background text
677
+ fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='" << txt_bg << "':enable='between(t," << token.tt0/100.0 << "," << token.tt1/100.0 << ")'";
678
 
679
+ // foreground text
680
+ fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2:text='" << txt_fg << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";
681
 
682
+ // underline
683
+ fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2+16:text='" << txt_ul << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";
 
684
  }
685
+ }
686
 
687
+ fout << "\" -c:v libx264 -pix_fmt yuv420p -y " << fname_inp << ".mp4" << "\n";
688
 
689
+ fout << "\n\n";
690
+ fout << "echo \"Your video has been saved to " << fname_inp << ".mp4\"" << "\n";
691
+ fout << "\n";
692
+ fout << "echo \" ffplay " << fname_inp << ".mp4\"\n";
693
+ fout << "\n";
694
 
695
+ fout.close();
696
 
697
+ fprintf(stderr, "%s: run 'source %s' to generate karaoke video\n", __func__, fname);
 
698
 
699
  return true;
700
  }