ggerganov commited on
Commit
02b3e34
·
unverified ·
1 Parent(s): d472548

main : make whisper_print_segment_callback() more readable (close #371)

Browse files
Files changed (1) hide show
  1. examples/main/main.cpp +53 -61
examples/main/main.cpp CHANGED
@@ -176,90 +176,82 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
176
 
177
  const int n_segments = whisper_full_n_segments(ctx);
178
 
 
 
 
 
 
179
  // print the last n_new segments
180
  const int s0 = n_segments - n_new;
 
181
  if (s0 == 0) {
182
  printf("\n");
183
  }
184
 
185
  for (int i = s0; i < n_segments; i++) {
186
- if (params.no_timestamps) {
187
- if (params.print_colors) {
188
- for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
189
- if (params.print_special == false) {
190
- const whisper_token id = whisper_full_get_token_id(ctx, i, j);
191
- if (id >= whisper_token_eot(ctx)) {
192
- continue;
193
- }
194
- }
195
 
196
- const char * text = whisper_full_get_token_text(ctx, i, j);
197
- const float p = whisper_full_get_token_p (ctx, i, j);
 
198
 
199
- const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
200
 
201
- printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
202
- }
203
- } else {
204
- const char * text = whisper_full_get_segment_text(ctx, i);
205
- printf("%s", text);
206
- }
207
- fflush(stdout);
208
- } else {
209
- const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
210
- const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
211
 
212
- std::string speaker;
 
213
 
214
- if (params.diarize && pcmf32s.size() == 2) {
215
- const int64_t n_samples = pcmf32s[0].size();
216
 
217
- const int64_t is0 = timestamp_to_sample(t0, n_samples);
218
- const int64_t is1 = timestamp_to_sample(t1, n_samples);
 
 
219
 
220
- double energy0 = 0.0f;
221
- double energy1 = 0.0f;
 
 
 
 
 
222
 
223
- for (int64_t j = is0; j < is1; j++) {
224
- energy0 += fabs(pcmf32s[0][j]);
225
- energy1 += fabs(pcmf32s[1][j]);
226
- }
227
 
228
- if (energy0 > 1.1*energy1) {
229
- speaker = "(speaker 0)";
230
- } else if (energy1 > 1.1*energy0) {
231
- speaker = "(speaker 1)";
232
- } else {
233
- speaker = "(speaker ?)";
 
234
  }
235
 
236
- //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
237
- }
238
-
239
- if (params.print_colors) {
240
- printf("[%s --> %s] ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
241
- for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
242
- if (params.print_special == false) {
243
- const whisper_token id = whisper_full_get_token_id(ctx, i, j);
244
- if (id >= whisper_token_eot(ctx)) {
245
- continue;
246
- }
247
- }
248
 
249
- const char * text = whisper_full_get_token_text(ctx, i, j);
250
- const float p = whisper_full_get_token_p (ctx, i, j);
251
 
252
- const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
 
 
 
253
 
254
- printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
255
- }
256
- printf("\n");
257
- } else {
258
- const char * text = whisper_full_get_segment_text(ctx, i);
259
 
260
- printf("[%s --> %s] %s%s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), speaker.c_str(), text);
261
- }
 
262
  }
 
 
263
  }
264
  }
265
 
 
176
 
177
  const int n_segments = whisper_full_n_segments(ctx);
178
 
179
+ std::string speaker = "";
180
+
181
+ int64_t t0;
182
+ int64_t t1;
183
+
184
  // print the last n_new segments
185
  const int s0 = n_segments - n_new;
186
+
187
  if (s0 == 0) {
188
  printf("\n");
189
  }
190
 
191
  for (int i = s0; i < n_segments; i++) {
192
+ if (!params.no_timestamps || params.diarize) {
193
+ t0 = whisper_full_get_segment_t0(ctx, i);
194
+ t1 = whisper_full_get_segment_t1(ctx, i);
195
+ }
 
 
 
 
 
196
 
197
+ if (!params.no_timestamps) {
198
+ printf("[%s --> %s] ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
199
+ }
200
 
201
+ if (params.diarize && pcmf32s.size() == 2) {
202
 
203
+ const int64_t n_samples = pcmf32s[0].size();
 
 
 
 
 
 
 
 
 
204
 
205
+ const int64_t is0 = timestamp_to_sample(t0, n_samples);
206
+ const int64_t is1 = timestamp_to_sample(t1, n_samples);
207
 
208
+ double energy0 = 0.0f;
209
+ double energy1 = 0.0f;
210
 
211
+ for (int64_t j = is0; j < is1; j++) {
212
+ energy0 += fabs(pcmf32s[0][j]);
213
+ energy1 += fabs(pcmf32s[1][j]);
214
+ }
215
 
216
+ if (energy0 > 1.1*energy1) {
217
+ speaker = "(speaker 0)";
218
+ } else if (energy1 > 1.1*energy0) {
219
+ speaker = "(speaker 1)";
220
+ } else {
221
+ speaker = "(speaker ?)";
222
+ }
223
 
224
+ //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
225
+ }
 
 
226
 
227
+ if (params.print_colors) {
228
+ for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
229
+ if (params.print_special == false) {
230
+ const whisper_token id = whisper_full_get_token_id(ctx, i, j);
231
+ if (id >= whisper_token_eot(ctx)) {
232
+ continue;
233
+ }
234
  }
235
 
236
+ const char * text = whisper_full_get_token_text(ctx, i, j);
237
+ const float p = whisper_full_get_token_p (ctx, i, j);
 
 
 
 
 
 
 
 
 
 
238
 
239
+ const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
 
240
 
241
+ printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
242
+ }
243
+ } else {
244
+ const char * text = whisper_full_get_segment_text(ctx, i);
245
 
246
+ printf("%s%s", speaker.c_str(), text);
247
+ }
 
 
 
248
 
249
+ // with timestamps or speakers: each segment on new line
250
+ if (!params.no_timestamps || params.diarize) {
251
+ printf("\n");
252
  }
253
+
254
+ fflush(stdout);
255
  }
256
  }
257