ggerganov commited on
Commit
abbf5f2
·
unverified ·
1 Parent(s): 93140af

whisper : fix bench regression + fix performance when using CPU BLAS (#1275)

Browse files

* whisper : fix bench regression

* ggml : use sched_yield when using BLAS + add comment

Files changed (2) hide show
  1. ggml.c +11 -3
  2. whisper.cpp +25 -8
ggml.c CHANGED
@@ -17283,10 +17283,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17283
  } else {
17284
  // wait for other threads to finish
17285
  const int last = node_n;
17286
- do {
17287
- //sched_yield();
 
 
 
 
 
 
 
17288
  node_n = atomic_load(&state->shared->node_n);
17289
- } while (node_n == last);
 
17290
  }
17291
 
17292
  // check if we should stop
 
17283
  } else {
17284
  // wait for other threads to finish
17285
  const int last = node_n;
17286
+ while (true) {
17287
+ // TODO: this sched_yield can have significant impact on the performance - either positive or negative
17288
+ // depending on the workload and the operating system.
17289
+ // since it is not clear what is the best approach, it should potentially become user-configurable
17290
+ // ref: https://github.com/ggerganov/ggml/issues/291
17291
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
17292
+ sched_yield();
17293
+ #endif
17294
+
17295
  node_n = atomic_load(&state->shared->node_n);
17296
+ if (node_n != last) break;
17297
+ };
17298
  }
17299
 
17300
  // check if we should stop
whisper.cpp CHANGED
@@ -118,6 +118,21 @@ static void byteswap_tensor(ggml_tensor * tensor) {
118
  #define WHISPER_USE_SCRATCH
119
  #define WHISPER_MAX_SCRATCH_BUFFERS 16
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  // available whisper models
122
  enum e_model {
123
  MODEL_UNKNOWN,
@@ -666,6 +681,7 @@ struct whisper_state {
666
 
667
  // memory buffers used by encode / decode contexts
668
  std::vector<uint8_t> buf_compute;
 
669
  std::vector<uint8_t> buf_scratch[WHISPER_MAX_SCRATCH_BUFFERS];
670
 
671
  int buf_last = 0;
@@ -1830,8 +1846,8 @@ static bool whisper_encode_internal(
1830
  {
1831
  struct ggml_cgraph gf = {};
1832
 
1833
- ggml_build_forward_expand (&gf, cur);
1834
- ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
1835
 
1836
  //ggml_graph_print(&gf);
1837
  }
@@ -1916,7 +1932,7 @@ static bool whisper_encode_internal(
1916
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
1917
  }
1918
 
1919
- ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
1920
  //ggml_graph_print(&gf);
1921
  }
1922
 
@@ -2329,8 +2345,8 @@ static bool whisper_decode_internal(
2329
 
2330
  // run the computation
2331
  {
2332
- ggml_build_forward_expand (&gf, logits);
2333
- ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
2334
  }
2335
 
2336
  // extract logits for all N tokens
@@ -5225,7 +5241,8 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
5225
  // b: N*N*sizeof(float)
5226
  // c: N*N*sizeof(float)
5227
  // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
5228
- std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);
 
5229
 
5230
  // put a bunch of random data in the buffer
5231
  for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
@@ -5280,12 +5297,12 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
5280
  double tsum = 0.0;
5281
 
5282
  // heat-up
5283
- ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
5284
 
5285
  for (int i = 0; i < n_max; ++i) {
5286
  const int64_t t0 = ggml_time_us();
5287
 
5288
- ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
5289
 
5290
  const int64_t t1 = ggml_time_us();
5291
 
 
118
  #define WHISPER_USE_SCRATCH
119
  #define WHISPER_MAX_SCRATCH_BUFFERS 16
120
 
121
+ //
122
+ // ggml helpers
123
+ //
124
+
125
+ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
126
+ struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
127
+
128
+ if (plan.work_size > 0) {
129
+ buf.resize(plan.work_size);
130
+ plan.work_data = buf.data();
131
+ }
132
+
133
+ ggml_graph_compute(graph, &plan);
134
+ }
135
+
136
  // available whisper models
137
  enum e_model {
138
  MODEL_UNKNOWN,
 
681
 
682
  // memory buffers used by encode / decode contexts
683
  std::vector<uint8_t> buf_compute;
684
+ std::vector<uint8_t> buf_work;
685
  std::vector<uint8_t> buf_scratch[WHISPER_MAX_SCRATCH_BUFFERS];
686
 
687
  int buf_last = 0;
 
1846
  {
1847
  struct ggml_cgraph gf = {};
1848
 
1849
+ ggml_build_forward_expand(&gf, cur);
1850
+ ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads);
1851
 
1852
  //ggml_graph_print(&gf);
1853
  }
 
1932
  ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
1933
  }
1934
 
1935
+ ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads);
1936
  //ggml_graph_print(&gf);
1937
  }
1938
 
 
2345
 
2346
  // run the computation
2347
  {
2348
+ ggml_build_forward_expand(&gf, logits);
2349
+ ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads);
2350
  }
2351
 
2352
  // extract logits for all N tokens
 
5241
  // b: N*N*sizeof(float)
5242
  // c: N*N*sizeof(float)
5243
  // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
5244
+ std::vector<uint8_t> buf (3llu*N_max*N_max*sizeof(float) + 3*ggml_tensor_overhead());
5245
+ std::vector<uint8_t> work(1llu*N_max*N_max*sizeof(float) + 1*ggml_tensor_overhead());
5246
 
5247
  // put a bunch of random data in the buffer
5248
  for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
 
5297
  double tsum = 0.0;
5298
 
5299
  // heat-up
5300
+ ggml_graph_compute_helper(work, &gf, n_threads);
5301
 
5302
  for (int i = 0; i < n_max; ++i) {
5303
  const int64_t t0 = ggml_time_us();
5304
 
5305
+ ggml_graph_compute_helper(work, &gf, n_threads);
5306
 
5307
  const int64_t t1 = ggml_time_us();
5308