ggerganov commited on
Commit
a660ed9
·
unverified ·
1 Parent(s): eb271b2

bench : add memcpy and ggml_mul_mat benchmarks

Browse files
Files changed (4) hide show
  1. Makefile +2 -2
  2. examples/bench/bench.cpp +157 -7
  3. extra/bench-all.sh +12 -1
  4. ggml.c +11 -4
Makefile CHANGED
@@ -133,8 +133,8 @@ ifdef WHISPER_OPENBLAS
133
  LDFLAGS += -lopenblas
134
  endif
135
  ifdef WHISPER_GPROF
136
- CFLAGS += -pg
137
- CXXFLAGS += -pg
138
  endif
139
  ifneq ($(filter aarch64%,$(UNAME_M)),)
140
  endif
 
133
  LDFLAGS += -lopenblas
134
  endif
135
  ifdef WHISPER_GPROF
136
+ CFLAGS += -pg
137
+ CXXFLAGS += -pg
138
  endif
139
  ifneq ($(filter aarch64%,$(UNAME_M)),)
140
  endif
examples/bench/bench.cpp CHANGED
@@ -1,12 +1,16 @@
 
1
  #include "whisper.h"
2
 
3
  #include <cstdio>
 
4
  #include <string>
5
  #include <thread>
 
6
 
7
  // command-line parameters
8
  struct whisper_params {
9
  int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
 
10
 
11
  std::string model = "models/ggml-base.en.bin";
12
  };
@@ -23,6 +27,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
23
  }
24
  else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
25
  else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
 
26
  else {
27
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
28
  whisper_print_usage(argc, argv, params);
@@ -41,16 +46,14 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
41
  fprintf(stderr, " -h, --help [default] show this help message and exit\n");
42
  fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
43
  fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
 
 
 
 
44
  fprintf(stderr, "\n");
45
  }
46
 
47
- int main(int argc, char ** argv) {
48
- whisper_params params;
49
-
50
- if (whisper_params_parse(argc, argv, params) == false) {
51
- return 1;
52
- }
53
-
54
  // whisper init
55
 
56
  struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
@@ -92,3 +95,150 @@ int main(int argc, char ** argv) {
92
 
93
  return 0;
94
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml.h"
2
  #include "whisper.h"
3
 
4
  #include <cstdio>
5
+ #include <cstring>
6
  #include <string>
7
  #include <thread>
8
+ #include <vector>
9
 
10
  // command-line parameters
11
  struct whisper_params {
12
  int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
13
+ int32_t what = 0; // what to benchmark: 0 - whisper ecoder, 1 - memcpy, 2 - ggml_mul_mat
14
 
15
  std::string model = "models/ggml-base.en.bin";
16
  };
 
27
  }
28
  else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
29
  else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
30
+ else if (arg == "-w" || arg == "--what") { params.what = atoi(argv[++i]); }
31
  else {
32
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
33
  whisper_print_usage(argc, argv, params);
 
46
  fprintf(stderr, " -h, --help [default] show this help message and exit\n");
47
  fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
48
  fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
49
+ fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what);
50
+ fprintf(stderr, " %-7s 0 - whisper encoder\n", "");
51
+ fprintf(stderr, " %-7s 1 - memcpy\n", "");
52
+ fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", "");
53
  fprintf(stderr, "\n");
54
  }
55
 
56
+ int bench_whisper_encoder(const whisper_params & params) {
 
 
 
 
 
 
57
  // whisper init
58
 
59
  struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
 
95
 
96
  return 0;
97
  }
98
+
99
+ int bench_memcpy(const whisper_params & params) {
100
+ size_t n = 50;
101
+ size_t arr = params.what > 0 ? 1024 : params.what; // trick to avoid compiler optimizations
102
+
103
+ // 1 GB array
104
+ const size_t size = arr*1024llu*1024llu;
105
+
106
+ char * src = (char *) malloc(size);
107
+ char * dst = (char *) malloc(size);
108
+
109
+ for (size_t i = 0; i < size; i++) src[i] = i;
110
+
111
+ memcpy(dst, src, size); // heat-up
112
+
113
+ double tsum = 0.0;
114
+
115
+ for (size_t i = 0; i < n; i++) {
116
+ const int64_t t0 = ggml_time_us();
117
+
118
+ memcpy(dst, src, size);
119
+
120
+ const int64_t t1 = ggml_time_us();
121
+
122
+ tsum += (t1 - t0)*1e-6;
123
+
124
+ src[0] = rand();
125
+ }
126
+
127
+ fprintf(stderr, "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
128
+
129
+ // needed to prevent the compile from optimizing the memcpy away
130
+ {
131
+ double sum = 0.0;
132
+
133
+ for (size_t i = 0; i < size; i++) sum += dst[i];
134
+
135
+ fprintf(stderr, "sum: %s\n", sum == -536870910.00 ? "ok" : "error");
136
+ }
137
+
138
+ free(src);
139
+ free(dst);
140
+
141
+ return 0;
142
+ }
143
+
144
+ int bench_ggml_mul_mat(const whisper_params & params) {
145
+ const int n_max = 128;
146
+
147
+ const std::vector<size_t> sizes = {
148
+ 64, 128, 256, 512, 1024, 2048, 4096,
149
+ };
150
+
151
+ const size_t N_max = sizes.back();
152
+
153
+ // a: N*N*sizeof(float)
154
+ // b: N*N*sizeof(float)
155
+ // c: N*N*sizeof(float)
156
+ // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
157
+ std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
158
+
159
+ for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
160
+
161
+ for (int j = 0; j < (int) sizes.size(); j++) {
162
+ int n_fp16 = 0;
163
+ int n_fp32 = 0;
164
+
165
+ // GFLOPS/s
166
+ double s_fp16 = 0.0;
167
+ double s_fp32 = 0.0;
168
+
169
+ const size_t N = sizes[j];
170
+
171
+ for (int k = 0; k < 2; ++k) {
172
+ const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32;
173
+
174
+ double & s = k == 0 ? s_fp16 : s_fp32;
175
+ int & n = k == 0 ? n_fp16 : n_fp32;
176
+
177
+ struct ggml_init_params gparams = {
178
+ /*.mem_size =*/ buf.size(),
179
+ /*.mem_buffer =*/ buf.data(),
180
+ };
181
+
182
+ struct ggml_context * ctx0 = ggml_init(gparams);
183
+
184
+ struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype, N, N);
185
+ struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
186
+
187
+ struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b);
188
+
189
+ struct ggml_cgraph gf = ggml_build_forward(c);
190
+
191
+ gf.n_threads = params.n_threads;
192
+
193
+ double tsum = 0.0;
194
+
195
+ // heat-up
196
+ ggml_graph_compute(ctx0, &gf);
197
+
198
+ for (int i = 0; i < n_max; ++i) {
199
+ const int64_t t0 = ggml_time_us();
200
+
201
+ ggml_graph_compute(ctx0, &gf);
202
+
203
+ const int64_t t1 = ggml_time_us();
204
+
205
+ tsum += (t1 - t0)*1e-6;
206
+ n++;
207
+
208
+ if (tsum > 1.0 && n >= 3) {
209
+ break;
210
+ }
211
+ }
212
+
213
+ ggml_free(ctx0);
214
+
215
+ s = ((2.0*N*N*N*n)/tsum)*1e-9;
216
+ }
217
+
218
+ fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
219
+ N, N, s_fp16, n_fp16, s_fp32, n_fp32);
220
+ }
221
+
222
+ return 0;
223
+ }
224
+
225
+ int main(int argc, char ** argv) {
226
+ whisper_params params;
227
+
228
+ if (whisper_params_parse(argc, argv, params) == false) {
229
+ return 1;
230
+ }
231
+
232
+ ggml_time_init();
233
+
234
+ int ret = -1;
235
+
236
+ switch (params.what) {
237
+ case 0: ret = bench_whisper_encoder(params); break;
238
+ case 1: ret = bench_memcpy(params); break;
239
+ case 2: ret = bench_ggml_mul_mat(params); break;
240
+ default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;
241
+ }
242
+
243
+ return ret;
244
+ }
extra/bench-all.sh CHANGED
@@ -12,6 +12,18 @@ fi
12
 
13
  models=( "tiny" "base" "small" "medium" "large" )
14
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  printf "\n"
16
  printf "Running benchmark for all models\n"
17
  printf "This can take a while!\n"
@@ -56,4 +68,3 @@ for model in "${models[@]}"; do
56
 
57
  printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
58
  done
59
-
 
12
 
13
  models=( "tiny" "base" "small" "medium" "large" )
14
 
15
+ printf "\n"
16
+ printf "Running memcpy benchmark with 1 thread\n"
17
+ printf "\n"
18
+
19
+ ./bench -w 1 -t 1 2>&1
20
+
21
+ printf "\n"
22
+ printf "Running ggml_mul_mat benchmark with " $n_threads " threads\n"
23
+ printf "\n"
24
+
25
+ ./bench -w 2 -t $n_threads 2>&1
26
+
27
  printf "\n"
28
  printf "Running benchmark for all models\n"
29
  printf "This can take a while!\n"
 
68
 
69
  printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
70
  done
 
ggml.c CHANGED
@@ -4373,7 +4373,9 @@ static void ggml_compute_forward_mul_mat_f32(
4373
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
4374
  GGML_ASSERT(nb10 == sizeof(float));
4375
 
4376
- if (params->ith != 0) return;
 
 
4377
 
4378
  if (params->type == GGML_TASK_INIT) {
4379
  return;
@@ -4616,7 +4618,9 @@ static void ggml_compute_forward_mul_mat_f16_f32(
4616
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
4617
  GGML_ASSERT(nb10 == sizeof(float));
4618
 
4619
- if (params->ith != 0) return;
 
 
4620
 
4621
  if (params->type == GGML_TASK_INIT) {
4622
  return;
@@ -7054,7 +7058,7 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
7054
  #ifdef __APPLE__
7055
 
7056
  //#include <os/lock.h>
7057
-
7058
  //typedef os_unfair_lock ggml_lock_t;
7059
  //
7060
  //#define ggml_lock_init(x) UNUSED(x)
@@ -7161,6 +7165,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
7161
  if (state->params.ith < state->params.nth) {
7162
  ggml_compute_forward(&state->params, state->node);
7163
  }
 
7164
  state->node = NULL;
7165
  } else {
7166
  break;
@@ -7205,6 +7210,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
7205
  .node = NULL,
7206
  .shared = &state_shared,
7207
  };
 
7208
  int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
7209
  assert(rc == 0);
7210
  UNUSED(rc);
@@ -7273,7 +7279,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
7273
  node->src1->type == GGML_TYPE_F32) {
7274
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
7275
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
7276
- node->n_tasks = 1;
 
7277
  cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]);
7278
  } else {
7279
  cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);
 
4373
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
4374
  GGML_ASSERT(nb10 == sizeof(float));
4375
 
4376
+ if (params->ith != 0) {
4377
+ return;
4378
+ }
4379
 
4380
  if (params->type == GGML_TASK_INIT) {
4381
  return;
 
4618
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
4619
  GGML_ASSERT(nb10 == sizeof(float));
4620
 
4621
+ if (params->ith != 0) {
4622
+ return;
4623
+ }
4624
 
4625
  if (params->type == GGML_TASK_INIT) {
4626
  return;
 
7058
  #ifdef __APPLE__
7059
 
7060
  //#include <os/lock.h>
7061
+ //
7062
  //typedef os_unfair_lock ggml_lock_t;
7063
  //
7064
  //#define ggml_lock_init(x) UNUSED(x)
 
7165
  if (state->params.ith < state->params.nth) {
7166
  ggml_compute_forward(&state->params, state->node);
7167
  }
7168
+
7169
  state->node = NULL;
7170
  } else {
7171
  break;
 
7210
  .node = NULL,
7211
  .shared = &state_shared,
7212
  };
7213
+
7214
  int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
7215
  assert(rc == 0);
7216
  UNUSED(rc);
 
7279
  node->src1->type == GGML_TYPE_F32) {
7280
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
7281
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
7282
+ node->n_tasks = 1; // TODO: this actually is doing nothing
7283
+ // the threads are still spinning
7284
  cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]);
7285
  } else {
7286
  cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);