ggerganov commited on
Commit
bef3486
·
unverified ·
1 Parent(s): a660ed9

bench : fix Windows linkage by moving ggml benches in whisper lib ..

Browse files
Files changed (3) hide show
  1. examples/bench/bench.cpp +4 -135
  2. whisper.cpp +140 -0
  3. whisper.h +7 -0
examples/bench/bench.cpp CHANGED
@@ -1,11 +1,8 @@
1
- #include "ggml.h"
2
  #include "whisper.h"
3
 
4
  #include <cstdio>
5
- #include <cstring>
6
  #include <string>
7
  #include <thread>
8
- #include <vector>
9
 
10
  // command-line parameters
11
  struct whisper_params {
@@ -53,7 +50,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
53
  fprintf(stderr, "\n");
54
  }
55
 
56
- int bench_whisper_encoder(const whisper_params & params) {
57
  // whisper init
58
 
59
  struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
@@ -96,132 +93,6 @@ int bench_whisper_encoder(const whisper_params & params) {
96
  return 0;
97
  }
98
 
99
- int bench_memcpy(const whisper_params & params) {
100
- size_t n = 50;
101
- size_t arr = params.what > 0 ? 1024 : params.what; // trick to avoid compiler optimizations
102
-
103
- // 1 GB array
104
- const size_t size = arr*1024llu*1024llu;
105
-
106
- char * src = (char *) malloc(size);
107
- char * dst = (char *) malloc(size);
108
-
109
- for (size_t i = 0; i < size; i++) src[i] = i;
110
-
111
- memcpy(dst, src, size); // heat-up
112
-
113
- double tsum = 0.0;
114
-
115
- for (size_t i = 0; i < n; i++) {
116
- const int64_t t0 = ggml_time_us();
117
-
118
- memcpy(dst, src, size);
119
-
120
- const int64_t t1 = ggml_time_us();
121
-
122
- tsum += (t1 - t0)*1e-6;
123
-
124
- src[0] = rand();
125
- }
126
-
127
- fprintf(stderr, "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
128
-
129
- // needed to prevent the compile from optimizing the memcpy away
130
- {
131
- double sum = 0.0;
132
-
133
- for (size_t i = 0; i < size; i++) sum += dst[i];
134
-
135
- fprintf(stderr, "sum: %s\n", sum == -536870910.00 ? "ok" : "error");
136
- }
137
-
138
- free(src);
139
- free(dst);
140
-
141
- return 0;
142
- }
143
-
144
- int bench_ggml_mul_mat(const whisper_params & params) {
145
- const int n_max = 128;
146
-
147
- const std::vector<size_t> sizes = {
148
- 64, 128, 256, 512, 1024, 2048, 4096,
149
- };
150
-
151
- const size_t N_max = sizes.back();
152
-
153
- // a: N*N*sizeof(float)
154
- // b: N*N*sizeof(float)
155
- // c: N*N*sizeof(float)
156
- // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
157
- std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
158
-
159
- for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
160
-
161
- for (int j = 0; j < (int) sizes.size(); j++) {
162
- int n_fp16 = 0;
163
- int n_fp32 = 0;
164
-
165
- // GFLOPS/s
166
- double s_fp16 = 0.0;
167
- double s_fp32 = 0.0;
168
-
169
- const size_t N = sizes[j];
170
-
171
- for (int k = 0; k < 2; ++k) {
172
- const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32;
173
-
174
- double & s = k == 0 ? s_fp16 : s_fp32;
175
- int & n = k == 0 ? n_fp16 : n_fp32;
176
-
177
- struct ggml_init_params gparams = {
178
- /*.mem_size =*/ buf.size(),
179
- /*.mem_buffer =*/ buf.data(),
180
- };
181
-
182
- struct ggml_context * ctx0 = ggml_init(gparams);
183
-
184
- struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype, N, N);
185
- struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
186
-
187
- struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b);
188
-
189
- struct ggml_cgraph gf = ggml_build_forward(c);
190
-
191
- gf.n_threads = params.n_threads;
192
-
193
- double tsum = 0.0;
194
-
195
- // heat-up
196
- ggml_graph_compute(ctx0, &gf);
197
-
198
- for (int i = 0; i < n_max; ++i) {
199
- const int64_t t0 = ggml_time_us();
200
-
201
- ggml_graph_compute(ctx0, &gf);
202
-
203
- const int64_t t1 = ggml_time_us();
204
-
205
- tsum += (t1 - t0)*1e-6;
206
- n++;
207
-
208
- if (tsum > 1.0 && n >= 3) {
209
- break;
210
- }
211
- }
212
-
213
- ggml_free(ctx0);
214
-
215
- s = ((2.0*N*N*N*n)/tsum)*1e-9;
216
- }
217
-
218
- fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
219
- N, N, s_fp16, n_fp16, s_fp32, n_fp32);
220
- }
221
-
222
- return 0;
223
- }
224
-
225
  int main(int argc, char ** argv) {
226
  whisper_params params;
227
 
@@ -229,14 +100,12 @@ int main(int argc, char ** argv) {
229
  return 1;
230
  }
231
 
232
- ggml_time_init();
233
-
234
  int ret = -1;
235
 
236
  switch (params.what) {
237
- case 0: ret = bench_whisper_encoder(params); break;
238
- case 1: ret = bench_memcpy(params); break;
239
- case 2: ret = bench_ggml_mul_mat(params); break;
240
  default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;
241
  }
242
 
 
 
1
  #include "whisper.h"
2
 
3
  #include <cstdio>
 
4
  #include <string>
5
  #include <thread>
 
6
 
7
  // command-line parameters
8
  struct whisper_params {
 
50
  fprintf(stderr, "\n");
51
  }
52
 
53
+ int whisper_bench_encoder(const whisper_params & params) {
54
  // whisper init
55
 
56
  struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
 
93
  return 0;
94
  }
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  int main(int argc, char ** argv) {
97
  whisper_params params;
98
 
 
100
  return 1;
101
  }
102
 
 
 
103
  int ret = -1;
104
 
105
  switch (params.what) {
106
+ case 0: ret = whisper_bench_encoder(params); break;
107
+ case 1: ret = whisper_bench_memcpy(params.n_threads); break;
108
+ case 2: ret = whisper_bench_ggml_mul_mat(params.n_threads); break;
109
  default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;
110
  }
111
 
whisper.cpp CHANGED
@@ -3801,6 +3801,7 @@ int whisper_full(
3801
 
3802
  if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
3803
  const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
 
3804
  if (!text.empty()) {
3805
  const auto tt0 = params.speed_up ? 2*t0 : t0;
3806
  const auto tt1 = params.speed_up ? 2*t1 : t1;
@@ -4059,6 +4060,145 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
4059
 
4060
  // =================================================================================================
4061
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4062
  //
4063
  // Experimental stuff below
4064
  //
 
3801
 
3802
  if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
3803
  const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
3804
+
3805
  if (!text.empty()) {
3806
  const auto tt0 = params.speed_up ? 2*t0 : t0;
3807
  const auto tt1 = params.speed_up ? 2*t1 : t1;
 
4060
 
4061
  // =================================================================================================
4062
 
4063
+ //
4064
+ // Temporary interface needed for exposing ggml interface
4065
+ // Will be removed in the future when ggml becomes a separate library
4066
+ //
4067
+
4068
+ WHISPER_API int whisper_bench_memcpy(int n_threads) {
4069
+ ggml_time_init();
4070
+
4071
+ size_t n = 50;
4072
+ size_t arr = n_threads > 0 ? 1024 : n_threads; // trick to avoid compiler optimizations
4073
+
4074
+ // 1 GB array
4075
+ const size_t size = arr*1024llu*1024llu;
4076
+
4077
+ char * src = (char *) malloc(size);
4078
+ char * dst = (char *) malloc(size);
4079
+
4080
+ for (size_t i = 0; i < size; i++) src[i] = i;
4081
+
4082
+ memcpy(dst, src, size); // heat-up
4083
+
4084
+ double tsum = 0.0;
4085
+
4086
+ for (size_t i = 0; i < n; i++) {
4087
+ const int64_t t0 = ggml_time_us();
4088
+
4089
+ memcpy(dst, src, size);
4090
+
4091
+ const int64_t t1 = ggml_time_us();
4092
+
4093
+ tsum += (t1 - t0)*1e-6;
4094
+
4095
+ src[0] = rand();
4096
+ }
4097
+
4098
+ fprintf(stderr, "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
4099
+
4100
+ // needed to prevent the compile from optimizing the memcpy away
4101
+ {
4102
+ double sum = 0.0;
4103
+
4104
+ for (size_t i = 0; i < size; i++) sum += dst[i];
4105
+
4106
+ fprintf(stderr, "sum: %s\n", sum == -536870910.00 ? "ok" : "error");
4107
+ }
4108
+
4109
+ free(src);
4110
+ free(dst);
4111
+
4112
+ return 0;
4113
+ }
4114
+
4115
+ WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
4116
+ ggml_time_init();
4117
+
4118
+ const int n_max = 128;
4119
+
4120
+ const std::vector<size_t> sizes = {
4121
+ 64, 128, 256, 512, 1024, 2048, 4096,
4122
+ };
4123
+
4124
+ const size_t N_max = sizes.back();
4125
+
4126
+ // a: N*N*sizeof(float)
4127
+ // b: N*N*sizeof(float)
4128
+ // c: N*N*sizeof(float)
4129
+ // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
4130
+ std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
4131
+
4132
+ for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
4133
+
4134
+ for (int j = 0; j < (int) sizes.size(); j++) {
4135
+ int n_fp16 = 0;
4136
+ int n_fp32 = 0;
4137
+
4138
+ // GFLOPS/s
4139
+ double s_fp16 = 0.0;
4140
+ double s_fp32 = 0.0;
4141
+
4142
+ const size_t N = sizes[j];
4143
+
4144
+ for (int k = 0; k < 2; ++k) {
4145
+ const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32;
4146
+
4147
+ double & s = k == 0 ? s_fp16 : s_fp32;
4148
+ int & n = k == 0 ? n_fp16 : n_fp32;
4149
+
4150
+ struct ggml_init_params gparams = {
4151
+ /*.mem_size =*/ buf.size(),
4152
+ /*.mem_buffer =*/ buf.data(),
4153
+ };
4154
+
4155
+ struct ggml_context * ctx0 = ggml_init(gparams);
4156
+
4157
+ struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype, N, N);
4158
+ struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
4159
+
4160
+ struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b);
4161
+
4162
+ struct ggml_cgraph gf = ggml_build_forward(c);
4163
+
4164
+ gf.n_threads = n_threads;
4165
+
4166
+ double tsum = 0.0;
4167
+
4168
+ // heat-up
4169
+ ggml_graph_compute(ctx0, &gf);
4170
+
4171
+ for (int i = 0; i < n_max; ++i) {
4172
+ const int64_t t0 = ggml_time_us();
4173
+
4174
+ ggml_graph_compute(ctx0, &gf);
4175
+
4176
+ const int64_t t1 = ggml_time_us();
4177
+
4178
+ tsum += (t1 - t0)*1e-6;
4179
+ n++;
4180
+
4181
+ if (tsum > 1.0 && n >= 3) {
4182
+ break;
4183
+ }
4184
+ }
4185
+
4186
+ ggml_free(ctx0);
4187
+
4188
+ s = ((2.0*N*N*N*n)/tsum)*1e-9;
4189
+ }
4190
+
4191
+ fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
4192
+ N, N, s_fp16, n_fp16, s_fp32, n_fp32);
4193
+ }
4194
+
4195
+ return 0;
4196
+ }
4197
+
4198
+ // =================================================================================================
4199
+
4200
+ // =================================================================================================
4201
+
4202
  //
4203
  // Experimental stuff below
4204
  //
whisper.h CHANGED
@@ -350,6 +350,13 @@ extern "C" {
350
  // Get the probability of the specified token in the specified segment.
351
  WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
352
 
 
 
 
 
 
 
 
353
  #ifdef __cplusplus
354
  }
355
  #endif
 
350
  // Get the probability of the specified token in the specified segment.
351
  WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
352
 
353
+ ////////////////////////////////////////////////////////////////////////////
354
+
355
+ // Temporary helpers needed for exposing ggml interface
356
+
357
+ WHISPER_API int whisper_bench_memcpy(int n_threads);
358
+ WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
359
+
360
  #ifdef __cplusplus
361
  }
362
  #endif