Spaces:
Sleeping
Sleeping
bench : fix Windows linkage by moving ggml benches in whisper lib ..
Browse files- examples/bench/bench.cpp +4 -135
- whisper.cpp +140 -0
- whisper.h +7 -0
examples/bench/bench.cpp
CHANGED
|
@@ -1,11 +1,8 @@
|
|
| 1 |
-
#include "ggml.h"
|
| 2 |
#include "whisper.h"
|
| 3 |
|
| 4 |
#include <cstdio>
|
| 5 |
-
#include <cstring>
|
| 6 |
#include <string>
|
| 7 |
#include <thread>
|
| 8 |
-
#include <vector>
|
| 9 |
|
| 10 |
// command-line parameters
|
| 11 |
struct whisper_params {
|
|
@@ -53,7 +50,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 53 |
fprintf(stderr, "\n");
|
| 54 |
}
|
| 55 |
|
| 56 |
-
int
|
| 57 |
// whisper init
|
| 58 |
|
| 59 |
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
|
|
@@ -96,132 +93,6 @@ int bench_whisper_encoder(const whisper_params & params) {
|
|
| 96 |
return 0;
|
| 97 |
}
|
| 98 |
|
| 99 |
-
int bench_memcpy(const whisper_params & params) {
|
| 100 |
-
size_t n = 50;
|
| 101 |
-
size_t arr = params.what > 0 ? 1024 : params.what; // trick to avoid compiler optimizations
|
| 102 |
-
|
| 103 |
-
// 1 GB array
|
| 104 |
-
const size_t size = arr*1024llu*1024llu;
|
| 105 |
-
|
| 106 |
-
char * src = (char *) malloc(size);
|
| 107 |
-
char * dst = (char *) malloc(size);
|
| 108 |
-
|
| 109 |
-
for (size_t i = 0; i < size; i++) src[i] = i;
|
| 110 |
-
|
| 111 |
-
memcpy(dst, src, size); // heat-up
|
| 112 |
-
|
| 113 |
-
double tsum = 0.0;
|
| 114 |
-
|
| 115 |
-
for (size_t i = 0; i < n; i++) {
|
| 116 |
-
const int64_t t0 = ggml_time_us();
|
| 117 |
-
|
| 118 |
-
memcpy(dst, src, size);
|
| 119 |
-
|
| 120 |
-
const int64_t t1 = ggml_time_us();
|
| 121 |
-
|
| 122 |
-
tsum += (t1 - t0)*1e-6;
|
| 123 |
-
|
| 124 |
-
src[0] = rand();
|
| 125 |
-
}
|
| 126 |
-
|
| 127 |
-
fprintf(stderr, "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
|
| 128 |
-
|
| 129 |
-
// needed to prevent the compile from optimizing the memcpy away
|
| 130 |
-
{
|
| 131 |
-
double sum = 0.0;
|
| 132 |
-
|
| 133 |
-
for (size_t i = 0; i < size; i++) sum += dst[i];
|
| 134 |
-
|
| 135 |
-
fprintf(stderr, "sum: %s\n", sum == -536870910.00 ? "ok" : "error");
|
| 136 |
-
}
|
| 137 |
-
|
| 138 |
-
free(src);
|
| 139 |
-
free(dst);
|
| 140 |
-
|
| 141 |
-
return 0;
|
| 142 |
-
}
|
| 143 |
-
|
| 144 |
-
int bench_ggml_mul_mat(const whisper_params & params) {
|
| 145 |
-
const int n_max = 128;
|
| 146 |
-
|
| 147 |
-
const std::vector<size_t> sizes = {
|
| 148 |
-
64, 128, 256, 512, 1024, 2048, 4096,
|
| 149 |
-
};
|
| 150 |
-
|
| 151 |
-
const size_t N_max = sizes.back();
|
| 152 |
-
|
| 153 |
-
// a: N*N*sizeof(float)
|
| 154 |
-
// b: N*N*sizeof(float)
|
| 155 |
-
// c: N*N*sizeof(float)
|
| 156 |
-
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
|
| 157 |
-
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
|
| 158 |
-
|
| 159 |
-
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
|
| 160 |
-
|
| 161 |
-
for (int j = 0; j < (int) sizes.size(); j++) {
|
| 162 |
-
int n_fp16 = 0;
|
| 163 |
-
int n_fp32 = 0;
|
| 164 |
-
|
| 165 |
-
// GFLOPS/s
|
| 166 |
-
double s_fp16 = 0.0;
|
| 167 |
-
double s_fp32 = 0.0;
|
| 168 |
-
|
| 169 |
-
const size_t N = sizes[j];
|
| 170 |
-
|
| 171 |
-
for (int k = 0; k < 2; ++k) {
|
| 172 |
-
const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
| 173 |
-
|
| 174 |
-
double & s = k == 0 ? s_fp16 : s_fp32;
|
| 175 |
-
int & n = k == 0 ? n_fp16 : n_fp32;
|
| 176 |
-
|
| 177 |
-
struct ggml_init_params gparams = {
|
| 178 |
-
/*.mem_size =*/ buf.size(),
|
| 179 |
-
/*.mem_buffer =*/ buf.data(),
|
| 180 |
-
};
|
| 181 |
-
|
| 182 |
-
struct ggml_context * ctx0 = ggml_init(gparams);
|
| 183 |
-
|
| 184 |
-
struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype, N, N);
|
| 185 |
-
struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
|
| 186 |
-
|
| 187 |
-
struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b);
|
| 188 |
-
|
| 189 |
-
struct ggml_cgraph gf = ggml_build_forward(c);
|
| 190 |
-
|
| 191 |
-
gf.n_threads = params.n_threads;
|
| 192 |
-
|
| 193 |
-
double tsum = 0.0;
|
| 194 |
-
|
| 195 |
-
// heat-up
|
| 196 |
-
ggml_graph_compute(ctx0, &gf);
|
| 197 |
-
|
| 198 |
-
for (int i = 0; i < n_max; ++i) {
|
| 199 |
-
const int64_t t0 = ggml_time_us();
|
| 200 |
-
|
| 201 |
-
ggml_graph_compute(ctx0, &gf);
|
| 202 |
-
|
| 203 |
-
const int64_t t1 = ggml_time_us();
|
| 204 |
-
|
| 205 |
-
tsum += (t1 - t0)*1e-6;
|
| 206 |
-
n++;
|
| 207 |
-
|
| 208 |
-
if (tsum > 1.0 && n >= 3) {
|
| 209 |
-
break;
|
| 210 |
-
}
|
| 211 |
-
}
|
| 212 |
-
|
| 213 |
-
ggml_free(ctx0);
|
| 214 |
-
|
| 215 |
-
s = ((2.0*N*N*N*n)/tsum)*1e-9;
|
| 216 |
-
}
|
| 217 |
-
|
| 218 |
-
fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
|
| 219 |
-
N, N, s_fp16, n_fp16, s_fp32, n_fp32);
|
| 220 |
-
}
|
| 221 |
-
|
| 222 |
-
return 0;
|
| 223 |
-
}
|
| 224 |
-
|
| 225 |
int main(int argc, char ** argv) {
|
| 226 |
whisper_params params;
|
| 227 |
|
|
@@ -229,14 +100,12 @@ int main(int argc, char ** argv) {
|
|
| 229 |
return 1;
|
| 230 |
}
|
| 231 |
|
| 232 |
-
ggml_time_init();
|
| 233 |
-
|
| 234 |
int ret = -1;
|
| 235 |
|
| 236 |
switch (params.what) {
|
| 237 |
-
case 0: ret =
|
| 238 |
-
case 1: ret =
|
| 239 |
-
case 2: ret =
|
| 240 |
default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;
|
| 241 |
}
|
| 242 |
|
|
|
|
|
|
|
| 1 |
#include "whisper.h"
|
| 2 |
|
| 3 |
#include <cstdio>
|
|
|
|
| 4 |
#include <string>
|
| 5 |
#include <thread>
|
|
|
|
| 6 |
|
| 7 |
// command-line parameters
|
| 8 |
struct whisper_params {
|
|
|
|
| 50 |
fprintf(stderr, "\n");
|
| 51 |
}
|
| 52 |
|
| 53 |
+
int whisper_bench_encoder(const whisper_params & params) {
|
| 54 |
// whisper init
|
| 55 |
|
| 56 |
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
|
|
|
|
| 93 |
return 0;
|
| 94 |
}
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
int main(int argc, char ** argv) {
|
| 97 |
whisper_params params;
|
| 98 |
|
|
|
|
| 100 |
return 1;
|
| 101 |
}
|
| 102 |
|
|
|
|
|
|
|
| 103 |
int ret = -1;
|
| 104 |
|
| 105 |
switch (params.what) {
|
| 106 |
+
case 0: ret = whisper_bench_encoder(params); break;
|
| 107 |
+
case 1: ret = whisper_bench_memcpy(params.n_threads); break;
|
| 108 |
+
case 2: ret = whisper_bench_ggml_mul_mat(params.n_threads); break;
|
| 109 |
default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;
|
| 110 |
}
|
| 111 |
|
whisper.cpp
CHANGED
|
@@ -3801,6 +3801,7 @@ int whisper_full(
|
|
| 3801 |
|
| 3802 |
if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
|
| 3803 |
const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
|
|
|
|
| 3804 |
if (!text.empty()) {
|
| 3805 |
const auto tt0 = params.speed_up ? 2*t0 : t0;
|
| 3806 |
const auto tt1 = params.speed_up ? 2*t1 : t1;
|
|
@@ -4059,6 +4060,145 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
|
|
| 4059 |
|
| 4060 |
// =================================================================================================
|
| 4061 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4062 |
//
|
| 4063 |
// Experimental stuff below
|
| 4064 |
//
|
|
|
|
| 3801 |
|
| 3802 |
if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
|
| 3803 |
const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
|
| 3804 |
+
|
| 3805 |
if (!text.empty()) {
|
| 3806 |
const auto tt0 = params.speed_up ? 2*t0 : t0;
|
| 3807 |
const auto tt1 = params.speed_up ? 2*t1 : t1;
|
|
|
|
| 4060 |
|
| 4061 |
// =================================================================================================
|
| 4062 |
|
| 4063 |
+
//
|
| 4064 |
+
// Temporary interface needed for exposing ggml interface
|
| 4065 |
+
// Will be removed in the future when ggml becomes a separate library
|
| 4066 |
+
//
|
| 4067 |
+
|
| 4068 |
+
WHISPER_API int whisper_bench_memcpy(int n_threads) {
|
| 4069 |
+
ggml_time_init();
|
| 4070 |
+
|
| 4071 |
+
size_t n = 50;
|
| 4072 |
+
size_t arr = n_threads > 0 ? 1024 : n_threads; // trick to avoid compiler optimizations
|
| 4073 |
+
|
| 4074 |
+
// 1 GB array
|
| 4075 |
+
const size_t size = arr*1024llu*1024llu;
|
| 4076 |
+
|
| 4077 |
+
char * src = (char *) malloc(size);
|
| 4078 |
+
char * dst = (char *) malloc(size);
|
| 4079 |
+
|
| 4080 |
+
for (size_t i = 0; i < size; i++) src[i] = i;
|
| 4081 |
+
|
| 4082 |
+
memcpy(dst, src, size); // heat-up
|
| 4083 |
+
|
| 4084 |
+
double tsum = 0.0;
|
| 4085 |
+
|
| 4086 |
+
for (size_t i = 0; i < n; i++) {
|
| 4087 |
+
const int64_t t0 = ggml_time_us();
|
| 4088 |
+
|
| 4089 |
+
memcpy(dst, src, size);
|
| 4090 |
+
|
| 4091 |
+
const int64_t t1 = ggml_time_us();
|
| 4092 |
+
|
| 4093 |
+
tsum += (t1 - t0)*1e-6;
|
| 4094 |
+
|
| 4095 |
+
src[0] = rand();
|
| 4096 |
+
}
|
| 4097 |
+
|
| 4098 |
+
fprintf(stderr, "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
|
| 4099 |
+
|
| 4100 |
+
// needed to prevent the compile from optimizing the memcpy away
|
| 4101 |
+
{
|
| 4102 |
+
double sum = 0.0;
|
| 4103 |
+
|
| 4104 |
+
for (size_t i = 0; i < size; i++) sum += dst[i];
|
| 4105 |
+
|
| 4106 |
+
fprintf(stderr, "sum: %s\n", sum == -536870910.00 ? "ok" : "error");
|
| 4107 |
+
}
|
| 4108 |
+
|
| 4109 |
+
free(src);
|
| 4110 |
+
free(dst);
|
| 4111 |
+
|
| 4112 |
+
return 0;
|
| 4113 |
+
}
|
| 4114 |
+
|
| 4115 |
+
WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
|
| 4116 |
+
ggml_time_init();
|
| 4117 |
+
|
| 4118 |
+
const int n_max = 128;
|
| 4119 |
+
|
| 4120 |
+
const std::vector<size_t> sizes = {
|
| 4121 |
+
64, 128, 256, 512, 1024, 2048, 4096,
|
| 4122 |
+
};
|
| 4123 |
+
|
| 4124 |
+
const size_t N_max = sizes.back();
|
| 4125 |
+
|
| 4126 |
+
// a: N*N*sizeof(float)
|
| 4127 |
+
// b: N*N*sizeof(float)
|
| 4128 |
+
// c: N*N*sizeof(float)
|
| 4129 |
+
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
|
| 4130 |
+
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
|
| 4131 |
+
|
| 4132 |
+
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
|
| 4133 |
+
|
| 4134 |
+
for (int j = 0; j < (int) sizes.size(); j++) {
|
| 4135 |
+
int n_fp16 = 0;
|
| 4136 |
+
int n_fp32 = 0;
|
| 4137 |
+
|
| 4138 |
+
// GFLOPS/s
|
| 4139 |
+
double s_fp16 = 0.0;
|
| 4140 |
+
double s_fp32 = 0.0;
|
| 4141 |
+
|
| 4142 |
+
const size_t N = sizes[j];
|
| 4143 |
+
|
| 4144 |
+
for (int k = 0; k < 2; ++k) {
|
| 4145 |
+
const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
| 4146 |
+
|
| 4147 |
+
double & s = k == 0 ? s_fp16 : s_fp32;
|
| 4148 |
+
int & n = k == 0 ? n_fp16 : n_fp32;
|
| 4149 |
+
|
| 4150 |
+
struct ggml_init_params gparams = {
|
| 4151 |
+
/*.mem_size =*/ buf.size(),
|
| 4152 |
+
/*.mem_buffer =*/ buf.data(),
|
| 4153 |
+
};
|
| 4154 |
+
|
| 4155 |
+
struct ggml_context * ctx0 = ggml_init(gparams);
|
| 4156 |
+
|
| 4157 |
+
struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype, N, N);
|
| 4158 |
+
struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
|
| 4159 |
+
|
| 4160 |
+
struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b);
|
| 4161 |
+
|
| 4162 |
+
struct ggml_cgraph gf = ggml_build_forward(c);
|
| 4163 |
+
|
| 4164 |
+
gf.n_threads = n_threads;
|
| 4165 |
+
|
| 4166 |
+
double tsum = 0.0;
|
| 4167 |
+
|
| 4168 |
+
// heat-up
|
| 4169 |
+
ggml_graph_compute(ctx0, &gf);
|
| 4170 |
+
|
| 4171 |
+
for (int i = 0; i < n_max; ++i) {
|
| 4172 |
+
const int64_t t0 = ggml_time_us();
|
| 4173 |
+
|
| 4174 |
+
ggml_graph_compute(ctx0, &gf);
|
| 4175 |
+
|
| 4176 |
+
const int64_t t1 = ggml_time_us();
|
| 4177 |
+
|
| 4178 |
+
tsum += (t1 - t0)*1e-6;
|
| 4179 |
+
n++;
|
| 4180 |
+
|
| 4181 |
+
if (tsum > 1.0 && n >= 3) {
|
| 4182 |
+
break;
|
| 4183 |
+
}
|
| 4184 |
+
}
|
| 4185 |
+
|
| 4186 |
+
ggml_free(ctx0);
|
| 4187 |
+
|
| 4188 |
+
s = ((2.0*N*N*N*n)/tsum)*1e-9;
|
| 4189 |
+
}
|
| 4190 |
+
|
| 4191 |
+
fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
|
| 4192 |
+
N, N, s_fp16, n_fp16, s_fp32, n_fp32);
|
| 4193 |
+
}
|
| 4194 |
+
|
| 4195 |
+
return 0;
|
| 4196 |
+
}
|
| 4197 |
+
|
| 4198 |
+
// =================================================================================================
|
| 4199 |
+
|
| 4200 |
+
// =================================================================================================
|
| 4201 |
+
|
| 4202 |
//
|
| 4203 |
// Experimental stuff below
|
| 4204 |
//
|
whisper.h
CHANGED
|
@@ -350,6 +350,13 @@ extern "C" {
|
|
| 350 |
// Get the probability of the specified token in the specified segment.
|
| 351 |
WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
|
| 352 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
#ifdef __cplusplus
|
| 354 |
}
|
| 355 |
#endif
|
|
|
|
| 350 |
// Get the probability of the specified token in the specified segment.
|
| 351 |
WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
|
| 352 |
|
| 353 |
+
////////////////////////////////////////////////////////////////////////////
|
| 354 |
+
|
| 355 |
+
// Temporary helpers needed for exposing ggml interface
|
| 356 |
+
|
| 357 |
+
WHISPER_API int whisper_bench_memcpy(int n_threads);
|
| 358 |
+
WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
|
| 359 |
+
|
| 360 |
#ifdef __cplusplus
|
| 361 |
}
|
| 362 |
#endif
|