Spaces:
Running
Running
bench : add memcpy and ggml_mul_mat benchmarks
Browse files- Makefile +2 -2
- examples/bench/bench.cpp +157 -7
- extra/bench-all.sh +12 -1
- ggml.c +11 -4
Makefile
CHANGED
|
@@ -133,8 +133,8 @@ ifdef WHISPER_OPENBLAS
|
|
| 133 |
LDFLAGS += -lopenblas
|
| 134 |
endif
|
| 135 |
ifdef WHISPER_GPROF
|
| 136 |
-
CFLAGS
|
| 137 |
-
CXXFLAGS
|
| 138 |
endif
|
| 139 |
ifneq ($(filter aarch64%,$(UNAME_M)),)
|
| 140 |
endif
|
|
|
|
| 133 |
LDFLAGS += -lopenblas
|
| 134 |
endif
|
| 135 |
ifdef WHISPER_GPROF
|
| 136 |
+
CFLAGS += -pg
|
| 137 |
+
CXXFLAGS += -pg
|
| 138 |
endif
|
| 139 |
ifneq ($(filter aarch64%,$(UNAME_M)),)
|
| 140 |
endif
|
examples/bench/bench.cpp
CHANGED
|
@@ -1,12 +1,16 @@
|
|
|
|
|
| 1 |
#include "whisper.h"
|
| 2 |
|
| 3 |
#include <cstdio>
|
|
|
|
| 4 |
#include <string>
|
| 5 |
#include <thread>
|
|
|
|
| 6 |
|
| 7 |
// command-line parameters
|
| 8 |
struct whisper_params {
|
| 9 |
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
|
|
|
| 10 |
|
| 11 |
std::string model = "models/ggml-base.en.bin";
|
| 12 |
};
|
|
@@ -23,6 +27,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 23 |
}
|
| 24 |
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
|
| 25 |
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
|
|
|
| 26 |
else {
|
| 27 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 28 |
whisper_print_usage(argc, argv, params);
|
|
@@ -41,16 +46,14 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 41 |
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
| 42 |
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
| 43 |
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
fprintf(stderr, "\n");
|
| 45 |
}
|
| 46 |
|
| 47 |
-
int
|
| 48 |
-
whisper_params params;
|
| 49 |
-
|
| 50 |
-
if (whisper_params_parse(argc, argv, params) == false) {
|
| 51 |
-
return 1;
|
| 52 |
-
}
|
| 53 |
-
|
| 54 |
// whisper init
|
| 55 |
|
| 56 |
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
|
|
@@ -92,3 +95,150 @@ int main(int argc, char ** argv) {
|
|
| 92 |
|
| 93 |
return 0;
|
| 94 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "ggml.h"
|
| 2 |
#include "whisper.h"
|
| 3 |
|
| 4 |
#include <cstdio>
|
| 5 |
+
#include <cstring>
|
| 6 |
#include <string>
|
| 7 |
#include <thread>
|
| 8 |
+
#include <vector>
|
| 9 |
|
| 10 |
// command-line parameters
|
| 11 |
struct whisper_params {
|
| 12 |
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
| 13 |
+
int32_t what = 0; // what to benchmark: 0 - whisper ecoder, 1 - memcpy, 2 - ggml_mul_mat
|
| 14 |
|
| 15 |
std::string model = "models/ggml-base.en.bin";
|
| 16 |
};
|
|
|
|
| 27 |
}
|
| 28 |
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
|
| 29 |
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
| 30 |
+
else if (arg == "-w" || arg == "--what") { params.what = atoi(argv[++i]); }
|
| 31 |
else {
|
| 32 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 33 |
whisper_print_usage(argc, argv, params);
|
|
|
|
| 46 |
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
| 47 |
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
| 48 |
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
| 49 |
+
fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what);
|
| 50 |
+
fprintf(stderr, " %-7s 0 - whisper encoder\n", "");
|
| 51 |
+
fprintf(stderr, " %-7s 1 - memcpy\n", "");
|
| 52 |
+
fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", "");
|
| 53 |
fprintf(stderr, "\n");
|
| 54 |
}
|
| 55 |
|
| 56 |
+
int bench_whisper_encoder(const whisper_params & params) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
// whisper init
|
| 58 |
|
| 59 |
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
|
|
|
|
| 95 |
|
| 96 |
return 0;
|
| 97 |
}
|
| 98 |
+
|
| 99 |
+
int bench_memcpy(const whisper_params & params) {
|
| 100 |
+
size_t n = 50;
|
| 101 |
+
size_t arr = params.what > 0 ? 1024 : params.what; // trick to avoid compiler optimizations
|
| 102 |
+
|
| 103 |
+
// 1 GB array
|
| 104 |
+
const size_t size = arr*1024llu*1024llu;
|
| 105 |
+
|
| 106 |
+
char * src = (char *) malloc(size);
|
| 107 |
+
char * dst = (char *) malloc(size);
|
| 108 |
+
|
| 109 |
+
for (size_t i = 0; i < size; i++) src[i] = i;
|
| 110 |
+
|
| 111 |
+
memcpy(dst, src, size); // heat-up
|
| 112 |
+
|
| 113 |
+
double tsum = 0.0;
|
| 114 |
+
|
| 115 |
+
for (size_t i = 0; i < n; i++) {
|
| 116 |
+
const int64_t t0 = ggml_time_us();
|
| 117 |
+
|
| 118 |
+
memcpy(dst, src, size);
|
| 119 |
+
|
| 120 |
+
const int64_t t1 = ggml_time_us();
|
| 121 |
+
|
| 122 |
+
tsum += (t1 - t0)*1e-6;
|
| 123 |
+
|
| 124 |
+
src[0] = rand();
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
fprintf(stderr, "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
|
| 128 |
+
|
| 129 |
+
// needed to prevent the compile from optimizing the memcpy away
|
| 130 |
+
{
|
| 131 |
+
double sum = 0.0;
|
| 132 |
+
|
| 133 |
+
for (size_t i = 0; i < size; i++) sum += dst[i];
|
| 134 |
+
|
| 135 |
+
fprintf(stderr, "sum: %s\n", sum == -536870910.00 ? "ok" : "error");
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
free(src);
|
| 139 |
+
free(dst);
|
| 140 |
+
|
| 141 |
+
return 0;
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
int bench_ggml_mul_mat(const whisper_params & params) {
|
| 145 |
+
const int n_max = 128;
|
| 146 |
+
|
| 147 |
+
const std::vector<size_t> sizes = {
|
| 148 |
+
64, 128, 256, 512, 1024, 2048, 4096,
|
| 149 |
+
};
|
| 150 |
+
|
| 151 |
+
const size_t N_max = sizes.back();
|
| 152 |
+
|
| 153 |
+
// a: N*N*sizeof(float)
|
| 154 |
+
// b: N*N*sizeof(float)
|
| 155 |
+
// c: N*N*sizeof(float)
|
| 156 |
+
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
|
| 157 |
+
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
|
| 158 |
+
|
| 159 |
+
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
|
| 160 |
+
|
| 161 |
+
for (int j = 0; j < (int) sizes.size(); j++) {
|
| 162 |
+
int n_fp16 = 0;
|
| 163 |
+
int n_fp32 = 0;
|
| 164 |
+
|
| 165 |
+
// GFLOPS/s
|
| 166 |
+
double s_fp16 = 0.0;
|
| 167 |
+
double s_fp32 = 0.0;
|
| 168 |
+
|
| 169 |
+
const size_t N = sizes[j];
|
| 170 |
+
|
| 171 |
+
for (int k = 0; k < 2; ++k) {
|
| 172 |
+
const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
| 173 |
+
|
| 174 |
+
double & s = k == 0 ? s_fp16 : s_fp32;
|
| 175 |
+
int & n = k == 0 ? n_fp16 : n_fp32;
|
| 176 |
+
|
| 177 |
+
struct ggml_init_params gparams = {
|
| 178 |
+
/*.mem_size =*/ buf.size(),
|
| 179 |
+
/*.mem_buffer =*/ buf.data(),
|
| 180 |
+
};
|
| 181 |
+
|
| 182 |
+
struct ggml_context * ctx0 = ggml_init(gparams);
|
| 183 |
+
|
| 184 |
+
struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype, N, N);
|
| 185 |
+
struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
|
| 186 |
+
|
| 187 |
+
struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b);
|
| 188 |
+
|
| 189 |
+
struct ggml_cgraph gf = ggml_build_forward(c);
|
| 190 |
+
|
| 191 |
+
gf.n_threads = params.n_threads;
|
| 192 |
+
|
| 193 |
+
double tsum = 0.0;
|
| 194 |
+
|
| 195 |
+
// heat-up
|
| 196 |
+
ggml_graph_compute(ctx0, &gf);
|
| 197 |
+
|
| 198 |
+
for (int i = 0; i < n_max; ++i) {
|
| 199 |
+
const int64_t t0 = ggml_time_us();
|
| 200 |
+
|
| 201 |
+
ggml_graph_compute(ctx0, &gf);
|
| 202 |
+
|
| 203 |
+
const int64_t t1 = ggml_time_us();
|
| 204 |
+
|
| 205 |
+
tsum += (t1 - t0)*1e-6;
|
| 206 |
+
n++;
|
| 207 |
+
|
| 208 |
+
if (tsum > 1.0 && n >= 3) {
|
| 209 |
+
break;
|
| 210 |
+
}
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
ggml_free(ctx0);
|
| 214 |
+
|
| 215 |
+
s = ((2.0*N*N*N*n)/tsum)*1e-9;
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
|
| 219 |
+
N, N, s_fp16, n_fp16, s_fp32, n_fp32);
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
return 0;
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
int main(int argc, char ** argv) {
|
| 226 |
+
whisper_params params;
|
| 227 |
+
|
| 228 |
+
if (whisper_params_parse(argc, argv, params) == false) {
|
| 229 |
+
return 1;
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
ggml_time_init();
|
| 233 |
+
|
| 234 |
+
int ret = -1;
|
| 235 |
+
|
| 236 |
+
switch (params.what) {
|
| 237 |
+
case 0: ret = bench_whisper_encoder(params); break;
|
| 238 |
+
case 1: ret = bench_memcpy(params); break;
|
| 239 |
+
case 2: ret = bench_ggml_mul_mat(params); break;
|
| 240 |
+
default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
return ret;
|
| 244 |
+
}
|
extra/bench-all.sh
CHANGED
|
@@ -12,6 +12,18 @@ fi
|
|
| 12 |
|
| 13 |
models=( "tiny" "base" "small" "medium" "large" )
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
printf "\n"
|
| 16 |
printf "Running benchmark for all models\n"
|
| 17 |
printf "This can take a while!\n"
|
|
@@ -56,4 +68,3 @@ for model in "${models[@]}"; do
|
|
| 56 |
|
| 57 |
printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
|
| 58 |
done
|
| 59 |
-
|
|
|
|
| 12 |
|
| 13 |
models=( "tiny" "base" "small" "medium" "large" )
|
| 14 |
|
| 15 |
+
printf "\n"
|
| 16 |
+
printf "Running memcpy benchmark with 1 thread\n"
|
| 17 |
+
printf "\n"
|
| 18 |
+
|
| 19 |
+
./bench -w 1 -t 1 2>&1
|
| 20 |
+
|
| 21 |
+
printf "\n"
|
| 22 |
+
printf "Running ggml_mul_mat benchmark with " $n_threads " threads\n"
|
| 23 |
+
printf "\n"
|
| 24 |
+
|
| 25 |
+
./bench -w 2 -t $n_threads 2>&1
|
| 26 |
+
|
| 27 |
printf "\n"
|
| 28 |
printf "Running benchmark for all models\n"
|
| 29 |
printf "This can take a while!\n"
|
|
|
|
| 68 |
|
| 69 |
printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
|
| 70 |
done
|
|
|
ggml.c
CHANGED
|
@@ -4373,7 +4373,9 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
| 4373 |
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
| 4374 |
GGML_ASSERT(nb10 == sizeof(float));
|
| 4375 |
|
| 4376 |
-
if (params->ith != 0)
|
|
|
|
|
|
|
| 4377 |
|
| 4378 |
if (params->type == GGML_TASK_INIT) {
|
| 4379 |
return;
|
|
@@ -4616,7 +4618,9 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
| 4616 |
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
| 4617 |
GGML_ASSERT(nb10 == sizeof(float));
|
| 4618 |
|
| 4619 |
-
if (params->ith != 0)
|
|
|
|
|
|
|
| 4620 |
|
| 4621 |
if (params->type == GGML_TASK_INIT) {
|
| 4622 |
return;
|
|
@@ -7054,7 +7058,7 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
|
|
| 7054 |
#ifdef __APPLE__
|
| 7055 |
|
| 7056 |
//#include <os/lock.h>
|
| 7057 |
-
|
| 7058 |
//typedef os_unfair_lock ggml_lock_t;
|
| 7059 |
//
|
| 7060 |
//#define ggml_lock_init(x) UNUSED(x)
|
|
@@ -7161,6 +7165,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
| 7161 |
if (state->params.ith < state->params.nth) {
|
| 7162 |
ggml_compute_forward(&state->params, state->node);
|
| 7163 |
}
|
|
|
|
| 7164 |
state->node = NULL;
|
| 7165 |
} else {
|
| 7166 |
break;
|
|
@@ -7205,6 +7210,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
| 7205 |
.node = NULL,
|
| 7206 |
.shared = &state_shared,
|
| 7207 |
};
|
|
|
|
| 7208 |
int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
| 7209 |
assert(rc == 0);
|
| 7210 |
UNUSED(rc);
|
|
@@ -7273,7 +7279,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
| 7273 |
node->src1->type == GGML_TYPE_F32) {
|
| 7274 |
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 7275 |
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
| 7276 |
-
node->n_tasks = 1;
|
|
|
|
| 7277 |
cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]);
|
| 7278 |
} else {
|
| 7279 |
cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);
|
|
|
|
| 4373 |
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
| 4374 |
GGML_ASSERT(nb10 == sizeof(float));
|
| 4375 |
|
| 4376 |
+
if (params->ith != 0) {
|
| 4377 |
+
return;
|
| 4378 |
+
}
|
| 4379 |
|
| 4380 |
if (params->type == GGML_TASK_INIT) {
|
| 4381 |
return;
|
|
|
|
| 4618 |
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
| 4619 |
GGML_ASSERT(nb10 == sizeof(float));
|
| 4620 |
|
| 4621 |
+
if (params->ith != 0) {
|
| 4622 |
+
return;
|
| 4623 |
+
}
|
| 4624 |
|
| 4625 |
if (params->type == GGML_TASK_INIT) {
|
| 4626 |
return;
|
|
|
|
| 7058 |
#ifdef __APPLE__
|
| 7059 |
|
| 7060 |
//#include <os/lock.h>
|
| 7061 |
+
//
|
| 7062 |
//typedef os_unfair_lock ggml_lock_t;
|
| 7063 |
//
|
| 7064 |
//#define ggml_lock_init(x) UNUSED(x)
|
|
|
|
| 7165 |
if (state->params.ith < state->params.nth) {
|
| 7166 |
ggml_compute_forward(&state->params, state->node);
|
| 7167 |
}
|
| 7168 |
+
|
| 7169 |
state->node = NULL;
|
| 7170 |
} else {
|
| 7171 |
break;
|
|
|
|
| 7210 |
.node = NULL,
|
| 7211 |
.shared = &state_shared,
|
| 7212 |
};
|
| 7213 |
+
|
| 7214 |
int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
| 7215 |
assert(rc == 0);
|
| 7216 |
UNUSED(rc);
|
|
|
|
| 7279 |
node->src1->type == GGML_TYPE_F32) {
|
| 7280 |
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 7281 |
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
| 7282 |
+
node->n_tasks = 1; // TODO: this actually is doing nothing
|
| 7283 |
+
// the threads are still spinning
|
| 7284 |
cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]);
|
| 7285 |
} else {
|
| 7286 |
cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);
|