Spaces:
Running
Running
whisper : fix bench regression + fix performance when using CPU BLAS (#1275)
Browse files* whisper : fix bench regression
* ggml : use sched_yield when using BLAS + add comment
- ggml.c +11 -3
- whisper.cpp +25 -8
ggml.c
CHANGED
|
@@ -17283,10 +17283,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
| 17283 |
} else {
|
| 17284 |
// wait for other threads to finish
|
| 17285 |
const int last = node_n;
|
| 17286 |
-
|
| 17287 |
-
//sched_yield
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17288 |
node_n = atomic_load(&state->shared->node_n);
|
| 17289 |
-
|
|
|
|
| 17290 |
}
|
| 17291 |
|
| 17292 |
// check if we should stop
|
|
|
|
| 17283 |
} else {
|
| 17284 |
// wait for other threads to finish
|
| 17285 |
const int last = node_n;
|
| 17286 |
+
while (true) {
|
| 17287 |
+
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
| 17288 |
+
// depending on the workload and the operating system.
|
| 17289 |
+
// since it is not clear what is the best approach, it should potentially become user-configurable
|
| 17290 |
+
// ref: https://github.com/ggerganov/ggml/issues/291
|
| 17291 |
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 17292 |
+
sched_yield();
|
| 17293 |
+
#endif
|
| 17294 |
+
|
| 17295 |
node_n = atomic_load(&state->shared->node_n);
|
| 17296 |
+
if (node_n != last) break;
|
| 17297 |
+
};
|
| 17298 |
}
|
| 17299 |
|
| 17300 |
// check if we should stop
|
whisper.cpp
CHANGED
|
@@ -118,6 +118,21 @@ static void byteswap_tensor(ggml_tensor * tensor) {
|
|
| 118 |
#define WHISPER_USE_SCRATCH
|
| 119 |
#define WHISPER_MAX_SCRATCH_BUFFERS 16
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
// available whisper models
|
| 122 |
enum e_model {
|
| 123 |
MODEL_UNKNOWN,
|
|
@@ -666,6 +681,7 @@ struct whisper_state {
|
|
| 666 |
|
| 667 |
// memory buffers used by encode / decode contexts
|
| 668 |
std::vector<uint8_t> buf_compute;
|
|
|
|
| 669 |
std::vector<uint8_t> buf_scratch[WHISPER_MAX_SCRATCH_BUFFERS];
|
| 670 |
|
| 671 |
int buf_last = 0;
|
|
@@ -1830,8 +1846,8 @@ static bool whisper_encode_internal(
|
|
| 1830 |
{
|
| 1831 |
struct ggml_cgraph gf = {};
|
| 1832 |
|
| 1833 |
-
ggml_build_forward_expand
|
| 1834 |
-
|
| 1835 |
|
| 1836 |
//ggml_graph_print(&gf);
|
| 1837 |
}
|
|
@@ -1916,7 +1932,7 @@ static bool whisper_encode_internal(
|
|
| 1916 |
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
|
| 1917 |
}
|
| 1918 |
|
| 1919 |
-
|
| 1920 |
//ggml_graph_print(&gf);
|
| 1921 |
}
|
| 1922 |
|
|
@@ -2329,8 +2345,8 @@ static bool whisper_decode_internal(
|
|
| 2329 |
|
| 2330 |
// run the computation
|
| 2331 |
{
|
| 2332 |
-
ggml_build_forward_expand
|
| 2333 |
-
|
| 2334 |
}
|
| 2335 |
|
| 2336 |
// extract logits for all N tokens
|
|
@@ -5225,7 +5241,8 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
| 5225 |
// b: N*N*sizeof(float)
|
| 5226 |
// c: N*N*sizeof(float)
|
| 5227 |
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
|
| 5228 |
-
std::vector<
|
|
|
|
| 5229 |
|
| 5230 |
// put a bunch of random data in the buffer
|
| 5231 |
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
|
|
@@ -5280,12 +5297,12 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
| 5280 |
double tsum = 0.0;
|
| 5281 |
|
| 5282 |
// heat-up
|
| 5283 |
-
|
| 5284 |
|
| 5285 |
for (int i = 0; i < n_max; ++i) {
|
| 5286 |
const int64_t t0 = ggml_time_us();
|
| 5287 |
|
| 5288 |
-
|
| 5289 |
|
| 5290 |
const int64_t t1 = ggml_time_us();
|
| 5291 |
|
|
|
|
| 118 |
#define WHISPER_USE_SCRATCH
|
| 119 |
#define WHISPER_MAX_SCRATCH_BUFFERS 16
|
| 120 |
|
| 121 |
+
//
|
| 122 |
+
// ggml helpers
|
| 123 |
+
//
|
| 124 |
+
|
| 125 |
+
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
| 126 |
+
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
| 127 |
+
|
| 128 |
+
if (plan.work_size > 0) {
|
| 129 |
+
buf.resize(plan.work_size);
|
| 130 |
+
plan.work_data = buf.data();
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
ggml_graph_compute(graph, &plan);
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
// available whisper models
|
| 137 |
enum e_model {
|
| 138 |
MODEL_UNKNOWN,
|
|
|
|
| 681 |
|
| 682 |
// memory buffers used by encode / decode contexts
|
| 683 |
std::vector<uint8_t> buf_compute;
|
| 684 |
+
std::vector<uint8_t> buf_work;
|
| 685 |
std::vector<uint8_t> buf_scratch[WHISPER_MAX_SCRATCH_BUFFERS];
|
| 686 |
|
| 687 |
int buf_last = 0;
|
|
|
|
| 1846 |
{
|
| 1847 |
struct ggml_cgraph gf = {};
|
| 1848 |
|
| 1849 |
+
ggml_build_forward_expand(&gf, cur);
|
| 1850 |
+
ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads);
|
| 1851 |
|
| 1852 |
//ggml_graph_print(&gf);
|
| 1853 |
}
|
|
|
|
| 1932 |
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
|
| 1933 |
}
|
| 1934 |
|
| 1935 |
+
ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads);
|
| 1936 |
//ggml_graph_print(&gf);
|
| 1937 |
}
|
| 1938 |
|
|
|
|
| 2345 |
|
| 2346 |
// run the computation
|
| 2347 |
{
|
| 2348 |
+
ggml_build_forward_expand(&gf, logits);
|
| 2349 |
+
ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads);
|
| 2350 |
}
|
| 2351 |
|
| 2352 |
// extract logits for all N tokens
|
|
|
|
| 5241 |
// b: N*N*sizeof(float)
|
| 5242 |
// c: N*N*sizeof(float)
|
| 5243 |
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
|
| 5244 |
+
std::vector<uint8_t> buf (3llu*N_max*N_max*sizeof(float) + 3*ggml_tensor_overhead());
|
| 5245 |
+
std::vector<uint8_t> work(1llu*N_max*N_max*sizeof(float) + 1*ggml_tensor_overhead());
|
| 5246 |
|
| 5247 |
// put a bunch of random data in the buffer
|
| 5248 |
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
|
|
|
|
| 5297 |
double tsum = 0.0;
|
| 5298 |
|
| 5299 |
// heat-up
|
| 5300 |
+
ggml_graph_compute_helper(work, &gf, n_threads);
|
| 5301 |
|
| 5302 |
for (int i = 0; i < n_max; ++i) {
|
| 5303 |
const int64_t t0 = ggml_time_us();
|
| 5304 |
|
| 5305 |
+
ggml_graph_compute_helper(work, &gf, n_threads);
|
| 5306 |
|
| 5307 |
const int64_t t1 = ggml_time_us();
|
| 5308 |
|