ggerganov commited on
Commit
b9f5e40
·
1 Parent(s): b1eba61

whisper : use backend registry (#0)

Browse files
Files changed (1) hide show
  1. src/whisper.cpp +51 -127
src/whisper.cpp CHANGED
@@ -1,43 +1,19 @@
1
  #include "whisper.h"
2
 
3
- #ifdef WHISPER_USE_COREML
4
- #include "coreml/whisper-encoder.h"
5
- #endif
6
-
7
  #include "ggml-cpu.h"
8
 
9
- #ifdef GGML_USE_METAL
10
- #include "ggml-metal.h"
11
- #endif
12
-
13
- #ifdef GGML_USE_CUDA
14
- #include "ggml-cuda.h"
15
- #endif
16
-
17
- #ifdef GGML_USE_SYCL
18
- #include "ggml-sycl.h"
19
- #endif
20
-
21
- #ifdef GGML_USE_VULKAN
22
- #include "ggml-vulkan.h"
23
- #endif
24
 
25
- #ifdef GGML_USE_BLAS
26
- #include "ggml-blas.h"
27
  #endif
28
 
29
  #ifdef WHISPER_USE_OPENVINO
30
  #include "openvino/whisper-openvino-encoder.h"
31
  #endif
32
 
33
- #ifdef GGML_USE_CANN
34
- #include "ggml-cann.h"
35
- #endif
36
-
37
- #include "ggml.h"
38
- #include "ggml-alloc.h"
39
- #include "ggml-backend.h"
40
-
41
  #include <atomic>
42
  #include <algorithm>
43
  #include <cassert>
@@ -195,14 +171,13 @@ static bool ggml_graph_compute_helper(
195
 
196
  for (int i = 0; i < ggml_backend_sched_get_n_backends(sched); ++i) {
197
  ggml_backend_t backend = ggml_backend_sched_get_backend(sched, i);
198
- if (ggml_backend_is_cpu(backend)) {
199
- ggml_backend_cpu_set_n_threads(backend, n_threads);
200
- }
201
- #ifdef GGML_USE_BLAS
202
- if (ggml_backend_is_blas(backend)) {
203
- ggml_backend_blas_set_n_threads(backend, n_threads);
204
  }
205
- #endif
206
  }
207
 
208
  bool t = ggml_backend_sched_graph_compute(sched, graph) == GGML_STATUS_SUCCESS;
@@ -1256,67 +1231,23 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) {
1256
  }
1257
 
1258
  static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
1259
- ggml_backend_t result = NULL;
1260
-
1261
  ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
1262
 
1263
- #ifdef GGML_USE_CUDA
1264
  if (params.use_gpu) {
1265
- WHISPER_LOG_INFO("%s: using CUDA backend\n", __func__);
1266
- result = ggml_backend_cuda_init(params.gpu_device);
1267
- if (!result) {
1268
- WHISPER_LOG_ERROR("%s: ggml_backend_cuda_init() failed\n", __func__);
1269
- }
1270
- }
1271
- #endif
1272
-
1273
- #ifdef GGML_USE_METAL
1274
- if (params.use_gpu) {
1275
- WHISPER_LOG_INFO("%s: using Metal backend\n", __func__);
1276
- result = ggml_backend_metal_init();
1277
- if (!result) {
1278
- WHISPER_LOG_ERROR("%s: ggml_backend_metal_init() failed\n", __func__);
1279
- } else if (!ggml_backend_metal_supports_family(result, 7)) {
1280
- WHISPER_LOG_ERROR("%s: Metal GPU does not support family 7 - falling back to CPU\n", __func__);
1281
- ggml_backend_free(result);
1282
- result = NULL;
1283
- }
1284
- }
1285
- #endif
1286
-
1287
- #ifdef GGML_USE_SYCL
1288
- if (params.use_gpu) {
1289
- WHISPER_LOG_INFO("%s: using SYCL backend\n", __func__);
1290
- result = ggml_backend_sycl_init(params.gpu_device);
1291
- if (!result) {
1292
- WHISPER_LOG_ERROR("%s: ggml_backend_sycl_init() failed\n", __func__);
1293
- }
1294
- }
1295
- #endif
1296
-
1297
- #ifdef GGML_USE_VULKAN
1298
- if (params.use_gpu) {
1299
- WHISPER_LOG_INFO("%s: using Vulkan backend\n", __func__);
1300
- result = ggml_backend_vk_init(params.gpu_device);
1301
- if (!result) {
1302
- WHISPER_LOG_ERROR("%s: ggml_backend_vk_init() failed\n", __func__);
1303
- }
1304
- }
1305
- #endif
1306
-
1307
- #ifdef GGML_USE_CANN
1308
- if (params.use_gpu) {
1309
- WHISPER_LOG_INFO("%s: using CANN backend\n", __func__);
1310
- result = ggml_backend_cann_init(params.gpu_device);
1311
- if (!result) {
1312
- WHISPER_LOG_ERROR("%s: ggml_backend_cann_init() failed\n", __func__);
1313
  }
1314
  }
1315
- #endif
1316
 
1317
- GGML_UNUSED(params);
1318
-
1319
- return result;
1320
  }
1321
 
1322
  static std::vector<ggml_backend_t> whisper_backend_init(const whisper_context_params & params) {
@@ -1328,17 +1259,19 @@ static std::vector<ggml_backend_t> whisper_backend_init(const whisper_context_pa
1328
  result.push_back(backend_gpu);
1329
  }
1330
 
1331
- #ifdef GGML_USE_BLAS
1332
- {
1333
- WHISPER_LOG_INFO("%s: using BLAS backend\n", __func__);
1334
- ggml_backend_t backend_blas = ggml_backend_blas_init();
1335
- if (!backend_blas) {
1336
- WHISPER_LOG_ERROR("%s: ggml_backend_blas_init() failed\n", __func__);
1337
- } else {
1338
- result.push_back(backend_blas);
 
 
 
1339
  }
1340
  }
1341
- #endif
1342
 
1343
  GGML_UNUSED(params);
1344
 
@@ -1348,33 +1281,20 @@ static std::vector<ggml_backend_t> whisper_backend_init(const whisper_context_pa
1348
  }
1349
 
1350
  static ggml_backend_buffer_type_t whisper_default_buffer_type(const whisper_context_params & params) {
1351
- ggml_backend_buffer_type_t result = nullptr;
1352
-
1353
- params.use_gpu || (result = ggml_backend_cpu_buffer_type());
1354
-
1355
- #ifdef GGML_USE_CUDA
1356
- result || (result = ggml_backend_cuda_buffer_type(params.gpu_device));
1357
- #endif
1358
-
1359
- #ifdef GGML_USE_METAL
1360
- result || (result = ggml_backend_metal_buffer_type());
1361
- #endif
1362
-
1363
- #ifdef GGML_USE_SYCL
1364
- result || (result = ggml_backend_sycl_buffer_type(params.gpu_device));
1365
- #endif
1366
-
1367
- #ifdef GGML_USE_VULKAN
1368
- result || (result = ggml_backend_vk_buffer_type(params.gpu_device));
1369
- #endif
1370
-
1371
- #ifdef GGML_USE_CANN
1372
- result || (result == ggml_backend_cann_buffer_type(params.gpu_device));
1373
- #endif
1374
 
1375
- result || (result = ggml_backend_cpu_buffer_type());
 
 
 
 
 
 
 
1376
 
1377
- return result;
1378
  }
1379
 
1380
  // load the model from a ggml file
@@ -3668,8 +3588,7 @@ struct whisper_context * whisper_init_with_params_no_state(struct whisper_model_
3668
  WHISPER_LOG_INFO("%s: flash attn = %d\n", __func__, params.flash_attn);
3669
  WHISPER_LOG_INFO("%s: gpu_device = %d\n", __func__, params.gpu_device);
3670
  WHISPER_LOG_INFO("%s: dtw = %d\n", __func__, params.dtw_token_timestamps);
3671
-
3672
- // TODO: temporary call to force backend registry initialization
3673
  WHISPER_LOG_INFO("%s: backends = %zu\n", __func__, ggml_backend_reg_count());
3674
 
3675
  whisper_context * ctx = new whisper_context;
@@ -7427,6 +7346,11 @@ static void whisper_log_internal(ggml_log_level level, const char * format, ...)
7427
  static void whisper_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
7428
  (void) level;
7429
  (void) user_data;
 
 
 
 
 
7430
  fputs(text, stderr);
7431
  fflush(stderr);
7432
  }
 
1
  #include "whisper.h"
2
 
 
 
 
 
3
  #include "ggml-cpu.h"
4
 
5
+ #include "ggml.h"
6
+ #include "ggml-alloc.h"
7
+ #include "ggml-backend.h"
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ #ifdef WHISPER_USE_COREML
10
+ #include "coreml/whisper-encoder.h"
11
  #endif
12
 
13
  #ifdef WHISPER_USE_OPENVINO
14
  #include "openvino/whisper-openvino-encoder.h"
15
  #endif
16
 
 
 
 
 
 
 
 
 
17
  #include <atomic>
18
  #include <algorithm>
19
  #include <cassert>
 
171
 
172
  for (int i = 0; i < ggml_backend_sched_get_n_backends(sched); ++i) {
173
  ggml_backend_t backend = ggml_backend_sched_get_backend(sched, i);
174
+ ggml_backend_dev_t dev = ggml_backend_get_device(backend);
175
+ ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
176
+
177
+ auto * fn_set_n_threads = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
178
+ if (fn_set_n_threads) {
179
+ fn_set_n_threads(backend, n_threads);
180
  }
 
181
  }
182
 
183
  bool t = ggml_backend_sched_graph_compute(sched, graph) == GGML_STATUS_SUCCESS;
 
1231
  }
1232
 
1233
  static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
 
 
1234
  ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
1235
 
 
1236
  if (params.use_gpu) {
1237
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
1238
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
1239
+ if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
1240
+ WHISPER_LOG_INFO("%s: using %s backend\n", __func__, ggml_backend_dev_name(dev));
1241
+ ggml_backend_t result = ggml_backend_dev_init(dev, nullptr);
1242
+ if (!result) {
1243
+ WHISPER_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
1244
+ }
1245
+ return result;
1246
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1247
  }
1248
  }
 
1249
 
1250
+ return nullptr;
 
 
1251
  }
1252
 
1253
  static std::vector<ggml_backend_t> whisper_backend_init(const whisper_context_params & params) {
 
1259
  result.push_back(backend_gpu);
1260
  }
1261
 
1262
+ // ACCEL backends
1263
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
1264
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
1265
+ if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
1266
+ WHISPER_LOG_INFO("%s: using %s backend\n", __func__, ggml_backend_dev_name(dev));
1267
+ ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
1268
+ if (!backend) {
1269
+ WHISPER_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
1270
+ continue;
1271
+ }
1272
+ result.push_back(backend);
1273
  }
1274
  }
 
1275
 
1276
  GGML_UNUSED(params);
1277
 
 
1281
  }
1282
 
1283
  static ggml_backend_buffer_type_t whisper_default_buffer_type(const whisper_context_params & params) {
1284
+ if (!params.use_gpu) {
1285
+ return ggml_backend_cpu_buffer_type();
1286
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1287
 
1288
+ // if we have a GPU device - use it
1289
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
1290
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
1291
+ if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
1292
+ WHISPER_LOG_INFO("%s: using device %s (%s)\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev));
1293
+ return ggml_backend_dev_buffer_type(dev);
1294
+ }
1295
+ }
1296
 
1297
+ return ggml_backend_cpu_buffer_type();
1298
  }
1299
 
1300
  // load the model from a ggml file
 
3588
  WHISPER_LOG_INFO("%s: flash attn = %d\n", __func__, params.flash_attn);
3589
  WHISPER_LOG_INFO("%s: gpu_device = %d\n", __func__, params.gpu_device);
3590
  WHISPER_LOG_INFO("%s: dtw = %d\n", __func__, params.dtw_token_timestamps);
3591
+ WHISPER_LOG_INFO("%s: devices = %zu\n", __func__, ggml_backend_dev_count());
 
3592
  WHISPER_LOG_INFO("%s: backends = %zu\n", __func__, ggml_backend_reg_count());
3593
 
3594
  whisper_context * ctx = new whisper_context;
 
7346
  static void whisper_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
7347
  (void) level;
7348
  (void) user_data;
7349
+ #ifndef WHISPER_DEBUG
7350
+ if (level == GGML_LOG_LEVEL_DEBUG) {
7351
+ return;
7352
+ }
7353
+ #endif
7354
  fputs(text, stderr);
7355
  fflush(stderr);
7356
  }