Diego Devesa ggerganov commited on
Commit
2e6437e
·
unverified ·
1 Parent(s): 0447b9d

whisper : support GGML_BACKEND_DL (#2843)

Browse files

* whisper : support GGML_BACKEND_DL

* fix DTW crash

* whisper.objc : fix build - add ggml-cpp.h

---------

Co-authored-by: Georgi Gerganov <[email protected]>

examples/bench/bench.cpp CHANGED
@@ -50,11 +50,11 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
50
  fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
51
  fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
52
  fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what);
53
- fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
54
- fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false");
55
  fprintf(stderr, " %-7s 0 - whisper\n", "");
56
  fprintf(stderr, " %-7s 1 - memcpy\n", "");
57
  fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", "");
 
 
58
  fprintf(stderr, "\n");
59
  }
60
 
 
50
  fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
51
  fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
52
  fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what);
 
 
53
  fprintf(stderr, " %-7s 0 - whisper\n", "");
54
  fprintf(stderr, " %-7s 1 - memcpy\n", "");
55
  fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", "");
56
+ fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
57
+ fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false");
58
  fprintf(stderr, "\n");
59
  }
60
 
examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj CHANGED
@@ -81,6 +81,7 @@
81
  18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.cpp; fileEncoding = 4; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; };
82
  18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
83
  18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
 
84
  18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.c"; sourceTree = "<group>"; };
85
  18E864AA2CE73C580094B8B3 /* ggml-cpu.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu.h"; path = "../../../ggml/include/ggml-cpu.h"; sourceTree = "<group>"; };
86
  18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-threading.h"; path = "../../../ggml/src/ggml-threading.h"; sourceTree = "<group>"; };
@@ -135,6 +136,7 @@
135
  18627C7829052BDF00BD2A04 /* whisper.objc */ = {
136
  isa = PBXGroup;
137
  children = (
 
138
  433188B92D3A18A400E3FE79 /* gguf.h */,
139
  433188B72D3A187C00E3FE79 /* gguf.cpp */,
140
  18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */,
 
81
  18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.cpp; fileEncoding = 4; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; };
82
  18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
83
  18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
84
+ 18B07DCB2D70411100B3B87C /* ggml-cpp.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpp.h"; path = "../../../ggml/include/ggml-cpp.h"; sourceTree = "<group>"; };
85
  18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.c"; sourceTree = "<group>"; };
86
  18E864AA2CE73C580094B8B3 /* ggml-cpu.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu.h"; path = "../../../ggml/include/ggml-cpu.h"; sourceTree = "<group>"; };
87
  18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-threading.h"; path = "../../../ggml/src/ggml-threading.h"; sourceTree = "<group>"; };
 
136
  18627C7829052BDF00BD2A04 /* whisper.objc */ = {
137
  isa = PBXGroup;
138
  children = (
139
+ 18B07DCB2D70411100B3B87C /* ggml-cpp.h */,
140
  433188B92D3A18A400E3FE79 /* gguf.h */,
141
  433188B72D3A187C00E3FE79 /* gguf.cpp */,
142
  18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */,
ggml/src/CMakeLists.txt CHANGED
@@ -226,6 +226,9 @@ add_library(ggml-base
226
  gguf.cpp)
227
 
228
  target_include_directories(ggml-base PRIVATE .)
 
 
 
229
 
230
  add_library(ggml
231
  ggml-backend-reg.cpp)
 
226
  gguf.cpp)
227
 
228
  target_include_directories(ggml-base PRIVATE .)
229
+ if (GGML_BACKEND_DL)
230
+ target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
231
+ endif()
232
 
233
  add_library(ggml
234
  ggml-backend-reg.cpp)
src/whisper.cpp CHANGED
@@ -1,8 +1,7 @@
1
  #include "whisper.h"
2
 
3
- #include "ggml-cpu.h"
4
-
5
  #include "ggml.h"
 
6
  #include "ggml-alloc.h"
7
  #include "ggml-backend.h"
8
 
@@ -19,19 +18,20 @@
19
  #include <cassert>
20
  #define _USE_MATH_DEFINES
21
  #include <cmath>
22
- #include <cstdio>
23
  #include <cstdarg>
 
24
  #include <cstring>
25
  #include <fstream>
 
26
  #include <map>
 
 
 
27
  #include <set>
28
  #include <string>
29
  #include <thread>
30
  #include <vector>
31
- #include <regex>
32
- #include <random>
33
- #include <functional>
34
- #include <codecvt>
35
 
36
  // dummy
37
 
@@ -149,21 +149,25 @@ static void whisper_log_callback_default(ggml_log_level level, const char * text
149
 
150
  static bool ggml_graph_compute_helper(
151
  struct ggml_cgraph * graph,
152
- std::vector<uint8_t> & buf,
153
  int n_threads,
154
  ggml_abort_callback abort_callback,
155
  void * abort_callback_data) {
156
- struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
157
 
158
- plan.abort_callback = abort_callback;
159
- plan.abort_callback_data = abort_callback_data;
160
 
161
- if (plan.work_size > 0) {
162
- buf.resize(plan.work_size);
163
- plan.work_data = buf.data();
 
 
 
 
 
 
 
164
  }
165
 
166
- return ggml_graph_compute(graph, &plan);
167
  }
168
 
169
  static bool ggml_graph_compute_helper(
@@ -187,6 +191,61 @@ static bool ggml_graph_compute_helper(
187
  return t;
188
  }
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  // faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
191
  // the idea is to represent the original matrix multiplication:
192
  //
@@ -1237,6 +1296,8 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) {
1237
  static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
1238
  ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
1239
 
 
 
1240
  ggml_backend_dev_t dev = nullptr;
1241
 
1242
  int cnt = 0;
@@ -1294,7 +1355,7 @@ static std::vector<ggml_backend_t> whisper_backend_init(const whisper_context_pa
1294
 
1295
  GGML_UNUSED(params);
1296
 
1297
- result.push_back(ggml_backend_cpu_init());
1298
 
1299
  return result;
1300
  }
@@ -4206,22 +4267,28 @@ static int whisper_has_openvino(void) {
4206
  const char * whisper_print_system_info(void) {
4207
  static std::string s;
4208
 
 
 
4209
  s = "";
4210
- s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
4211
- s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
4212
- s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
4213
- s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
4214
- s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
4215
- s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
4216
- s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
4217
- s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
4218
- s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
4219
- s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
4220
- s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
4221
- s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
4222
  s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
4223
  s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
4224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4225
  return s.c_str();
4226
  }
4227
 
@@ -6653,6 +6720,8 @@ WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
6653
  }
6654
 
6655
  WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
 
 
6656
  static std::string s;
6657
  s = "";
6658
  char strbuf[256];
@@ -6672,7 +6741,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
6672
  // c: N*N*sizeof(float)
6673
  // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
6674
  std::vector<uint8_t> buf(3llu*N_max*N_max*sizeof(float) + 3*ggml_tensor_overhead() + ggml_graph_overhead());
6675
- std::vector<uint8_t> work;
6676
 
6677
  // put a bunch of random data in the buffer
6678
  for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
@@ -6729,12 +6797,12 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
6729
  double tsum = 0.0;
6730
 
6731
  // heat-up
6732
- ggml_graph_compute_helper(gf, work, n_threads, nullptr, nullptr);
6733
 
6734
  for (int i = 0; i < n_max; ++i) {
6735
  const int64_t t0 = ggml_time_us();
6736
 
6737
- ggml_graph_compute_helper(gf, work, n_threads, nullptr, nullptr);
6738
 
6739
  const int64_t t1 = ggml_time_us();
6740
 
@@ -7111,18 +7179,18 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
7111
  struct ggml_tensor * cost = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, N + 1, M + 1);
7112
  struct ggml_tensor * trace = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, N + 1, M + 1);
7113
 
7114
- cost = ggml_set_f32(cost, INFINITY);
7115
- trace = ggml_set_f32(trace, -1);
7116
- ggml_set_f32_nd(cost, 0, 0, 0, 0, 0.0);
7117
 
7118
  // dtw
7119
  // supposedly can be optmized by computing diagonals in parallel ?
7120
  // Not sure it is worth it since x will be GENERATED_TOKENS*1500 size at most.
7121
  for (int64_t j = 1; j < M + 1; ++j) {
7122
  for (int64_t i = 1; i < N + 1; ++i) {
7123
- float c0 = ggml_get_f32_nd(cost, i - 1, j - 1, 0, 0);
7124
- float c1 = ggml_get_f32_nd(cost, i - 1, j, 0, 0);
7125
- float c2 = ggml_get_f32_nd(cost, i, j - 1, 0, 0);
7126
 
7127
  float c;
7128
  int32_t t;
@@ -7137,9 +7205,9 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
7137
  t = 2;
7138
  }
7139
 
7140
- c = ggml_get_f32_nd(x, i - 1, j - 1, 0, 0) + c;
7141
- ggml_set_f32_nd(cost, i, j, 0, 0, c);
7142
- ggml_set_i32_nd(trace, i, j, 0, 0, t);
7143
  }
7144
  }
7145
 
@@ -7148,19 +7216,19 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
7148
  struct ggml_tensor * bt = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, BT_MAX_ROWS, 2);
7149
  // trace[0, :] = 2;
7150
  for (int64_t i = 0; i < M + 1; ++i)
7151
- ggml_set_i32_nd(trace, 0, i, 0, 0, 2);
7152
  //trace[:, 0] = 1;
7153
  for (int64_t i = 0; i < N + 1; ++i)
7154
- ggml_set_i32_nd(trace, i, 0, 0, 0, 1);
7155
  int bt_row_idx = BT_MAX_ROWS - 1;
7156
  int64_t i = N;
7157
  int64_t j = M;
7158
  while (i > 0 || j > 0) {
7159
- ggml_set_i32_nd(bt, bt_row_idx, 0, 0, 0, i - 1);
7160
- ggml_set_i32_nd(bt, bt_row_idx, 1, 0, 0, j - 1);
7161
  --bt_row_idx;
7162
 
7163
- int32_t t = ggml_get_i32_nd(trace, i, j, 0, 0);
7164
  if (t == 0) {
7165
  --i;
7166
  --j;
@@ -7181,8 +7249,8 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
7181
  ggml_tensor * r = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, 2, result_n_cols);
7182
  for (int64_t i = 0; i < 2; ++i) {
7183
  for (int64_t j = 0; j < result_n_cols; ++j) {
7184
- int32_t v = ggml_get_i32_nd(bt, j+bt_row_idx+1, i, 0, 0);
7185
- ggml_set_i32_nd(r, i, j, 0, 0, v);
7186
  }
7187
  }
7188
 
@@ -7217,11 +7285,11 @@ static void median_filter(struct ggml_tensor * dst , const struct ggml_tensor *
7217
  idx = 2*(a->ne[2] - 1) - idx;
7218
  }
7219
 
7220
- filter.push_back(ggml_get_f32_nd(a, i, j, idx, 0));
7221
  }
7222
  std::sort(filter.begin(), filter.end());
7223
  const float v = filter[filter.size()/2];
7224
- ggml_set_f32_nd(dst, i, j, k, 0, v);
7225
  filter.clear();
7226
  }
7227
  }
@@ -7343,7 +7411,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
7343
  // Compute
7344
  struct ggml_cgraph * gf = ggml_new_graph(gctx);
7345
  ggml_build_forward_expand(gf, w);
7346
- ggml_graph_compute_with_ctx(gctx, gf, n_threads);
 
 
7347
 
7348
  ggml_tensor * alignment = dtw_and_backtrace(gctx, w);
7349
 
@@ -7352,9 +7422,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
7352
  auto seg_i = state->result_all.begin() + i_segment;
7353
  auto tok_i = seg_i->tokens.begin();
7354
  for (int i = 0; i < alignment->ne[1]; ++i) {
7355
- int32_t v = ggml_get_i32_nd(alignment, 0, i, 0, 0);
7356
  if (v != last_v) {
7357
- int32_t time_index = ggml_get_i32_nd(alignment, 1, i, 0, 0);
7358
  int64_t timestamp = (time_index * 2) + seek; // Each index on DTW result = 20mS audio
7359
  last_v = v;
7360
 
 
1
  #include "whisper.h"
2
 
 
 
3
  #include "ggml.h"
4
+ #include "ggml-cpp.h"
5
  #include "ggml-alloc.h"
6
  #include "ggml-backend.h"
7
 
 
18
  #include <cassert>
19
  #define _USE_MATH_DEFINES
20
  #include <cmath>
21
+ #include <codecvt>
22
  #include <cstdarg>
23
+ #include <cstdio>
24
  #include <cstring>
25
  #include <fstream>
26
+ #include <functional>
27
  #include <map>
28
+ #include <mutex>
29
+ #include <random>
30
+ #include <regex>
31
  #include <set>
32
  #include <string>
33
  #include <thread>
34
  #include <vector>
 
 
 
 
35
 
36
  // dummy
37
 
 
149
 
150
  static bool ggml_graph_compute_helper(
151
  struct ggml_cgraph * graph,
 
152
  int n_threads,
153
  ggml_abort_callback abort_callback,
154
  void * abort_callback_data) {
 
155
 
156
+ ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
 
157
 
158
+ auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
159
+
160
+ auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
161
+ if (set_abort_callback_fn) {
162
+ set_abort_callback_fn(backend.get(), abort_callback, abort_callback_data);
163
+ }
164
+
165
+ auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
166
+ if (ggml_backend_set_n_threads_fn) {
167
+ ggml_backend_set_n_threads_fn(backend.get(), n_threads);
168
  }
169
 
170
+ return ggml_backend_graph_compute(backend.get(), graph) == GGML_STATUS_SUCCESS;
171
  }
172
 
173
  static bool ggml_graph_compute_helper(
 
191
  return t;
192
  }
193
 
194
+ static void whisper_load_backends() {
195
+ #ifdef GGML_BACKEND_DL
196
+ static std::once_flag flag;
197
+ std::call_once(flag, []() {
198
+ ggml_backend_load_all();
199
+ });
200
+ #endif
201
+ }
202
+
203
+ // TODO: move these functions to ggml-base with support for ggml-backend?
204
+
205
+ static ggml_tensor * whisper_set_f32(struct ggml_tensor * t, float v) {
206
+ GGML_ASSERT(t->type == GGML_TYPE_F32);
207
+ GGML_ASSERT(ggml_is_contiguous(t));
208
+ size_t nels = ggml_nelements(t);
209
+ for (int64_t i = 0; i < nels; ++i) {
210
+ ((float *) t->data)[i] = v;
211
+ }
212
+ return t;
213
+ }
214
+
215
+ static ggml_tensor * whisper_set_i32(struct ggml_tensor * t, int32_t v) {
216
+ GGML_ASSERT(t->type == GGML_TYPE_I32);
217
+ GGML_ASSERT(ggml_is_contiguous(t));
218
+ size_t nels = ggml_nelements(t);
219
+ for (int64_t i = 0; i < nels; ++i) {
220
+ ((int32_t *) t->data)[i] = v;
221
+ }
222
+ return t;
223
+ }
224
+
225
+ static float whisper_get_f32_nd(const struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
226
+ GGML_ASSERT(t->type == GGML_TYPE_F32);
227
+ void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
228
+ return *(float *) data;
229
+ }
230
+
231
+ static void whisper_set_f32_nd(struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float v) {
232
+ GGML_ASSERT(t->type == GGML_TYPE_F32);
233
+ void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
234
+ *(float *) data = v;
235
+ }
236
+
237
+ static int32_t whisper_get_i32_nd(const struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
238
+ GGML_ASSERT(t->type == GGML_TYPE_I32);
239
+ void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
240
+ return *(int32_t *) data;
241
+ }
242
+
243
+ static void whisper_set_i32_nd(struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, int32_t v) {
244
+ GGML_ASSERT(t->type == GGML_TYPE_I32);
245
+ void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
246
+ *(int32_t *) data = v;
247
+ }
248
+
249
  // faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
250
  // the idea is to represent the original matrix multiplication:
251
  //
 
1296
  static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
1297
  ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
1298
 
1299
+ whisper_load_backends();
1300
+
1301
  ggml_backend_dev_t dev = nullptr;
1302
 
1303
  int cnt = 0;
 
1355
 
1356
  GGML_UNUSED(params);
1357
 
1358
+ result.push_back(ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr));
1359
 
1360
  return result;
1361
  }
 
4267
  const char * whisper_print_system_info(void) {
4268
  static std::string s;
4269
 
4270
+ whisper_load_backends();
4271
+
4272
  s = "";
4273
+ s += "WHISPER : ";
 
 
 
 
 
 
 
 
 
 
 
4274
  s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
4275
  s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
4276
 
4277
+ for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
4278
+ auto * reg = ggml_backend_reg_get(i);
4279
+ auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
4280
+ if (get_features_fn) {
4281
+ ggml_backend_feature * features = get_features_fn(reg);
4282
+ s += ggml_backend_reg_name(reg);
4283
+ s += " : ";
4284
+ for (; features->name; features++) {
4285
+ s += features->name;
4286
+ s += " = ";
4287
+ s += features->value;
4288
+ s += " | ";
4289
+ }
4290
+ }
4291
+ }
4292
  return s.c_str();
4293
  }
4294
 
 
6720
  }
6721
 
6722
  WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
6723
+ whisper_load_backends();
6724
+
6725
  static std::string s;
6726
  s = "";
6727
  char strbuf[256];
 
6741
  // c: N*N*sizeof(float)
6742
  // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
6743
  std::vector<uint8_t> buf(3llu*N_max*N_max*sizeof(float) + 3*ggml_tensor_overhead() + ggml_graph_overhead());
 
6744
 
6745
  // put a bunch of random data in the buffer
6746
  for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
 
6797
  double tsum = 0.0;
6798
 
6799
  // heat-up
6800
+ ggml_graph_compute_helper(gf, n_threads, nullptr, nullptr);
6801
 
6802
  for (int i = 0; i < n_max; ++i) {
6803
  const int64_t t0 = ggml_time_us();
6804
 
6805
+ ggml_graph_compute_helper(gf, n_threads, nullptr, nullptr);
6806
 
6807
  const int64_t t1 = ggml_time_us();
6808
 
 
7179
  struct ggml_tensor * cost = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, N + 1, M + 1);
7180
  struct ggml_tensor * trace = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, N + 1, M + 1);
7181
 
7182
+ cost = whisper_set_f32(cost, INFINITY);
7183
+ trace = whisper_set_i32(trace, -1);
7184
+ whisper_set_f32_nd(cost, 0, 0, 0, 0, 0.0);
7185
 
7186
  // dtw
7187
  // supposedly can be optmized by computing diagonals in parallel ?
7188
  // Not sure it is worth it since x will be GENERATED_TOKENS*1500 size at most.
7189
  for (int64_t j = 1; j < M + 1; ++j) {
7190
  for (int64_t i = 1; i < N + 1; ++i) {
7191
+ float c0 = whisper_get_f32_nd(cost, i - 1, j - 1, 0, 0);
7192
+ float c1 = whisper_get_f32_nd(cost, i - 1, j, 0, 0);
7193
+ float c2 = whisper_get_f32_nd(cost, i, j - 1, 0, 0);
7194
 
7195
  float c;
7196
  int32_t t;
 
7205
  t = 2;
7206
  }
7207
 
7208
+ c = whisper_get_f32_nd(x, i - 1, j - 1, 0, 0) + c;
7209
+ whisper_set_f32_nd(cost, i, j, 0, 0, c);
7210
+ whisper_set_i32_nd(trace, i, j, 0, 0, t);
7211
  }
7212
  }
7213
 
 
7216
  struct ggml_tensor * bt = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, BT_MAX_ROWS, 2);
7217
  // trace[0, :] = 2;
7218
  for (int64_t i = 0; i < M + 1; ++i)
7219
+ whisper_set_i32_nd(trace, 0, i, 0, 0, 2);
7220
  //trace[:, 0] = 1;
7221
  for (int64_t i = 0; i < N + 1; ++i)
7222
+ whisper_set_i32_nd(trace, i, 0, 0, 0, 1);
7223
  int bt_row_idx = BT_MAX_ROWS - 1;
7224
  int64_t i = N;
7225
  int64_t j = M;
7226
  while (i > 0 || j > 0) {
7227
+ whisper_set_i32_nd(bt, bt_row_idx, 0, 0, 0, i - 1);
7228
+ whisper_set_i32_nd(bt, bt_row_idx, 1, 0, 0, j - 1);
7229
  --bt_row_idx;
7230
 
7231
+ int32_t t = whisper_get_i32_nd(trace, i, j, 0, 0);
7232
  if (t == 0) {
7233
  --i;
7234
  --j;
 
7249
  ggml_tensor * r = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, 2, result_n_cols);
7250
  for (int64_t i = 0; i < 2; ++i) {
7251
  for (int64_t j = 0; j < result_n_cols; ++j) {
7252
+ int32_t v = whisper_get_i32_nd(bt, j+bt_row_idx+1, i, 0, 0);
7253
+ whisper_set_i32_nd(r, i, j, 0, 0, v);
7254
  }
7255
  }
7256
 
 
7285
  idx = 2*(a->ne[2] - 1) - idx;
7286
  }
7287
 
7288
+ filter.push_back(whisper_get_f32_nd(a, i, j, idx, 0));
7289
  }
7290
  std::sort(filter.begin(), filter.end());
7291
  const float v = filter[filter.size()/2];
7292
+ whisper_set_f32_nd(dst, i, j, k, 0, v);
7293
  filter.clear();
7294
  }
7295
  }
 
7411
  // Compute
7412
  struct ggml_cgraph * gf = ggml_new_graph(gctx);
7413
  ggml_build_forward_expand(gf, w);
7414
+
7415
+ ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
7416
+ ggml_backend_graph_compute(backend.get(), gf);
7417
 
7418
  ggml_tensor * alignment = dtw_and_backtrace(gctx, w);
7419
 
 
7422
  auto seg_i = state->result_all.begin() + i_segment;
7423
  auto tok_i = seg_i->tokens.begin();
7424
  for (int i = 0; i < alignment->ne[1]; ++i) {
7425
+ int32_t v = whisper_get_i32_nd(alignment, 0, i, 0, 0);
7426
  if (v != last_v) {
7427
+ int32_t time_index = whisper_get_i32_nd(alignment, 1, i, 0, 0);
7428
  int64_t timestamp = (time_index * 2) + seek; // Each index on DTW result = 20mS audio
7429
  last_v = v;
7430