Spaces:
Running
Running
whisper : support GGML_BACKEND_DL (#2843)
Browse files* whisper : support GGML_BACKEND_DL
* fix DTW crash
* whisper.objc : fix build - add ggml-cpp.h
---------
Co-authored-by: Georgi Gerganov <[email protected]>
examples/bench/bench.cpp
CHANGED
|
@@ -50,11 +50,11 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 50 |
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
| 51 |
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
| 52 |
fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what);
|
| 53 |
-
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
|
| 54 |
-
fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false");
|
| 55 |
fprintf(stderr, " %-7s 0 - whisper\n", "");
|
| 56 |
fprintf(stderr, " %-7s 1 - memcpy\n", "");
|
| 57 |
fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", "");
|
|
|
|
|
|
|
| 58 |
fprintf(stderr, "\n");
|
| 59 |
}
|
| 60 |
|
|
|
|
| 50 |
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
| 51 |
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
| 52 |
fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what);
|
|
|
|
|
|
|
| 53 |
fprintf(stderr, " %-7s 0 - whisper\n", "");
|
| 54 |
fprintf(stderr, " %-7s 1 - memcpy\n", "");
|
| 55 |
fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", "");
|
| 56 |
+
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
|
| 57 |
+
fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false");
|
| 58 |
fprintf(stderr, "\n");
|
| 59 |
}
|
| 60 |
|
examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
CHANGED
|
@@ -81,6 +81,7 @@
|
|
| 81 |
18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.cpp; fileEncoding = 4; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; };
|
| 82 |
18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
|
| 83 |
18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
|
|
|
|
| 84 |
18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.c"; sourceTree = "<group>"; };
|
| 85 |
18E864AA2CE73C580094B8B3 /* ggml-cpu.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu.h"; path = "../../../ggml/include/ggml-cpu.h"; sourceTree = "<group>"; };
|
| 86 |
18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-threading.h"; path = "../../../ggml/src/ggml-threading.h"; sourceTree = "<group>"; };
|
|
@@ -135,6 +136,7 @@
|
|
| 135 |
18627C7829052BDF00BD2A04 /* whisper.objc */ = {
|
| 136 |
isa = PBXGroup;
|
| 137 |
children = (
|
|
|
|
| 138 |
433188B92D3A18A400E3FE79 /* gguf.h */,
|
| 139 |
433188B72D3A187C00E3FE79 /* gguf.cpp */,
|
| 140 |
18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */,
|
|
|
|
| 81 |
18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.cpp; fileEncoding = 4; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; };
|
| 82 |
18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
|
| 83 |
18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
|
| 84 |
+
18B07DCB2D70411100B3B87C /* ggml-cpp.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpp.h"; path = "../../../ggml/include/ggml-cpp.h"; sourceTree = "<group>"; };
|
| 85 |
18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.c"; sourceTree = "<group>"; };
|
| 86 |
18E864AA2CE73C580094B8B3 /* ggml-cpu.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu.h"; path = "../../../ggml/include/ggml-cpu.h"; sourceTree = "<group>"; };
|
| 87 |
18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-threading.h"; path = "../../../ggml/src/ggml-threading.h"; sourceTree = "<group>"; };
|
|
|
|
| 136 |
18627C7829052BDF00BD2A04 /* whisper.objc */ = {
|
| 137 |
isa = PBXGroup;
|
| 138 |
children = (
|
| 139 |
+
18B07DCB2D70411100B3B87C /* ggml-cpp.h */,
|
| 140 |
433188B92D3A18A400E3FE79 /* gguf.h */,
|
| 141 |
433188B72D3A187C00E3FE79 /* gguf.cpp */,
|
| 142 |
18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */,
|
ggml/src/CMakeLists.txt
CHANGED
|
@@ -226,6 +226,9 @@ add_library(ggml-base
|
|
| 226 |
gguf.cpp)
|
| 227 |
|
| 228 |
target_include_directories(ggml-base PRIVATE .)
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
add_library(ggml
|
| 231 |
ggml-backend-reg.cpp)
|
|
|
|
| 226 |
gguf.cpp)
|
| 227 |
|
| 228 |
target_include_directories(ggml-base PRIVATE .)
|
| 229 |
+
if (GGML_BACKEND_DL)
|
| 230 |
+
target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
|
| 231 |
+
endif()
|
| 232 |
|
| 233 |
add_library(ggml
|
| 234 |
ggml-backend-reg.cpp)
|
src/whisper.cpp
CHANGED
|
@@ -1,8 +1,7 @@
|
|
| 1 |
#include "whisper.h"
|
| 2 |
|
| 3 |
-
#include "ggml-cpu.h"
|
| 4 |
-
|
| 5 |
#include "ggml.h"
|
|
|
|
| 6 |
#include "ggml-alloc.h"
|
| 7 |
#include "ggml-backend.h"
|
| 8 |
|
|
@@ -19,19 +18,20 @@
|
|
| 19 |
#include <cassert>
|
| 20 |
#define _USE_MATH_DEFINES
|
| 21 |
#include <cmath>
|
| 22 |
-
#include <
|
| 23 |
#include <cstdarg>
|
|
|
|
| 24 |
#include <cstring>
|
| 25 |
#include <fstream>
|
|
|
|
| 26 |
#include <map>
|
|
|
|
|
|
|
|
|
|
| 27 |
#include <set>
|
| 28 |
#include <string>
|
| 29 |
#include <thread>
|
| 30 |
#include <vector>
|
| 31 |
-
#include <regex>
|
| 32 |
-
#include <random>
|
| 33 |
-
#include <functional>
|
| 34 |
-
#include <codecvt>
|
| 35 |
|
| 36 |
// dummy
|
| 37 |
|
|
@@ -149,21 +149,25 @@ static void whisper_log_callback_default(ggml_log_level level, const char * text
|
|
| 149 |
|
| 150 |
static bool ggml_graph_compute_helper(
|
| 151 |
struct ggml_cgraph * graph,
|
| 152 |
-
std::vector<uint8_t> & buf,
|
| 153 |
int n_threads,
|
| 154 |
ggml_abort_callback abort_callback,
|
| 155 |
void * abort_callback_data) {
|
| 156 |
-
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
| 157 |
|
| 158 |
-
|
| 159 |
-
plan.abort_callback_data = abort_callback_data;
|
| 160 |
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
}
|
| 165 |
|
| 166 |
-
return
|
| 167 |
}
|
| 168 |
|
| 169 |
static bool ggml_graph_compute_helper(
|
|
@@ -187,6 +191,61 @@ static bool ggml_graph_compute_helper(
|
|
| 187 |
return t;
|
| 188 |
}
|
| 189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
|
| 191 |
// the idea is to represent the original matrix multiplication:
|
| 192 |
//
|
|
@@ -1237,6 +1296,8 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) {
|
|
| 1237 |
static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
|
| 1238 |
ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
|
| 1239 |
|
|
|
|
|
|
|
| 1240 |
ggml_backend_dev_t dev = nullptr;
|
| 1241 |
|
| 1242 |
int cnt = 0;
|
|
@@ -1294,7 +1355,7 @@ static std::vector<ggml_backend_t> whisper_backend_init(const whisper_context_pa
|
|
| 1294 |
|
| 1295 |
GGML_UNUSED(params);
|
| 1296 |
|
| 1297 |
-
result.push_back(
|
| 1298 |
|
| 1299 |
return result;
|
| 1300 |
}
|
|
@@ -4206,22 +4267,28 @@ static int whisper_has_openvino(void) {
|
|
| 4206 |
const char * whisper_print_system_info(void) {
|
| 4207 |
static std::string s;
|
| 4208 |
|
|
|
|
|
|
|
| 4209 |
s = "";
|
| 4210 |
-
s += "
|
| 4211 |
-
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
| 4212 |
-
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
| 4213 |
-
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
| 4214 |
-
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
| 4215 |
-
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
| 4216 |
-
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
| 4217 |
-
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
| 4218 |
-
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
| 4219 |
-
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
| 4220 |
-
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
| 4221 |
-
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
| 4222 |
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
|
| 4223 |
s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
|
| 4224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4225 |
return s.c_str();
|
| 4226 |
}
|
| 4227 |
|
|
@@ -6653,6 +6720,8 @@ WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
|
|
| 6653 |
}
|
| 6654 |
|
| 6655 |
WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
|
|
|
|
|
| 6656 |
static std::string s;
|
| 6657 |
s = "";
|
| 6658 |
char strbuf[256];
|
|
@@ -6672,7 +6741,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
| 6672 |
// c: N*N*sizeof(float)
|
| 6673 |
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
|
| 6674 |
std::vector<uint8_t> buf(3llu*N_max*N_max*sizeof(float) + 3*ggml_tensor_overhead() + ggml_graph_overhead());
|
| 6675 |
-
std::vector<uint8_t> work;
|
| 6676 |
|
| 6677 |
// put a bunch of random data in the buffer
|
| 6678 |
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
|
|
@@ -6729,12 +6797,12 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
| 6729 |
double tsum = 0.0;
|
| 6730 |
|
| 6731 |
// heat-up
|
| 6732 |
-
ggml_graph_compute_helper(gf,
|
| 6733 |
|
| 6734 |
for (int i = 0; i < n_max; ++i) {
|
| 6735 |
const int64_t t0 = ggml_time_us();
|
| 6736 |
|
| 6737 |
-
ggml_graph_compute_helper(gf,
|
| 6738 |
|
| 6739 |
const int64_t t1 = ggml_time_us();
|
| 6740 |
|
|
@@ -7111,18 +7179,18 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
|
|
| 7111 |
struct ggml_tensor * cost = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, N + 1, M + 1);
|
| 7112 |
struct ggml_tensor * trace = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, N + 1, M + 1);
|
| 7113 |
|
| 7114 |
-
cost =
|
| 7115 |
-
trace =
|
| 7116 |
-
|
| 7117 |
|
| 7118 |
// dtw
|
| 7119 |
// supposedly can be optmized by computing diagonals in parallel ?
|
| 7120 |
// Not sure it is worth it since x will be GENERATED_TOKENS*1500 size at most.
|
| 7121 |
for (int64_t j = 1; j < M + 1; ++j) {
|
| 7122 |
for (int64_t i = 1; i < N + 1; ++i) {
|
| 7123 |
-
float c0 =
|
| 7124 |
-
float c1 =
|
| 7125 |
-
float c2 =
|
| 7126 |
|
| 7127 |
float c;
|
| 7128 |
int32_t t;
|
|
@@ -7137,9 +7205,9 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
|
|
| 7137 |
t = 2;
|
| 7138 |
}
|
| 7139 |
|
| 7140 |
-
c =
|
| 7141 |
-
|
| 7142 |
-
|
| 7143 |
}
|
| 7144 |
}
|
| 7145 |
|
|
@@ -7148,19 +7216,19 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
|
|
| 7148 |
struct ggml_tensor * bt = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, BT_MAX_ROWS, 2);
|
| 7149 |
// trace[0, :] = 2;
|
| 7150 |
for (int64_t i = 0; i < M + 1; ++i)
|
| 7151 |
-
|
| 7152 |
//trace[:, 0] = 1;
|
| 7153 |
for (int64_t i = 0; i < N + 1; ++i)
|
| 7154 |
-
|
| 7155 |
int bt_row_idx = BT_MAX_ROWS - 1;
|
| 7156 |
int64_t i = N;
|
| 7157 |
int64_t j = M;
|
| 7158 |
while (i > 0 || j > 0) {
|
| 7159 |
-
|
| 7160 |
-
|
| 7161 |
--bt_row_idx;
|
| 7162 |
|
| 7163 |
-
int32_t t =
|
| 7164 |
if (t == 0) {
|
| 7165 |
--i;
|
| 7166 |
--j;
|
|
@@ -7181,8 +7249,8 @@ static ggml_tensor * dtw_and_backtrace(ggml_context * ctx, ggml_tensor * x) {
|
|
| 7181 |
ggml_tensor * r = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, 2, result_n_cols);
|
| 7182 |
for (int64_t i = 0; i < 2; ++i) {
|
| 7183 |
for (int64_t j = 0; j < result_n_cols; ++j) {
|
| 7184 |
-
int32_t v =
|
| 7185 |
-
|
| 7186 |
}
|
| 7187 |
}
|
| 7188 |
|
|
@@ -7217,11 +7285,11 @@ static void median_filter(struct ggml_tensor * dst , const struct ggml_tensor *
|
|
| 7217 |
idx = 2*(a->ne[2] - 1) - idx;
|
| 7218 |
}
|
| 7219 |
|
| 7220 |
-
filter.push_back(
|
| 7221 |
}
|
| 7222 |
std::sort(filter.begin(), filter.end());
|
| 7223 |
const float v = filter[filter.size()/2];
|
| 7224 |
-
|
| 7225 |
filter.clear();
|
| 7226 |
}
|
| 7227 |
}
|
|
@@ -7343,7 +7411,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
|
|
| 7343 |
// Compute
|
| 7344 |
struct ggml_cgraph * gf = ggml_new_graph(gctx);
|
| 7345 |
ggml_build_forward_expand(gf, w);
|
| 7346 |
-
|
|
|
|
|
|
|
| 7347 |
|
| 7348 |
ggml_tensor * alignment = dtw_and_backtrace(gctx, w);
|
| 7349 |
|
|
@@ -7352,9 +7422,9 @@ static void whisper_exp_compute_token_level_timestamps_dtw(
|
|
| 7352 |
auto seg_i = state->result_all.begin() + i_segment;
|
| 7353 |
auto tok_i = seg_i->tokens.begin();
|
| 7354 |
for (int i = 0; i < alignment->ne[1]; ++i) {
|
| 7355 |
-
int32_t v =
|
| 7356 |
if (v != last_v) {
|
| 7357 |
-
int32_t time_index =
|
| 7358 |
int64_t timestamp = (time_index * 2) + seek; // Each index on DTW result = 20mS audio
|
| 7359 |
last_v = v;
|
| 7360 |
|
|
|
|
| 1 |
#include "whisper.h"
|
| 2 |
|
|
|
|
|
|
|
| 3 |
#include "ggml.h"
|
| 4 |
+
#include "ggml-cpp.h"
|
| 5 |
#include "ggml-alloc.h"
|
| 6 |
#include "ggml-backend.h"
|
| 7 |
|
|
|
|
| 18 |
#include <cassert>
|
| 19 |
#define _USE_MATH_DEFINES
|
| 20 |
#include <cmath>
|
| 21 |
+
#include <codecvt>
|
| 22 |
#include <cstdarg>
|
| 23 |
+
#include <cstdio>
|
| 24 |
#include <cstring>
|
| 25 |
#include <fstream>
|
| 26 |
+
#include <functional>
|
| 27 |
#include <map>
|
| 28 |
+
#include <mutex>
|
| 29 |
+
#include <random>
|
| 30 |
+
#include <regex>
|
| 31 |
#include <set>
|
| 32 |
#include <string>
|
| 33 |
#include <thread>
|
| 34 |
#include <vector>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
// dummy
|
| 37 |
|
|
|
|
| 149 |
|
| 150 |
static bool ggml_graph_compute_helper(
|
| 151 |
struct ggml_cgraph * graph,
|
|
|
|
| 152 |
int n_threads,
|
| 153 |
ggml_abort_callback abort_callback,
|
| 154 |
void * abort_callback_data) {
|
|
|
|
| 155 |
|
| 156 |
+
ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
|
|
|
|
| 157 |
|
| 158 |
+
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
|
| 159 |
+
|
| 160 |
+
auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
|
| 161 |
+
if (set_abort_callback_fn) {
|
| 162 |
+
set_abort_callback_fn(backend.get(), abort_callback, abort_callback_data);
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
|
| 166 |
+
if (ggml_backend_set_n_threads_fn) {
|
| 167 |
+
ggml_backend_set_n_threads_fn(backend.get(), n_threads);
|
| 168 |
}
|
| 169 |
|
| 170 |
+
return ggml_backend_graph_compute(backend.get(), graph) == GGML_STATUS_SUCCESS;
|
| 171 |
}
|
| 172 |
|
| 173 |
static bool ggml_graph_compute_helper(
|
|
|
|
| 191 |
return t;
|
| 192 |
}
|
| 193 |
|
| 194 |
+
static void whisper_load_backends() {
|
| 195 |
+
#ifdef GGML_BACKEND_DL
|
| 196 |
+
static std::once_flag flag;
|
| 197 |
+
std::call_once(flag, []() {
|
| 198 |
+
ggml_backend_load_all();
|
| 199 |
+
});
|
| 200 |
+
#endif
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
// TODO: move these functions to ggml-base with support for ggml-backend?
|
| 204 |
+
|
| 205 |
+
static ggml_tensor * whisper_set_f32(struct ggml_tensor * t, float v) {
|
| 206 |
+
GGML_ASSERT(t->type == GGML_TYPE_F32);
|
| 207 |
+
GGML_ASSERT(ggml_is_contiguous(t));
|
| 208 |
+
size_t nels = ggml_nelements(t);
|
| 209 |
+
for (int64_t i = 0; i < nels; ++i) {
|
| 210 |
+
((float *) t->data)[i] = v;
|
| 211 |
+
}
|
| 212 |
+
return t;
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
static ggml_tensor * whisper_set_i32(struct ggml_tensor * t, int32_t v) {
|
| 216 |
+
GGML_ASSERT(t->type == GGML_TYPE_I32);
|
| 217 |
+
GGML_ASSERT(ggml_is_contiguous(t));
|
| 218 |
+
size_t nels = ggml_nelements(t);
|
| 219 |
+
for (int64_t i = 0; i < nels; ++i) {
|
| 220 |
+
((int32_t *) t->data)[i] = v;
|
| 221 |
+
}
|
| 222 |
+
return t;
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
static float whisper_get_f32_nd(const struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
|
| 226 |
+
GGML_ASSERT(t->type == GGML_TYPE_F32);
|
| 227 |
+
void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
|
| 228 |
+
return *(float *) data;
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
static void whisper_set_f32_nd(struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float v) {
|
| 232 |
+
GGML_ASSERT(t->type == GGML_TYPE_F32);
|
| 233 |
+
void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
|
| 234 |
+
*(float *) data = v;
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
static int32_t whisper_get_i32_nd(const struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
|
| 238 |
+
GGML_ASSERT(t->type == GGML_TYPE_I32);
|
| 239 |
+
void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
|
| 240 |
+
return *(int32_t *) data;
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
static void whisper_set_i32_nd(struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, int32_t v) {
|
| 244 |
+
GGML_ASSERT(t->type == GGML_TYPE_I32);
|
| 245 |
+
void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
|
| 246 |
+
*(int32_t *) data = v;
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
|
| 250 |
// the idea is to represent the original matrix multiplication:
|
| 251 |
//
|
|
|
|
| 1296 |
static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
|
| 1297 |
ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
|
| 1298 |
|
| 1299 |
+
whisper_load_backends();
|
| 1300 |
+
|
| 1301 |
ggml_backend_dev_t dev = nullptr;
|
| 1302 |
|
| 1303 |
int cnt = 0;
|
|
|
|
| 1355 |
|
| 1356 |
GGML_UNUSED(params);
|
| 1357 |
|
| 1358 |
+
result.push_back(ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr));
|
| 1359 |
|
| 1360 |
return result;
|
| 1361 |
}
|
|
|
|
| 4267 |
const char * whisper_print_system_info(void) {
|
| 4268 |
static std::string s;
|
| 4269 |
|
| 4270 |
+
whisper_load_backends();
|
| 4271 |
+
|
| 4272 |
s = "";
|
| 4273 |
+
s += "WHISPER : ";
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4274 |
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
|
| 4275 |
s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
|
| 4276 |
|
| 4277 |
+
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
| 4278 |
+
auto * reg = ggml_backend_reg_get(i);
|
| 4279 |
+
auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
|
| 4280 |
+
if (get_features_fn) {
|
| 4281 |
+
ggml_backend_feature * features = get_features_fn(reg);
|
| 4282 |
+
s += ggml_backend_reg_name(reg);
|
| 4283 |
+
s += " : ";
|
| 4284 |
+
for (; features->name; features++) {
|
| 4285 |
+
s += features->name;
|
| 4286 |
+
s += " = ";
|
| 4287 |
+
s += features->value;
|
| 4288 |
+
s += " | ";
|
| 4289 |
+
}
|
| 4290 |
+
}
|
| 4291 |
+
}
|
| 4292 |
return s.c_str();
|
| 4293 |
}
|
| 4294 |
|
|
|
|
| 6720 |
}
|
| 6721 |
|
| 6722 |
WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
| 6723 |
+
whisper_load_backends();
|
| 6724 |
+
|
| 6725 |
static std::string s;
|
| 6726 |
s = "";
|
| 6727 |
char strbuf[256];
|
|
|
|
| 6741 |
// c: N*N*sizeof(float)
|
| 6742 |
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
|
| 6743 |
std::vector<uint8_t> buf(3llu*N_max*N_max*sizeof(float) + 3*ggml_tensor_overhead() + ggml_graph_overhead());
|
|
|
|
| 6744 |
|
| 6745 |
// put a bunch of random data in the buffer
|
| 6746 |
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
|
|
|
|
| 6797 |
double tsum = 0.0;
|
| 6798 |
|
| 6799 |
// heat-up
|
| 6800 |
+
ggml_graph_compute_helper(gf, n_threads, nullptr, nullptr);
|
| 6801 |
|
| 6802 |
for (int i = 0; i < n_max; ++i) {
|
| 6803 |
const int64_t t0 = ggml_time_us();
|
| 6804 |
|
| 6805 |
+
ggml_graph_compute_helper(gf, n_threads, nullptr, nullptr);
|
| 6806 |
|
| 6807 |
const int64_t t1 = ggml_time_us();
|
| 6808 |
|
|
|
|
| 7179 |
struct ggml_tensor * cost = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, N + 1, M + 1);
|
| 7180 |
struct ggml_tensor * trace = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, N + 1, M + 1);
|
| 7181 |
|
| 7182 |
+
cost = whisper_set_f32(cost, INFINITY);
|
| 7183 |
+
trace = whisper_set_i32(trace, -1);
|
| 7184 |
+
whisper_set_f32_nd(cost, 0, 0, 0, 0, 0.0);
|
| 7185 |
|
| 7186 |
// dtw
|
| 7187 |
// supposedly can be optmized by computing diagonals in parallel ?
|
| 7188 |
// Not sure it is worth it since x will be GENERATED_TOKENS*1500 size at most.
|
| 7189 |
for (int64_t j = 1; j < M + 1; ++j) {
|
| 7190 |
for (int64_t i = 1; i < N + 1; ++i) {
|
| 7191 |
+
float c0 = whisper_get_f32_nd(cost, i - 1, j - 1, 0, 0);
|
| 7192 |
+
float c1 = whisper_get_f32_nd(cost, i - 1, j, 0, 0);
|
| 7193 |
+
float c2 = whisper_get_f32_nd(cost, i, j - 1, 0, 0);
|
| 7194 |
|
| 7195 |
float c;
|
| 7196 |
int32_t t;
|
|
|
|
| 7205 |
t = 2;
|
| 7206 |
}
|
| 7207 |
|
| 7208 |
+
c = whisper_get_f32_nd(x, i - 1, j - 1, 0, 0) + c;
|
| 7209 |
+
whisper_set_f32_nd(cost, i, j, 0, 0, c);
|
| 7210 |
+
whisper_set_i32_nd(trace, i, j, 0, 0, t);
|
| 7211 |
}
|
| 7212 |
}
|
| 7213 |
|
|
|
|
| 7216 |
struct ggml_tensor * bt = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, BT_MAX_ROWS, 2);
|
| 7217 |
// trace[0, :] = 2;
|
| 7218 |
for (int64_t i = 0; i < M + 1; ++i)
|
| 7219 |
+
whisper_set_i32_nd(trace, 0, i, 0, 0, 2);
|
| 7220 |
//trace[:, 0] = 1;
|
| 7221 |
for (int64_t i = 0; i < N + 1; ++i)
|
| 7222 |
+
whisper_set_i32_nd(trace, i, 0, 0, 0, 1);
|
| 7223 |
int bt_row_idx = BT_MAX_ROWS - 1;
|
| 7224 |
int64_t i = N;
|
| 7225 |
int64_t j = M;
|
| 7226 |
while (i > 0 || j > 0) {
|
| 7227 |
+
whisper_set_i32_nd(bt, bt_row_idx, 0, 0, 0, i - 1);
|
| 7228 |
+
whisper_set_i32_nd(bt, bt_row_idx, 1, 0, 0, j - 1);
|
| 7229 |
--bt_row_idx;
|
| 7230 |
|
| 7231 |
+
int32_t t = whisper_get_i32_nd(trace, i, j, 0, 0);
|
| 7232 |
if (t == 0) {
|
| 7233 |
--i;
|
| 7234 |
--j;
|
|
|
|
| 7249 |
ggml_tensor * r = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, 2, result_n_cols);
|
| 7250 |
for (int64_t i = 0; i < 2; ++i) {
|
| 7251 |
for (int64_t j = 0; j < result_n_cols; ++j) {
|
| 7252 |
+
int32_t v = whisper_get_i32_nd(bt, j+bt_row_idx+1, i, 0, 0);
|
| 7253 |
+
whisper_set_i32_nd(r, i, j, 0, 0, v);
|
| 7254 |
}
|
| 7255 |
}
|
| 7256 |
|
|
|
|
| 7285 |
idx = 2*(a->ne[2] - 1) - idx;
|
| 7286 |
}
|
| 7287 |
|
| 7288 |
+
filter.push_back(whisper_get_f32_nd(a, i, j, idx, 0));
|
| 7289 |
}
|
| 7290 |
std::sort(filter.begin(), filter.end());
|
| 7291 |
const float v = filter[filter.size()/2];
|
| 7292 |
+
whisper_set_f32_nd(dst, i, j, k, 0, v);
|
| 7293 |
filter.clear();
|
| 7294 |
}
|
| 7295 |
}
|
|
|
|
| 7411 |
// Compute
|
| 7412 |
struct ggml_cgraph * gf = ggml_new_graph(gctx);
|
| 7413 |
ggml_build_forward_expand(gf, w);
|
| 7414 |
+
|
| 7415 |
+
ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
|
| 7416 |
+
ggml_backend_graph_compute(backend.get(), gf);
|
| 7417 |
|
| 7418 |
ggml_tensor * alignment = dtw_and_backtrace(gctx, w);
|
| 7419 |
|
|
|
|
| 7422 |
auto seg_i = state->result_all.begin() + i_segment;
|
| 7423 |
auto tok_i = seg_i->tokens.begin();
|
| 7424 |
for (int i = 0; i < alignment->ne[1]; ++i) {
|
| 7425 |
+
int32_t v = whisper_get_i32_nd(alignment, 0, i, 0, 0);
|
| 7426 |
if (v != last_v) {
|
| 7427 |
+
int32_t time_index = whisper_get_i32_nd(alignment, 1, i, 0, 0);
|
| 7428 |
int64_t timestamp = (time_index * 2) + seek; // Each index on DTW result = 20mS audio
|
| 7429 |
last_v = v;
|
| 7430 |
|