ggerganov commited on
Commit
d91d7d9
·
unverified ·
1 Parent(s): c3d7603

whisper : add GPU support via cuBLAS (#834)

Browse files

* make : add WHISPER_CUBLAS

* make : fix CUBLAS build

* whisper : disable Flash Attention + adjust memory buffers

* whisper : remove old commented code

* readme : add cuBLAS instructions

* cmake : add WHISPER_CUBLAS option

* gitignore : ignore build-cublas

.gitignore CHANGED
@@ -12,6 +12,7 @@ build-em/
12
  build-debug/
13
  build-release/
14
  build-static/
 
15
  build-no-accel/
16
  build-sanitize-addr/
17
  build-sanitize-thread/
 
12
  build-debug/
13
  build-release/
14
  build-static/
15
+ build-cublas/
16
  build-no-accel/
17
  build-sanitize-addr/
18
  build-sanitize-thread/
CMakeLists.txt CHANGED
@@ -51,7 +51,7 @@ option(WHISPER_SANITIZE_UNDEFINED "whisper: enable undefined sanitizer" OFF)
51
  option(WHISPER_BUILD_TESTS "whisper: build tests" ${WHISPER_STANDALONE})
52
  option(WHISPER_BUILD_EXAMPLES "whisper: build examples" ${WHISPER_STANDALONE})
53
 
54
- option(WHISPER_SUPPORT_SDL2 "whisper: support for libSDL2" OFF)
55
 
56
  if (APPLE)
57
  option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
@@ -62,7 +62,8 @@ if (APPLE)
62
  option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
63
  option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
64
  else()
65
- option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF)
 
66
  endif()
67
 
68
  option(WHISPER_PERF "whisper: enable perf timings" OFF)
@@ -127,7 +128,7 @@ if (APPLE)
127
  endif()
128
  endif()
129
 
130
- if (WHISPER_SUPPORT_OPENBLAS)
131
  find_library(OPENBLAS_LIB
132
  NAMES openblas libopenblas
133
  )
@@ -141,6 +142,31 @@ if (WHISPER_SUPPORT_OPENBLAS)
141
  endif()
142
  endif()
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  # compiler flags
145
 
146
  if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
@@ -247,6 +273,7 @@ set(TARGET whisper)
247
  add_library(${TARGET}
248
  ggml.h
249
  ggml.c
 
250
  whisper.h
251
  whisper.cpp
252
  )
@@ -279,6 +306,12 @@ if (BUILD_SHARED_LIBS)
279
  )
280
  endif()
281
 
 
 
 
 
 
 
282
  if (EMSCRIPTEN)
283
  set_target_properties(${TARGET} PROPERTIES COMPILE_FLAGS "-msimd128")
284
  endif()
 
51
  option(WHISPER_BUILD_TESTS "whisper: build tests" ${WHISPER_STANDALONE})
52
  option(WHISPER_BUILD_EXAMPLES "whisper: build examples" ${WHISPER_STANDALONE})
53
 
54
+ option(WHISPER_SDL2 "whisper: support for libSDL2" OFF)
55
 
56
  if (APPLE)
57
  option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
 
62
  option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
63
  option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
64
  else()
65
+ option(WHISPER_OPENBLAS "whisper: support for OpenBLAS" OFF)
66
+ option(WHISPER_CUBLAS "whisper: support for cuBLAS" OFF)
67
  endif()
68
 
69
  option(WHISPER_PERF "whisper: enable perf timings" OFF)
 
128
  endif()
129
  endif()
130
 
131
+ if (WHISPER_OPENBLAS)
132
  find_library(OPENBLAS_LIB
133
  NAMES openblas libopenblas
134
  )
 
142
  endif()
143
  endif()
144
 
145
+ if (WHISPER_CUBLAS)
146
+ cmake_minimum_required(VERSION 3.17)
147
+
148
+ find_package(CUDAToolkit)
149
+
150
+ if (CUDAToolkit_FOUND)
151
+ message(STATUS "cuBLAS found")
152
+
153
+ enable_language(CUDA)
154
+
155
+ set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
156
+
157
+ add_compile_definitions(GGML_USE_CUBLAS)
158
+
159
+ if (WHISPER_STATIC)
160
+ set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
161
+ else()
162
+ set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
163
+ endif()
164
+
165
+ else()
166
+ message(WARNING "cuBLAS not found")
167
+ endif()
168
+ endif()
169
+
170
  # compiler flags
171
 
172
  if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
 
273
  add_library(${TARGET}
274
  ggml.h
275
  ggml.c
276
+ ${GGML_CUDA_SOURCES}
277
  whisper.h
278
  whisper.cpp
279
  )
 
306
  )
307
  endif()
308
 
309
+ if (GGML_CUDA_SOURCES)
310
+ message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
311
+ set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES OFF)
312
+ set_property(TARGET whisper PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
313
+ endif()
314
+
315
  if (EMSCRIPTEN)
316
  set_target_properties(${TARGET} PROPERTIES COMPILE_FLAGS "-msimd128")
317
  endif()
Makefile CHANGED
@@ -1,3 +1,5 @@
 
 
1
  ifndef UNAME_S
2
  UNAME_S := $(shell uname -s)
3
  endif
@@ -157,6 +159,18 @@ ifdef WHISPER_OPENBLAS
157
  LDFLAGS += -lopenblas
158
  endif
159
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  ifdef WHISPER_GPROF
161
  CFLAGS += -pg
162
  CXXFLAGS += -pg
@@ -200,20 +214,18 @@ $(info I CC: $(CCV))
200
  $(info I CXX: $(CXXV))
201
  $(info )
202
 
203
- default: main bench
204
-
205
  #
206
  # Build library
207
  #
208
 
209
- ggml.o: ggml.c ggml.h
210
- $(CC) $(CFLAGS) -c ggml.c -o ggml.o
211
 
212
- whisper.o: whisper.cpp whisper.h ggml.h
213
- $(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
214
 
215
  ifndef WHISPER_COREML
216
- WHISPER_OBJ = whisper.o
217
  else
218
  whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
219
  $(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
@@ -221,7 +233,7 @@ whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
221
  whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
222
  $(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o
223
 
224
- WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o
225
  endif
226
 
227
  libwhisper.a: ggml.o $(WHISPER_OBJ)
 
1
+ default: main bench
2
+
3
  ifndef UNAME_S
4
  UNAME_S := $(shell uname -s)
5
  endif
 
159
  LDFLAGS += -lopenblas
160
  endif
161
 
162
+ ifdef WHISPER_CUBLAS
163
+ CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
164
+ CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
165
+ LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
166
+ WHISPER_OBJ += ggml-cuda.o
167
+ NVCC = nvcc
168
+ NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
169
+
170
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
171
+ $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
172
+ endif
173
+
174
  ifdef WHISPER_GPROF
175
  CFLAGS += -pg
176
  CXXFLAGS += -pg
 
214
  $(info I CXX: $(CXXV))
215
  $(info )
216
 
 
 
217
  #
218
  # Build library
219
  #
220
 
221
+ ggml.o: ggml.c ggml.h ggml-cuda.h
222
+ $(CC) $(CFLAGS) -c $< -o $@
223
 
224
+ whisper.o: whisper.cpp whisper.h ggml.h ggml-cuda.h
225
+ $(CXX) $(CXXFLAGS) -c $< -o $@
226
 
227
  ifndef WHISPER_COREML
228
+ WHISPER_OBJ += whisper.o
229
  else
230
  whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
231
  $(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
 
233
  whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
234
  $(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o
235
 
236
+ WHISPER_OBJ += whisper.o whisper-encoder.o whisper-encoder-impl.o
237
  endif
238
 
239
  libwhisper.a: ggml.o $(WHISPER_OBJ)
README.md CHANGED
@@ -18,6 +18,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
18
  - Low memory usage (Flash Attention)
19
  - Zero memory allocations at runtime
20
  - Runs on the CPU
 
21
  - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
22
 
23
  Supported platforms:
@@ -254,7 +255,7 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
254
  # using Makefile
255
  make clean
256
  WHISPER_COREML=1 make -j
257
-
258
  # using CMake
259
  cd build
260
  cmake -DWHISPER_COREML=1 ..
@@ -271,20 +272,33 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
271
  whisper_init_state: first run on a device may take a while ...
272
  whisper_init_state: Core ML model loaded
273
 
274
- system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | COREML = 1 |
275
 
276
  ...
277
  ```
278
 
279
  The first run on a device is slow, since the ANE service compiles the Core ML model to some device-specific format.
280
  Next runs are faster.
281
-
282
  For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566).
283
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  ## Limitations
285
 
286
  - Inference only
287
- - No GPU support (yet)
288
 
289
  ## Another example
290
 
@@ -429,7 +443,7 @@ system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1
429
 
430
  main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
431
 
432
- [00:00:00.000 --> 00:00:00.320]
433
  [00:00:00.320 --> 00:00:00.370] And
434
  [00:00:00.370 --> 00:00:00.690] so
435
  [00:00:00.690 --> 00:00:00.850] my
 
18
  - Low memory usage (Flash Attention)
19
  - Zero memory allocations at runtime
20
  - Runs on the CPU
21
+ - [Partial GPU support for NVIDIA via cuBLAS](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
22
  - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
23
 
24
  Supported platforms:
 
255
  # using Makefile
256
  make clean
257
  WHISPER_COREML=1 make -j
258
+
259
  # using CMake
260
  cd build
261
  cmake -DWHISPER_COREML=1 ..
 
272
  whisper_init_state: first run on a device may take a while ...
273
  whisper_init_state: Core ML model loaded
274
 
275
+ system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | COREML = 1 |
276
 
277
  ...
278
  ```
279
 
280
  The first run on a device is slow, since the ANE service compiles the Core ML model to some device-specific format.
281
  Next runs are faster.
282
+
283
  For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566).
284
+
285
+ ## NVIDIA GPU support via cuBLAS
286
+
287
+ With NVIDIA cards, the Encoder processing can be offloaded to the GPU to a large extend through cuBLAS.
288
+ First, make sure you have installed `cuda`: https://developer.nvidia.com/cuda-downloads
289
+
290
+ Now build `whisper.cpp` with cuBLAS support:
291
+
292
+ ```
293
+ make clean
294
+ WHISPER_CUBLAS=1 make -j
295
+ ```
296
+
297
+ Run all the examples as usual.
298
+
299
  ## Limitations
300
 
301
  - Inference only
 
302
 
303
  ## Another example
304
 
 
443
 
444
  main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
445
 
446
+ [00:00:00.000 --> 00:00:00.320]
447
  [00:00:00.320 --> 00:00:00.370] And
448
  [00:00:00.370 --> 00:00:00.690] so
449
  [00:00:00.690 --> 00:00:00.850] my
examples/CMakeLists.txt CHANGED
@@ -4,7 +4,7 @@ find_package(Threads REQUIRED)
4
 
5
  # third-party
6
 
7
- if (WHISPER_SUPPORT_SDL2)
8
  # SDL2
9
  find_package(SDL2 REQUIRED)
10
 
@@ -27,7 +27,7 @@ include(DefaultTargetOptions)
27
 
28
  set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
29
 
30
- if (WHISPER_SUPPORT_SDL2)
31
  # common-sdl
32
 
33
  set(TARGET common-sdl)
 
4
 
5
  # third-party
6
 
7
+ if (WHISPER_SDL2)
8
  # SDL2
9
  find_package(SDL2 REQUIRED)
10
 
 
27
 
28
  set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
29
 
30
+ if (WHISPER_SDL2)
31
  # common-sdl
32
 
33
  set(TARGET common-sdl)
examples/command/CMakeLists.txt CHANGED
@@ -1,4 +1,4 @@
1
- if (WHISPER_SUPPORT_SDL2)
2
  # command
3
  set(TARGET command)
4
  add_executable(${TARGET} command.cpp)
 
1
+ if (WHISPER_SDL2)
2
  # command
3
  set(TARGET command)
4
  add_executable(${TARGET} command.cpp)
examples/stream/CMakeLists.txt CHANGED
@@ -1,4 +1,4 @@
1
- if (WHISPER_SUPPORT_SDL2)
2
  # stream
3
  set(TARGET stream)
4
  add_executable(${TARGET} stream.cpp)
 
1
+ if (WHISPER_SDL2)
2
  # stream
3
  set(TARGET stream)
4
  add_executable(${TARGET} stream.cpp)
examples/talk-llama/CMakeLists.txt CHANGED
@@ -1,4 +1,4 @@
1
- if (WHISPER_SUPPORT_SDL2)
2
  # talk-llama
3
  set(TARGET talk-llama)
4
  #add_executable(${TARGET} talk-llama.cpp llama.cpp)
 
1
+ if (WHISPER_SDL2)
2
  # talk-llama
3
  set(TARGET talk-llama)
4
  #add_executable(${TARGET} talk-llama.cpp llama.cpp)
examples/talk/CMakeLists.txt CHANGED
@@ -1,4 +1,4 @@
1
- if (WHISPER_SUPPORT_SDL2)
2
  # talk
3
  set(TARGET talk)
4
  #add_executable(${TARGET} talk.cpp gpt-2.cpp)
 
1
+ if (WHISPER_SDL2)
2
  # talk
3
  set(TARGET talk)
4
  #add_executable(${TARGET} talk.cpp gpt-2.cpp)
whisper.cpp CHANGED
@@ -102,7 +102,7 @@ static void byteswap_tensor(ggml_tensor * tensor) {
102
  #define WHISPER_PRINT_DEBUG(...)
103
  #endif
104
 
105
- #define WHISPER_USE_FLASH_ATTN
106
  //#define WHISPER_USE_FLASH_FF
107
  #define WHISPER_MAX_DECODERS 16
108
 
@@ -224,11 +224,11 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
224
  static const size_t MB = 1ull*1024*1024;
225
 
226
  static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
227
- { MODEL_TINY, 14ull*MB },
228
- { MODEL_BASE, 18ull*MB },
229
- { MODEL_SMALL, 28ull*MB },
230
- { MODEL_MEDIUM, 36ull*MB },
231
- { MODEL_LARGE, 44ull*MB },
232
  };
233
 
234
  static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
@@ -280,11 +280,11 @@ static const std::map<e_model, size_t> MEM_REQ_KV_CROSS = {
280
  };
281
 
282
  static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
283
- { MODEL_TINY, 6ull*MB },
284
- { MODEL_BASE, 8ull*MB },
285
- { MODEL_SMALL, 13ull*MB },
286
- { MODEL_MEDIUM, 22ull*MB },
287
- { MODEL_LARGE, 33ull*MB },
288
  };
289
 
290
  static const std::map<e_model, size_t> MEM_REQ_DECODE = {
@@ -1554,26 +1554,17 @@ static bool whisper_encode_internal(
1554
 
1555
  struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
1556
 
1557
- //struct ggml_tensor * V_trans =
1558
- // ggml_permute(ctx0,
1559
- // ggml_cpy(ctx0,
1560
- // Vcur,
1561
- // ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
1562
- // 1, 2, 0, 3);
1563
-
1564
- //struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
1565
-
1566
  struct ggml_tensor * V =
1567
  ggml_cpy(ctx0,
1568
  ggml_permute(ctx0,
1569
  ggml_reshape_3d(ctx0,
1570
  Vcur,
1571
  n_state/n_head, n_head, n_ctx),
1572
- 0, 2, 1, 3),
1573
- ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_ctx, n_head)
1574
  );
1575
 
1576
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, ggml_transpose(ctx0, V), KQ_soft_max);
1577
  #endif
1578
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1579
 
 
102
  #define WHISPER_PRINT_DEBUG(...)
103
  #endif
104
 
105
+ //#define WHISPER_USE_FLASH_ATTN
106
  //#define WHISPER_USE_FLASH_FF
107
  #define WHISPER_MAX_DECODERS 16
108
 
 
224
  static const size_t MB = 1ull*1024*1024;
225
 
226
  static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
227
+ { MODEL_TINY, 62ull*MB },
228
+ { MODEL_BASE, 80ull*MB },
229
+ { MODEL_SMALL, 120ull*MB },
230
+ { MODEL_MEDIUM, 158ull*MB },
231
+ { MODEL_LARGE, 198ull*MB },
232
  };
233
 
234
  static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
 
280
  };
281
 
282
  static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
283
+ { MODEL_TINY, 30ull*MB },
284
+ { MODEL_BASE, 38ull*MB },
285
+ { MODEL_SMALL, 56ull*MB },
286
+ { MODEL_MEDIUM, 74ull*MB },
287
+ { MODEL_LARGE, 94ull*MB },
288
  };
289
 
290
  static const std::map<e_model, size_t> MEM_REQ_DECODE = {
 
1554
 
1555
  struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
1556
 
 
 
 
 
 
 
 
 
 
1557
  struct ggml_tensor * V =
1558
  ggml_cpy(ctx0,
1559
  ggml_permute(ctx0,
1560
  ggml_reshape_3d(ctx0,
1561
  Vcur,
1562
  n_state/n_head, n_head, n_ctx),
1563
+ 1, 2, 0, 3),
1564
+ ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head)
1565
  );
1566
 
1567
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
1568
  #endif
1569
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1570