Spaces:
Running
Running
whisper : add Core ML support (#566)
Browse files* coreml : use Core ML encoder inference
* coreml : simlpify whisper_encode + log messages
* whisper : resolve rebase conflicts
* coreml : add scripts for CoreML model generation
* bench-all : recognize COREML flag
- .gitignore +4 -1
- CMakeLists.txt +59 -9
- Makefile +32 -16
- coreml/whisper-decoder-impl.h +146 -0
- coreml/whisper-decoder-impl.m +201 -0
- coreml/whisper-encoder-impl.h +142 -0
- coreml/whisper-encoder-impl.m +197 -0
- coreml/whisper-encoder.h +22 -0
- coreml/whisper-encoder.mm +67 -0
- extra/bench-all.sh +4 -0
- models/convert-whisper-to-coreml.py +334 -0
- models/download-coreml-model.sh +82 -0
- models/generate-coreml-interface.sh +29 -0
- models/generate-coreml-model.sh +25 -0
- whisper.cpp +60 -0
.gitignore
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
*.o
|
| 2 |
*.a
|
| 3 |
.cache/
|
|
|
|
| 4 |
.test/
|
| 5 |
.vs/
|
| 6 |
.vscode/
|
|
@@ -35,4 +36,6 @@ examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata
|
|
| 35 |
|
| 36 |
extra/bench-gg.txt
|
| 37 |
|
| 38 |
-
|
|
|
|
|
|
|
|
|
| 1 |
*.o
|
| 2 |
*.a
|
| 3 |
.cache/
|
| 4 |
+
.coreml/
|
| 5 |
.test/
|
| 6 |
.vs/
|
| 7 |
.vscode/
|
|
|
|
| 36 |
|
| 37 |
extra/bench-gg.txt
|
| 38 |
|
| 39 |
+
models/*.mlmodel
|
| 40 |
+
models/*.mlmodelc
|
| 41 |
+
models/*.mlpackage
|
CMakeLists.txt
CHANGED
|
@@ -58,6 +58,8 @@ if (APPLE)
|
|
| 58 |
option(WHISPER_NO_AVX "whisper: disable AVX" OFF)
|
| 59 |
option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF)
|
| 60 |
option(WHISPER_NO_FMA "whisper: disable FMA" OFF)
|
|
|
|
|
|
|
| 61 |
else()
|
| 62 |
option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF)
|
| 63 |
endif()
|
|
@@ -90,16 +92,33 @@ endif()
|
|
| 90 |
|
| 91 |
find_package(Threads REQUIRED)
|
| 92 |
|
| 93 |
-
# on APPLE
|
| 94 |
-
if (APPLE
|
| 95 |
-
|
| 96 |
-
if (
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
endif()
|
| 104 |
endif()
|
| 105 |
|
|
@@ -187,6 +206,33 @@ if (WHISPER_PERF)
|
|
| 187 |
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
|
| 188 |
endif()
|
| 189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
#
|
| 191 |
# whisper - this is the main library of the project
|
| 192 |
#
|
|
@@ -206,6 +252,10 @@ target_include_directories(${TARGET} PUBLIC
|
|
| 206 |
.
|
| 207 |
)
|
| 208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
if (MSVC)
|
| 210 |
target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
|
| 211 |
|
|
|
|
| 58 |
option(WHISPER_NO_AVX "whisper: disable AVX" OFF)
|
| 59 |
option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF)
|
| 60 |
option(WHISPER_NO_FMA "whisper: disable FMA" OFF)
|
| 61 |
+
|
| 62 |
+
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
|
| 63 |
else()
|
| 64 |
option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF)
|
| 65 |
endif()
|
|
|
|
| 92 |
|
| 93 |
find_package(Threads REQUIRED)
|
| 94 |
|
| 95 |
+
# on APPLE
|
| 96 |
+
if (APPLE)
|
| 97 |
+
# include Accelerate framework
|
| 98 |
+
if (NOT WHISPER_NO_ACCELERATE)
|
| 99 |
+
find_library(ACCELERATE_FRAMEWORK Accelerate)
|
| 100 |
+
|
| 101 |
+
if (ACCELERATE_FRAMEWORK)
|
| 102 |
+
message(STATUS "Accelerate framework found")
|
| 103 |
|
| 104 |
+
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
|
| 105 |
+
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
|
| 106 |
+
else()
|
| 107 |
+
message(WARNING "Accelerate framework not found")
|
| 108 |
+
endif()
|
| 109 |
+
endif()
|
| 110 |
+
|
| 111 |
+
if (WHISPER_COREML)
|
| 112 |
+
find_library(FOUNDATION_FRAMEWORK Foundation)
|
| 113 |
+
find_library(COREML_FRAMEWORK CoreML)
|
| 114 |
+
|
| 115 |
+
if (COREML_FRAMEWORK)
|
| 116 |
+
message(STATUS "CoreML framework found")
|
| 117 |
+
|
| 118 |
+
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML)
|
| 119 |
+
else()
|
| 120 |
+
message(WARNING "CoreML framework not found")
|
| 121 |
+
endif()
|
| 122 |
endif()
|
| 123 |
endif()
|
| 124 |
|
|
|
|
| 206 |
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
|
| 207 |
endif()
|
| 208 |
|
| 209 |
+
#
|
| 210 |
+
# whisper.coreml - Core ML support
|
| 211 |
+
#
|
| 212 |
+
|
| 213 |
+
if (WHISPER_COREML)
|
| 214 |
+
set(TARGET whisper.coreml)
|
| 215 |
+
|
| 216 |
+
add_library(${TARGET}
|
| 217 |
+
coreml/whisper-encoder.h
|
| 218 |
+
coreml/whisper-encoder.mm
|
| 219 |
+
coreml/whisper-encoder-impl.h
|
| 220 |
+
coreml/whisper-encoder-impl.m
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
include(DefaultTargetOptions)
|
| 224 |
+
|
| 225 |
+
target_include_directories(${TARGET} PUBLIC
|
| 226 |
+
.
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
target_link_libraries(${TARGET} PRIVATE ${FOUNDATION_FRAMEWORK} ${COREML_FRAMEWORK})
|
| 230 |
+
|
| 231 |
+
set_target_properties(${TARGET} PROPERTIES
|
| 232 |
+
COMPILE_FLAGS "-fobjc-arc"
|
| 233 |
+
)
|
| 234 |
+
endif()
|
| 235 |
+
|
| 236 |
#
|
| 237 |
# whisper - this is the main library of the project
|
| 238 |
#
|
|
|
|
| 252 |
.
|
| 253 |
)
|
| 254 |
|
| 255 |
+
if (WHISPER_COREML)
|
| 256 |
+
target_link_libraries(${TARGET} PRIVATE whisper.coreml)
|
| 257 |
+
endif()
|
| 258 |
+
|
| 259 |
if (MSVC)
|
| 260 |
target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
|
| 261 |
|
Makefile
CHANGED
|
@@ -140,6 +140,10 @@ ifndef WHISPER_NO_ACCELERATE
|
|
| 140 |
LDFLAGS += -framework Accelerate
|
| 141 |
endif
|
| 142 |
endif
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
ifdef WHISPER_OPENBLAS
|
| 144 |
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
|
| 145 |
LDFLAGS += -lopenblas
|
|
@@ -195,11 +199,23 @@ ggml.o: ggml.c ggml.h
|
|
| 195 |
whisper.o: whisper.cpp whisper.h ggml.h
|
| 196 |
$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
|
| 197 |
|
| 198 |
-
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
-
libwhisper.so: ggml.o
|
| 202 |
-
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o
|
| 203 |
|
| 204 |
clean:
|
| 205 |
rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so
|
|
@@ -213,24 +229,24 @@ CC_SDL=`sdl2-config --cflags --libs`
|
|
| 213 |
SRC_COMMON = examples/common.cpp
|
| 214 |
SRC_COMMON_SDL = examples/common-sdl.cpp
|
| 215 |
|
| 216 |
-
main: examples/main/main.cpp $(SRC_COMMON) ggml.o
|
| 217 |
-
$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o
|
| 218 |
./main -h
|
| 219 |
|
| 220 |
-
bench: examples/bench/bench.cpp ggml.o
|
| 221 |
-
$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o
|
| 222 |
|
| 223 |
-
stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o
|
| 224 |
-
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o
|
| 225 |
|
| 226 |
-
command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o
|
| 227 |
-
$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o
|
| 228 |
|
| 229 |
-
talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o
|
| 230 |
-
$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o
|
| 231 |
|
| 232 |
-
talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o
|
| 233 |
-
$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o
|
| 234 |
|
| 235 |
#
|
| 236 |
# Audio samples
|
|
|
|
| 140 |
LDFLAGS += -framework Accelerate
|
| 141 |
endif
|
| 142 |
endif
|
| 143 |
+
ifdef WHISPER_COREML
|
| 144 |
+
CXXFLAGS += -DWHISPER_USE_COREML
|
| 145 |
+
LDFLAGS += -framework Foundation -framework CoreML
|
| 146 |
+
endif
|
| 147 |
ifdef WHISPER_OPENBLAS
|
| 148 |
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
|
| 149 |
LDFLAGS += -lopenblas
|
|
|
|
| 199 |
whisper.o: whisper.cpp whisper.h ggml.h
|
| 200 |
$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
|
| 201 |
|
| 202 |
+
ifndef WHISPER_COREML
|
| 203 |
+
WHISPER_OBJ = whisper.o
|
| 204 |
+
else
|
| 205 |
+
whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
|
| 206 |
+
$(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
|
| 207 |
+
|
| 208 |
+
whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
|
| 209 |
+
$(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o
|
| 210 |
+
|
| 211 |
+
WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o
|
| 212 |
+
endif
|
| 213 |
+
|
| 214 |
+
libwhisper.a: ggml.o $(WHISPER_OBJ)
|
| 215 |
+
$(AR) rcs libwhisper.a ggml.o $(WHISPER_OBJ)
|
| 216 |
|
| 217 |
+
libwhisper.so: ggml.o $(WHISPER_OBJ)
|
| 218 |
+
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
|
| 219 |
|
| 220 |
clean:
|
| 221 |
rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so
|
|
|
|
| 229 |
SRC_COMMON = examples/common.cpp
|
| 230 |
SRC_COMMON_SDL = examples/common-sdl.cpp
|
| 231 |
|
| 232 |
+
main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
|
| 233 |
+
$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o main $(LDFLAGS)
|
| 234 |
./main -h
|
| 235 |
|
| 236 |
+
bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
|
| 237 |
+
$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
|
| 238 |
|
| 239 |
+
stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
|
| 240 |
+
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
|
| 241 |
|
| 242 |
+
command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
|
| 243 |
+
$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)
|
| 244 |
|
| 245 |
+
talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
|
| 246 |
+
$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
|
| 247 |
|
| 248 |
+
talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
|
| 249 |
+
$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS)
|
| 250 |
|
| 251 |
#
|
| 252 |
# Audio samples
|
coreml/whisper-decoder-impl.h
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// whisper-decoder-impl.h
|
| 3 |
+
//
|
| 4 |
+
// This file was automatically generated and should not be edited.
|
| 5 |
+
//
|
| 6 |
+
|
| 7 |
+
#import <Foundation/Foundation.h>
|
| 8 |
+
#import <CoreML/CoreML.h>
|
| 9 |
+
#include <stdint.h>
|
| 10 |
+
#include <os/log.h>
|
| 11 |
+
|
| 12 |
+
NS_ASSUME_NONNULL_BEGIN
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
/// Model Prediction Input Type
|
| 16 |
+
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
|
| 17 |
+
@interface whisper_decoder_implInput : NSObject<MLFeatureProvider>
|
| 18 |
+
|
| 19 |
+
/// token_data as 1 by 1 matrix of 32-bit integers
|
| 20 |
+
@property (readwrite, nonatomic, strong) MLMultiArray * token_data;
|
| 21 |
+
|
| 22 |
+
/// audio_data as 1 × 384 × 1 × 1500 4-dimensional array of floats
|
| 23 |
+
@property (readwrite, nonatomic, strong) MLMultiArray * audio_data;
|
| 24 |
+
- (instancetype)init NS_UNAVAILABLE;
|
| 25 |
+
- (instancetype)initWithToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data NS_DESIGNATED_INITIALIZER;
|
| 26 |
+
|
| 27 |
+
@end
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
/// Model Prediction Output Type
|
| 31 |
+
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
|
| 32 |
+
@interface whisper_decoder_implOutput : NSObject<MLFeatureProvider>
|
| 33 |
+
|
| 34 |
+
/// var_1346 as multidimensional array of floats
|
| 35 |
+
@property (readwrite, nonatomic, strong) MLMultiArray * var_1346;
|
| 36 |
+
- (instancetype)init NS_UNAVAILABLE;
|
| 37 |
+
- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 NS_DESIGNATED_INITIALIZER;
|
| 38 |
+
|
| 39 |
+
@end
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
/// Class for model loading and prediction
|
| 43 |
+
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
|
| 44 |
+
@interface whisper_decoder_impl : NSObject
|
| 45 |
+
@property (readonly, nonatomic, nullable) MLModel * model;
|
| 46 |
+
|
| 47 |
+
/**
|
| 48 |
+
URL of the underlying .mlmodelc directory.
|
| 49 |
+
*/
|
| 50 |
+
+ (nullable NSURL *)URLOfModelInThisBundle;
|
| 51 |
+
|
| 52 |
+
/**
|
| 53 |
+
Initialize whisper_decoder_impl instance from an existing MLModel object.
|
| 54 |
+
|
| 55 |
+
Usually the application does not use this initializer unless it makes a subclass of whisper_decoder_impl.
|
| 56 |
+
Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
|
| 57 |
+
*/
|
| 58 |
+
- (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
|
| 59 |
+
|
| 60 |
+
/**
|
| 61 |
+
Initialize whisper_decoder_impl instance with the model in this bundle.
|
| 62 |
+
*/
|
| 63 |
+
- (nullable instancetype)init;
|
| 64 |
+
|
| 65 |
+
/**
|
| 66 |
+
Initialize whisper_decoder_impl instance with the model in this bundle.
|
| 67 |
+
|
| 68 |
+
@param configuration The model configuration object
|
| 69 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 70 |
+
*/
|
| 71 |
+
- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
| 72 |
+
|
| 73 |
+
/**
|
| 74 |
+
Initialize whisper_decoder_impl instance from the model URL.
|
| 75 |
+
|
| 76 |
+
@param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
|
| 77 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 78 |
+
*/
|
| 79 |
+
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
| 80 |
+
|
| 81 |
+
/**
|
| 82 |
+
Initialize whisper_decoder_impl instance from the model URL.
|
| 83 |
+
|
| 84 |
+
@param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
|
| 85 |
+
@param configuration The model configuration object
|
| 86 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 87 |
+
*/
|
| 88 |
+
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
| 89 |
+
|
| 90 |
+
/**
|
| 91 |
+
Construct whisper_decoder_impl instance asynchronously with configuration.
|
| 92 |
+
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
| 93 |
+
|
| 94 |
+
@param configuration The model configuration
|
| 95 |
+
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
|
| 96 |
+
*/
|
| 97 |
+
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler;
|
| 98 |
+
|
| 99 |
+
/**
|
| 100 |
+
Construct whisper_decoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
|
| 101 |
+
|
| 102 |
+
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
| 103 |
+
|
| 104 |
+
@param modelURL The model URL.
|
| 105 |
+
@param configuration The model configuration
|
| 106 |
+
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
|
| 107 |
+
*/
|
| 108 |
+
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler;
|
| 109 |
+
|
| 110 |
+
/**
|
| 111 |
+
Make a prediction using the standard interface
|
| 112 |
+
@param input an instance of whisper_decoder_implInput to predict from
|
| 113 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 114 |
+
@return the prediction as whisper_decoder_implOutput
|
| 115 |
+
*/
|
| 116 |
+
- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
| 117 |
+
|
| 118 |
+
/**
|
| 119 |
+
Make a prediction using the standard interface
|
| 120 |
+
@param input an instance of whisper_decoder_implInput to predict from
|
| 121 |
+
@param options prediction options
|
| 122 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 123 |
+
@return the prediction as whisper_decoder_implOutput
|
| 124 |
+
*/
|
| 125 |
+
- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
| 126 |
+
|
| 127 |
+
/**
|
| 128 |
+
Make a prediction using the convenience interface
|
| 129 |
+
@param token_data as 1 by 1 matrix of 32-bit integers:
|
| 130 |
+
@param audio_data as 1 × 384 × 1 × 1500 4-dimensional array of floats:
|
| 131 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 132 |
+
@return the prediction as whisper_decoder_implOutput
|
| 133 |
+
*/
|
| 134 |
+
- (nullable whisper_decoder_implOutput *)predictionFromToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
| 135 |
+
|
| 136 |
+
/**
|
| 137 |
+
Batch prediction
|
| 138 |
+
@param inputArray array of whisper_decoder_implInput instances to obtain predictions from
|
| 139 |
+
@param options prediction options
|
| 140 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 141 |
+
@return the predictions as NSArray<whisper_decoder_implOutput *>
|
| 142 |
+
*/
|
| 143 |
+
- (nullable NSArray<whisper_decoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_decoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
| 144 |
+
@end
|
| 145 |
+
|
| 146 |
+
NS_ASSUME_NONNULL_END
|
coreml/whisper-decoder-impl.m
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// whisper-decoder-impl.m
|
| 3 |
+
//
|
| 4 |
+
// This file was automatically generated and should not be edited.
|
| 5 |
+
//
|
| 6 |
+
|
| 7 |
+
#if !__has_feature(objc_arc)
|
| 8 |
+
#error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
|
| 9 |
+
#endif
|
| 10 |
+
|
| 11 |
+
#import "whisper-decoder-impl.h"
|
| 12 |
+
|
| 13 |
+
@implementation whisper_decoder_implInput
|
| 14 |
+
|
| 15 |
+
- (instancetype)initWithToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data {
|
| 16 |
+
self = [super init];
|
| 17 |
+
if (self) {
|
| 18 |
+
_token_data = token_data;
|
| 19 |
+
_audio_data = audio_data;
|
| 20 |
+
}
|
| 21 |
+
return self;
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
- (NSSet<NSString *> *)featureNames {
|
| 25 |
+
return [NSSet setWithArray:@[@"token_data", @"audio_data"]];
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
|
| 29 |
+
if ([featureName isEqualToString:@"token_data"]) {
|
| 30 |
+
return [MLFeatureValue featureValueWithMultiArray:self.token_data];
|
| 31 |
+
}
|
| 32 |
+
if ([featureName isEqualToString:@"audio_data"]) {
|
| 33 |
+
return [MLFeatureValue featureValueWithMultiArray:self.audio_data];
|
| 34 |
+
}
|
| 35 |
+
return nil;
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
@end
|
| 39 |
+
|
| 40 |
+
@implementation whisper_decoder_implOutput
|
| 41 |
+
|
| 42 |
+
- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 {
|
| 43 |
+
self = [super init];
|
| 44 |
+
if (self) {
|
| 45 |
+
_var_1346 = var_1346;
|
| 46 |
+
}
|
| 47 |
+
return self;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
- (NSSet<NSString *> *)featureNames {
|
| 51 |
+
return [NSSet setWithArray:@[@"var_1346"]];
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
|
| 55 |
+
if ([featureName isEqualToString:@"var_1346"]) {
|
| 56 |
+
return [MLFeatureValue featureValueWithMultiArray:self.var_1346];
|
| 57 |
+
}
|
| 58 |
+
return nil;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
@end
|
| 62 |
+
|
| 63 |
+
@implementation whisper_decoder_impl
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
/**
|
| 67 |
+
URL of the underlying .mlmodelc directory.
|
| 68 |
+
*/
|
| 69 |
+
+ (nullable NSURL *)URLOfModelInThisBundle {
|
| 70 |
+
NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"whisper_decoder_impl" ofType:@"mlmodelc"];
|
| 71 |
+
if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load whisper-decoder-impl.mlmodelc in the bundle resource"); return nil; }
|
| 72 |
+
return [NSURL fileURLWithPath:assetPath];
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
/**
|
| 77 |
+
Initialize whisper_decoder_impl instance from an existing MLModel object.
|
| 78 |
+
|
| 79 |
+
Usually the application does not use this initializer unless it makes a subclass of whisper_decoder_impl.
|
| 80 |
+
Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
|
| 81 |
+
*/
|
| 82 |
+
- (instancetype)initWithMLModel:(MLModel *)model {
|
| 83 |
+
self = [super init];
|
| 84 |
+
if (!self) { return nil; }
|
| 85 |
+
_model = model;
|
| 86 |
+
if (_model == nil) { return nil; }
|
| 87 |
+
return self;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
/**
|
| 92 |
+
Initialize whisper_decoder_impl instance with the model in this bundle.
|
| 93 |
+
*/
|
| 94 |
+
- (nullable instancetype)init {
|
| 95 |
+
return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
/**
|
| 100 |
+
Initialize whisper_decoder_impl instance with the model in this bundle.
|
| 101 |
+
|
| 102 |
+
@param configuration The model configuration object
|
| 103 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 104 |
+
*/
|
| 105 |
+
- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
| 106 |
+
return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error];
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
/**
|
| 111 |
+
Initialize whisper_decoder_impl instance from the model URL.
|
| 112 |
+
|
| 113 |
+
@param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
|
| 114 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 115 |
+
*/
|
| 116 |
+
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
| 117 |
+
MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error];
|
| 118 |
+
if (model == nil) { return nil; }
|
| 119 |
+
return [self initWithMLModel:model];
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
/**
|
| 124 |
+
Initialize whisper_decoder_impl instance from the model URL.
|
| 125 |
+
|
| 126 |
+
@param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
|
| 127 |
+
@param configuration The model configuration object
|
| 128 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 129 |
+
*/
|
| 130 |
+
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
| 131 |
+
MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error];
|
| 132 |
+
if (model == nil) { return nil; }
|
| 133 |
+
return [self initWithMLModel:model];
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
/**
|
| 138 |
+
Construct whisper_decoder_impl instance asynchronously with configuration.
|
| 139 |
+
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
| 140 |
+
|
| 141 |
+
@param configuration The model configuration
|
| 142 |
+
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
|
| 143 |
+
*/
|
| 144 |
+
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler {
|
| 145 |
+
[self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
|
| 146 |
+
configuration:configuration
|
| 147 |
+
completionHandler:handler];
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
/**
|
| 152 |
+
Construct whisper_decoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
|
| 153 |
+
|
| 154 |
+
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
| 155 |
+
|
| 156 |
+
@param modelURL The model URL.
|
| 157 |
+
@param configuration The model configuration
|
| 158 |
+
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
|
| 159 |
+
*/
|
| 160 |
+
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler {
|
| 161 |
+
[MLModel loadContentsOfURL:modelURL
|
| 162 |
+
configuration:configuration
|
| 163 |
+
completionHandler:^(MLModel *model, NSError *error) {
|
| 164 |
+
if (model != nil) {
|
| 165 |
+
whisper_decoder_impl *typedModel = [[whisper_decoder_impl alloc] initWithMLModel:model];
|
| 166 |
+
handler(typedModel, nil);
|
| 167 |
+
} else {
|
| 168 |
+
handler(nil, error);
|
| 169 |
+
}
|
| 170 |
+
}];
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
| 174 |
+
return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
| 178 |
+
id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
|
| 179 |
+
if (!outFeatures) { return nil; }
|
| 180 |
+
return [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[outFeatures featureValueForName:@"var_1346"].multiArrayValue];
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
- (nullable whisper_decoder_implOutput *)predictionFromToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
| 184 |
+
whisper_decoder_implInput *input_ = [[whisper_decoder_implInput alloc] initWithToken_data:token_data audio_data:audio_data];
|
| 185 |
+
return [self predictionFromFeatures:input_ error:error];
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
- (nullable NSArray<whisper_decoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_decoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
| 189 |
+
id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
|
| 190 |
+
id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
|
| 191 |
+
if (!outBatch) { return nil; }
|
| 192 |
+
NSMutableArray<whisper_decoder_implOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
|
| 193 |
+
for (NSInteger i = 0; i < outBatch.count; i++) {
|
| 194 |
+
id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
|
| 195 |
+
whisper_decoder_implOutput * result = [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[resultProvider featureValueForName:@"var_1346"].multiArrayValue];
|
| 196 |
+
[results addObject:result];
|
| 197 |
+
}
|
| 198 |
+
return results;
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
@end
|
coreml/whisper-encoder-impl.h
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// whisper-encoder-impl.h
|
| 3 |
+
//
|
| 4 |
+
// This file was automatically generated and should not be edited.
|
| 5 |
+
//
|
| 6 |
+
|
| 7 |
+
#import <Foundation/Foundation.h>
|
| 8 |
+
#import <CoreML/CoreML.h>
|
| 9 |
+
#include <stdint.h>
|
| 10 |
+
#include <os/log.h>
|
| 11 |
+
|
| 12 |
+
NS_ASSUME_NONNULL_BEGIN
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
/// Model Prediction Input Type
|
| 16 |
+
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
|
| 17 |
+
@interface whisper_encoder_implInput : NSObject<MLFeatureProvider>
|
| 18 |
+
|
| 19 |
+
/// logmel_data as 1 × 80 × 3000 3-dimensional array of floats
|
| 20 |
+
@property (readwrite, nonatomic, strong) MLMultiArray * logmel_data;
|
| 21 |
+
- (instancetype)init NS_UNAVAILABLE;
|
| 22 |
+
- (instancetype)initWithLogmel_data:(MLMultiArray *)logmel_data NS_DESIGNATED_INITIALIZER;
|
| 23 |
+
|
| 24 |
+
@end
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
/// Model Prediction Output Type
|
| 28 |
+
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
|
| 29 |
+
@interface whisper_encoder_implOutput : NSObject<MLFeatureProvider>
|
| 30 |
+
|
| 31 |
+
/// output as multidimensional array of floats
|
| 32 |
+
@property (readwrite, nonatomic, strong) MLMultiArray * output;
|
| 33 |
+
- (instancetype)init NS_UNAVAILABLE;
|
| 34 |
+
- (instancetype)initWithOutput:(MLMultiArray *)output NS_DESIGNATED_INITIALIZER;
|
| 35 |
+
|
| 36 |
+
@end
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
/// Class for model loading and prediction
|
| 40 |
+
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
|
| 41 |
+
@interface whisper_encoder_impl : NSObject
|
| 42 |
+
@property (readonly, nonatomic, nullable) MLModel * model;
|
| 43 |
+
|
| 44 |
+
/**
|
| 45 |
+
URL of the underlying .mlmodelc directory.
|
| 46 |
+
*/
|
| 47 |
+
+ (nullable NSURL *)URLOfModelInThisBundle;
|
| 48 |
+
|
| 49 |
+
/**
|
| 50 |
+
Initialize whisper_encoder_impl instance from an existing MLModel object.
|
| 51 |
+
|
| 52 |
+
Usually the application does not use this initializer unless it makes a subclass of whisper_encoder_impl.
|
| 53 |
+
Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
|
| 54 |
+
*/
|
| 55 |
+
- (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
|
| 56 |
+
|
| 57 |
+
/**
|
| 58 |
+
Initialize whisper_encoder_impl instance with the model in this bundle.
|
| 59 |
+
*/
|
| 60 |
+
- (nullable instancetype)init;
|
| 61 |
+
|
| 62 |
+
/**
|
| 63 |
+
Initialize whisper_encoder_impl instance with the model in this bundle.
|
| 64 |
+
|
| 65 |
+
@param configuration The model configuration object
|
| 66 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 67 |
+
*/
|
| 68 |
+
- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
| 69 |
+
|
| 70 |
+
/**
|
| 71 |
+
Initialize whisper_encoder_impl instance from the model URL.
|
| 72 |
+
|
| 73 |
+
@param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
|
| 74 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 75 |
+
*/
|
| 76 |
+
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
| 77 |
+
|
| 78 |
+
/**
|
| 79 |
+
Initialize whisper_encoder_impl instance from the model URL.
|
| 80 |
+
|
| 81 |
+
@param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
|
| 82 |
+
@param configuration The model configuration object
|
| 83 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 84 |
+
*/
|
| 85 |
+
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
| 86 |
+
|
| 87 |
+
/**
|
| 88 |
+
Construct whisper_encoder_impl instance asynchronously with configuration.
|
| 89 |
+
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
| 90 |
+
|
| 91 |
+
@param configuration The model configuration
|
| 92 |
+
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
|
| 93 |
+
*/
|
| 94 |
+
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler;
|
| 95 |
+
|
| 96 |
+
/**
|
| 97 |
+
Construct whisper_encoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
|
| 98 |
+
|
| 99 |
+
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
| 100 |
+
|
| 101 |
+
@param modelURL The model URL.
|
| 102 |
+
@param configuration The model configuration
|
| 103 |
+
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
|
| 104 |
+
*/
|
| 105 |
+
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler;
|
| 106 |
+
|
| 107 |
+
/**
|
| 108 |
+
Make a prediction using the standard interface
|
| 109 |
+
@param input an instance of whisper_encoder_implInput to predict from
|
| 110 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 111 |
+
@return the prediction as whisper_encoder_implOutput
|
| 112 |
+
*/
|
| 113 |
+
- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
| 114 |
+
|
| 115 |
+
/**
|
| 116 |
+
Make a prediction using the standard interface
|
| 117 |
+
@param input an instance of whisper_encoder_implInput to predict from
|
| 118 |
+
@param options prediction options
|
| 119 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 120 |
+
@return the prediction as whisper_encoder_implOutput
|
| 121 |
+
*/
|
| 122 |
+
- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
| 123 |
+
|
| 124 |
+
/**
|
| 125 |
+
Make a prediction using the convenience interface
|
| 126 |
+
@param logmel_data as 1 × 80 × 3000 3-dimensional array of floats:
|
| 127 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 128 |
+
@return the prediction as whisper_encoder_implOutput
|
| 129 |
+
*/
|
| 130 |
+
- (nullable whisper_encoder_implOutput *)predictionFromLogmel_data:(MLMultiArray *)logmel_data error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
| 131 |
+
|
| 132 |
+
/**
|
| 133 |
+
Batch prediction
|
| 134 |
+
@param inputArray array of whisper_encoder_implInput instances to obtain predictions from
|
| 135 |
+
@param options prediction options
|
| 136 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 137 |
+
@return the predictions as NSArray<whisper_encoder_implOutput *>
|
| 138 |
+
*/
|
| 139 |
+
- (nullable NSArray<whisper_encoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_encoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
|
| 140 |
+
@end
|
| 141 |
+
|
| 142 |
+
NS_ASSUME_NONNULL_END
|
coreml/whisper-encoder-impl.m
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// whisper-encoder-impl.m
|
| 3 |
+
//
|
| 4 |
+
// This file was automatically generated and should not be edited.
|
| 5 |
+
//
|
| 6 |
+
|
| 7 |
+
#if !__has_feature(objc_arc)
|
| 8 |
+
#error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
|
| 9 |
+
#endif
|
| 10 |
+
|
| 11 |
+
#import "whisper-encoder-impl.h"
|
| 12 |
+
|
| 13 |
+
@implementation whisper_encoder_implInput
|
| 14 |
+
|
| 15 |
+
- (instancetype)initWithLogmel_data:(MLMultiArray *)logmel_data {
|
| 16 |
+
self = [super init];
|
| 17 |
+
if (self) {
|
| 18 |
+
_logmel_data = logmel_data;
|
| 19 |
+
}
|
| 20 |
+
return self;
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
- (NSSet<NSString *> *)featureNames {
|
| 24 |
+
return [NSSet setWithArray:@[@"logmel_data"]];
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
|
| 28 |
+
if ([featureName isEqualToString:@"logmel_data"]) {
|
| 29 |
+
return [MLFeatureValue featureValueWithMultiArray:self.logmel_data];
|
| 30 |
+
}
|
| 31 |
+
return nil;
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
@end
|
| 35 |
+
|
| 36 |
+
@implementation whisper_encoder_implOutput
|
| 37 |
+
|
| 38 |
+
- (instancetype)initWithOutput:(MLMultiArray *)output {
|
| 39 |
+
self = [super init];
|
| 40 |
+
if (self) {
|
| 41 |
+
_output = output;
|
| 42 |
+
}
|
| 43 |
+
return self;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
- (NSSet<NSString *> *)featureNames {
|
| 47 |
+
return [NSSet setWithArray:@[@"output"]];
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
|
| 51 |
+
if ([featureName isEqualToString:@"output"]) {
|
| 52 |
+
return [MLFeatureValue featureValueWithMultiArray:self.output];
|
| 53 |
+
}
|
| 54 |
+
return nil;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
@end
|
| 58 |
+
|
| 59 |
+
@implementation whisper_encoder_impl
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
/**
|
| 63 |
+
URL of the underlying .mlmodelc directory.
|
| 64 |
+
*/
|
| 65 |
+
+ (nullable NSURL *)URLOfModelInThisBundle {
|
| 66 |
+
NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"whisper_encoder_impl" ofType:@"mlmodelc"];
|
| 67 |
+
if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load whisper-encoder-impl.mlmodelc in the bundle resource"); return nil; }
|
| 68 |
+
return [NSURL fileURLWithPath:assetPath];
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
/**
|
| 73 |
+
Initialize whisper_encoder_impl instance from an existing MLModel object.
|
| 74 |
+
|
| 75 |
+
Usually the application does not use this initializer unless it makes a subclass of whisper_encoder_impl.
|
| 76 |
+
Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
|
| 77 |
+
*/
|
| 78 |
+
- (instancetype)initWithMLModel:(MLModel *)model {
|
| 79 |
+
self = [super init];
|
| 80 |
+
if (!self) { return nil; }
|
| 81 |
+
_model = model;
|
| 82 |
+
if (_model == nil) { return nil; }
|
| 83 |
+
return self;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
/**
|
| 88 |
+
Initialize whisper_encoder_impl instance with the model in this bundle.
|
| 89 |
+
*/
|
| 90 |
+
- (nullable instancetype)init {
|
| 91 |
+
return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
/**
|
| 96 |
+
Initialize whisper_encoder_impl instance with the model in this bundle.
|
| 97 |
+
|
| 98 |
+
@param configuration The model configuration object
|
| 99 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 100 |
+
*/
|
| 101 |
+
- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
| 102 |
+
return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error];
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
/**
|
| 107 |
+
Initialize whisper_encoder_impl instance from the model URL.
|
| 108 |
+
|
| 109 |
+
@param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
|
| 110 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 111 |
+
*/
|
| 112 |
+
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
| 113 |
+
MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error];
|
| 114 |
+
if (model == nil) { return nil; }
|
| 115 |
+
return [self initWithMLModel:model];
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
/**
|
| 120 |
+
Initialize whisper_encoder_impl instance from the model URL.
|
| 121 |
+
|
| 122 |
+
@param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
|
| 123 |
+
@param configuration The model configuration object
|
| 124 |
+
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
|
| 125 |
+
*/
|
| 126 |
+
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
| 127 |
+
MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error];
|
| 128 |
+
if (model == nil) { return nil; }
|
| 129 |
+
return [self initWithMLModel:model];
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
/**
|
| 134 |
+
Construct whisper_encoder_impl instance asynchronously with configuration.
|
| 135 |
+
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
| 136 |
+
|
| 137 |
+
@param configuration The model configuration
|
| 138 |
+
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
|
| 139 |
+
*/
|
| 140 |
+
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler {
|
| 141 |
+
[self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
|
| 142 |
+
configuration:configuration
|
| 143 |
+
completionHandler:handler];
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
/**
|
| 148 |
+
Construct whisper_encoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
|
| 149 |
+
|
| 150 |
+
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
|
| 151 |
+
|
| 152 |
+
@param modelURL The model URL.
|
| 153 |
+
@param configuration The model configuration
|
| 154 |
+
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
|
| 155 |
+
*/
|
| 156 |
+
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler {
|
| 157 |
+
[MLModel loadContentsOfURL:modelURL
|
| 158 |
+
configuration:configuration
|
| 159 |
+
completionHandler:^(MLModel *model, NSError *error) {
|
| 160 |
+
if (model != nil) {
|
| 161 |
+
whisper_encoder_impl *typedModel = [[whisper_encoder_impl alloc] initWithMLModel:model];
|
| 162 |
+
handler(typedModel, nil);
|
| 163 |
+
} else {
|
| 164 |
+
handler(nil, error);
|
| 165 |
+
}
|
| 166 |
+
}];
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
| 170 |
+
return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
| 174 |
+
id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
|
| 175 |
+
if (!outFeatures) { return nil; }
|
| 176 |
+
return [[whisper_encoder_implOutput alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
- (nullable whisper_encoder_implOutput *)predictionFromLogmel_data:(MLMultiArray *)logmel_data error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
| 180 |
+
whisper_encoder_implInput *input_ = [[whisper_encoder_implInput alloc] initWithLogmel_data:logmel_data];
|
| 181 |
+
return [self predictionFromFeatures:input_ error:error];
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
- (nullable NSArray<whisper_encoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_encoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
| 185 |
+
id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
|
| 186 |
+
id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
|
| 187 |
+
if (!outBatch) { return nil; }
|
| 188 |
+
NSMutableArray<whisper_encoder_implOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
|
| 189 |
+
for (NSInteger i = 0; i < outBatch.count; i++) {
|
| 190 |
+
id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
|
| 191 |
+
whisper_encoder_implOutput * result = [[whisper_encoder_implOutput alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue];
|
| 192 |
+
[results addObject:result];
|
| 193 |
+
}
|
| 194 |
+
return results;
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
@end
|
coreml/whisper-encoder.h
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Wrapper of the Core ML Whisper Encoder model
|
| 2 |
+
//
|
| 3 |
+
// Code is derived from the work of Github user @wangchou
|
| 4 |
+
// ref: https://github.com/wangchou/callCoreMLFromCpp
|
| 5 |
+
|
| 6 |
+
#if __cplusplus
|
| 7 |
+
extern "C" {
|
| 8 |
+
#endif
|
| 9 |
+
|
| 10 |
+
struct whisper_coreml_context;
|
| 11 |
+
|
| 12 |
+
struct whisper_coreml_context * whisper_coreml_init(const char * path_model);
|
| 13 |
+
void whisper_coreml_free(struct whisper_coreml_context * ctx);
|
| 14 |
+
|
| 15 |
+
void whisper_coreml_encode(
|
| 16 |
+
const whisper_coreml_context * ctx,
|
| 17 |
+
float * mel,
|
| 18 |
+
float * out);
|
| 19 |
+
|
| 20 |
+
#if __cplusplus
|
| 21 |
+
}
|
| 22 |
+
#endif
|
coreml/whisper-encoder.mm
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#import "coreml/whisper-encoder.h"
|
| 2 |
+
#import "coreml/whisper-encoder-impl.h"
|
| 3 |
+
|
| 4 |
+
#import <CoreML/CoreML.h>
|
| 5 |
+
|
| 6 |
+
#include <stdlib.h>
|
| 7 |
+
|
| 8 |
+
#if __cplusplus
|
| 9 |
+
extern "C" {
|
| 10 |
+
#endif
|
| 11 |
+
|
| 12 |
+
struct whisper_coreml_context {
|
| 13 |
+
const void * data;
|
| 14 |
+
};
|
| 15 |
+
|
| 16 |
+
struct whisper_coreml_context * whisper_coreml_init(const char * path_model) {
|
| 17 |
+
NSString * path_model_str = [[NSString alloc] initWithUTF8String:path_model];
|
| 18 |
+
|
| 19 |
+
NSURL * url_model = [NSURL fileURLWithPath: path_model_str];
|
| 20 |
+
|
| 21 |
+
const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model error:nil]);
|
| 22 |
+
|
| 23 |
+
if (data == NULL) {
|
| 24 |
+
return NULL;
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
whisper_coreml_context * ctx = new whisper_coreml_context;
|
| 28 |
+
|
| 29 |
+
ctx->data = data;
|
| 30 |
+
|
| 31 |
+
return ctx;
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
void whisper_coreml_free(struct whisper_coreml_context * ctx) {
|
| 35 |
+
CFRelease(ctx->data);
|
| 36 |
+
delete ctx;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
void whisper_coreml_encode(
|
| 40 |
+
const whisper_coreml_context * ctx,
|
| 41 |
+
float * mel,
|
| 42 |
+
float * out) {
|
| 43 |
+
MLMultiArray * inMultiArray = [
|
| 44 |
+
[MLMultiArray alloc] initWithDataPointer: mel
|
| 45 |
+
shape: @[@1, @80, @3000]
|
| 46 |
+
dataType: MLMultiArrayDataTypeFloat32
|
| 47 |
+
strides: @[@(240000), @(3000), @1]
|
| 48 |
+
deallocator: nil
|
| 49 |
+
error: nil
|
| 50 |
+
];
|
| 51 |
+
|
| 52 |
+
whisper_encoder_implOutput * outCoreML = [(__bridge id) ctx->data predictionFromLogmel_data:inMultiArray error:nil];
|
| 53 |
+
|
| 54 |
+
MLMultiArray * outMA = outCoreML.output;
|
| 55 |
+
|
| 56 |
+
//NSArray<NSNumber *> * shape = outMA.shape;
|
| 57 |
+
//NSArray<NSNumber *> * strides = outMA.strides;
|
| 58 |
+
|
| 59 |
+
//printf("shape: %ld %ld %ld %ld\n", [shape[0] longValue], [shape[1] longValue], [shape[2] longValue], [shape[3] longValue]);
|
| 60 |
+
//printf("strides: %ld %ld %ld %ld\n", [strides[0] longValue], [strides[1] longValue], [strides[2] longValue], [strides[3] longValue]);
|
| 61 |
+
|
| 62 |
+
memcpy(out, outMA.dataPointer, outMA.count * sizeof(float));
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
#if __cplusplus
|
| 66 |
+
}
|
| 67 |
+
#endif
|
extra/bench-all.sh
CHANGED
|
@@ -64,6 +64,10 @@ for model in "${models[@]}"; do
|
|
| 64 |
config="$config BLAS"
|
| 65 |
fi
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
commit=$(git rev-parse --short HEAD)
|
| 68 |
|
| 69 |
printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
|
|
|
|
| 64 |
config="$config BLAS"
|
| 65 |
fi
|
| 66 |
|
| 67 |
+
if [[ $system_info == *"COREML = 1"* ]]; then
|
| 68 |
+
config="$config COREML"
|
| 69 |
+
fi
|
| 70 |
+
|
| 71 |
commit=$(git rev-parse --short HEAD)
|
| 72 |
|
| 73 |
printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
|
models/convert-whisper-to-coreml.py
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
import coremltools as ct
|
| 5 |
+
|
| 6 |
+
from torch import Tensor
|
| 7 |
+
from torch import nn
|
| 8 |
+
from typing import Dict
|
| 9 |
+
from typing import Optional
|
| 10 |
+
from ane_transformers.reference.layer_norm import LayerNormANE as LayerNormANEBase
|
| 11 |
+
from coremltools.models.neural_network.quantization_utils import quantize_weights
|
| 12 |
+
from whisper.model import Whisper, AudioEncoder, TextDecoder, ResidualAttentionBlock, MultiHeadAttention, ModelDimensions
|
| 13 |
+
from whisper import load_model
|
| 14 |
+
|
| 15 |
+
# Use for changing dim of input in encoder and decoder embeddings
|
| 16 |
+
def linear_to_conv2d_map(state_dict, prefix, local_metadata, strict,
|
| 17 |
+
missing_keys, unexpected_keys, error_msgs):
|
| 18 |
+
"""
|
| 19 |
+
Unsqueeze twice to map nn.Linear weights to nn.Conv2d weights
|
| 20 |
+
"""
|
| 21 |
+
for k in state_dict:
|
| 22 |
+
is_attention = all(substr in k for substr in ['attn', '.weight'])
|
| 23 |
+
is_mlp = any([k.endswith(s) for s in ['mlp.0.weight', 'mlp.2.weight']])
|
| 24 |
+
|
| 25 |
+
if (is_attention or is_mlp) and len(state_dict[k].shape) == 2:
|
| 26 |
+
state_dict[k] = state_dict[k][:, :, None, None]
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def correct_for_bias_scale_order_inversion(state_dict, prefix, local_metadata,
|
| 30 |
+
strict, missing_keys,
|
| 31 |
+
unexpected_keys, error_msgs):
|
| 32 |
+
state_dict[prefix + 'bias'] = state_dict[prefix + 'bias'] / state_dict[prefix + 'weight']
|
| 33 |
+
return state_dict
|
| 34 |
+
|
| 35 |
+
class LayerNormANE(LayerNormANEBase):
|
| 36 |
+
|
| 37 |
+
def __init__(self, *args, **kwargs):
|
| 38 |
+
super().__init__(*args, **kwargs)
|
| 39 |
+
self._register_load_state_dict_pre_hook(
|
| 40 |
+
correct_for_bias_scale_order_inversion)
|
| 41 |
+
|
| 42 |
+
class MultiHeadAttentionANE(MultiHeadAttention):
|
| 43 |
+
def __init__(self, n_state: int, n_head: int):
|
| 44 |
+
super().__init__(n_state, n_head)
|
| 45 |
+
|
| 46 |
+
setattr(self, 'query', nn.Conv2d(n_state, n_state, kernel_size=1))
|
| 47 |
+
setattr(self, 'key', nn.Conv2d(n_state, n_state, kernel_size=1, bias=False))
|
| 48 |
+
setattr(self, 'value', nn.Conv2d(n_state, n_state, kernel_size=1))
|
| 49 |
+
setattr(self, 'out', nn.Conv2d(n_state, n_state, kernel_size=1))
|
| 50 |
+
|
| 51 |
+
def forward(self,
|
| 52 |
+
x: Tensor,
|
| 53 |
+
xa: Optional[Tensor] = None,
|
| 54 |
+
mask: Optional[Tensor] = None,
|
| 55 |
+
kv_cache: Optional[dict] = None):
|
| 56 |
+
|
| 57 |
+
q = self.query(x)
|
| 58 |
+
|
| 59 |
+
if kv_cache is None or xa is None or self.key not in kv_cache:
|
| 60 |
+
# hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
|
| 61 |
+
# otherwise, perform key/value projections for self- or cross-attention as usual.
|
| 62 |
+
k = self.key(x if xa is None else xa)
|
| 63 |
+
v = self.value(x if xa is None else xa)
|
| 64 |
+
|
| 65 |
+
else:
|
| 66 |
+
# for cross-attention, calculate keys and values once and reuse in subsequent calls.
|
| 67 |
+
k = kv_cache[self.key]
|
| 68 |
+
v = kv_cache[self.value]
|
| 69 |
+
|
| 70 |
+
wv, qk = self.qkv_attention_ane(q, k, v, mask)
|
| 71 |
+
|
| 72 |
+
return self.out(wv), qk
|
| 73 |
+
|
| 74 |
+
def qkv_attention_ane(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None):
|
| 75 |
+
|
| 76 |
+
_, dim, _, seqlen = q.size()
|
| 77 |
+
|
| 78 |
+
dim_per_head = dim // self.n_head
|
| 79 |
+
|
| 80 |
+
scale = float(dim_per_head)**-0.5
|
| 81 |
+
|
| 82 |
+
q = q * scale
|
| 83 |
+
|
| 84 |
+
mh_q = q.split(dim_per_head, dim=1)
|
| 85 |
+
mh_k = k.transpose(1,3).split(dim_per_head, dim=3)
|
| 86 |
+
mh_v = v.split(dim_per_head, dim=1)
|
| 87 |
+
|
| 88 |
+
mh_qk = [
|
| 89 |
+
torch.einsum('bchq,bkhc->bkhq', [qi, ki])
|
| 90 |
+
for qi, ki in zip(mh_q, mh_k)
|
| 91 |
+
] # (batch_size, max_seq_length, 1, max_seq_length) * n_heads
|
| 92 |
+
|
| 93 |
+
if mask is not None:
|
| 94 |
+
for head_idx in range(self.n_head):
|
| 95 |
+
mh_qk[head_idx] = mh_qk[head_idx] + mask[:, :seqlen, :, :seqlen]
|
| 96 |
+
|
| 97 |
+
attn_weights = [aw.softmax(dim=1) for aw in mh_qk] # (batch_size, max_seq_length, 1, max_seq_length) * n_heads
|
| 98 |
+
attn = [torch.einsum('bkhq,bchk->bchq', wi, vi) for wi, vi in zip(attn_weights, mh_v)] # (batch_size, dim_per_head, 1, max_seq_length) * n_heads
|
| 99 |
+
attn = torch.cat(attn, dim=1) # (batch_size, dim, 1, max_seq_length)
|
| 100 |
+
|
| 101 |
+
return attn, torch.cat(mh_qk, dim=1).float().detach()
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class ResidualAttentionBlockANE(ResidualAttentionBlock):
|
| 105 |
+
def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
|
| 106 |
+
super().__init__(n_state, n_head, cross_attention)
|
| 107 |
+
|
| 108 |
+
setattr(self, 'attn', MultiHeadAttentionANE(n_state, n_head))
|
| 109 |
+
setattr(self, 'attn_ln', LayerNormANE(n_state))
|
| 110 |
+
|
| 111 |
+
setattr(self, 'cross_attn', MultiHeadAttentionANE(n_state, n_head) if cross_attention else None)
|
| 112 |
+
setattr(self, 'cross_attn_ln', LayerNormANE(n_state) if cross_attention else None)
|
| 113 |
+
|
| 114 |
+
n_mlp = n_state * 4
|
| 115 |
+
setattr(self, 'mlp', nn.Sequential(
|
| 116 |
+
nn.Conv2d(n_state, n_mlp, kernel_size=1),
|
| 117 |
+
nn.GELU(),
|
| 118 |
+
nn.Conv2d(n_mlp, n_state, kernel_size=1)
|
| 119 |
+
))
|
| 120 |
+
setattr(self, 'mlp_ln', LayerNormANE(n_state))
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
class AudioEncoderANE(AudioEncoder):
|
| 124 |
+
def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
|
| 125 |
+
super().__init__(n_mels, n_ctx, n_state, n_head, n_layer)
|
| 126 |
+
|
| 127 |
+
setattr(self, 'blocks', nn.ModuleList(
|
| 128 |
+
[ResidualAttentionBlockANE(n_state, n_head) for _ in range(n_layer)]
|
| 129 |
+
))
|
| 130 |
+
setattr(self, 'ln_post', LayerNormANE(n_state))
|
| 131 |
+
|
| 132 |
+
def forward(self, x: Tensor):
|
| 133 |
+
"""
|
| 134 |
+
x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
|
| 135 |
+
the mel spectrogram of the audio
|
| 136 |
+
"""
|
| 137 |
+
x = F.gelu(self.conv1(x))
|
| 138 |
+
x = F.gelu(self.conv2(x))
|
| 139 |
+
|
| 140 |
+
assert x.shape[1:] == self.positional_embedding.shape[::-1], "incorrect audio shape"
|
| 141 |
+
|
| 142 |
+
# Add positional embedding and add dummy dim for ANE
|
| 143 |
+
x = (x + self.positional_embedding.transpose(0,1)).to(x.dtype).unsqueeze(2)
|
| 144 |
+
|
| 145 |
+
for block in self.blocks:
|
| 146 |
+
x = block(x)
|
| 147 |
+
|
| 148 |
+
x = self.ln_post(x)
|
| 149 |
+
|
| 150 |
+
# """
|
| 151 |
+
# TODO:
|
| 152 |
+
# I think we need to transpose the result here to make it fit whisper.cpp memory order.
|
| 153 |
+
# However, even doing this, the results are still wrong. Kind of less wrong compared to
|
| 154 |
+
# not transposing, but still wrong.
|
| 155 |
+
|
| 156 |
+
# Also, I don't know why the original OpenAI implementation does not need to transpose
|
| 157 |
+
|
| 158 |
+
# transpose to (batch_size, n_ctx, n_state)
|
| 159 |
+
# x : torch.Tensor, shape = (batch_size, n_state, 1, n_ctx)
|
| 160 |
+
|
| 161 |
+
# """
|
| 162 |
+
# x = x.transpose(1,3)
|
| 163 |
+
|
| 164 |
+
return x
|
| 165 |
+
|
| 166 |
+
class TextDecoderANE(TextDecoder):
|
| 167 |
+
|
| 168 |
+
def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
|
| 169 |
+
super().__init__(n_vocab, n_ctx, n_state, n_head, n_layer)
|
| 170 |
+
|
| 171 |
+
setattr(self, 'blocks', nn.ModuleList(
|
| 172 |
+
[ResidualAttentionBlockANE(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
|
| 173 |
+
))
|
| 174 |
+
setattr(self, 'ln', LayerNormANE(n_state))
|
| 175 |
+
|
| 176 |
+
def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
|
| 177 |
+
"""
|
| 178 |
+
x : torch.LongTensor, shape = (batch_size, <= n_ctx)
|
| 179 |
+
the text tokens
|
| 180 |
+
xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
|
| 181 |
+
the encoded audio features to be attended on
|
| 182 |
+
"""
|
| 183 |
+
offset = next(iter(kv_cache.values())).shape[3] if kv_cache else 0
|
| 184 |
+
x = self.token_embedding(x) + self.positional_embedding[offset : offset + x.shape[-1]]
|
| 185 |
+
x = x.to(xa.dtype)
|
| 186 |
+
|
| 187 |
+
# Reformat for ANE
|
| 188 |
+
mask = self.mask[None, None, :, :].permute(0,3,1,2)
|
| 189 |
+
x = x.transpose(1,2).unsqueeze(2)
|
| 190 |
+
|
| 191 |
+
for block in self.blocks:
|
| 192 |
+
x = block(x, xa, mask=mask, kv_cache=kv_cache)
|
| 193 |
+
|
| 194 |
+
x = self.ln(x)
|
| 195 |
+
|
| 196 |
+
# Reformat back from ANE
|
| 197 |
+
x = x.permute(0,2,3,1).squeeze(0)
|
| 198 |
+
|
| 199 |
+
# ANE can only load tensors with dim size of at most 16,384 - whisper uses 51,864 (en) or 51,865 (multi-lang) tokens so we need to compute in chunks
|
| 200 |
+
if self.token_embedding.weight.shape[0] == 51865:
|
| 201 |
+
# split in 11 chunks - 4715 each
|
| 202 |
+
splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//11, dim=0)
|
| 203 |
+
logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
|
| 204 |
+
else:
|
| 205 |
+
# split in 12 chunks - 4322 each
|
| 206 |
+
assert(self.token_embedding.weight.shape[0] == 51864)
|
| 207 |
+
splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//12, dim=0)
|
| 208 |
+
logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
|
| 209 |
+
|
| 210 |
+
return logits
|
| 211 |
+
|
| 212 |
+
class WhisperANE(Whisper):
|
| 213 |
+
def __init__(self, dims: ModelDimensions):
|
| 214 |
+
super().__init__(dims)
|
| 215 |
+
|
| 216 |
+
setattr(self, 'encoder', AudioEncoderANE(
|
| 217 |
+
self.dims.n_mels,
|
| 218 |
+
self.dims.n_audio_ctx,
|
| 219 |
+
self.dims.n_audio_state,
|
| 220 |
+
self.dims.n_audio_head,
|
| 221 |
+
self.dims.n_audio_layer,
|
| 222 |
+
))
|
| 223 |
+
setattr(self, 'decoder', TextDecoderANE(
|
| 224 |
+
self.dims.n_vocab,
|
| 225 |
+
self.dims.n_text_ctx,
|
| 226 |
+
self.dims.n_text_state,
|
| 227 |
+
self.dims.n_text_head,
|
| 228 |
+
self.dims.n_text_layer,
|
| 229 |
+
))
|
| 230 |
+
|
| 231 |
+
self._register_load_state_dict_pre_hook(linear_to_conv2d_map)
|
| 232 |
+
|
| 233 |
+
def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> Dict[str, torch.Tensor]:
|
| 234 |
+
return self.decoder(tokens, self.encoder(mel))
|
| 235 |
+
|
| 236 |
+
def install_kv_cache_hooks(self, cache: Optional[dict] = None):
|
| 237 |
+
cache = {**cache} if cache is not None else {}
|
| 238 |
+
hooks = []
|
| 239 |
+
|
| 240 |
+
def save_to_cache(module, _, output):
|
| 241 |
+
if module not in cache or output.shape[3] > self.decoder.positional_embedding.shape[0]:
|
| 242 |
+
cache[module] = output # save as-is, for the first token or cross attention
|
| 243 |
+
else:
|
| 244 |
+
cache[module] = torch.cat([cache[module], output], dim=3).detach()
|
| 245 |
+
return cache[module]
|
| 246 |
+
|
| 247 |
+
def install_hooks(layer: nn.Module):
|
| 248 |
+
if isinstance(layer, MultiHeadAttentionANE):
|
| 249 |
+
hooks.append(layer.key.register_forward_hook(save_to_cache))
|
| 250 |
+
hooks.append(layer.value.register_forward_hook(save_to_cache))
|
| 251 |
+
|
| 252 |
+
self.decoder.apply(install_hooks)
|
| 253 |
+
return cache, hooks
|
| 254 |
+
|
| 255 |
+
def convert_encoder(hparams, model, quantize=False):
|
| 256 |
+
model.eval()
|
| 257 |
+
|
| 258 |
+
input_shape = (1, 80, 3000)
|
| 259 |
+
input_data = torch.randn(input_shape)
|
| 260 |
+
traced_model = torch.jit.trace(model, input_data)
|
| 261 |
+
|
| 262 |
+
model = ct.convert(
|
| 263 |
+
traced_model,
|
| 264 |
+
convert_to=None if quantize else "mlprogram", # convert will fail if weights are quantized, not sure why
|
| 265 |
+
inputs=[ct.TensorType(name="logmel_data", shape=input_shape)],
|
| 266 |
+
outputs=[ct.TensorType(name="output")],
|
| 267 |
+
compute_units=ct.ComputeUnit.ALL
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
if quantize:
|
| 271 |
+
model = quantize_weights(model, nbits=16)
|
| 272 |
+
|
| 273 |
+
return model
|
| 274 |
+
|
| 275 |
+
def convert_decoder(hparams, model, quantize=False):
|
| 276 |
+
model.eval()
|
| 277 |
+
|
| 278 |
+
tokens_shape = (1, 1)
|
| 279 |
+
audio_shape = (1, hparams.n_audio_state, 1, 1500)
|
| 280 |
+
|
| 281 |
+
audio_data = torch.randn(audio_shape)
|
| 282 |
+
token_data = torch.randint(50257, tokens_shape).long()
|
| 283 |
+
traced_model = torch.jit.trace(model, (token_data, audio_data))
|
| 284 |
+
|
| 285 |
+
model = ct.convert(
|
| 286 |
+
traced_model,
|
| 287 |
+
convert_to=None if quantize else "mlprogram", # convert will fail if weights are quantized, not sure why
|
| 288 |
+
inputs=[
|
| 289 |
+
ct.TensorType(name="token_data", shape=tokens_shape, dtype=int),
|
| 290 |
+
ct.TensorType(name="audio_data", shape=audio_shape)
|
| 291 |
+
]
|
| 292 |
+
)
|
| 293 |
+
|
| 294 |
+
if quantize:
|
| 295 |
+
model = quantize_weights(model, nbits=16)
|
| 296 |
+
|
| 297 |
+
return model
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
if __name__ == "__main__":
|
| 301 |
+
parser = argparse.ArgumentParser()
|
| 302 |
+
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large)", required=True)
|
| 303 |
+
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
|
| 304 |
+
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
|
| 305 |
+
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
|
| 306 |
+
args = parser.parse_args()
|
| 307 |
+
|
| 308 |
+
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large"]:
|
| 309 |
+
raise ValueError("Invalid model name")
|
| 310 |
+
|
| 311 |
+
whisper = load_model(args.model).cpu()
|
| 312 |
+
hparams = whisper.dims
|
| 313 |
+
print(hparams)
|
| 314 |
+
|
| 315 |
+
if args.optimize_ane:
|
| 316 |
+
whisperANE = WhisperANE(hparams).eval()
|
| 317 |
+
whisperANE.load_state_dict(whisper.state_dict())
|
| 318 |
+
|
| 319 |
+
encoder = whisperANE.encoder
|
| 320 |
+
decoder = whisperANE.decoder
|
| 321 |
+
else:
|
| 322 |
+
encoder = whisper.encoder
|
| 323 |
+
decoder = whisper.decoder
|
| 324 |
+
|
| 325 |
+
# Convert encoder
|
| 326 |
+
encoder = convert_encoder(hparams, encoder, quantize=args.quantize)
|
| 327 |
+
encoder.save(f"models/coreml-encoder-{args.model}.mlpackage")
|
| 328 |
+
|
| 329 |
+
if args.encoder_only is False:
|
| 330 |
+
# Convert decoder
|
| 331 |
+
decoder = convert_decoder(hparams, decoder, quantize=args.quantize)
|
| 332 |
+
decoder.save(f"models/coreml-decoder-{args.model}.mlpackage")
|
| 333 |
+
|
| 334 |
+
print("done converting")
|
models/download-coreml-model.sh
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# This script downloads Whisper model files that have already been converted to Core ML format.
|
| 4 |
+
# This way you don't have to convert them yourself.
|
| 5 |
+
|
| 6 |
+
src="https://huggingface.co/datasets/ggerganov/whisper.cpp-coreml"
|
| 7 |
+
pfx="resolve/main/ggml"
|
| 8 |
+
|
| 9 |
+
# get the path of this script
|
| 10 |
+
function get_script_path() {
|
| 11 |
+
if [ -x "$(command -v realpath)" ]; then
|
| 12 |
+
echo "$(dirname $(realpath $0))"
|
| 13 |
+
else
|
| 14 |
+
local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
|
| 15 |
+
echo "$ret"
|
| 16 |
+
fi
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
models_path="$(get_script_path)"
|
| 20 |
+
|
| 21 |
+
# Whisper models
|
| 22 |
+
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
|
| 23 |
+
|
| 24 |
+
# list available models
|
| 25 |
+
function list_models {
|
| 26 |
+
printf "\n"
|
| 27 |
+
printf " Available models:"
|
| 28 |
+
for model in "${models[@]}"; do
|
| 29 |
+
printf " $model"
|
| 30 |
+
done
|
| 31 |
+
printf "\n\n"
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
if [ "$#" -ne 1 ]; then
|
| 35 |
+
printf "Usage: $0 <model>\n"
|
| 36 |
+
list_models
|
| 37 |
+
|
| 38 |
+
exit 1
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
model=$1
|
| 42 |
+
|
| 43 |
+
if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
|
| 44 |
+
printf "Invalid model: $model\n"
|
| 45 |
+
list_models
|
| 46 |
+
|
| 47 |
+
exit 1
|
| 48 |
+
fi
|
| 49 |
+
|
| 50 |
+
# download Core ML model
|
| 51 |
+
|
| 52 |
+
printf "Downloading Core ML model $model from '$src' ...\n"
|
| 53 |
+
|
| 54 |
+
cd $models_path
|
| 55 |
+
|
| 56 |
+
if [ -f "ggml-$model.mlmodel" ]; then
|
| 57 |
+
printf "Model $model already exists. Skipping download.\n"
|
| 58 |
+
exit 0
|
| 59 |
+
fi
|
| 60 |
+
|
| 61 |
+
if [ -x "$(command -v wget)" ]; then
|
| 62 |
+
wget --quiet --show-progress -O ggml-$model.mlmodel $src/$pfx-$model.mlmodel
|
| 63 |
+
elif [ -x "$(command -v curl)" ]; then
|
| 64 |
+
curl -L --output ggml-$model.mlmodel $src/$pfx-$model.mlmodel
|
| 65 |
+
else
|
| 66 |
+
printf "Either wget or curl is required to download models.\n"
|
| 67 |
+
exit 1
|
| 68 |
+
fi
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
if [ $? -ne 0 ]; then
|
| 72 |
+
printf "Failed to download Core ML model $model \n"
|
| 73 |
+
printf "Please try again later or download the original Whisper model files and convert them yourself.\n"
|
| 74 |
+
exit 1
|
| 75 |
+
fi
|
| 76 |
+
|
| 77 |
+
printf "Done! Model '$model' saved in 'models/ggml-$model.mlmodel'\n"
|
| 78 |
+
printf "Run the following command to compile it:\n\n"
|
| 79 |
+
printf " $ xcrun coremlc compile ./models/ggml-$model.mlmodel ./models\n\n"
|
| 80 |
+
printf "You can now use it like this:\n\n"
|
| 81 |
+
printf " $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n"
|
| 82 |
+
printf "\n"
|
models/generate-coreml-interface.sh
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
#
|
| 3 |
+
# This generates:
|
| 4 |
+
# - coreml/whisper-encoder-impl.h and coreml/whisper-encoder-impl.m
|
| 5 |
+
# - coreml/whisper-decoder-impl.h and coreml/whisper-decoder-impl.m
|
| 6 |
+
#
|
| 7 |
+
|
| 8 |
+
wd=$(dirname "$0")
|
| 9 |
+
cd "$wd/../"
|
| 10 |
+
|
| 11 |
+
python3 models/convert-whisper-to-coreml.py --model tiny.en
|
| 12 |
+
|
| 13 |
+
mv -v models/coreml-encoder-tiny.en.mlpackage models/whisper-encoder-impl.mlpackage
|
| 14 |
+
xcrun coremlc generate models/whisper-encoder-impl.mlpackage coreml/
|
| 15 |
+
mv coreml/whisper_encoder_impl.h coreml/whisper-encoder-impl.h
|
| 16 |
+
mv coreml/whisper_encoder_impl.m coreml/whisper-encoder-impl.m
|
| 17 |
+
sed -i '' 's/whisper_encoder_impl\.h/whisper-encoder-impl.h/g' coreml/whisper-encoder-impl.m
|
| 18 |
+
sed -i '' 's/whisper_encoder_impl\.m/whisper-encoder-impl.m/g' coreml/whisper-encoder-impl.m
|
| 19 |
+
sed -i '' 's/whisper_encoder_impl\.h/whisper-encoder-impl.h/g' coreml/whisper-encoder-impl.h
|
| 20 |
+
|
| 21 |
+
mv -v models/coreml-decoder-tiny.en.mlpackage models/whisper-decoder-impl.mlpackage
|
| 22 |
+
xcrun coremlc generate models/whisper-decoder-impl.mlpackage coreml/
|
| 23 |
+
mv coreml/whisper_decoder_impl.h coreml/whisper-decoder-impl.h
|
| 24 |
+
mv coreml/whisper_decoder_impl.m coreml/whisper-decoder-impl.m
|
| 25 |
+
sed -i '' 's/whisper_decoder_impl\.h/whisper-decoder-impl.h/g' coreml/whisper-decoder-impl.m
|
| 26 |
+
sed -i '' 's/whisper_decoder_impl\.m/whisper-decoder-impl.m/g' coreml/whisper-decoder-impl.m
|
| 27 |
+
sed -i '' 's/whisper_decoder_impl\.h/whisper-decoder-impl.h/g' coreml/whisper-decoder-impl.h
|
| 28 |
+
|
| 29 |
+
rm -rfv models/whisper-encoder-impl.mlpackage models/whisper-decoder-impl.mlpackage
|
models/generate-coreml-model.sh
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Usage: ./generate-coreml-model.sh <model-name>
|
| 4 |
+
if [ $# -eq 0 ]
|
| 5 |
+
then
|
| 6 |
+
echo "No model name supplied"
|
| 7 |
+
echo "Usage: ./generate-coreml-model.sh <model-name>"
|
| 8 |
+
exit 1
|
| 9 |
+
fi
|
| 10 |
+
|
| 11 |
+
mname="$1"
|
| 12 |
+
|
| 13 |
+
wd=$(dirname "$0")
|
| 14 |
+
cd "$wd/../"
|
| 15 |
+
|
| 16 |
+
python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True
|
| 17 |
+
|
| 18 |
+
xcrun coremlc compile models/coreml-encoder-${mname}.mlpackage models/
|
| 19 |
+
rm -rf models/ggml-${mname}-encoder.mlmodelc
|
| 20 |
+
mv -v models/coreml-encoder-${mname}.mlmodelc models/ggml-${mname}-encoder.mlmodelc
|
| 21 |
+
|
| 22 |
+
# TODO: decoder (sometime in the future maybe)
|
| 23 |
+
#xcrun coremlc compile models/whisper-decoder-${mname}.mlpackage models/
|
| 24 |
+
#rm -rf models/ggml-${mname}-decoder.mlmodelc
|
| 25 |
+
#mv -v models/coreml_decoder_${mname}.mlmodelc models/ggml-${mname}-decoder.mlmodelc
|
whisper.cpp
CHANGED
|
@@ -1,5 +1,8 @@
|
|
| 1 |
#define WHISPER_BUILD
|
| 2 |
#include "whisper.h"
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
#include "ggml.h"
|
| 5 |
|
|
@@ -586,6 +589,11 @@ struct whisper_state {
|
|
| 586 |
|
| 587 |
int lang_id = 0; // english by default
|
| 588 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 589 |
// [EXPERIMENTAL] token-level timestamps data
|
| 590 |
int64_t t_beg = 0;
|
| 591 |
int64_t t_last = 0;
|
|
@@ -1376,6 +1384,7 @@ static bool whisper_encode_internal(
|
|
| 1376 |
}
|
| 1377 |
}
|
| 1378 |
|
|
|
|
| 1379 |
struct ggml_tensor * cur;
|
| 1380 |
|
| 1381 |
// convolution + gelu
|
|
@@ -1683,6 +1692,13 @@ static bool whisper_encode_internal(
|
|
| 1683 |
|
| 1684 |
//ggml_graph_print(&gf);
|
| 1685 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1686 |
|
| 1687 |
// cur
|
| 1688 |
//{
|
|
@@ -2470,6 +2486,20 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
|
|
| 2470 |
// interface implementation
|
| 2471 |
//
|
| 2472 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2473 |
struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
| 2474 |
whisper_state * state = new whisper_state;
|
| 2475 |
|
|
@@ -2497,6 +2527,21 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
|
| 2497 |
fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
| 2498 |
}
|
| 2499 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2500 |
state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
|
| 2501 |
|
| 2502 |
state->logits_id.reserve(ctx->model.hparams.n_vocab);
|
|
@@ -2531,6 +2576,7 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model
|
|
| 2531 |
}
|
| 2532 |
|
| 2533 |
loader.context = &fin;
|
|
|
|
| 2534 |
loader.read = [](void * ctx, void * output, size_t read_size) {
|
| 2535 |
std::ifstream * fin = (std::ifstream*)ctx;
|
| 2536 |
fin->read((char *)output, read_size);
|
|
@@ -2663,6 +2709,11 @@ void whisper_free_state(struct whisper_state * state)
|
|
| 2663 |
kv_cache_free(state->decoders[i].kv_self);
|
| 2664 |
}
|
| 2665 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2666 |
delete state;
|
| 2667 |
}
|
| 2668 |
}
|
|
@@ -3084,6 +3135,14 @@ void whisper_reset_timings(struct whisper_context * ctx) {
|
|
| 3084 |
}
|
| 3085 |
}
|
| 3086 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3087 |
const char * whisper_print_system_info(void) {
|
| 3088 |
static std::string s;
|
| 3089 |
|
|
@@ -3100,6 +3159,7 @@ const char * whisper_print_system_info(void) {
|
|
| 3100 |
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
| 3101 |
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
| 3102 |
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
|
|
|
| 3103 |
|
| 3104 |
return s.c_str();
|
| 3105 |
}
|
|
|
|
| 1 |
#define WHISPER_BUILD
|
| 2 |
#include "whisper.h"
|
| 3 |
+
#if WHISPER_USE_COREML
|
| 4 |
+
#include "coreml/whisper-encoder.h"
|
| 5 |
+
#endif
|
| 6 |
|
| 7 |
#include "ggml.h"
|
| 8 |
|
|
|
|
| 589 |
|
| 590 |
int lang_id = 0; // english by default
|
| 591 |
|
| 592 |
+
std::string path_model; // populated by whisper_init_from_file()
|
| 593 |
+
#ifdef WHISPER_USE_COREML
|
| 594 |
+
whisper_coreml_context * ctx_coreml;
|
| 595 |
+
#endif
|
| 596 |
+
|
| 597 |
// [EXPERIMENTAL] token-level timestamps data
|
| 598 |
int64_t t_beg = 0;
|
| 599 |
int64_t t_last = 0;
|
|
|
|
| 1384 |
}
|
| 1385 |
}
|
| 1386 |
|
| 1387 |
+
#ifndef WHISPER_USE_COREML
|
| 1388 |
struct ggml_tensor * cur;
|
| 1389 |
|
| 1390 |
// convolution + gelu
|
|
|
|
| 1692 |
|
| 1693 |
//ggml_graph_print(&gf);
|
| 1694 |
}
|
| 1695 |
+
#else
|
| 1696 |
+
wstate.use_buf(ctx0, -1);
|
| 1697 |
+
|
| 1698 |
+
struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
|
| 1699 |
+
|
| 1700 |
+
whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
|
| 1701 |
+
#endif
|
| 1702 |
|
| 1703 |
// cur
|
| 1704 |
//{
|
|
|
|
| 2486 |
// interface implementation
|
| 2487 |
//
|
| 2488 |
|
| 2489 |
+
#ifdef WHISPER_USE_COREML
|
| 2490 |
+
// replace .bin with -encoder.mlmodelc
|
| 2491 |
+
static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
|
| 2492 |
+
auto pos = path_bin.rfind('.');
|
| 2493 |
+
if (pos != std::string::npos) {
|
| 2494 |
+
path_bin = path_bin.substr(0, pos);
|
| 2495 |
+
}
|
| 2496 |
+
|
| 2497 |
+
path_bin += "-encoder.mlmodelc";
|
| 2498 |
+
|
| 2499 |
+
return path_bin;
|
| 2500 |
+
}
|
| 2501 |
+
#endif
|
| 2502 |
+
|
| 2503 |
struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
| 2504 |
whisper_state * state = new whisper_state;
|
| 2505 |
|
|
|
|
| 2527 |
fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
| 2528 |
}
|
| 2529 |
|
| 2530 |
+
#ifdef WHISPER_USE_COREML
|
| 2531 |
+
const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model);
|
| 2532 |
+
|
| 2533 |
+
fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
|
| 2534 |
+
fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
|
| 2535 |
+
|
| 2536 |
+
state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
|
| 2537 |
+
if (!state->ctx_coreml) {
|
| 2538 |
+
fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
|
| 2539 |
+
return nullptr;
|
| 2540 |
+
}
|
| 2541 |
+
|
| 2542 |
+
fprintf(stderr, "%s: Core ML model loaded\n", __func__);
|
| 2543 |
+
#endif
|
| 2544 |
+
|
| 2545 |
state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
|
| 2546 |
|
| 2547 |
state->logits_id.reserve(ctx->model.hparams.n_vocab);
|
|
|
|
| 2576 |
}
|
| 2577 |
|
| 2578 |
loader.context = &fin;
|
| 2579 |
+
|
| 2580 |
loader.read = [](void * ctx, void * output, size_t read_size) {
|
| 2581 |
std::ifstream * fin = (std::ifstream*)ctx;
|
| 2582 |
fin->read((char *)output, read_size);
|
|
|
|
| 2709 |
kv_cache_free(state->decoders[i].kv_self);
|
| 2710 |
}
|
| 2711 |
|
| 2712 |
+
#ifdef WHISPER_USE_COREML
|
| 2713 |
+
whisper_coreml_free(state->ctx_coreml);
|
| 2714 |
+
state->ctx_coreml = nullptr;
|
| 2715 |
+
#endif
|
| 2716 |
+
|
| 2717 |
delete state;
|
| 2718 |
}
|
| 2719 |
}
|
|
|
|
| 3135 |
}
|
| 3136 |
}
|
| 3137 |
|
| 3138 |
+
static int whisper_has_coreml(void) {
|
| 3139 |
+
#ifdef WHISPER_USE_COREML
|
| 3140 |
+
return 1;
|
| 3141 |
+
#else
|
| 3142 |
+
return 0;
|
| 3143 |
+
#endif
|
| 3144 |
+
}
|
| 3145 |
+
|
| 3146 |
const char * whisper_print_system_info(void) {
|
| 3147 |
static std::string s;
|
| 3148 |
|
|
|
|
| 3159 |
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
| 3160 |
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
| 3161 |
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
| 3162 |
+
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
|
| 3163 |
|
| 3164 |
return s.c_str();
|
| 3165 |
}
|