ggerganov commited on
Commit
1fd4bde
·
unverified ·
1 Parent(s): 4b562fc

whisper : add Core ML support (#566)

Browse files

* coreml : use Core ML encoder inference

* coreml : simlpify whisper_encode + log messages

* whisper : resolve rebase conflicts

* coreml : add scripts for CoreML model generation

* bench-all : recognize COREML flag

.gitignore CHANGED
@@ -1,6 +1,7 @@
1
  *.o
2
  *.a
3
  .cache/
 
4
  .test/
5
  .vs/
6
  .vscode/
@@ -35,4 +36,6 @@ examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata
35
 
36
  extra/bench-gg.txt
37
 
38
- *.mlmodel*
 
 
 
1
  *.o
2
  *.a
3
  .cache/
4
+ .coreml/
5
  .test/
6
  .vs/
7
  .vscode/
 
36
 
37
  extra/bench-gg.txt
38
 
39
+ models/*.mlmodel
40
+ models/*.mlmodelc
41
+ models/*.mlpackage
CMakeLists.txt CHANGED
@@ -58,6 +58,8 @@ if (APPLE)
58
  option(WHISPER_NO_AVX "whisper: disable AVX" OFF)
59
  option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF)
60
  option(WHISPER_NO_FMA "whisper: disable FMA" OFF)
 
 
61
  else()
62
  option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF)
63
  endif()
@@ -90,16 +92,33 @@ endif()
90
 
91
  find_package(Threads REQUIRED)
92
 
93
- # on APPLE - include Accelerate framework
94
- if (APPLE AND NOT WHISPER_NO_ACCELERATE)
95
- find_library(ACCELERATE_FRAMEWORK Accelerate)
96
- if (ACCELERATE_FRAMEWORK)
97
- message(STATUS "Accelerate framework found")
 
 
 
98
 
99
- set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
100
- set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
101
- else()
102
- message(WARNING "Accelerate framework not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  endif()
104
  endif()
105
 
@@ -187,6 +206,33 @@ if (WHISPER_PERF)
187
  set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
188
  endif()
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  #
191
  # whisper - this is the main library of the project
192
  #
@@ -206,6 +252,10 @@ target_include_directories(${TARGET} PUBLIC
206
  .
207
  )
208
 
 
 
 
 
209
  if (MSVC)
210
  target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
211
 
 
58
  option(WHISPER_NO_AVX "whisper: disable AVX" OFF)
59
  option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF)
60
  option(WHISPER_NO_FMA "whisper: disable FMA" OFF)
61
+
62
+ option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
63
  else()
64
  option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF)
65
  endif()
 
92
 
93
  find_package(Threads REQUIRED)
94
 
95
+ # on APPLE
96
+ if (APPLE)
97
+ # include Accelerate framework
98
+ if (NOT WHISPER_NO_ACCELERATE)
99
+ find_library(ACCELERATE_FRAMEWORK Accelerate)
100
+
101
+ if (ACCELERATE_FRAMEWORK)
102
+ message(STATUS "Accelerate framework found")
103
 
104
+ set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
105
+ set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
106
+ else()
107
+ message(WARNING "Accelerate framework not found")
108
+ endif()
109
+ endif()
110
+
111
+ if (WHISPER_COREML)
112
+ find_library(FOUNDATION_FRAMEWORK Foundation)
113
+ find_library(COREML_FRAMEWORK CoreML)
114
+
115
+ if (COREML_FRAMEWORK)
116
+ message(STATUS "CoreML framework found")
117
+
118
+ set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML)
119
+ else()
120
+ message(WARNING "CoreML framework not found")
121
+ endif()
122
  endif()
123
  endif()
124
 
 
206
  set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
207
  endif()
208
 
209
+ #
210
+ # whisper.coreml - Core ML support
211
+ #
212
+
213
+ if (WHISPER_COREML)
214
+ set(TARGET whisper.coreml)
215
+
216
+ add_library(${TARGET}
217
+ coreml/whisper-encoder.h
218
+ coreml/whisper-encoder.mm
219
+ coreml/whisper-encoder-impl.h
220
+ coreml/whisper-encoder-impl.m
221
+ )
222
+
223
+ include(DefaultTargetOptions)
224
+
225
+ target_include_directories(${TARGET} PUBLIC
226
+ .
227
+ )
228
+
229
+ target_link_libraries(${TARGET} PRIVATE ${FOUNDATION_FRAMEWORK} ${COREML_FRAMEWORK})
230
+
231
+ set_target_properties(${TARGET} PROPERTIES
232
+ COMPILE_FLAGS "-fobjc-arc"
233
+ )
234
+ endif()
235
+
236
  #
237
  # whisper - this is the main library of the project
238
  #
 
252
  .
253
  )
254
 
255
+ if (WHISPER_COREML)
256
+ target_link_libraries(${TARGET} PRIVATE whisper.coreml)
257
+ endif()
258
+
259
  if (MSVC)
260
  target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
261
 
Makefile CHANGED
@@ -140,6 +140,10 @@ ifndef WHISPER_NO_ACCELERATE
140
  LDFLAGS += -framework Accelerate
141
  endif
142
  endif
 
 
 
 
143
  ifdef WHISPER_OPENBLAS
144
  CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
145
  LDFLAGS += -lopenblas
@@ -195,11 +199,23 @@ ggml.o: ggml.c ggml.h
195
  whisper.o: whisper.cpp whisper.h ggml.h
196
  $(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
197
 
198
- libwhisper.a: ggml.o whisper.o
199
- $(AR) rcs libwhisper.a ggml.o whisper.o
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
- libwhisper.so: ggml.o whisper.o
202
- $(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)
203
 
204
  clean:
205
  rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so
@@ -213,24 +229,24 @@ CC_SDL=`sdl2-config --cflags --libs`
213
  SRC_COMMON = examples/common.cpp
214
  SRC_COMMON_SDL = examples/common-sdl.cpp
215
 
216
- main: examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o
217
- $(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o -o main $(LDFLAGS)
218
  ./main -h
219
 
220
- bench: examples/bench/bench.cpp ggml.o whisper.o
221
- $(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
222
 
223
- stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
224
- $(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
225
 
226
- command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
227
- $(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
228
 
229
- talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
230
- $(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
231
 
232
- talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
233
- $(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk-llama $(CC_SDL) $(LDFLAGS)
234
 
235
  #
236
  # Audio samples
 
140
  LDFLAGS += -framework Accelerate
141
  endif
142
  endif
143
+ ifdef WHISPER_COREML
144
+ CXXFLAGS += -DWHISPER_USE_COREML
145
+ LDFLAGS += -framework Foundation -framework CoreML
146
+ endif
147
  ifdef WHISPER_OPENBLAS
148
  CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
149
  LDFLAGS += -lopenblas
 
199
  whisper.o: whisper.cpp whisper.h ggml.h
200
  $(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
201
 
202
+ ifndef WHISPER_COREML
203
+ WHISPER_OBJ = whisper.o
204
+ else
205
+ whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
206
+ $(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
207
+
208
+ whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
209
+ $(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o
210
+
211
+ WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o
212
+ endif
213
+
214
+ libwhisper.a: ggml.o $(WHISPER_OBJ)
215
+ $(AR) rcs libwhisper.a ggml.o $(WHISPER_OBJ)
216
 
217
+ libwhisper.so: ggml.o $(WHISPER_OBJ)
218
+ $(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
219
 
220
  clean:
221
  rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so
 
229
  SRC_COMMON = examples/common.cpp
230
  SRC_COMMON_SDL = examples/common-sdl.cpp
231
 
232
+ main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
233
+ $(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o main $(LDFLAGS)
234
  ./main -h
235
 
236
+ bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
237
+ $(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
238
 
239
+ stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
240
+ $(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
241
 
242
+ command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
243
+ $(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)
244
 
245
+ talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
246
+ $(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
247
 
248
+ talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
249
+ $(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS)
250
 
251
  #
252
  # Audio samples
coreml/whisper-decoder-impl.h ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // whisper-decoder-impl.h
3
+ //
4
+ // This file was automatically generated and should not be edited.
5
+ //
6
+
7
+ #import <Foundation/Foundation.h>
8
+ #import <CoreML/CoreML.h>
9
+ #include <stdint.h>
10
+ #include <os/log.h>
11
+
12
+ NS_ASSUME_NONNULL_BEGIN
13
+
14
+
15
+ /// Model Prediction Input Type
16
+ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
17
+ @interface whisper_decoder_implInput : NSObject<MLFeatureProvider>
18
+
19
+ /// token_data as 1 by 1 matrix of 32-bit integers
20
+ @property (readwrite, nonatomic, strong) MLMultiArray * token_data;
21
+
22
+ /// audio_data as 1 × 384 × 1 × 1500 4-dimensional array of floats
23
+ @property (readwrite, nonatomic, strong) MLMultiArray * audio_data;
24
+ - (instancetype)init NS_UNAVAILABLE;
25
+ - (instancetype)initWithToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data NS_DESIGNATED_INITIALIZER;
26
+
27
+ @end
28
+
29
+
30
+ /// Model Prediction Output Type
31
+ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
32
+ @interface whisper_decoder_implOutput : NSObject<MLFeatureProvider>
33
+
34
+ /// var_1346 as multidimensional array of floats
35
+ @property (readwrite, nonatomic, strong) MLMultiArray * var_1346;
36
+ - (instancetype)init NS_UNAVAILABLE;
37
+ - (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 NS_DESIGNATED_INITIALIZER;
38
+
39
+ @end
40
+
41
+
42
+ /// Class for model loading and prediction
43
+ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
44
+ @interface whisper_decoder_impl : NSObject
45
+ @property (readonly, nonatomic, nullable) MLModel * model;
46
+
47
+ /**
48
+ URL of the underlying .mlmodelc directory.
49
+ */
50
+ + (nullable NSURL *)URLOfModelInThisBundle;
51
+
52
+ /**
53
+ Initialize whisper_decoder_impl instance from an existing MLModel object.
54
+
55
+ Usually the application does not use this initializer unless it makes a subclass of whisper_decoder_impl.
56
+ Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
57
+ */
58
+ - (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
59
+
60
+ /**
61
+ Initialize whisper_decoder_impl instance with the model in this bundle.
62
+ */
63
+ - (nullable instancetype)init;
64
+
65
+ /**
66
+ Initialize whisper_decoder_impl instance with the model in this bundle.
67
+
68
+ @param configuration The model configuration object
69
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
70
+ */
71
+ - (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
72
+
73
+ /**
74
+ Initialize whisper_decoder_impl instance from the model URL.
75
+
76
+ @param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
77
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
78
+ */
79
+ - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
80
+
81
+ /**
82
+ Initialize whisper_decoder_impl instance from the model URL.
83
+
84
+ @param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
85
+ @param configuration The model configuration object
86
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
87
+ */
88
+ - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
89
+
90
+ /**
91
+ Construct whisper_decoder_impl instance asynchronously with configuration.
92
+ Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
93
+
94
+ @param configuration The model configuration
95
+ @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
96
+ */
97
+ + (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler;
98
+
99
+ /**
100
+ Construct whisper_decoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
101
+
102
+ Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
103
+
104
+ @param modelURL The model URL.
105
+ @param configuration The model configuration
106
+ @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
107
+ */
108
+ + (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler;
109
+
110
+ /**
111
+ Make a prediction using the standard interface
112
+ @param input an instance of whisper_decoder_implInput to predict from
113
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
114
+ @return the prediction as whisper_decoder_implOutput
115
+ */
116
+ - (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
117
+
118
+ /**
119
+ Make a prediction using the standard interface
120
+ @param input an instance of whisper_decoder_implInput to predict from
121
+ @param options prediction options
122
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
123
+ @return the prediction as whisper_decoder_implOutput
124
+ */
125
+ - (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
126
+
127
+ /**
128
+ Make a prediction using the convenience interface
129
+ @param token_data as 1 by 1 matrix of 32-bit integers:
130
+ @param audio_data as 1 × 384 × 1 × 1500 4-dimensional array of floats:
131
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
132
+ @return the prediction as whisper_decoder_implOutput
133
+ */
134
+ - (nullable whisper_decoder_implOutput *)predictionFromToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data error:(NSError * _Nullable __autoreleasing * _Nullable)error;
135
+
136
+ /**
137
+ Batch prediction
138
+ @param inputArray array of whisper_decoder_implInput instances to obtain predictions from
139
+ @param options prediction options
140
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
141
+ @return the predictions as NSArray<whisper_decoder_implOutput *>
142
+ */
143
+ - (nullable NSArray<whisper_decoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_decoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
144
+ @end
145
+
146
+ NS_ASSUME_NONNULL_END
coreml/whisper-decoder-impl.m ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // whisper-decoder-impl.m
3
+ //
4
+ // This file was automatically generated and should not be edited.
5
+ //
6
+
7
+ #if !__has_feature(objc_arc)
8
+ #error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
9
+ #endif
10
+
11
+ #import "whisper-decoder-impl.h"
12
+
13
+ @implementation whisper_decoder_implInput
14
+
15
+ - (instancetype)initWithToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data {
16
+ self = [super init];
17
+ if (self) {
18
+ _token_data = token_data;
19
+ _audio_data = audio_data;
20
+ }
21
+ return self;
22
+ }
23
+
24
+ - (NSSet<NSString *> *)featureNames {
25
+ return [NSSet setWithArray:@[@"token_data", @"audio_data"]];
26
+ }
27
+
28
+ - (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
29
+ if ([featureName isEqualToString:@"token_data"]) {
30
+ return [MLFeatureValue featureValueWithMultiArray:self.token_data];
31
+ }
32
+ if ([featureName isEqualToString:@"audio_data"]) {
33
+ return [MLFeatureValue featureValueWithMultiArray:self.audio_data];
34
+ }
35
+ return nil;
36
+ }
37
+
38
+ @end
39
+
40
+ @implementation whisper_decoder_implOutput
41
+
42
+ - (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 {
43
+ self = [super init];
44
+ if (self) {
45
+ _var_1346 = var_1346;
46
+ }
47
+ return self;
48
+ }
49
+
50
+ - (NSSet<NSString *> *)featureNames {
51
+ return [NSSet setWithArray:@[@"var_1346"]];
52
+ }
53
+
54
+ - (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
55
+ if ([featureName isEqualToString:@"var_1346"]) {
56
+ return [MLFeatureValue featureValueWithMultiArray:self.var_1346];
57
+ }
58
+ return nil;
59
+ }
60
+
61
+ @end
62
+
63
+ @implementation whisper_decoder_impl
64
+
65
+
66
+ /**
67
+ URL of the underlying .mlmodelc directory.
68
+ */
69
+ + (nullable NSURL *)URLOfModelInThisBundle {
70
+ NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"whisper_decoder_impl" ofType:@"mlmodelc"];
71
+ if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load whisper-decoder-impl.mlmodelc in the bundle resource"); return nil; }
72
+ return [NSURL fileURLWithPath:assetPath];
73
+ }
74
+
75
+
76
+ /**
77
+ Initialize whisper_decoder_impl instance from an existing MLModel object.
78
+
79
+ Usually the application does not use this initializer unless it makes a subclass of whisper_decoder_impl.
80
+ Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
81
+ */
82
+ - (instancetype)initWithMLModel:(MLModel *)model {
83
+ self = [super init];
84
+ if (!self) { return nil; }
85
+ _model = model;
86
+ if (_model == nil) { return nil; }
87
+ return self;
88
+ }
89
+
90
+
91
+ /**
92
+ Initialize whisper_decoder_impl instance with the model in this bundle.
93
+ */
94
+ - (nullable instancetype)init {
95
+ return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
96
+ }
97
+
98
+
99
+ /**
100
+ Initialize whisper_decoder_impl instance with the model in this bundle.
101
+
102
+ @param configuration The model configuration object
103
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
104
+ */
105
+ - (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
106
+ return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error];
107
+ }
108
+
109
+
110
+ /**
111
+ Initialize whisper_decoder_impl instance from the model URL.
112
+
113
+ @param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
114
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
115
+ */
116
+ - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
117
+ MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error];
118
+ if (model == nil) { return nil; }
119
+ return [self initWithMLModel:model];
120
+ }
121
+
122
+
123
+ /**
124
+ Initialize whisper_decoder_impl instance from the model URL.
125
+
126
+ @param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
127
+ @param configuration The model configuration object
128
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
129
+ */
130
+ - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
131
+ MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error];
132
+ if (model == nil) { return nil; }
133
+ return [self initWithMLModel:model];
134
+ }
135
+
136
+
137
+ /**
138
+ Construct whisper_decoder_impl instance asynchronously with configuration.
139
+ Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
140
+
141
+ @param configuration The model configuration
142
+ @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
143
+ */
144
+ + (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler {
145
+ [self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
146
+ configuration:configuration
147
+ completionHandler:handler];
148
+ }
149
+
150
+
151
+ /**
152
+ Construct whisper_decoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
153
+
154
+ Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
155
+
156
+ @param modelURL The model URL.
157
+ @param configuration The model configuration
158
+ @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
159
+ */
160
+ + (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler {
161
+ [MLModel loadContentsOfURL:modelURL
162
+ configuration:configuration
163
+ completionHandler:^(MLModel *model, NSError *error) {
164
+ if (model != nil) {
165
+ whisper_decoder_impl *typedModel = [[whisper_decoder_impl alloc] initWithMLModel:model];
166
+ handler(typedModel, nil);
167
+ } else {
168
+ handler(nil, error);
169
+ }
170
+ }];
171
+ }
172
+
173
+ - (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
174
+ return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
175
+ }
176
+
177
+ - (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
178
+ id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
179
+ if (!outFeatures) { return nil; }
180
+ return [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[outFeatures featureValueForName:@"var_1346"].multiArrayValue];
181
+ }
182
+
183
+ - (nullable whisper_decoder_implOutput *)predictionFromToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data error:(NSError * _Nullable __autoreleasing * _Nullable)error {
184
+ whisper_decoder_implInput *input_ = [[whisper_decoder_implInput alloc] initWithToken_data:token_data audio_data:audio_data];
185
+ return [self predictionFromFeatures:input_ error:error];
186
+ }
187
+
188
+ - (nullable NSArray<whisper_decoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_decoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
189
+ id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
190
+ id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
191
+ if (!outBatch) { return nil; }
192
+ NSMutableArray<whisper_decoder_implOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
193
+ for (NSInteger i = 0; i < outBatch.count; i++) {
194
+ id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
195
+ whisper_decoder_implOutput * result = [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[resultProvider featureValueForName:@"var_1346"].multiArrayValue];
196
+ [results addObject:result];
197
+ }
198
+ return results;
199
+ }
200
+
201
+ @end
coreml/whisper-encoder-impl.h ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // whisper-encoder-impl.h
3
+ //
4
+ // This file was automatically generated and should not be edited.
5
+ //
6
+
7
+ #import <Foundation/Foundation.h>
8
+ #import <CoreML/CoreML.h>
9
+ #include <stdint.h>
10
+ #include <os/log.h>
11
+
12
+ NS_ASSUME_NONNULL_BEGIN
13
+
14
+
15
+ /// Model Prediction Input Type
16
+ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
17
+ @interface whisper_encoder_implInput : NSObject<MLFeatureProvider>
18
+
19
+ /// logmel_data as 1 × 80 × 3000 3-dimensional array of floats
20
+ @property (readwrite, nonatomic, strong) MLMultiArray * logmel_data;
21
+ - (instancetype)init NS_UNAVAILABLE;
22
+ - (instancetype)initWithLogmel_data:(MLMultiArray *)logmel_data NS_DESIGNATED_INITIALIZER;
23
+
24
+ @end
25
+
26
+
27
+ /// Model Prediction Output Type
28
+ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
29
+ @interface whisper_encoder_implOutput : NSObject<MLFeatureProvider>
30
+
31
+ /// output as multidimensional array of floats
32
+ @property (readwrite, nonatomic, strong) MLMultiArray * output;
33
+ - (instancetype)init NS_UNAVAILABLE;
34
+ - (instancetype)initWithOutput:(MLMultiArray *)output NS_DESIGNATED_INITIALIZER;
35
+
36
+ @end
37
+
38
+
39
+ /// Class for model loading and prediction
40
+ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
41
+ @interface whisper_encoder_impl : NSObject
42
+ @property (readonly, nonatomic, nullable) MLModel * model;
43
+
44
+ /**
45
+ URL of the underlying .mlmodelc directory.
46
+ */
47
+ + (nullable NSURL *)URLOfModelInThisBundle;
48
+
49
+ /**
50
+ Initialize whisper_encoder_impl instance from an existing MLModel object.
51
+
52
+ Usually the application does not use this initializer unless it makes a subclass of whisper_encoder_impl.
53
+ Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
54
+ */
55
+ - (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
56
+
57
+ /**
58
+ Initialize whisper_encoder_impl instance with the model in this bundle.
59
+ */
60
+ - (nullable instancetype)init;
61
+
62
+ /**
63
+ Initialize whisper_encoder_impl instance with the model in this bundle.
64
+
65
+ @param configuration The model configuration object
66
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
67
+ */
68
+ - (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
69
+
70
+ /**
71
+ Initialize whisper_encoder_impl instance from the model URL.
72
+
73
+ @param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
74
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
75
+ */
76
+ - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
77
+
78
+ /**
79
+ Initialize whisper_encoder_impl instance from the model URL.
80
+
81
+ @param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
82
+ @param configuration The model configuration object
83
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
84
+ */
85
+ - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
86
+
87
+ /**
88
+ Construct whisper_encoder_impl instance asynchronously with configuration.
89
+ Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
90
+
91
+ @param configuration The model configuration
92
+ @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
93
+ */
94
+ + (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler;
95
+
96
+ /**
97
+ Construct whisper_encoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
98
+
99
+ Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
100
+
101
+ @param modelURL The model URL.
102
+ @param configuration The model configuration
103
+ @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
104
+ */
105
+ + (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler;
106
+
107
+ /**
108
+ Make a prediction using the standard interface
109
+ @param input an instance of whisper_encoder_implInput to predict from
110
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
111
+ @return the prediction as whisper_encoder_implOutput
112
+ */
113
+ - (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
114
+
115
+ /**
116
+ Make a prediction using the standard interface
117
+ @param input an instance of whisper_encoder_implInput to predict from
118
+ @param options prediction options
119
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
120
+ @return the prediction as whisper_encoder_implOutput
121
+ */
122
+ - (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
123
+
124
+ /**
125
+ Make a prediction using the convenience interface
126
+ @param logmel_data as 1 × 80 × 3000 3-dimensional array of floats:
127
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
128
+ @return the prediction as whisper_encoder_implOutput
129
+ */
130
+ - (nullable whisper_encoder_implOutput *)predictionFromLogmel_data:(MLMultiArray *)logmel_data error:(NSError * _Nullable __autoreleasing * _Nullable)error;
131
+
132
+ /**
133
+ Batch prediction
134
+ @param inputArray array of whisper_encoder_implInput instances to obtain predictions from
135
+ @param options prediction options
136
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
137
+ @return the predictions as NSArray<whisper_encoder_implOutput *>
138
+ */
139
+ - (nullable NSArray<whisper_encoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_encoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
140
+ @end
141
+
142
+ NS_ASSUME_NONNULL_END
coreml/whisper-encoder-impl.m ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // whisper-encoder-impl.m
3
+ //
4
+ // This file was automatically generated and should not be edited.
5
+ //
6
+
7
+ #if !__has_feature(objc_arc)
8
+ #error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
9
+ #endif
10
+
11
+ #import "whisper-encoder-impl.h"
12
+
13
+ @implementation whisper_encoder_implInput
14
+
15
+ - (instancetype)initWithLogmel_data:(MLMultiArray *)logmel_data {
16
+ self = [super init];
17
+ if (self) {
18
+ _logmel_data = logmel_data;
19
+ }
20
+ return self;
21
+ }
22
+
23
+ - (NSSet<NSString *> *)featureNames {
24
+ return [NSSet setWithArray:@[@"logmel_data"]];
25
+ }
26
+
27
+ - (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
28
+ if ([featureName isEqualToString:@"logmel_data"]) {
29
+ return [MLFeatureValue featureValueWithMultiArray:self.logmel_data];
30
+ }
31
+ return nil;
32
+ }
33
+
34
+ @end
35
+
36
+ @implementation whisper_encoder_implOutput
37
+
38
+ - (instancetype)initWithOutput:(MLMultiArray *)output {
39
+ self = [super init];
40
+ if (self) {
41
+ _output = output;
42
+ }
43
+ return self;
44
+ }
45
+
46
+ - (NSSet<NSString *> *)featureNames {
47
+ return [NSSet setWithArray:@[@"output"]];
48
+ }
49
+
50
+ - (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
51
+ if ([featureName isEqualToString:@"output"]) {
52
+ return [MLFeatureValue featureValueWithMultiArray:self.output];
53
+ }
54
+ return nil;
55
+ }
56
+
57
+ @end
58
+
59
+ @implementation whisper_encoder_impl
60
+
61
+
62
+ /**
63
+ URL of the underlying .mlmodelc directory.
64
+ */
65
+ + (nullable NSURL *)URLOfModelInThisBundle {
66
+ NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"whisper_encoder_impl" ofType:@"mlmodelc"];
67
+ if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load whisper-encoder-impl.mlmodelc in the bundle resource"); return nil; }
68
+ return [NSURL fileURLWithPath:assetPath];
69
+ }
70
+
71
+
72
+ /**
73
+ Initialize whisper_encoder_impl instance from an existing MLModel object.
74
+
75
+ Usually the application does not use this initializer unless it makes a subclass of whisper_encoder_impl.
76
+ Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
77
+ */
78
+ - (instancetype)initWithMLModel:(MLModel *)model {
79
+ self = [super init];
80
+ if (!self) { return nil; }
81
+ _model = model;
82
+ if (_model == nil) { return nil; }
83
+ return self;
84
+ }
85
+
86
+
87
+ /**
88
+ Initialize whisper_encoder_impl instance with the model in this bundle.
89
+ */
90
+ - (nullable instancetype)init {
91
+ return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
92
+ }
93
+
94
+
95
+ /**
96
+ Initialize whisper_encoder_impl instance with the model in this bundle.
97
+
98
+ @param configuration The model configuration object
99
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
100
+ */
101
+ - (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
102
+ return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error];
103
+ }
104
+
105
+
106
+ /**
107
+ Initialize whisper_encoder_impl instance from the model URL.
108
+
109
+ @param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
110
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
111
+ */
112
+ - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
113
+ MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error];
114
+ if (model == nil) { return nil; }
115
+ return [self initWithMLModel:model];
116
+ }
117
+
118
+
119
+ /**
120
+ Initialize whisper_encoder_impl instance from the model URL.
121
+
122
+ @param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
123
+ @param configuration The model configuration object
124
+ @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
125
+ */
126
+ - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
127
+ MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error];
128
+ if (model == nil) { return nil; }
129
+ return [self initWithMLModel:model];
130
+ }
131
+
132
+
133
+ /**
134
+ Construct whisper_encoder_impl instance asynchronously with configuration.
135
+ Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
136
+
137
+ @param configuration The model configuration
138
+ @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
139
+ */
140
+ + (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler {
141
+ [self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
142
+ configuration:configuration
143
+ completionHandler:handler];
144
+ }
145
+
146
+
147
+ /**
148
+ Construct whisper_encoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
149
+
150
+ Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
151
+
152
+ @param modelURL The model URL.
153
+ @param configuration The model configuration
154
+ @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
155
+ */
156
+ + (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler {
157
+ [MLModel loadContentsOfURL:modelURL
158
+ configuration:configuration
159
+ completionHandler:^(MLModel *model, NSError *error) {
160
+ if (model != nil) {
161
+ whisper_encoder_impl *typedModel = [[whisper_encoder_impl alloc] initWithMLModel:model];
162
+ handler(typedModel, nil);
163
+ } else {
164
+ handler(nil, error);
165
+ }
166
+ }];
167
+ }
168
+
169
+ - (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
170
+ return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
171
+ }
172
+
173
+ - (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
174
+ id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
175
+ if (!outFeatures) { return nil; }
176
+ return [[whisper_encoder_implOutput alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
177
+ }
178
+
179
+ - (nullable whisper_encoder_implOutput *)predictionFromLogmel_data:(MLMultiArray *)logmel_data error:(NSError * _Nullable __autoreleasing * _Nullable)error {
180
+ whisper_encoder_implInput *input_ = [[whisper_encoder_implInput alloc] initWithLogmel_data:logmel_data];
181
+ return [self predictionFromFeatures:input_ error:error];
182
+ }
183
+
184
+ - (nullable NSArray<whisper_encoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_encoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
185
+ id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
186
+ id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
187
+ if (!outBatch) { return nil; }
188
+ NSMutableArray<whisper_encoder_implOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
189
+ for (NSInteger i = 0; i < outBatch.count; i++) {
190
+ id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
191
+ whisper_encoder_implOutput * result = [[whisper_encoder_implOutput alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue];
192
+ [results addObject:result];
193
+ }
194
+ return results;
195
+ }
196
+
197
+ @end
coreml/whisper-encoder.h ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Wrapper of the Core ML Whisper Encoder model
2
+ //
3
+ // Code is derived from the work of Github user @wangchou
4
+ // ref: https://github.com/wangchou/callCoreMLFromCpp
5
+
6
+ #if __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ struct whisper_coreml_context;
11
+
12
+ struct whisper_coreml_context * whisper_coreml_init(const char * path_model);
13
+ void whisper_coreml_free(struct whisper_coreml_context * ctx);
14
+
15
+ void whisper_coreml_encode(
16
+ const whisper_coreml_context * ctx,
17
+ float * mel,
18
+ float * out);
19
+
20
+ #if __cplusplus
21
+ }
22
+ #endif
coreml/whisper-encoder.mm ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #import "coreml/whisper-encoder.h"
2
+ #import "coreml/whisper-encoder-impl.h"
3
+
4
+ #import <CoreML/CoreML.h>
5
+
6
+ #include <stdlib.h>
7
+
8
+ #if __cplusplus
9
+ extern "C" {
10
+ #endif
11
+
12
+ struct whisper_coreml_context {
13
+ const void * data;
14
+ };
15
+
16
+ struct whisper_coreml_context * whisper_coreml_init(const char * path_model) {
17
+ NSString * path_model_str = [[NSString alloc] initWithUTF8String:path_model];
18
+
19
+ NSURL * url_model = [NSURL fileURLWithPath: path_model_str];
20
+
21
+ const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model error:nil]);
22
+
23
+ if (data == NULL) {
24
+ return NULL;
25
+ }
26
+
27
+ whisper_coreml_context * ctx = new whisper_coreml_context;
28
+
29
+ ctx->data = data;
30
+
31
+ return ctx;
32
+ }
33
+
34
+ void whisper_coreml_free(struct whisper_coreml_context * ctx) {
35
+ CFRelease(ctx->data);
36
+ delete ctx;
37
+ }
38
+
39
+ void whisper_coreml_encode(
40
+ const whisper_coreml_context * ctx,
41
+ float * mel,
42
+ float * out) {
43
+ MLMultiArray * inMultiArray = [
44
+ [MLMultiArray alloc] initWithDataPointer: mel
45
+ shape: @[@1, @80, @3000]
46
+ dataType: MLMultiArrayDataTypeFloat32
47
+ strides: @[@(240000), @(3000), @1]
48
+ deallocator: nil
49
+ error: nil
50
+ ];
51
+
52
+ whisper_encoder_implOutput * outCoreML = [(__bridge id) ctx->data predictionFromLogmel_data:inMultiArray error:nil];
53
+
54
+ MLMultiArray * outMA = outCoreML.output;
55
+
56
+ //NSArray<NSNumber *> * shape = outMA.shape;
57
+ //NSArray<NSNumber *> * strides = outMA.strides;
58
+
59
+ //printf("shape: %ld %ld %ld %ld\n", [shape[0] longValue], [shape[1] longValue], [shape[2] longValue], [shape[3] longValue]);
60
+ //printf("strides: %ld %ld %ld %ld\n", [strides[0] longValue], [strides[1] longValue], [strides[2] longValue], [strides[3] longValue]);
61
+
62
+ memcpy(out, outMA.dataPointer, outMA.count * sizeof(float));
63
+ }
64
+
65
+ #if __cplusplus
66
+ }
67
+ #endif
extra/bench-all.sh CHANGED
@@ -64,6 +64,10 @@ for model in "${models[@]}"; do
64
  config="$config BLAS"
65
  fi
66
 
 
 
 
 
67
  commit=$(git rev-parse --short HEAD)
68
 
69
  printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
 
64
  config="$config BLAS"
65
  fi
66
 
67
+ if [[ $system_info == *"COREML = 1"* ]]; then
68
+ config="$config COREML"
69
+ fi
70
+
71
  commit=$(git rev-parse --short HEAD)
72
 
73
  printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
models/convert-whisper-to-coreml.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+ import torch.nn.functional as F
4
+ import coremltools as ct
5
+
6
+ from torch import Tensor
7
+ from torch import nn
8
+ from typing import Dict
9
+ from typing import Optional
10
+ from ane_transformers.reference.layer_norm import LayerNormANE as LayerNormANEBase
11
+ from coremltools.models.neural_network.quantization_utils import quantize_weights
12
+ from whisper.model import Whisper, AudioEncoder, TextDecoder, ResidualAttentionBlock, MultiHeadAttention, ModelDimensions
13
+ from whisper import load_model
14
+
15
+ # Use for changing dim of input in encoder and decoder embeddings
16
+ def linear_to_conv2d_map(state_dict, prefix, local_metadata, strict,
17
+ missing_keys, unexpected_keys, error_msgs):
18
+ """
19
+ Unsqueeze twice to map nn.Linear weights to nn.Conv2d weights
20
+ """
21
+ for k in state_dict:
22
+ is_attention = all(substr in k for substr in ['attn', '.weight'])
23
+ is_mlp = any([k.endswith(s) for s in ['mlp.0.weight', 'mlp.2.weight']])
24
+
25
+ if (is_attention or is_mlp) and len(state_dict[k].shape) == 2:
26
+ state_dict[k] = state_dict[k][:, :, None, None]
27
+
28
+
29
+ def correct_for_bias_scale_order_inversion(state_dict, prefix, local_metadata,
30
+ strict, missing_keys,
31
+ unexpected_keys, error_msgs):
32
+ state_dict[prefix + 'bias'] = state_dict[prefix + 'bias'] / state_dict[prefix + 'weight']
33
+ return state_dict
34
+
35
+ class LayerNormANE(LayerNormANEBase):
36
+
37
+ def __init__(self, *args, **kwargs):
38
+ super().__init__(*args, **kwargs)
39
+ self._register_load_state_dict_pre_hook(
40
+ correct_for_bias_scale_order_inversion)
41
+
42
+ class MultiHeadAttentionANE(MultiHeadAttention):
43
+ def __init__(self, n_state: int, n_head: int):
44
+ super().__init__(n_state, n_head)
45
+
46
+ setattr(self, 'query', nn.Conv2d(n_state, n_state, kernel_size=1))
47
+ setattr(self, 'key', nn.Conv2d(n_state, n_state, kernel_size=1, bias=False))
48
+ setattr(self, 'value', nn.Conv2d(n_state, n_state, kernel_size=1))
49
+ setattr(self, 'out', nn.Conv2d(n_state, n_state, kernel_size=1))
50
+
51
+ def forward(self,
52
+ x: Tensor,
53
+ xa: Optional[Tensor] = None,
54
+ mask: Optional[Tensor] = None,
55
+ kv_cache: Optional[dict] = None):
56
+
57
+ q = self.query(x)
58
+
59
+ if kv_cache is None or xa is None or self.key not in kv_cache:
60
+ # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
61
+ # otherwise, perform key/value projections for self- or cross-attention as usual.
62
+ k = self.key(x if xa is None else xa)
63
+ v = self.value(x if xa is None else xa)
64
+
65
+ else:
66
+ # for cross-attention, calculate keys and values once and reuse in subsequent calls.
67
+ k = kv_cache[self.key]
68
+ v = kv_cache[self.value]
69
+
70
+ wv, qk = self.qkv_attention_ane(q, k, v, mask)
71
+
72
+ return self.out(wv), qk
73
+
74
+ def qkv_attention_ane(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None):
75
+
76
+ _, dim, _, seqlen = q.size()
77
+
78
+ dim_per_head = dim // self.n_head
79
+
80
+ scale = float(dim_per_head)**-0.5
81
+
82
+ q = q * scale
83
+
84
+ mh_q = q.split(dim_per_head, dim=1)
85
+ mh_k = k.transpose(1,3).split(dim_per_head, dim=3)
86
+ mh_v = v.split(dim_per_head, dim=1)
87
+
88
+ mh_qk = [
89
+ torch.einsum('bchq,bkhc->bkhq', [qi, ki])
90
+ for qi, ki in zip(mh_q, mh_k)
91
+ ] # (batch_size, max_seq_length, 1, max_seq_length) * n_heads
92
+
93
+ if mask is not None:
94
+ for head_idx in range(self.n_head):
95
+ mh_qk[head_idx] = mh_qk[head_idx] + mask[:, :seqlen, :, :seqlen]
96
+
97
+ attn_weights = [aw.softmax(dim=1) for aw in mh_qk] # (batch_size, max_seq_length, 1, max_seq_length) * n_heads
98
+ attn = [torch.einsum('bkhq,bchk->bchq', wi, vi) for wi, vi in zip(attn_weights, mh_v)] # (batch_size, dim_per_head, 1, max_seq_length) * n_heads
99
+ attn = torch.cat(attn, dim=1) # (batch_size, dim, 1, max_seq_length)
100
+
101
+ return attn, torch.cat(mh_qk, dim=1).float().detach()
102
+
103
+
104
+ class ResidualAttentionBlockANE(ResidualAttentionBlock):
105
+ def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
106
+ super().__init__(n_state, n_head, cross_attention)
107
+
108
+ setattr(self, 'attn', MultiHeadAttentionANE(n_state, n_head))
109
+ setattr(self, 'attn_ln', LayerNormANE(n_state))
110
+
111
+ setattr(self, 'cross_attn', MultiHeadAttentionANE(n_state, n_head) if cross_attention else None)
112
+ setattr(self, 'cross_attn_ln', LayerNormANE(n_state) if cross_attention else None)
113
+
114
+ n_mlp = n_state * 4
115
+ setattr(self, 'mlp', nn.Sequential(
116
+ nn.Conv2d(n_state, n_mlp, kernel_size=1),
117
+ nn.GELU(),
118
+ nn.Conv2d(n_mlp, n_state, kernel_size=1)
119
+ ))
120
+ setattr(self, 'mlp_ln', LayerNormANE(n_state))
121
+
122
+
123
+ class AudioEncoderANE(AudioEncoder):
124
+ def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
125
+ super().__init__(n_mels, n_ctx, n_state, n_head, n_layer)
126
+
127
+ setattr(self, 'blocks', nn.ModuleList(
128
+ [ResidualAttentionBlockANE(n_state, n_head) for _ in range(n_layer)]
129
+ ))
130
+ setattr(self, 'ln_post', LayerNormANE(n_state))
131
+
132
+ def forward(self, x: Tensor):
133
+ """
134
+ x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
135
+ the mel spectrogram of the audio
136
+ """
137
+ x = F.gelu(self.conv1(x))
138
+ x = F.gelu(self.conv2(x))
139
+
140
+ assert x.shape[1:] == self.positional_embedding.shape[::-1], "incorrect audio shape"
141
+
142
+ # Add positional embedding and add dummy dim for ANE
143
+ x = (x + self.positional_embedding.transpose(0,1)).to(x.dtype).unsqueeze(2)
144
+
145
+ for block in self.blocks:
146
+ x = block(x)
147
+
148
+ x = self.ln_post(x)
149
+
150
+ # """
151
+ # TODO:
152
+ # I think we need to transpose the result here to make it fit whisper.cpp memory order.
153
+ # However, even doing this, the results are still wrong. Kind of less wrong compared to
154
+ # not transposing, but still wrong.
155
+
156
+ # Also, I don't know why the original OpenAI implementation does not need to transpose
157
+
158
+ # transpose to (batch_size, n_ctx, n_state)
159
+ # x : torch.Tensor, shape = (batch_size, n_state, 1, n_ctx)
160
+
161
+ # """
162
+ # x = x.transpose(1,3)
163
+
164
+ return x
165
+
166
+ class TextDecoderANE(TextDecoder):
167
+
168
+ def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
169
+ super().__init__(n_vocab, n_ctx, n_state, n_head, n_layer)
170
+
171
+ setattr(self, 'blocks', nn.ModuleList(
172
+ [ResidualAttentionBlockANE(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
173
+ ))
174
+ setattr(self, 'ln', LayerNormANE(n_state))
175
+
176
+ def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
177
+ """
178
+ x : torch.LongTensor, shape = (batch_size, <= n_ctx)
179
+ the text tokens
180
+ xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
181
+ the encoded audio features to be attended on
182
+ """
183
+ offset = next(iter(kv_cache.values())).shape[3] if kv_cache else 0
184
+ x = self.token_embedding(x) + self.positional_embedding[offset : offset + x.shape[-1]]
185
+ x = x.to(xa.dtype)
186
+
187
+ # Reformat for ANE
188
+ mask = self.mask[None, None, :, :].permute(0,3,1,2)
189
+ x = x.transpose(1,2).unsqueeze(2)
190
+
191
+ for block in self.blocks:
192
+ x = block(x, xa, mask=mask, kv_cache=kv_cache)
193
+
194
+ x = self.ln(x)
195
+
196
+ # Reformat back from ANE
197
+ x = x.permute(0,2,3,1).squeeze(0)
198
+
199
+ # ANE can only load tensors with dim size of at most 16,384 - whisper uses 51,864 (en) or 51,865 (multi-lang) tokens so we need to compute in chunks
200
+ if self.token_embedding.weight.shape[0] == 51865:
201
+ # split in 11 chunks - 4715 each
202
+ splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//11, dim=0)
203
+ logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
204
+ else:
205
+ # split in 12 chunks - 4322 each
206
+ assert(self.token_embedding.weight.shape[0] == 51864)
207
+ splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//12, dim=0)
208
+ logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
209
+
210
+ return logits
211
+
212
+ class WhisperANE(Whisper):
213
+ def __init__(self, dims: ModelDimensions):
214
+ super().__init__(dims)
215
+
216
+ setattr(self, 'encoder', AudioEncoderANE(
217
+ self.dims.n_mels,
218
+ self.dims.n_audio_ctx,
219
+ self.dims.n_audio_state,
220
+ self.dims.n_audio_head,
221
+ self.dims.n_audio_layer,
222
+ ))
223
+ setattr(self, 'decoder', TextDecoderANE(
224
+ self.dims.n_vocab,
225
+ self.dims.n_text_ctx,
226
+ self.dims.n_text_state,
227
+ self.dims.n_text_head,
228
+ self.dims.n_text_layer,
229
+ ))
230
+
231
+ self._register_load_state_dict_pre_hook(linear_to_conv2d_map)
232
+
233
+ def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> Dict[str, torch.Tensor]:
234
+ return self.decoder(tokens, self.encoder(mel))
235
+
236
+ def install_kv_cache_hooks(self, cache: Optional[dict] = None):
237
+ cache = {**cache} if cache is not None else {}
238
+ hooks = []
239
+
240
+ def save_to_cache(module, _, output):
241
+ if module not in cache or output.shape[3] > self.decoder.positional_embedding.shape[0]:
242
+ cache[module] = output # save as-is, for the first token or cross attention
243
+ else:
244
+ cache[module] = torch.cat([cache[module], output], dim=3).detach()
245
+ return cache[module]
246
+
247
+ def install_hooks(layer: nn.Module):
248
+ if isinstance(layer, MultiHeadAttentionANE):
249
+ hooks.append(layer.key.register_forward_hook(save_to_cache))
250
+ hooks.append(layer.value.register_forward_hook(save_to_cache))
251
+
252
+ self.decoder.apply(install_hooks)
253
+ return cache, hooks
254
+
255
+ def convert_encoder(hparams, model, quantize=False):
256
+ model.eval()
257
+
258
+ input_shape = (1, 80, 3000)
259
+ input_data = torch.randn(input_shape)
260
+ traced_model = torch.jit.trace(model, input_data)
261
+
262
+ model = ct.convert(
263
+ traced_model,
264
+ convert_to=None if quantize else "mlprogram", # convert will fail if weights are quantized, not sure why
265
+ inputs=[ct.TensorType(name="logmel_data", shape=input_shape)],
266
+ outputs=[ct.TensorType(name="output")],
267
+ compute_units=ct.ComputeUnit.ALL
268
+ )
269
+
270
+ if quantize:
271
+ model = quantize_weights(model, nbits=16)
272
+
273
+ return model
274
+
275
+ def convert_decoder(hparams, model, quantize=False):
276
+ model.eval()
277
+
278
+ tokens_shape = (1, 1)
279
+ audio_shape = (1, hparams.n_audio_state, 1, 1500)
280
+
281
+ audio_data = torch.randn(audio_shape)
282
+ token_data = torch.randint(50257, tokens_shape).long()
283
+ traced_model = torch.jit.trace(model, (token_data, audio_data))
284
+
285
+ model = ct.convert(
286
+ traced_model,
287
+ convert_to=None if quantize else "mlprogram", # convert will fail if weights are quantized, not sure why
288
+ inputs=[
289
+ ct.TensorType(name="token_data", shape=tokens_shape, dtype=int),
290
+ ct.TensorType(name="audio_data", shape=audio_shape)
291
+ ]
292
+ )
293
+
294
+ if quantize:
295
+ model = quantize_weights(model, nbits=16)
296
+
297
+ return model
298
+
299
+
300
+ if __name__ == "__main__":
301
+ parser = argparse.ArgumentParser()
302
+ parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large)", required=True)
303
+ parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
304
+ parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
305
+ parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
306
+ args = parser.parse_args()
307
+
308
+ if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large"]:
309
+ raise ValueError("Invalid model name")
310
+
311
+ whisper = load_model(args.model).cpu()
312
+ hparams = whisper.dims
313
+ print(hparams)
314
+
315
+ if args.optimize_ane:
316
+ whisperANE = WhisperANE(hparams).eval()
317
+ whisperANE.load_state_dict(whisper.state_dict())
318
+
319
+ encoder = whisperANE.encoder
320
+ decoder = whisperANE.decoder
321
+ else:
322
+ encoder = whisper.encoder
323
+ decoder = whisper.decoder
324
+
325
+ # Convert encoder
326
+ encoder = convert_encoder(hparams, encoder, quantize=args.quantize)
327
+ encoder.save(f"models/coreml-encoder-{args.model}.mlpackage")
328
+
329
+ if args.encoder_only is False:
330
+ # Convert decoder
331
+ decoder = convert_decoder(hparams, decoder, quantize=args.quantize)
332
+ decoder.save(f"models/coreml-decoder-{args.model}.mlpackage")
333
+
334
+ print("done converting")
models/download-coreml-model.sh ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # This script downloads Whisper model files that have already been converted to Core ML format.
4
+ # This way you don't have to convert them yourself.
5
+
6
+ src="https://huggingface.co/datasets/ggerganov/whisper.cpp-coreml"
7
+ pfx="resolve/main/ggml"
8
+
9
+ # get the path of this script
10
+ function get_script_path() {
11
+ if [ -x "$(command -v realpath)" ]; then
12
+ echo "$(dirname $(realpath $0))"
13
+ else
14
+ local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
15
+ echo "$ret"
16
+ fi
17
+ }
18
+
19
+ models_path="$(get_script_path)"
20
+
21
+ # Whisper models
22
+ models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
23
+
24
+ # list available models
25
+ function list_models {
26
+ printf "\n"
27
+ printf " Available models:"
28
+ for model in "${models[@]}"; do
29
+ printf " $model"
30
+ done
31
+ printf "\n\n"
32
+ }
33
+
34
+ if [ "$#" -ne 1 ]; then
35
+ printf "Usage: $0 <model>\n"
36
+ list_models
37
+
38
+ exit 1
39
+ fi
40
+
41
+ model=$1
42
+
43
+ if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
44
+ printf "Invalid model: $model\n"
45
+ list_models
46
+
47
+ exit 1
48
+ fi
49
+
50
+ # download Core ML model
51
+
52
+ printf "Downloading Core ML model $model from '$src' ...\n"
53
+
54
+ cd $models_path
55
+
56
+ if [ -f "ggml-$model.mlmodel" ]; then
57
+ printf "Model $model already exists. Skipping download.\n"
58
+ exit 0
59
+ fi
60
+
61
+ if [ -x "$(command -v wget)" ]; then
62
+ wget --quiet --show-progress -O ggml-$model.mlmodel $src/$pfx-$model.mlmodel
63
+ elif [ -x "$(command -v curl)" ]; then
64
+ curl -L --output ggml-$model.mlmodel $src/$pfx-$model.mlmodel
65
+ else
66
+ printf "Either wget or curl is required to download models.\n"
67
+ exit 1
68
+ fi
69
+
70
+
71
+ if [ $? -ne 0 ]; then
72
+ printf "Failed to download Core ML model $model \n"
73
+ printf "Please try again later or download the original Whisper model files and convert them yourself.\n"
74
+ exit 1
75
+ fi
76
+
77
+ printf "Done! Model '$model' saved in 'models/ggml-$model.mlmodel'\n"
78
+ printf "Run the following command to compile it:\n\n"
79
+ printf " $ xcrun coremlc compile ./models/ggml-$model.mlmodel ./models\n\n"
80
+ printf "You can now use it like this:\n\n"
81
+ printf " $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n"
82
+ printf "\n"
models/generate-coreml-interface.sh ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #
3
+ # This generates:
4
+ # - coreml/whisper-encoder-impl.h and coreml/whisper-encoder-impl.m
5
+ # - coreml/whisper-decoder-impl.h and coreml/whisper-decoder-impl.m
6
+ #
7
+
8
+ wd=$(dirname "$0")
9
+ cd "$wd/../"
10
+
11
+ python3 models/convert-whisper-to-coreml.py --model tiny.en
12
+
13
+ mv -v models/coreml-encoder-tiny.en.mlpackage models/whisper-encoder-impl.mlpackage
14
+ xcrun coremlc generate models/whisper-encoder-impl.mlpackage coreml/
15
+ mv coreml/whisper_encoder_impl.h coreml/whisper-encoder-impl.h
16
+ mv coreml/whisper_encoder_impl.m coreml/whisper-encoder-impl.m
17
+ sed -i '' 's/whisper_encoder_impl\.h/whisper-encoder-impl.h/g' coreml/whisper-encoder-impl.m
18
+ sed -i '' 's/whisper_encoder_impl\.m/whisper-encoder-impl.m/g' coreml/whisper-encoder-impl.m
19
+ sed -i '' 's/whisper_encoder_impl\.h/whisper-encoder-impl.h/g' coreml/whisper-encoder-impl.h
20
+
21
+ mv -v models/coreml-decoder-tiny.en.mlpackage models/whisper-decoder-impl.mlpackage
22
+ xcrun coremlc generate models/whisper-decoder-impl.mlpackage coreml/
23
+ mv coreml/whisper_decoder_impl.h coreml/whisper-decoder-impl.h
24
+ mv coreml/whisper_decoder_impl.m coreml/whisper-decoder-impl.m
25
+ sed -i '' 's/whisper_decoder_impl\.h/whisper-decoder-impl.h/g' coreml/whisper-decoder-impl.m
26
+ sed -i '' 's/whisper_decoder_impl\.m/whisper-decoder-impl.m/g' coreml/whisper-decoder-impl.m
27
+ sed -i '' 's/whisper_decoder_impl\.h/whisper-decoder-impl.h/g' coreml/whisper-decoder-impl.h
28
+
29
+ rm -rfv models/whisper-encoder-impl.mlpackage models/whisper-decoder-impl.mlpackage
models/generate-coreml-model.sh ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Usage: ./generate-coreml-model.sh <model-name>
4
+ if [ $# -eq 0 ]
5
+ then
6
+ echo "No model name supplied"
7
+ echo "Usage: ./generate-coreml-model.sh <model-name>"
8
+ exit 1
9
+ fi
10
+
11
+ mname="$1"
12
+
13
+ wd=$(dirname "$0")
14
+ cd "$wd/../"
15
+
16
+ python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True
17
+
18
+ xcrun coremlc compile models/coreml-encoder-${mname}.mlpackage models/
19
+ rm -rf models/ggml-${mname}-encoder.mlmodelc
20
+ mv -v models/coreml-encoder-${mname}.mlmodelc models/ggml-${mname}-encoder.mlmodelc
21
+
22
+ # TODO: decoder (sometime in the future maybe)
23
+ #xcrun coremlc compile models/whisper-decoder-${mname}.mlpackage models/
24
+ #rm -rf models/ggml-${mname}-decoder.mlmodelc
25
+ #mv -v models/coreml_decoder_${mname}.mlmodelc models/ggml-${mname}-decoder.mlmodelc
whisper.cpp CHANGED
@@ -1,5 +1,8 @@
1
  #define WHISPER_BUILD
2
  #include "whisper.h"
 
 
 
3
 
4
  #include "ggml.h"
5
 
@@ -586,6 +589,11 @@ struct whisper_state {
586
 
587
  int lang_id = 0; // english by default
588
 
 
 
 
 
 
589
  // [EXPERIMENTAL] token-level timestamps data
590
  int64_t t_beg = 0;
591
  int64_t t_last = 0;
@@ -1376,6 +1384,7 @@ static bool whisper_encode_internal(
1376
  }
1377
  }
1378
 
 
1379
  struct ggml_tensor * cur;
1380
 
1381
  // convolution + gelu
@@ -1683,6 +1692,13 @@ static bool whisper_encode_internal(
1683
 
1684
  //ggml_graph_print(&gf);
1685
  }
 
 
 
 
 
 
 
1686
 
1687
  // cur
1688
  //{
@@ -2470,6 +2486,20 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
2470
  // interface implementation
2471
  //
2472
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2473
  struct whisper_state * whisper_init_state(whisper_context * ctx) {
2474
  whisper_state * state = new whisper_state;
2475
 
@@ -2497,6 +2527,21 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
2497
  fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2498
  }
2499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2500
  state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
2501
 
2502
  state->logits_id.reserve(ctx->model.hparams.n_vocab);
@@ -2531,6 +2576,7 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model
2531
  }
2532
 
2533
  loader.context = &fin;
 
2534
  loader.read = [](void * ctx, void * output, size_t read_size) {
2535
  std::ifstream * fin = (std::ifstream*)ctx;
2536
  fin->read((char *)output, read_size);
@@ -2663,6 +2709,11 @@ void whisper_free_state(struct whisper_state * state)
2663
  kv_cache_free(state->decoders[i].kv_self);
2664
  }
2665
 
 
 
 
 
 
2666
  delete state;
2667
  }
2668
  }
@@ -3084,6 +3135,14 @@ void whisper_reset_timings(struct whisper_context * ctx) {
3084
  }
3085
  }
3086
 
 
 
 
 
 
 
 
 
3087
  const char * whisper_print_system_info(void) {
3088
  static std::string s;
3089
 
@@ -3100,6 +3159,7 @@ const char * whisper_print_system_info(void) {
3100
  s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
3101
  s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
3102
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
 
3103
 
3104
  return s.c_str();
3105
  }
 
1
  #define WHISPER_BUILD
2
  #include "whisper.h"
3
+ #if WHISPER_USE_COREML
4
+ #include "coreml/whisper-encoder.h"
5
+ #endif
6
 
7
  #include "ggml.h"
8
 
 
589
 
590
  int lang_id = 0; // english by default
591
 
592
+ std::string path_model; // populated by whisper_init_from_file()
593
+ #ifdef WHISPER_USE_COREML
594
+ whisper_coreml_context * ctx_coreml;
595
+ #endif
596
+
597
  // [EXPERIMENTAL] token-level timestamps data
598
  int64_t t_beg = 0;
599
  int64_t t_last = 0;
 
1384
  }
1385
  }
1386
 
1387
+ #ifndef WHISPER_USE_COREML
1388
  struct ggml_tensor * cur;
1389
 
1390
  // convolution + gelu
 
1692
 
1693
  //ggml_graph_print(&gf);
1694
  }
1695
+ #else
1696
+ wstate.use_buf(ctx0, -1);
1697
+
1698
+ struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
1699
+
1700
+ whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
1701
+ #endif
1702
 
1703
  // cur
1704
  //{
 
2486
  // interface implementation
2487
  //
2488
 
2489
+ #ifdef WHISPER_USE_COREML
2490
+ // replace .bin with -encoder.mlmodelc
2491
+ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
2492
+ auto pos = path_bin.rfind('.');
2493
+ if (pos != std::string::npos) {
2494
+ path_bin = path_bin.substr(0, pos);
2495
+ }
2496
+
2497
+ path_bin += "-encoder.mlmodelc";
2498
+
2499
+ return path_bin;
2500
+ }
2501
+ #endif
2502
+
2503
  struct whisper_state * whisper_init_state(whisper_context * ctx) {
2504
  whisper_state * state = new whisper_state;
2505
 
 
2527
  fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2528
  }
2529
 
2530
+ #ifdef WHISPER_USE_COREML
2531
+ const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model);
2532
+
2533
+ fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
2534
+ fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
2535
+
2536
+ state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
2537
+ if (!state->ctx_coreml) {
2538
+ fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
2539
+ return nullptr;
2540
+ }
2541
+
2542
+ fprintf(stderr, "%s: Core ML model loaded\n", __func__);
2543
+ #endif
2544
+
2545
  state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
2546
 
2547
  state->logits_id.reserve(ctx->model.hparams.n_vocab);
 
2576
  }
2577
 
2578
  loader.context = &fin;
2579
+
2580
  loader.read = [](void * ctx, void * output, size_t read_size) {
2581
  std::ifstream * fin = (std::ifstream*)ctx;
2582
  fin->read((char *)output, read_size);
 
2709
  kv_cache_free(state->decoders[i].kv_self);
2710
  }
2711
 
2712
+ #ifdef WHISPER_USE_COREML
2713
+ whisper_coreml_free(state->ctx_coreml);
2714
+ state->ctx_coreml = nullptr;
2715
+ #endif
2716
+
2717
  delete state;
2718
  }
2719
  }
 
3135
  }
3136
  }
3137
 
3138
+ static int whisper_has_coreml(void) {
3139
+ #ifdef WHISPER_USE_COREML
3140
+ return 1;
3141
+ #else
3142
+ return 0;
3143
+ #endif
3144
+ }
3145
+
3146
  const char * whisper_print_system_info(void) {
3147
  static std::string s;
3148
 
 
3159
  s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
3160
  s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
3161
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
3162
+ s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
3163
 
3164
  return s.c_str();
3165
  }