ggerganov commited on
Commit
a5f8f3c
·
unverified ·
1 Parent(s): d91d7d9

whisper : add integer quantization support (#540)

Browse files

* whisper : add integer quantization support

* examples : add common-ggml + prepare to add "quantize" tool

* whisper : quantization tool ready

* whisper : fix F32 support

* whisper : try to fix shared lib linkage

* wasm : update quantized models to Q5

* bench.wasm : remove "medium" button

* bench.wasm : fix custom model button

* ggml : add Q5_0 and Q5_1 WASM SIMD

* wasm : add quantized models to all WASM examples

* wasm : bump DB version number to 2

* talk-llama : update example to latest llama.cpp

* node : increase test timeout to 10s

* readme : add information for model quantization

* wasm : add links to other examples

.gitignore CHANGED
@@ -23,6 +23,7 @@ build-sanitize-thread/
23
  /talk
24
  /talk-llama
25
  /bench
 
26
 
27
  arm_neon.h
28
  sync.sh
 
23
  /talk
24
  /talk-llama
25
  /bench
26
+ /quantize
27
 
28
  arm_neon.h
29
  sync.sh
CMakeLists.txt CHANGED
@@ -303,6 +303,12 @@ if (BUILD_SHARED_LIBS)
303
 
304
  target_compile_definitions(${TARGET} PUBLIC
305
  WHISPER_SHARED
 
 
 
 
 
 
306
  )
307
  endif()
308
 
 
303
 
304
  target_compile_definitions(${TARGET} PUBLIC
305
  WHISPER_SHARED
306
+ GGML_SHARED
307
+ )
308
+
309
+ target_compile_definitions(${TARGET} PRIVATE
310
+ WHISPER_BUILD
311
+ GGML_BUILD
312
  )
313
  endif()
314
 
Makefile CHANGED
@@ -1,4 +1,4 @@
1
- default: main bench
2
 
3
  ifndef UNAME_S
4
  UNAME_S := $(shell uname -s)
@@ -243,7 +243,7 @@ libwhisper.so: ggml.o $(WHISPER_OBJ)
243
  $(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
244
 
245
  clean:
246
- rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so
247
 
248
  #
249
  # Examples
@@ -251,7 +251,7 @@ clean:
251
 
252
  CC_SDL=`sdl2-config --cflags --libs`
253
 
254
- SRC_COMMON = examples/common.cpp
255
  SRC_COMMON_SDL = examples/common-sdl.cpp
256
 
257
  main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
@@ -261,6 +261,9 @@ main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
261
  bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
262
  $(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
263
 
 
 
 
264
  stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
265
  $(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
266
 
 
1
+ default: main bench quantize
2
 
3
  ifndef UNAME_S
4
  UNAME_S := $(shell uname -s)
 
243
  $(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
244
 
245
  clean:
246
+ rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so
247
 
248
  #
249
  # Examples
 
251
 
252
  CC_SDL=`sdl2-config --cflags --libs`
253
 
254
+ SRC_COMMON = examples/common.cpp examples/common-ggml.cpp
255
  SRC_COMMON_SDL = examples/common-sdl.cpp
256
 
257
  main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
 
261
  bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
262
  $(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
263
 
264
+ quantize: examples/quantize/quantize.cpp ggml.o $(WHISPER_OBJ) $(SRC_COMMON)
265
+ $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o quantize $(LDFLAGS)
266
+
267
  stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
268
  $(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
269
 
README.md CHANGED
@@ -15,6 +15,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
15
  - AVX intrinsics support for x86 architectures
16
  - VSX intrinsics support for POWER architectures
17
  - Mixed F16 / F32 precision
 
18
  - Low memory usage (Flash Attention)
19
  - Zero memory allocations at runtime
20
  - Runs on the CPU
@@ -228,6 +229,22 @@ make large
228
  | medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
229
  | large | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  ## Core ML support
232
 
233
  On Apple Silicon devices, the Encoder inference can be executed on the Apple Neural Engine (ANE) via Core ML. This can result in significant
 
15
  - AVX intrinsics support for x86 architectures
16
  - VSX intrinsics support for POWER architectures
17
  - Mixed F16 / F32 precision
18
+ - [4-bit and 5-bit integer quantization support](https://github.com/ggerganov/whisper.cpp#quantization)
19
  - Low memory usage (Flash Attention)
20
  - Zero memory allocations at runtime
21
  - Runs on the CPU
 
229
  | medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
230
  | large | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
231
 
232
+ ## Quantization
233
+
234
+ `whisper.cpp` supports integer quantization of the Whisper `ggml` models.
235
+ Quantized models require less memory and disk space and depending on the hardware can be processed more efficiently.
236
+
237
+ Here are the steps for creating and using a quantized model:
238
+
239
+ ```bash
240
+ # quantize a model with Q5_0 method
241
+ make quantize
242
+ ./quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
243
+
244
+ # run the examples as usual, specifying the quantized model file
245
+ ./main -m models/ggml-base.en-q5_0.bin ./samples/gb0.wav
246
+ ```
247
+
248
  ## Core ML support
249
 
250
  On Apple Silicon devices, the Encoder inference can be executed on the Apple Neural Engine (ANE) via Core ML. This can result in significant
bindings/javascript/whisper.js CHANGED
The diff for this file is too large to render. See raw diff
 
examples/CMakeLists.txt CHANGED
@@ -21,10 +21,14 @@ set(TARGET common)
21
  add_library(${TARGET} STATIC
22
  common.h
23
  common.cpp
 
 
24
  )
25
 
26
  include(DefaultTargetOptions)
27
 
 
 
28
  set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
29
 
30
  if (WHISPER_SDL2)
@@ -62,6 +66,7 @@ else()
62
  add_subdirectory(stream)
63
  add_subdirectory(command)
64
  add_subdirectory(bench)
 
65
  add_subdirectory(talk)
66
  add_subdirectory(talk-llama)
67
  endif()
 
21
  add_library(${TARGET} STATIC
22
  common.h
23
  common.cpp
24
+ common-ggml.h
25
+ common-ggml.cpp
26
  )
27
 
28
  include(DefaultTargetOptions)
29
 
30
+ target_link_libraries(${TARGET} PRIVATE whisper)
31
+
32
  set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
33
 
34
  if (WHISPER_SDL2)
 
66
  add_subdirectory(stream)
67
  add_subdirectory(command)
68
  add_subdirectory(bench)
69
+ add_subdirectory(quantize)
70
  add_subdirectory(talk)
71
  add_subdirectory(talk-llama)
72
  endif()
examples/addon.node/__test__/whisper.spec.js CHANGED
@@ -14,9 +14,10 @@ const whisperParamsMock = {
14
  };
15
 
16
  describe("Run whisper.node", () => {
17
- test("it should receive a non-empty value", async () => {
18
- let result = await whisperAsync(whisperParamsMock);
19
 
20
- expect(result.length).toBeGreaterThan(0);
21
- });
22
  });
 
 
14
  };
15
 
16
  describe("Run whisper.node", () => {
17
+ test("it should receive a non-empty value", async () => {
18
+ let result = await whisperAsync(whisperParamsMock);
19
 
20
+ expect(result.length).toBeGreaterThan(0);
21
+ }, 10000);
22
  });
23
+
examples/bench.wasm/CMakeLists.txt CHANGED
@@ -31,9 +31,9 @@ endif()
31
  set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
32
  --bind \
33
  -s USE_PTHREADS=1 \
34
- -s PTHREAD_POOL_SIZE=8 \
35
- -s INITIAL_MEMORY=1024MB \
36
- -s TOTAL_MEMORY=1024MB \
37
  -s FORCE_FILESYSTEM=1 \
38
  -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
39
  ${EXTRA_FLAGS} \
 
31
  set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
32
  --bind \
33
  -s USE_PTHREADS=1 \
34
+ -s PTHREAD_POOL_SIZE_STRICT=0 \
35
+ -s INITIAL_MEMORY=2000MB \
36
+ -s TOTAL_MEMORY=2000MB \
37
  -s FORCE_FILESYSTEM=1 \
38
  -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
39
  ${EXTRA_FLAGS} \
examples/bench.wasm/index-tmpl.html CHANGED
@@ -35,6 +35,15 @@
35
 
36
  <br><br>
37
 
 
 
 
 
 
 
 
 
 
38
  <hr>
39
 
40
  Select the model you would like to use and click the "Bench" button.<br>
@@ -44,11 +53,18 @@
44
 
45
  <div id="model-whisper">
46
  Whisper model: <span id="model-whisper-status"></span>
47
- <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
48
- <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
49
- <span id="fetch-whisper-progress"></span>
50
-
51
  <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
 
 
 
 
 
 
 
 
52
  </div>
53
 
54
  <br>
@@ -160,6 +176,14 @@
160
 
161
  document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
162
  document.getElementById('fetch-whisper-base-en').style.display = 'none';
 
 
 
 
 
 
 
 
163
  document.getElementById('whisper-file' ).style.display = 'none';
164
  document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
165
  }
@@ -168,19 +192,42 @@
168
  let urls = {
169
  'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
170
  'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
 
 
 
 
 
 
 
171
  };
172
 
173
  let sizes = {
174
  'tiny.en': 75,
175
  'base.en': 142,
 
 
 
 
 
 
 
176
  };
177
 
178
  let url = urls[model];
179
  let dst = 'whisper.bin';
180
  let size_mb = sizes[model];
181
 
182
- document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
183
- document.getElementById('fetch-whisper-base-en').style.display = 'none';
 
 
 
 
 
 
 
 
 
184
  document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
185
 
186
  cbProgress = function(p) {
@@ -190,9 +237,18 @@
190
 
191
  cbCancel = function() {
192
  var el;
193
- el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
194
- el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
195
- el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
 
 
 
 
 
 
 
 
 
196
  };
197
 
198
  loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
 
35
 
36
  <br><br>
37
 
38
+ <b>More examples:</b>
39
+ <a href="https://whisper.ggerganov.com/">main</a> |
40
+ <a href="https://whisper.ggerganov.com/bench">bench</a> |
41
+ <a href="https://whisper.ggerganov.com/stream">stream</a> |
42
+ <a href="https://whisper.ggerganov.com/command">command</a> |
43
+ <a href="https://whisper.ggerganov.com/talk">talk</a> |
44
+
45
+ <br><br>
46
+
47
  <hr>
48
 
49
  Select the model you would like to use and click the "Bench" button.<br>
 
53
 
54
  <div id="model-whisper">
55
  Whisper model: <span id="model-whisper-status"></span>
56
+ <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
57
+ <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
58
+ <button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
 
59
  <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
60
+ <br><br>
61
+ Quantized models:<br><br>
62
+ <button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
63
+ <button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
64
+ <button id="fetch-whisper-small-en-q5_1" onclick="loadWhisper('small-en-q5_1')">small.en (Q5_1, 182 MB)</button>
65
+ <button id="fetch-whisper-medium-en-q5_0" onclick="loadWhisper('medium-en-q5_0')">medium.en (Q5_0, 515 MB)</button>
66
+ <button id="fetch-whisper-large-q5_0" onclick="loadWhisper('large-q5_0')">large (Q5_0, 1030 MB)</button>
67
+ <span id="fetch-whisper-progress"></span>
68
  </div>
69
 
70
  <br>
 
176
 
177
  document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
178
  document.getElementById('fetch-whisper-base-en').style.display = 'none';
179
+ document.getElementById('fetch-whisper-small-en').style.display = 'none';
180
+
181
+ document.getElementById('fetch-whisper-tiny-en-q5_1' ).style.display = 'none';
182
+ document.getElementById('fetch-whisper-base-en-q5_1' ).style.display = 'none';
183
+ document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
184
+ document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
185
+ document.getElementById('fetch-whisper-large-q5_0' ).style.display = 'none';
186
+
187
  document.getElementById('whisper-file' ).style.display = 'none';
188
  document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
189
  }
 
192
  let urls = {
193
  'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
194
  'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
195
+ 'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
196
+
197
+ 'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
198
+ 'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
199
+ 'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin',
200
+ 'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin',
201
+ 'large-q5_0': 'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin',
202
  };
203
 
204
  let sizes = {
205
  'tiny.en': 75,
206
  'base.en': 142,
207
+ 'small.en': 466,
208
+
209
+ 'tiny-en-q5_1': 31,
210
+ 'base-en-q5_1': 57,
211
+ 'small-en-q5_1': 182,
212
+ 'medium-en-q5_0': 515,
213
+ 'large-q5_0': 1030,
214
  };
215
 
216
  let url = urls[model];
217
  let dst = 'whisper.bin';
218
  let size_mb = sizes[model];
219
 
220
+ document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
221
+ document.getElementById('fetch-whisper-base-en').style.display = 'none';
222
+ document.getElementById('fetch-whisper-small-en').style.display = 'none';
223
+
224
+ document.getElementById('fetch-whisper-tiny-en-q5_1' ).style.display = 'none';
225
+ document.getElementById('fetch-whisper-base-en-q5_1' ).style.display = 'none';
226
+ document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
227
+ document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
228
+ document.getElementById('fetch-whisper-large-q5_0' ).style.display = 'none';
229
+
230
+ document.getElementById('whisper-file' ).style.display = 'none';
231
  document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
232
 
233
  cbProgress = function(p) {
 
237
 
238
  cbCancel = function() {
239
  var el;
240
+ el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
241
+ el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
242
+ el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
243
+
244
+ el = document.getElementById('fetch-whisper-tiny-en-q5_1' ); if (el) el.style.display = 'inline-block';
245
+ el = document.getElementById('fetch-whisper-base-en-q5_1' ); if (el) el.style.display = 'inline-block';
246
+ el = document.getElementById('fetch-whisper-small-en-q5_1' ); if (el) el.style.display = 'inline-block';
247
+ el = document.getElementById('fetch-whisper-medium-en-q5_0'); if (el) el.style.display = 'inline-block';
248
+ el = document.getElementById('fetch-whisper-large-q5_0' ); if (el) el.style.display = 'inline-block';
249
+
250
+ el = document.getElementById('whisper-file' ); if (el) el.style.display = 'inline-block';
251
+ el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
252
  };
253
 
254
  loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
examples/command.wasm/index-tmpl.html CHANGED
@@ -35,6 +35,15 @@
35
 
36
  <br><br>
37
 
 
 
 
 
 
 
 
 
 
38
  <hr>
39
 
40
  Select the model you would like to use, click the "Start" button and follow the instructions.
@@ -45,6 +54,10 @@
45
  Whisper model: <span id="model-whisper-status"></span>
46
  <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
47
  <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
 
 
 
 
48
  <span id="fetch-whisper-progress"></span>
49
 
50
  <!--
@@ -162,11 +175,17 @@
162
  let urls = {
163
  'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
164
  'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
 
 
 
165
  };
166
 
167
  let sizes = {
168
  'tiny.en': 75,
169
  'base.en': 142,
 
 
 
170
  };
171
 
172
  let url = urls[model];
@@ -177,6 +196,10 @@
177
 
178
  document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
179
  document.getElementById('fetch-whisper-base-en').style.display = 'none';
 
 
 
 
180
  document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
181
 
182
  cbProgress = function(p) {
@@ -188,6 +211,10 @@
188
  var el;
189
  el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
190
  el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
 
 
 
 
191
  el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
192
  };
193
 
 
35
 
36
  <br><br>
37
 
38
+ <b>More examples:</b>
39
+ <a href="https://whisper.ggerganov.com/">main</a> |
40
+ <a href="https://whisper.ggerganov.com/bench">bench</a> |
41
+ <a href="https://whisper.ggerganov.com/stream">stream</a> |
42
+ <a href="https://whisper.ggerganov.com/command">command</a> |
43
+ <a href="https://whisper.ggerganov.com/talk">talk</a> |
44
+
45
+ <br><br>
46
+
47
  <hr>
48
 
49
  Select the model you would like to use, click the "Start" button and follow the instructions.
 
54
  Whisper model: <span id="model-whisper-status"></span>
55
  <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
56
  <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
57
+ <br><br>
58
+ Quantized models:<br><br>
59
+ <button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
60
+ <button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
61
  <span id="fetch-whisper-progress"></span>
62
 
63
  <!--
 
175
  let urls = {
176
  'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
177
  'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
178
+
179
+ 'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
180
+ 'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
181
  };
182
 
183
  let sizes = {
184
  'tiny.en': 75,
185
  'base.en': 142,
186
+
187
+ 'tiny-en-q5_1': 31,
188
+ 'base-en-q5_1': 57,
189
  };
190
 
191
  let url = urls[model];
 
196
 
197
  document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
198
  document.getElementById('fetch-whisper-base-en').style.display = 'none';
199
+
200
+ document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
201
+ document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
202
+
203
  document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
204
 
205
  cbProgress = function(p) {
 
211
  var el;
212
  el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
213
  el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
214
+
215
+ el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
216
+ el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
217
+
218
  el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
219
  };
220
 
examples/common-ggml.cpp ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "common-ggml.h"
2
+
3
+ #include <regex>
4
+ #include <map>
5
+
6
+ static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
7
+ {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
8
+ {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
9
+ {"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
10
+ {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
11
+ {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
12
+ {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
13
+ };
14
+
15
+ void ggml_print_ftypes(FILE * fp) {
16
+ for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
17
+ fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
18
+ }
19
+ }
20
+
21
+ enum ggml_ftype ggml_parse_ftype(const char * str) {
22
+ enum ggml_ftype ftype;
23
+ if (str[0] == 'q') {
24
+ const auto it = GGML_FTYPE_MAP.find(str);
25
+ if (it == GGML_FTYPE_MAP.end()) {
26
+ fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
27
+ return GGML_FTYPE_UNKNOWN;
28
+ }
29
+ ftype = it->second;
30
+ } else {
31
+ ftype = (enum ggml_ftype) atoi(str);
32
+ }
33
+
34
+ return ftype;
35
+ }
36
+
37
+ bool ggml_common_quantize_0(
38
+ std::ifstream & finp,
39
+ std::ofstream & fout,
40
+ const ggml_ftype ftype,
41
+ const std::vector<std::string> & to_quant,
42
+ const std::vector<std::string> & to_skip) {
43
+
44
+ ggml_type qtype = GGML_TYPE_F32;
45
+
46
+ switch (ftype) {
47
+ case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
48
+ case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
49
+ case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
50
+ case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
51
+ case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
52
+ case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
53
+ case GGML_FTYPE_UNKNOWN:
54
+ case GGML_FTYPE_ALL_F32:
55
+ case GGML_FTYPE_MOSTLY_F16:
56
+ case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
57
+ {
58
+ fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
59
+ return false;
60
+ }
61
+ };
62
+
63
+ if (!ggml_is_quantized(qtype)) {
64
+ fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
65
+ return false;
66
+ }
67
+
68
+ size_t total_size_org = 0;
69
+ size_t total_size_new = 0;
70
+
71
+ std::vector<float> work;
72
+
73
+ std::vector<uint8_t> data_u8;
74
+ std::vector<ggml_fp16_t> data_f16;
75
+ std::vector<float> data_f32;
76
+
77
+ std::vector<int64_t> hist_all(1 << 4, 0);
78
+
79
+ while (true) {
80
+ int32_t n_dims;
81
+ int32_t length;
82
+ int32_t ttype;
83
+
84
+ finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
85
+ finp.read(reinterpret_cast<char *>(&length), sizeof(length));
86
+ finp.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
87
+
88
+ if (finp.eof()) {
89
+ break;
90
+ }
91
+
92
+ int32_t nelements = 1;
93
+ int32_t ne[2] = { 1, 1 };
94
+ for (int i = 0; i < n_dims; ++i) {
95
+ finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
96
+ nelements *= ne[i];
97
+ }
98
+
99
+ std::string name(length, 0);
100
+ finp.read (&name[0], length);
101
+
102
+ printf("%64s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ggml_type_name((ggml_type) ttype));
103
+
104
+ bool quantize = false;
105
+
106
+ // check if we should quantize this tensor
107
+ for (const auto & s : to_quant) {
108
+ if (std::regex_match(name, std::regex(s))) {
109
+ quantize = true;
110
+ break;
111
+ }
112
+ }
113
+
114
+ // check if we should skip this tensor
115
+ for (const auto & s : to_skip) {
116
+ if (std::regex_match(name, std::regex(s))) {
117
+ quantize = false;
118
+ break;
119
+ }
120
+ }
121
+
122
+ // quantize only 2D tensors
123
+ quantize &= (n_dims == 2);
124
+
125
+ if (quantize) {
126
+ if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
127
+ fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
128
+ return false;
129
+ }
130
+
131
+ if (ttype == GGML_TYPE_F16) {
132
+ data_f16.resize(nelements);
133
+ finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
134
+ data_f32.resize(nelements);
135
+ for (int i = 0; i < nelements; ++i) {
136
+ data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
137
+ }
138
+ } else {
139
+ data_f32.resize(nelements);
140
+ finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
141
+ }
142
+
143
+ ttype = qtype;
144
+ } else {
145
+ const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
146
+
147
+ data_u8.resize(nelements*bpe);
148
+ finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
149
+ }
150
+
151
+ fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
152
+ fout.write(reinterpret_cast<char *>(&length), sizeof(length));
153
+ fout.write(reinterpret_cast<char *>(&ttype), sizeof(ttype));
154
+ for (int i = 0; i < n_dims; ++i) {
155
+ fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
156
+ }
157
+ fout.write(&name[0], length);
158
+
159
+ if (quantize) {
160
+ work.resize(nelements); // for quantization
161
+
162
+ size_t cur_size = 0;
163
+ std::vector<int64_t> hist_cur(1 << 4, 0);
164
+
165
+ switch ((ggml_type) ttype) {
166
+ case GGML_TYPE_Q4_0:
167
+ {
168
+ cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
169
+ } break;
170
+ case GGML_TYPE_Q4_1:
171
+ {
172
+ cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
173
+ } break;
174
+ case GGML_TYPE_Q4_2:
175
+ {
176
+ cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
177
+ } break;
178
+ case GGML_TYPE_Q5_0:
179
+ {
180
+ cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
181
+ } break;
182
+ case GGML_TYPE_Q5_1:
183
+ {
184
+ cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
185
+ } break;
186
+ case GGML_TYPE_Q8_0:
187
+ {
188
+ cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
189
+ } break;
190
+ case GGML_TYPE_F32:
191
+ case GGML_TYPE_F16:
192
+ case GGML_TYPE_I8:
193
+ case GGML_TYPE_I16:
194
+ case GGML_TYPE_I32:
195
+ case GGML_TYPE_Q8_1:
196
+ case GGML_TYPE_COUNT:
197
+ {
198
+ fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
199
+ return false;
200
+ }
201
+ }
202
+
203
+ fout.write(reinterpret_cast<char *>(work.data()), cur_size);
204
+ total_size_new += cur_size;
205
+
206
+ printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
207
+ for (int i = 0; i < hist_cur.size(); ++i) {
208
+ hist_all[i] += hist_cur[i];
209
+ }
210
+
211
+ for (int i = 0; i < hist_cur.size(); ++i) {
212
+ printf("%5.3f ", hist_cur[i] / (float)nelements);
213
+ }
214
+ printf("\n");
215
+ } else {
216
+ printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
217
+ fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
218
+ total_size_new += data_u8.size();
219
+ }
220
+
221
+ total_size_org += nelements * sizeof(float);
222
+ }
223
+
224
+ printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
225
+ printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
226
+
227
+ {
228
+ int64_t sum_all = 0;
229
+ for (int i = 0; i < hist_all.size(); ++i) {
230
+ sum_all += hist_all[i];
231
+ }
232
+
233
+ printf("%s: hist: ", __func__);
234
+ for (int i = 0; i < hist_all.size(); ++i) {
235
+ printf("%5.3f ", hist_all[i] / (float)sum_all);
236
+ }
237
+ printf("\n");
238
+ }
239
+
240
+ return true;
241
+ }
examples/common-ggml.h ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+
5
+ #include <fstream>
6
+ #include <vector>
7
+ #include <string>
8
+
9
+ enum ggml_ftype ggml_parse_ftype(const char * str);
10
+
11
+ void ggml_print_ftypes(FILE * fp = stderr);
12
+
13
+ bool ggml_common_quantize_0(
14
+ std::ifstream & finp,
15
+ std::ofstream & fout,
16
+ const ggml_ftype ftype,
17
+ const std::vector<std::string> & to_quant,
18
+ const std::vector<std::string> & to_skip);
examples/common.cpp CHANGED
@@ -6,13 +6,86 @@
6
  #include "dr_wav.h"
7
 
8
  #include <cmath>
9
- #include <cstdint>
10
  #include <regex>
11
 
12
  #ifndef M_PI
13
  #define M_PI 3.14159265358979323846
14
  #endif
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  std::string trim(const std::string & s) {
17
  std::regex e("^\\s+|\\s+$");
18
  return std::regex_replace(s, e, "");
@@ -28,6 +101,251 @@ std::string replace(const std::string & s, const std::string & from, const std::
28
  return result;
29
  }
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
32
  drwav wav;
33
  std::vector<uint8_t> wav_data; // used for pipe input from stdin
 
6
  #include "dr_wav.h"
7
 
8
  #include <cmath>
9
+ #include <fstream>
10
  #include <regex>
11
 
12
  #ifndef M_PI
13
  #define M_PI 3.14159265358979323846
14
  #endif
15
 
16
+ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
17
+ for (int i = 1; i < argc; i++) {
18
+ std::string arg = argv[i];
19
+
20
+ if (arg == "-s" || arg == "--seed") {
21
+ params.seed = std::stoi(argv[++i]);
22
+ } else if (arg == "-t" || arg == "--threads") {
23
+ params.n_threads = std::stoi(argv[++i]);
24
+ } else if (arg == "-p" || arg == "--prompt") {
25
+ params.prompt = argv[++i];
26
+ } else if (arg == "-n" || arg == "--n_predict") {
27
+ params.n_predict = std::stoi(argv[++i]);
28
+ } else if (arg == "--top_k") {
29
+ params.top_k = std::stoi(argv[++i]);
30
+ } else if (arg == "--top_p") {
31
+ params.top_p = std::stof(argv[++i]);
32
+ } else if (arg == "--temp") {
33
+ params.temp = std::stof(argv[++i]);
34
+ } else if (arg == "-b" || arg == "--batch_size") {
35
+ params.n_batch = std::stoi(argv[++i]);
36
+ } else if (arg == "-m" || arg == "--model") {
37
+ params.model = argv[++i];
38
+ } else if (arg == "-h" || arg == "--help") {
39
+ gpt_print_usage(argc, argv, params);
40
+ exit(0);
41
+ } else {
42
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
43
+ gpt_print_usage(argc, argv, params);
44
+ exit(0);
45
+ }
46
+ }
47
+
48
+ return true;
49
+ }
50
+
51
+ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
52
+ fprintf(stderr, "usage: %s [options]\n", argv[0]);
53
+ fprintf(stderr, "\n");
54
+ fprintf(stderr, "options:\n");
55
+ fprintf(stderr, " -h, --help show this help message and exit\n");
56
+ fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
57
+ fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
58
+ fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
59
+ fprintf(stderr, " prompt to start generation with (default: random)\n");
60
+ fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
61
+ fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
62
+ fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
63
+ fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
64
+ fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
65
+ fprintf(stderr, " -m FNAME, --model FNAME\n");
66
+ fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
67
+ fprintf(stderr, "\n");
68
+ }
69
+
70
+ std::string gpt_random_prompt(std::mt19937 & rng) {
71
+ const int r = rng() % 10;
72
+ switch (r) {
73
+ case 0: return "So";
74
+ case 1: return "Once upon a time";
75
+ case 2: return "When";
76
+ case 3: return "The";
77
+ case 4: return "After";
78
+ case 5: return "If";
79
+ case 6: return "import";
80
+ case 7: return "He";
81
+ case 8: return "She";
82
+ case 9: return "They";
83
+ default: return "To";
84
+ }
85
+
86
+ return "The";
87
+ }
88
+
89
  std::string trim(const std::string & s) {
90
  std::regex e("^\\s+|\\s+$");
91
  return std::regex_replace(s, e, "");
 
101
  return result;
102
  }
103
 
104
+ std::map<std::string, int32_t> json_parse(const std::string & fname) {
105
+ std::map<std::string, int32_t> result;
106
+
107
+ // read file into string
108
+ std::string json;
109
+ {
110
+ std::ifstream ifs(fname);
111
+ if (!ifs) {
112
+ fprintf(stderr, "Failed to open %s\n", fname.c_str());
113
+ exit(1);
114
+ }
115
+
116
+ json = std::string((std::istreambuf_iterator<char>(ifs)),
117
+ (std::istreambuf_iterator<char>()));
118
+ }
119
+
120
+ if (json[0] != '{') {
121
+ return result;
122
+ }
123
+
124
+ // parse json
125
+ {
126
+ bool has_key = false;
127
+ bool in_token = false;
128
+
129
+ std::string str_key = "";
130
+ std::string str_val = "";
131
+
132
+ int n = json.size();
133
+ for (int i = 1; i < n; ++i) {
134
+ if (!in_token) {
135
+ if (json[i] == ' ') continue;
136
+ if (json[i] == '"') {
137
+ in_token = true;
138
+ continue;
139
+ }
140
+ } else {
141
+ if (json[i] == '\\' && i+1 < n) {
142
+ if (has_key == false) {
143
+ str_key += json[i];
144
+ } else {
145
+ str_val += json[i];
146
+ }
147
+ ++i;
148
+ } else if (json[i] == '"') {
149
+ if (has_key == false) {
150
+ has_key = true;
151
+ ++i;
152
+ while (json[i] == ' ') ++i;
153
+ ++i; // :
154
+ while (json[i] == ' ') ++i;
155
+ if (json[i] != '\"') {
156
+ while (json[i] != ',' && json[i] != '}') {
157
+ str_val += json[i++];
158
+ }
159
+ has_key = false;
160
+ } else {
161
+ in_token = true;
162
+ continue;
163
+ }
164
+ } else {
165
+ has_key = false;
166
+ }
167
+
168
+ str_key = ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
169
+ str_key = ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
170
+ str_key = ::replace(str_key, "\\\"", "\""); // \\\" -> "
171
+
172
+ try {
173
+ result[str_key] = std::stoi(str_val);
174
+ } catch (...) {
175
+ //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
176
+
177
+ }
178
+ str_key = "";
179
+ str_val = "";
180
+ in_token = false;
181
+ continue;
182
+ }
183
+ if (has_key == false) {
184
+ str_key += json[i];
185
+ } else {
186
+ str_val += json[i];
187
+ }
188
+ }
189
+ }
190
+ }
191
+
192
+ return result;
193
+ }
194
+
195
+ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
196
+ std::vector<std::string> words;
197
+
198
+ // first split the text into words
199
+ {
200
+ std::string str = text;
201
+ std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
202
+
203
+ std::regex re(pat);
204
+ std::smatch m;
205
+
206
+ while (std::regex_search(str, m, re)) {
207
+ for (auto x : m) {
208
+ words.push_back(x);
209
+ }
210
+ str = m.suffix();
211
+ }
212
+ }
213
+
214
+ // find the longest tokens that form the words:
215
+ std::vector<gpt_vocab::id> tokens;
216
+ for (const auto & word : words) {
217
+ if (word.size() == 0) continue;
218
+
219
+ int i = 0;
220
+ int n = word.size();
221
+ while (i < n) {
222
+ int j = n;
223
+ while (j > i) {
224
+ auto it = vocab.token_to_id.find(word.substr(i, j-i));
225
+ if (it != vocab.token_to_id.end()) {
226
+ tokens.push_back(it->second);
227
+ i = j;
228
+ break;
229
+ }
230
+ --j;
231
+ }
232
+ if (i == n) {
233
+ break;
234
+ }
235
+ if (j == i) {
236
+ auto sub = word.substr(i, 1);
237
+ if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
238
+ tokens.push_back(vocab.token_to_id.at(sub));
239
+ } else {
240
+ fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
241
+ }
242
+ ++i;
243
+ }
244
+ }
245
+ }
246
+
247
+ return tokens;
248
+ }
249
+
250
+ bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
251
+ printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
252
+
253
+ vocab.token_to_id = ::json_parse(fname);
254
+
255
+ for (const auto & kv : vocab.token_to_id) {
256
+ vocab.id_to_token[kv.second] = kv.first;
257
+ }
258
+
259
+ printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
260
+
261
+ // print the vocabulary
262
+ //for (auto kv : vocab.token_to_id) {
263
+ // printf("'%s' -> %d\n", kv.first.data(), kv.second);
264
+ //}
265
+
266
+ return true;
267
+ }
268
+
269
+ gpt_vocab::id gpt_sample_top_k_top_p(
270
+ const gpt_vocab & vocab,
271
+ const float * logits,
272
+ int top_k,
273
+ double top_p,
274
+ double temp,
275
+ std::mt19937 & rng) {
276
+ int n_logits = vocab.id_to_token.size();
277
+
278
+ std::vector<std::pair<double, gpt_vocab::id>> logits_id;
279
+ logits_id.reserve(n_logits);
280
+
281
+ {
282
+ const double scale = 1.0/temp;
283
+ for (int i = 0; i < n_logits; ++i) {
284
+ logits_id.push_back(std::make_pair(logits[i]*scale, i));
285
+ }
286
+ }
287
+
288
+ // find the top K tokens
289
+ std::partial_sort(
290
+ logits_id.begin(),
291
+ logits_id.begin() + top_k, logits_id.end(),
292
+ [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
293
+ return a.first > b.first;
294
+ });
295
+
296
+ logits_id.resize(top_k);
297
+
298
+ double maxl = -INFINITY;
299
+ for (const auto & kv : logits_id) {
300
+ maxl = std::max(maxl, kv.first);
301
+ }
302
+
303
+ // compute probs for the top K tokens
304
+ std::vector<double> probs;
305
+ probs.reserve(logits_id.size());
306
+
307
+ double sum = 0.0;
308
+ for (const auto & kv : logits_id) {
309
+ double p = exp(kv.first - maxl);
310
+ probs.push_back(p);
311
+ sum += p;
312
+ }
313
+
314
+ // normalize the probs
315
+ for (auto & p : probs) {
316
+ p /= sum;
317
+ }
318
+
319
+ if (top_p < 1.0f) {
320
+ double cumsum = 0.0f;
321
+ for (int i = 0; i < top_k; i++) {
322
+ cumsum += probs[i];
323
+ if (cumsum >= top_p) {
324
+ top_k = i + 1;
325
+ probs.resize(top_k);
326
+ logits_id.resize(top_k);
327
+ break;
328
+ }
329
+ }
330
+
331
+ cumsum = 1.0/cumsum;
332
+ for (int i = 0; i < (int) probs.size(); i++) {
333
+ probs[i] *= cumsum;
334
+ }
335
+ }
336
+
337
+ //printf("\n");
338
+ //for (int i = 0; i < (int) probs.size(); i++) {
339
+ // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
340
+ //}
341
+ //exit(0);
342
+
343
+ std::discrete_distribution<> dist(probs.begin(), probs.end());
344
+ int idx = dist(rng);
345
+
346
+ return logits_id[idx].second;
347
+ }
348
+
349
  bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
350
  drwav wav;
351
  std::vector<uint8_t> wav_data; // used for pipe input from stdin
examples/common.h CHANGED
@@ -1,10 +1,44 @@
 
 
1
  #pragma once
2
 
3
- // needs to match WHISPER_SAMPLE_RATE
 
 
 
 
 
4
  #define COMMON_SAMPLE_RATE 16000
5
 
6
- #include <vector>
7
- #include <string>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  std::string trim(const std::string & s);
10
 
@@ -13,6 +47,52 @@ std::string replace(
13
  const std::string & from,
14
  const std::string & to);
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  // Read WAV audio file and store the PCM data into pcmf32
17
  // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
18
  // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
 
1
+ // Various helper functions and utilities
2
+
3
  #pragma once
4
 
5
+ #include <string>
6
+ #include <map>
7
+ #include <vector>
8
+ #include <random>
9
+ #include <thread>
10
+
11
  #define COMMON_SAMPLE_RATE 16000
12
 
13
+ //
14
+ // CLI argument parsing
15
+ //
16
+
17
+ struct gpt_params {
18
+ int32_t seed = -1; // RNG seed
19
+ int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
20
+ int32_t n_predict = 200; // new tokens to predict
21
+
22
+ // sampling parameters
23
+ int32_t top_k = 40;
24
+ float top_p = 0.9f;
25
+ float temp = 0.9f;
26
+
27
+ int32_t n_batch = 8; // batch size for prompt processing
28
+
29
+ std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
30
+ std::string prompt;
31
+ };
32
+
33
+ bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
34
+
35
+ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
36
+
37
+ std::string gpt_random_prompt(std::mt19937 & rng);
38
+
39
+ //
40
+ // Vocab utils
41
+ //
42
 
43
  std::string trim(const std::string & s);
44
 
 
47
  const std::string & from,
48
  const std::string & to);
49
 
50
+ struct gpt_vocab {
51
+ using id = int32_t;
52
+ using token = std::string;
53
+
54
+ std::map<token, id> token_to_id;
55
+ std::map<id, token> id_to_token;
56
+ };
57
+
58
+ // poor-man's JSON parsing
59
+ std::map<std::string, int32_t> json_parse(const std::string & fname);
60
+
61
+ // split text into tokens
62
+ //
63
+ // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
64
+ //
65
+ // Regex (Python):
66
+ // r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
67
+ //
68
+ // Regex (C++):
69
+ // R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
70
+ //
71
+ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
72
+
73
+ // load the tokens from encoder.json
74
+ bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
75
+
76
+ // sample next token given probabilities for each embedding
77
+ //
78
+ // - consider only the top K tokens
79
+ // - from them, consider only the top tokens with cumulative probability > P
80
+ //
81
+ // TODO: not sure if this implementation is correct
82
+ // TODO: temperature is not implemented
83
+ //
84
+ gpt_vocab::id gpt_sample_top_k_top_p(
85
+ const gpt_vocab & vocab,
86
+ const float * logits,
87
+ int top_k,
88
+ double top_p,
89
+ double temp,
90
+ std::mt19937 & rng);
91
+
92
+ //
93
+ // Audio utils
94
+ //
95
+
96
  // Read WAV audio file and store the PCM data into pcmf32
97
  // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
98
  // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
examples/helpers.js CHANGED
@@ -145,7 +145,15 @@ function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
145
  var db = event.target.result;
146
  var tx = db.transaction(['models'], 'readwrite');
147
  var os = tx.objectStore('models');
148
- var rq = os.put(data, url);
 
 
 
 
 
 
 
 
149
 
150
  rq.onsuccess = function (event) {
151
  cbPrint('loadRemote: "' + url + '" stored in the IndexedDB');
@@ -180,7 +188,6 @@ function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
180
 
181
  rq.onabort = function (event) {
182
  cbPrint('loadRemote: failed to open IndexedDB: abort');
183
-
184
  };
185
  }
186
-
 
145
  var db = event.target.result;
146
  var tx = db.transaction(['models'], 'readwrite');
147
  var os = tx.objectStore('models');
148
+
149
+ var rq = null;
150
+ try {
151
+ var rq = os.put(data, url);
152
+ } catch (e) {
153
+ cbPrint('loadRemote: failed to store "' + url + '" in the IndexedDB: \n' + e);
154
+ cbCancel();
155
+ return;
156
+ }
157
 
158
  rq.onsuccess = function (event) {
159
  cbPrint('loadRemote: "' + url + '" stored in the IndexedDB');
 
188
 
189
  rq.onabort = function (event) {
190
  cbPrint('loadRemote: failed to open IndexedDB: abort');
191
+ cbCancel();
192
  };
193
  }
 
examples/main/main.cpp CHANGED
@@ -496,7 +496,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
496
  value_i("layer", whisper_model_n_text_layer(ctx), true);
497
  end_obj();
498
  value_i("mels", whisper_model_n_mels(ctx));
499
- value_i("f16", whisper_model_f16(ctx), true);
500
  end_obj();
501
  start_obj("params");
502
  value_s("model", params.model.c_str());
 
496
  value_i("layer", whisper_model_n_text_layer(ctx), true);
497
  end_obj();
498
  value_i("mels", whisper_model_n_mels(ctx));
499
+ value_i("ftype", whisper_model_ftype(ctx), true);
500
  end_obj();
501
  start_obj("params");
502
  value_s("model", params.model.c_str());
examples/quantize/CMakeLists.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ set(TARGET quantize)
2
+ add_executable(${TARGET} quantize.cpp)
3
+
4
+ include(DefaultTargetOptions)
5
+
6
+ target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
examples/quantize/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # quantize
2
+
3
+ Tool for integer quantization of Whisper `ggml` model files
examples/quantize/quantize.cpp ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml.h"
2
+
3
+ #include "common.h"
4
+ #include "common-ggml.h"
5
+
6
+ #include <cassert>
7
+ #include <cmath>
8
+ #include <cstdio>
9
+ #include <cstring>
10
+ #include <fstream>
11
+ #include <map>
12
+ #include <string>
13
+ #include <vector>
14
+ #include <regex>
15
+
16
+ // default hparams (Whisper tiny)
17
+ struct whisper_hparams {
18
+ int32_t n_vocab = 51864;
19
+ int32_t n_audio_ctx = 1500;
20
+ int32_t n_audio_state = 384;
21
+ int32_t n_audio_head = 6;
22
+ int32_t n_audio_layer = 4;
23
+ int32_t n_text_ctx = 448;
24
+ int32_t n_text_state = 384;
25
+ int32_t n_text_head = 6;
26
+ int32_t n_text_layer = 4;
27
+ int32_t n_mels = 80;
28
+ int32_t f16 = 1;
29
+ };
30
+
31
+ struct whisper_filters {
32
+ int32_t n_mel;
33
+ int32_t n_fft;
34
+
35
+ std::vector<float> data;
36
+ };
37
+
38
+ // quantize a model
39
+ bool whisper_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
40
+ gpt_vocab vocab;
41
+
42
+ printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
43
+
44
+ auto finp = std::ifstream(fname_inp, std::ios::binary);
45
+ if (!finp) {
46
+ fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
47
+ return false;
48
+ }
49
+
50
+ auto fout = std::ofstream(fname_out, std::ios::binary);
51
+ if (!fout) {
52
+ fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
53
+ return false;
54
+ }
55
+
56
+ // verify magic
57
+ {
58
+ uint32_t magic;
59
+ finp.read((char *) &magic, sizeof(magic));
60
+ if (magic != 0x67676d6c) {
61
+ fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
62
+ return false;
63
+ }
64
+
65
+ fout.write((char *) &magic, sizeof(magic));
66
+ }
67
+
68
+ whisper_hparams hparams;
69
+
70
+ // load hparams
71
+ {
72
+ finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
73
+ finp.read((char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
74
+ finp.read((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
75
+ finp.read((char *) &hparams.n_audio_head, sizeof(hparams.n_audio_head));
76
+ finp.read((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
77
+ finp.read((char *) &hparams.n_text_ctx, sizeof(hparams.n_text_ctx));
78
+ finp.read((char *) &hparams.n_text_state, sizeof(hparams.n_text_state));
79
+ finp.read((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
80
+ finp.read((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
81
+ finp.read((char *) &hparams.n_mels, sizeof(hparams.n_mels));
82
+ finp.read((char *) &hparams.f16, sizeof(hparams.f16));
83
+
84
+ fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
85
+ fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
86
+ fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
87
+ fprintf(stderr, "%s: n_audio_head = %d\n", __func__, hparams.n_audio_head);
88
+ fprintf(stderr, "%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
89
+ fprintf(stderr, "%s: n_text_ctx = %d\n", __func__, hparams.n_text_ctx);
90
+ fprintf(stderr, "%s: n_text_state = %d\n", __func__, hparams.n_text_state);
91
+ fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
92
+ fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
93
+ fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
94
+ fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
95
+
96
+ fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
97
+ fout.write((char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
98
+ fout.write((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
99
+ fout.write((char *) &hparams.n_audio_head, sizeof(hparams.n_audio_head));
100
+ fout.write((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
101
+ fout.write((char *) &hparams.n_text_ctx, sizeof(hparams.n_text_ctx));
102
+ fout.write((char *) &hparams.n_text_state, sizeof(hparams.n_text_state));
103
+ fout.write((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
104
+ fout.write((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
105
+ fout.write((char *) &hparams.n_mels, sizeof(hparams.n_mels));
106
+ fout.write((char *) &ftype, sizeof(hparams.f16));
107
+ }
108
+
109
+ // load mel filters
110
+ {
111
+ whisper_filters filters;
112
+
113
+ finp.read ((char *) &filters.n_mel, sizeof(filters.n_mel));
114
+ fout.write((char *) &filters.n_mel, sizeof(filters.n_mel));
115
+ finp.read ((char *) &filters.n_fft, sizeof(filters.n_fft));
116
+ fout.write((char *) &filters.n_fft, sizeof(filters.n_fft));
117
+
118
+ filters.data.resize(filters.n_mel * filters.n_fft);
119
+ finp.read ((char *) filters.data.data(), filters.data.size() * sizeof(float));
120
+ fout.write((char *) filters.data.data(), filters.data.size() * sizeof(float));
121
+ }
122
+
123
+ // load vocab
124
+ {
125
+ int32_t n_vocab = 0;
126
+ finp.read ((char *) &n_vocab, sizeof(n_vocab));
127
+ fout.write((char *) &n_vocab, sizeof(n_vocab));
128
+
129
+ //if (n_vocab != hparams.n_vocab) {
130
+ // fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
131
+ // __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
132
+ // return false;
133
+ //}
134
+
135
+ std::string word;
136
+ for (int i = 0; i < n_vocab; i++) {
137
+ uint32_t len;
138
+ finp.read ((char *) &len, sizeof(len));
139
+ fout.write((char *) &len, sizeof(len));
140
+
141
+ word.resize(len);
142
+ finp.read ((char *) word.data(), len);
143
+ fout.write((char *) word.data(), len);
144
+
145
+ vocab.token_to_id[word] = i;
146
+ vocab.id_to_token[i] = word;
147
+ }
148
+ }
149
+
150
+ // regexes of tensor names to not be quantized
151
+ const std::vector<std::string> to_skip = {
152
+ //"encoder.*",
153
+ "encoder.conv1.bias",
154
+ "encoder.conv2.bias",
155
+ "encoder.positional_embedding",
156
+ "decoder.positional_embedding",
157
+ };
158
+
159
+ if (!ggml_common_quantize_0(finp, fout, ftype, { ".*" }, to_skip)) {
160
+ fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
161
+ return false;
162
+ }
163
+
164
+ finp.close();
165
+ fout.close();
166
+
167
+ return true;
168
+ }
169
+
170
+ int main(int argc, char ** argv) {
171
+ if (argc != 4) {
172
+ fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
173
+ ggml_print_ftypes(stderr);
174
+ return 1;
175
+ }
176
+
177
+ // needed to initialize f16 tables
178
+ {
179
+ struct ggml_init_params params = { 0, NULL, false };
180
+ struct ggml_context * ctx = ggml_init(params);
181
+ ggml_free(ctx);
182
+ }
183
+
184
+ const std::string fname_inp = argv[1];
185
+ const std::string fname_out = argv[2];
186
+
187
+ const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
188
+
189
+ const int64_t t_main_start_us = ggml_time_us();
190
+
191
+ int64_t t_quantize_us = 0;
192
+
193
+ // load the model
194
+ {
195
+ const int64_t t_start_us = ggml_time_us();
196
+
197
+ if (!whisper_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
198
+ fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
199
+ return 1;
200
+ }
201
+
202
+ t_quantize_us = ggml_time_us() - t_start_us;
203
+ }
204
+
205
+ // report timing
206
+ {
207
+ const int64_t t_main_end_us = ggml_time_us();
208
+
209
+ printf("\n");
210
+ printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
211
+ printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
212
+ }
213
+
214
+ return 0;
215
+ }
examples/stream.wasm/index-tmpl.html CHANGED
@@ -35,6 +35,15 @@
35
 
36
  <br><br>
37
 
 
 
 
 
 
 
 
 
 
38
  <hr>
39
 
40
  Select the model you would like to use, click the "Start" button and start speaking
@@ -45,6 +54,10 @@
45
  Whisper model: <span id="model-whisper-status"></span>
46
  <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
47
  <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
 
 
 
 
48
  <span id="fetch-whisper-progress"></span>
49
 
50
  <!--
@@ -162,11 +175,17 @@
162
  let urls = {
163
  'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
164
  'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
 
 
 
165
  };
166
 
167
  let sizes = {
168
  'tiny.en': 75,
169
  'base.en': 142,
 
 
 
170
  };
171
 
172
  let url = urls[model];
@@ -177,6 +196,10 @@
177
 
178
  document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
179
  document.getElementById('fetch-whisper-base-en').style.display = 'none';
 
 
 
 
180
  document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
181
 
182
  cbProgress = function(p) {
@@ -188,6 +211,10 @@
188
  var el;
189
  el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
190
  el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
 
 
 
 
191
  el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
192
  };
193
 
 
35
 
36
  <br><br>
37
 
38
+ <b>More examples:</b>
39
+ <a href="https://whisper.ggerganov.com/">main</a> |
40
+ <a href="https://whisper.ggerganov.com/bench">bench</a> |
41
+ <a href="https://whisper.ggerganov.com/stream">stream</a> |
42
+ <a href="https://whisper.ggerganov.com/command">command</a> |
43
+ <a href="https://whisper.ggerganov.com/talk">talk</a> |
44
+
45
+ <br><br>
46
+
47
  <hr>
48
 
49
  Select the model you would like to use, click the "Start" button and start speaking
 
54
  Whisper model: <span id="model-whisper-status"></span>
55
  <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
56
  <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
57
+ <br><br>
58
+ Quantized models:<br><br>
59
+ <button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
60
+ <button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
61
  <span id="fetch-whisper-progress"></span>
62
 
63
  <!--
 
175
  let urls = {
176
  'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
177
  'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
178
+
179
+ 'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
180
+ 'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
181
  };
182
 
183
  let sizes = {
184
  'tiny.en': 75,
185
  'base.en': 142,
186
+
187
+ 'tiny-en-q5_1': 31,
188
+ 'base-en-q5_1': 57,
189
  };
190
 
191
  let url = urls[model];
 
196
 
197
  document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
198
  document.getElementById('fetch-whisper-base-en').style.display = 'none';
199
+
200
+ document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
201
+ document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
202
+
203
  document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
204
 
205
  cbProgress = function(p) {
 
211
  var el;
212
  el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
213
  el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
214
+
215
+ el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
216
+ el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
217
+
218
  el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
219
  };
220
 
examples/talk-llama/{llama_util.h → llama-util.h} RENAMED
@@ -21,12 +21,17 @@
21
  #if defined(_POSIX_MAPPED_FILES)
22
  #include <sys/mman.h>
23
  #endif
 
 
 
24
  #endif
25
  #endif
26
 
27
  #if defined(_WIN32)
28
  #define WIN32_LEAN_AND_MEAN
29
- #define NOMINMAX
 
 
30
  #include <windows.h>
31
  #include <io.h>
32
  #include <stdio.h> // for _fseeki64
@@ -41,8 +46,12 @@
41
  } while (0)
42
 
43
  #ifdef __GNUC__
 
 
 
44
  __attribute__((format(printf, 1, 2)))
45
  #endif
 
46
  static std::string format(const char * fmt, ...) {
47
  va_list ap, ap2;
48
  va_start(ap, fmt);
@@ -55,7 +64,7 @@ static std::string format(const char * fmt, ...) {
55
  va_end(ap2);
56
  va_end(ap);
57
  return std::string(buf.data(), size);
58
- };
59
 
60
  struct llama_file {
61
  // use FILE * so we don't have to re-open the file to mmap
@@ -162,7 +171,7 @@ struct llama_mmap {
162
  #ifdef _POSIX_MAPPED_FILES
163
  static constexpr bool SUPPORTED = true;
164
 
165
- llama_mmap(struct llama_file * file) {
166
  size = file->size;
167
  int fd = fileno(file->fp);
168
  int flags = MAP_SHARED;
@@ -170,15 +179,16 @@ struct llama_mmap {
170
  flags |= MAP_POPULATE;
171
  #endif
172
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
173
- close(fd);
174
  if (addr == MAP_FAILED) {
175
  throw format("mmap failed: %s", strerror(errno));
176
  }
177
 
178
- // Advise the kernel to preload the mapped memory
179
- if (madvise(addr, file->size, MADV_WILLNEED)) {
180
- fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
181
- strerror(errno));
 
 
182
  }
183
  }
184
 
@@ -188,14 +198,13 @@ struct llama_mmap {
188
  #elif defined(_WIN32)
189
  static constexpr bool SUPPORTED = true;
190
 
191
- llama_mmap(struct llama_file * file) {
192
  size = file->size;
193
 
194
  HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
195
 
196
  HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
197
  DWORD error = GetLastError();
198
- CloseHandle(hFile);
199
 
200
  if (hMapping == NULL) {
201
  throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
@@ -209,14 +218,20 @@ struct llama_mmap {
209
  throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
210
  }
211
 
212
- // Advise the kernel to preload the mapped memory
213
- WIN32_MEMORY_RANGE_ENTRY range;
214
- range.VirtualAddress = addr;
215
- range.NumberOfBytes = (SIZE_T)size;
216
- if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
217
- fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
218
- llama_format_win_err(GetLastError()).c_str());
 
 
 
219
  }
 
 
 
220
  }
221
 
222
  ~llama_mmap() {
@@ -291,8 +306,18 @@ struct llama_mlock {
291
  if (!mlock(addr, size)) {
292
  return true;
293
  } else {
294
- fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
295
- size, this->size, std::strerror(errno));
 
 
 
 
 
 
 
 
 
 
296
  return false;
297
  }
298
  }
@@ -338,8 +363,8 @@ struct llama_mlock {
338
  // Hopefully a megabyte is enough overhead:
339
  size_t increment = size + 1048576;
340
  // The minimum must be <= the maximum, so we need to increase both:
341
- min_ws_size += size;
342
- max_ws_size += size;
343
  if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
344
  fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
345
  llama_format_win_err(GetLastError()).c_str());
@@ -380,4 +405,29 @@ struct llama_buffer {
380
  delete[] addr;
381
  }
382
  };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  #endif
 
21
  #if defined(_POSIX_MAPPED_FILES)
22
  #include <sys/mman.h>
23
  #endif
24
+ #if defined(_POSIX_MEMLOCK_RANGE)
25
+ #include <sys/resource.h>
26
+ #endif
27
  #endif
28
  #endif
29
 
30
  #if defined(_WIN32)
31
  #define WIN32_LEAN_AND_MEAN
32
+ #ifndef NOMINMAX
33
+ #define NOMINMAX
34
+ #endif
35
  #include <windows.h>
36
  #include <io.h>
37
  #include <stdio.h> // for _fseeki64
 
46
  } while (0)
47
 
48
  #ifdef __GNUC__
49
+ #ifdef __MINGW32__
50
+ __attribute__((format(gnu_printf, 1, 2)))
51
+ #else
52
  __attribute__((format(printf, 1, 2)))
53
  #endif
54
+ #endif
55
  static std::string format(const char * fmt, ...) {
56
  va_list ap, ap2;
57
  va_start(ap, fmt);
 
64
  va_end(ap2);
65
  va_end(ap);
66
  return std::string(buf.data(), size);
67
+ }
68
 
69
  struct llama_file {
70
  // use FILE * so we don't have to re-open the file to mmap
 
171
  #ifdef _POSIX_MAPPED_FILES
172
  static constexpr bool SUPPORTED = true;
173
 
174
+ llama_mmap(struct llama_file * file, bool prefetch = true) {
175
  size = file->size;
176
  int fd = fileno(file->fp);
177
  int flags = MAP_SHARED;
 
179
  flags |= MAP_POPULATE;
180
  #endif
181
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
 
182
  if (addr == MAP_FAILED) {
183
  throw format("mmap failed: %s", strerror(errno));
184
  }
185
 
186
+ if (prefetch) {
187
+ // Advise the kernel to preload the mapped memory
188
+ if (madvise(addr, file->size, MADV_WILLNEED)) {
189
+ fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
190
+ strerror(errno));
191
+ }
192
  }
193
  }
194
 
 
198
  #elif defined(_WIN32)
199
  static constexpr bool SUPPORTED = true;
200
 
201
+ llama_mmap(struct llama_file * file, bool prefetch = true) {
202
  size = file->size;
203
 
204
  HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
205
 
206
  HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
207
  DWORD error = GetLastError();
 
208
 
209
  if (hMapping == NULL) {
210
  throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
 
218
  throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
219
  }
220
 
221
+ #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
222
+ if (prefetch) {
223
+ // Advise the kernel to preload the mapped memory
224
+ WIN32_MEMORY_RANGE_ENTRY range;
225
+ range.VirtualAddress = addr;
226
+ range.NumberOfBytes = (SIZE_T)size;
227
+ if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
228
+ fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
229
+ llama_format_win_err(GetLastError()).c_str());
230
+ }
231
  }
232
+ #else
233
+ #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
234
+ #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
235
  }
236
 
237
  ~llama_mmap() {
 
306
  if (!mlock(addr, size)) {
307
  return true;
308
  } else {
309
+ char* errmsg = std::strerror(errno);
310
+ bool suggest = (errno == ENOMEM);
311
+
312
+ // Check if the resource limit is fine after all
313
+ struct rlimit lock_limit;
314
+ if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
315
+ suggest = false;
316
+ if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
317
+ suggest = false;
318
+
319
+ fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
320
+ size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
321
  return false;
322
  }
323
  }
 
363
  // Hopefully a megabyte is enough overhead:
364
  size_t increment = size + 1048576;
365
  // The minimum must be <= the maximum, so we need to increase both:
366
+ min_ws_size += increment;
367
+ max_ws_size += increment;
368
  if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
369
  fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
370
  llama_format_win_err(GetLastError()).c_str());
 
405
  delete[] addr;
406
  }
407
  };
408
+
409
+ #ifdef GGML_USE_CUBLAS
410
+ #include "ggml-cuda.h"
411
+ struct llama_ctx_buffer {
412
+ uint8_t * addr = NULL;
413
+ size_t size = 0;
414
+
415
+ void resize(size_t size) {
416
+ if (addr) {
417
+ ggml_cuda_host_free(addr);
418
+ }
419
+ addr = (uint8_t *) ggml_cuda_host_malloc(size);
420
+ this->size = size;
421
+ }
422
+
423
+ ~llama_ctx_buffer() {
424
+ if (addr) {
425
+ ggml_cuda_host_free(addr);
426
+ }
427
+ }
428
+ };
429
+ #else
430
+ typedef llama_buffer llama_ctx_buffer;
431
+ #endif
432
+
433
  #endif
examples/talk-llama/llama.cpp CHANGED
@@ -1,10 +1,17 @@
1
- #include "llama_util.h"
 
 
 
 
 
 
 
2
  #include "llama.h"
3
- #include "llama_internal.h"
4
 
5
  #include "ggml.h"
6
 
7
  #include <array>
 
8
  #include <cinttypes>
9
  #include <fstream>
10
  #include <random>
@@ -17,11 +24,15 @@
17
  #include <memory>
18
  #include <algorithm>
19
  #include <initializer_list>
 
 
 
 
 
20
 
21
  #define LLAMA_USE_SCRATCH
22
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
23
 
24
-
25
  // available llama models
26
  enum e_model {
27
  MODEL_UNKNOWN,
@@ -37,36 +48,52 @@ static const size_t MB = 1024*1024;
37
  // TODO: dynamically determine these sizes
38
  // needs modifications in ggml
39
 
40
- static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
41
- { MODEL_7B, 512ull*MB },
42
- { MODEL_13B, 512ull*MB },
43
- { MODEL_30B, 512ull*MB },
44
- { MODEL_65B, 512ull*MB },
45
- };
 
 
 
 
46
 
47
- static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
48
- { MODEL_7B, 512ull*MB },
49
- { MODEL_13B, 512ull*MB },
50
- { MODEL_30B, 512ull*MB },
51
- { MODEL_65B, 512ull*MB },
52
- };
 
 
 
 
53
 
54
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
55
- static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
56
- { MODEL_7B, 1026ull*MB },
57
- { MODEL_13B, 1608ull*MB },
58
- { MODEL_30B, 3124ull*MB },
59
- { MODEL_65B, 5120ull*MB },
60
- };
 
 
 
 
61
 
62
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
63
  // not actually needed if BLAS is disabled
64
- static const std::map<e_model, size_t> MEM_REQ_EVAL = {
65
- { MODEL_7B, 768ull*MB },
66
- { MODEL_13B, 1024ull*MB },
67
- { MODEL_30B, 1280ull*MB },
68
- { MODEL_65B, 1536ull*MB },
69
- };
 
 
 
 
70
 
71
  // default hparams (LLaMA 7B)
72
  struct llama_hparams {
@@ -77,7 +104,7 @@ struct llama_hparams {
77
  uint32_t n_head = 32;
78
  uint32_t n_layer = 32;
79
  uint32_t n_rot = 64;
80
- uint32_t f16 = 1;
81
 
82
  bool operator!=(const llama_hparams & other) const {
83
  return memcmp(this, &other, sizeof(llama_hparams));
@@ -109,7 +136,7 @@ struct llama_kv_cache {
109
 
110
  struct ggml_context * ctx = NULL;
111
 
112
- llama_buffer buf;
113
 
114
  int n; // number of tokens currently in the cache
115
 
@@ -140,7 +167,7 @@ struct llama_model {
140
  struct llama_kv_cache kv_self;
141
 
142
  // the model memory buffer
143
- llama_buffer buf;
144
 
145
  // model memory mapped file
146
  std::unique_ptr<llama_mmap> mapping;
@@ -201,8 +228,8 @@ struct llama_context {
201
 
202
  // memory buffers used to evaluate the model
203
  // TODO: move in llama_state
204
- llama_buffer buf_compute;
205
- llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
206
 
207
  int buf_last = 0;
208
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@@ -257,22 +284,12 @@ static size_t checked_div(size_t a, size_t b) {
257
  }
258
 
259
  static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
260
- std::string ret = "[" + std::to_string(ne.at(0));
 
261
  for (size_t i = 1; i < ne.size(); i++) {
262
- ret += " x " + std::to_string(ne.at(i));
263
- }
264
- ret += "]";
265
- return ret;
266
- }
267
-
268
- static const char * llama_format_type(enum ggml_type type) {
269
- switch (type) {
270
- case GGML_TYPE_F32: return "f32";
271
- case GGML_TYPE_F16: return "f16";
272
- case GGML_TYPE_Q4_0: return "q4_0";
273
- case GGML_TYPE_Q4_1: return "q4_1";
274
- default: LLAMA_ASSERT(false);
275
  }
 
276
  }
277
 
278
  static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
@@ -427,7 +444,7 @@ struct llama_file_loader {
427
  hparams.n_head = file.read_u32();
428
  hparams.n_layer = file.read_u32();
429
  hparams.n_rot = file.read_u32();
430
- hparams.f16 = file.read_u32();
431
  }
432
  void read_vocab() {
433
  vocab.id_to_token.resize(hparams.n_vocab);
@@ -453,20 +470,25 @@ struct llama_file_loader {
453
  llama_load_tensor_shard shard;
454
  uint32_t n_dims = file.read_u32();
455
  uint32_t name_len = file.read_u32();
456
- uint32_t ftype = file.read_u32();
457
  shard.ne.resize(n_dims);
458
  file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
459
  std::string name = file.read_string(name_len);
460
  if (n_dims < 1 || n_dims > 2) {
461
  throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
462
  }
463
- switch (ftype) {
464
- case 0: shard.type = GGML_TYPE_F32; break;
465
- case 1: shard.type = GGML_TYPE_F16; break;
466
- case 2: shard.type = GGML_TYPE_Q4_0; break;
467
- case 3: shard.type = GGML_TYPE_Q4_1; break;
 
 
 
 
 
468
  default: {
469
- throw format("unrecognized ftype %u\n", ftype);
470
  }
471
  }
472
 
@@ -497,18 +519,18 @@ struct llama_file_loader {
497
  struct llama_file_saver {
498
  llama_file file;
499
  llama_file_loader * any_file_loader;
500
- llama_file_saver(const char * fname, llama_file_loader * any_file_loader, uint32_t new_f16)
501
  : file(fname, "wb"), any_file_loader(any_file_loader) {
502
  fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
503
  write_magic();
504
- write_hparams(new_f16);
505
  write_vocab();
506
  }
507
  void write_magic() {
508
  file.write_u32('ggjt'); // magic
509
  file.write_u32(1); // version
510
  }
511
- void write_hparams(uint32_t new_f16) {
512
  const llama_hparams & hparams = any_file_loader->hparams;
513
  file.write_u32(hparams.n_vocab);
514
  file.write_u32(hparams.n_embd);
@@ -516,7 +538,7 @@ struct llama_file_saver {
516
  file.write_u32(hparams.n_head);
517
  file.write_u32(hparams.n_layer);
518
  file.write_u32(hparams.n_rot);
519
- file.write_u32(new_f16);
520
  }
521
  void write_vocab() {
522
  if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
@@ -531,17 +553,21 @@ struct llama_file_saver {
531
  }
532
  }
533
  void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
534
- uint32_t ftype;
535
  switch (new_type) {
536
- case GGML_TYPE_F32: ftype = 0; break;
537
- case GGML_TYPE_F16: ftype = 1; break;
538
- case GGML_TYPE_Q4_0: ftype = 2; break;
539
- case GGML_TYPE_Q4_1: ftype = 3; break;
 
 
 
 
 
540
  default: LLAMA_ASSERT(false);
541
  }
542
  file.write_u32((uint32_t) tensor.ne.size());
543
  file.write_u32((uint32_t) tensor.name.size());
544
- file.write_u32(ftype);
545
  file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
546
  file.write_raw(tensor.name.data(), tensor.name.size());
547
  file.seek(-file.tell() & 31, SEEK_CUR);
@@ -621,6 +647,7 @@ struct llama_model_loader {
621
  throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
622
  name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
623
  }
 
624
  return get_tensor_for(lt);
625
  }
626
 
@@ -753,7 +780,7 @@ static bool kv_cache_init(
753
  const int n_embd = hparams.n_embd;
754
  const int n_layer = hparams.n_layer;
755
 
756
- const int64_t n_mem = (int64_t)n_layer*n_ctx;
757
  const int64_t n_elements = n_embd*n_mem;
758
 
759
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
@@ -815,6 +842,22 @@ static const char *llama_file_version_name(llama_file_version version) {
815
  }
816
  }
817
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
818
  static const char *llama_model_type_name(e_model type) {
819
  switch (type) {
820
  case MODEL_7B: return "7B";
@@ -867,7 +910,7 @@ static void llama_model_load_internal(
867
  fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
868
  fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
869
  fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
870
- fprintf(stderr, "%s: f16 = %u\n", __func__, hparams.f16);
871
  fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
872
  fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
873
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
@@ -891,13 +934,13 @@ static void llama_model_load_internal(
891
  const size_t mem_required =
892
  ctx_size +
893
  mmapped_size +
894
- MEM_REQ_SCRATCH0.at(model.type) +
895
- MEM_REQ_SCRATCH1.at(model.type) +
896
- MEM_REQ_EVAL.at (model.type);
897
 
898
  // this is the memory required by one llama_state
899
  const size_t mem_required_state =
900
- scale*MEM_REQ_KV_SELF.at(model.type);
901
 
902
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
903
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@@ -934,8 +977,8 @@ static void llama_model_load_internal(
934
  ml->ggml_ctx = ctx;
935
 
936
  model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
937
- model.norm = ml->get_tensor("norm.weight", {n_embd});
938
- model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
939
 
940
  model.layers.resize(n_layer);
941
  for (uint32_t i = 0; i < n_layer; ++i) {
@@ -1039,7 +1082,7 @@ static bool llama_eval_internal(
1039
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1040
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1041
  ggml_cgraph gf = {};
1042
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
1043
 
1044
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1045
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -1213,9 +1256,11 @@ static bool llama_eval_internal(
1213
  ggml_build_forward_expand(&gf, inpL);
1214
  ggml_graph_compute (ctx0, &gf);
1215
 
 
1216
  // print timing information per ggml operation (for debugging purposes)
1217
  // requires GGML_PERF to be defined
1218
- //ggml_graph_print(&gf);
 
1219
 
1220
  // plot the computation graph in dot format (for debugging purposes)
1221
  //if (n_past%100 == 0) {
@@ -1430,131 +1475,435 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1430
  // sampling
1431
  //
1432
 
1433
- static void sample_top_k(std::vector<std::pair<float, llama_vocab::id>> & logits_id, int top_k) {
1434
- // find the top k tokens
1435
- std::partial_sort(
1436
- logits_id.begin(),
1437
- logits_id.begin() + top_k, logits_id.end(),
1438
- [](const std::pair<float, llama_vocab::id> & a, const std::pair<float, llama_vocab::id> & b) {
1439
- return a.first > b.first;
1440
- });
1441
 
1442
- logits_id.resize(top_k);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1443
  }
1444
 
1445
- static llama_vocab::id llama_sample_top_p_top_k(
1446
- llama_context & lctx,
1447
- const std::vector<llama_vocab::id> & last_n_tokens,
1448
- int top_k,
1449
- float top_p,
1450
- float temp,
1451
- float repeat_penalty) {
1452
- auto & rng = lctx.rng;
1453
-
1454
- const int n_logits = lctx.model.hparams.n_vocab;
1455
-
1456
- const auto & logits = lctx.logits;
1457
- const auto * plogits = logits.data() + logits.size() - n_logits;
1458
-
1459
- if (temp <= 0) {
1460
- // select the token with the highest logit directly
1461
- float max_logit = plogits[0];
1462
- llama_vocab::id max_id = 0;
1463
-
1464
- for (int i = 1; i < n_logits; ++i) {
1465
- if (plogits[i] > max_logit) {
1466
- max_logit = plogits[i];
1467
- max_id = i;
1468
- }
1469
  }
1470
- return max_id;
 
 
 
 
 
1471
  }
 
1472
 
1473
- std::vector<std::pair<float, llama_vocab::id>> logits_id;
1474
- logits_id.reserve(n_logits);
 
 
1475
 
1476
- {
1477
- const float scale = 1.0f/temp;
1478
- for (int i = 0; i < n_logits; ++i) {
1479
- // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
1480
- // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
1481
- if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
1482
- // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
1483
- if (plogits[i] < 0.0f) {
1484
- logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
1485
- } else {
1486
- logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
1487
- }
1488
- } else {
1489
- logits_id.push_back(std::make_pair(plogits[i]*scale, i));
1490
- }
1491
  }
1492
  }
1493
 
1494
- sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits);
 
1495
 
1496
- // compute probs for the top k tokens
1497
- std::vector<float> probs;
1498
- probs.reserve(logits_id.size());
 
1499
 
1500
- float maxl = logits_id[0].first;
1501
- double sum = 0.0;
1502
- for (const auto & kv : logits_id) {
1503
- const float p = expf(kv.first - maxl);
1504
- probs.push_back(p);
1505
- sum += p;
1506
  }
1507
 
1508
- // normalize the probs
1509
- for (auto & p : probs) {
1510
- p /= sum;
 
 
 
 
 
 
 
 
 
 
1511
  }
1512
 
1513
- if (top_p < 1.0) {
1514
- double cumsum = 0.0;
1515
- for (int i = 0; i < (int) probs.size(); i++) {
1516
- cumsum += probs[i];
1517
- if (cumsum >= top_p) {
1518
- probs.resize(i + 1);
1519
- logits_id.resize(i + 1);
1520
- break;
1521
- }
 
 
 
 
 
 
 
 
 
 
 
1522
  }
1523
  }
1524
 
1525
- //printf("\n");
1526
- //for (int i = 0; i < (int) 10; i++) {
1527
- // printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]);
1528
- //}
1529
- //printf("\n\n");
1530
- //exit(0);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1531
 
1532
  std::discrete_distribution<> dist(probs.begin(), probs.end());
 
1533
  int idx = dist(rng);
1534
 
1535
- return logits_id[idx].second;
 
 
 
 
1536
  }
1537
 
1538
  //
1539
  // quantization
1540
  //
1541
 
1542
- static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
1543
  ggml_type quantized_type;
1544
- switch (itype) {
1545
- case 2: quantized_type = GGML_TYPE_Q4_0; break;
1546
- case 3: quantized_type = GGML_TYPE_Q4_1; break;
1547
- default: throw format("invalid quantization type %d\n", itype);
 
 
 
 
1548
  };
1549
 
 
 
 
 
1550
  std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
1551
  /*vocab_only*/ false));
1552
- llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), (uint32_t) itype);
1553
 
1554
  size_t total_size_org = 0;
1555
  size_t total_size_new = 0;
1556
  std::vector<int64_t> hist_all(1 << 4, 0);
1557
 
 
 
 
1558
  size_t idx = 0;
1559
  for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
1560
  llama_buffer read_data;
@@ -1562,10 +1911,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1562
  tensor.data = read_data.addr;
1563
  model_loader->load_data_for(tensor);
1564
 
1565
- printf("[%zu/%zu] %36s - %s, type = %6s, ",
1566
  ++idx, model_loader->tensors_map.tensors.size(),
1567
  tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
1568
- llama_format_type(tensor.type));
1569
 
1570
  // This used to be a regex, but <regex> has an extreme cost to compile times.
1571
  bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
@@ -1573,6 +1922,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1573
  // quantize only 2D tensors
1574
  quantize &= (tensor.ne.size() == 2);
1575
 
 
 
 
 
 
1576
  enum ggml_type new_type;
1577
  void * new_data;
1578
  size_t new_size;
@@ -1598,7 +1952,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1598
  f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
1599
  }
1600
  } else {
1601
- throw format("type %s unsupported for integer quantization", llama_format_type(tensor.type));
1602
  }
1603
 
1604
  printf("quantizing .. ");
@@ -1608,17 +1962,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1608
  new_data = work.addr;
1609
  std::vector<int64_t> hist_cur(1 << 4, 0);
1610
 
1611
- switch (new_type) {
1612
- case GGML_TYPE_Q4_0:
1613
- {
1614
- new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1615
- } break;
1616
- case GGML_TYPE_Q4_1:
1617
- {
1618
- new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
1619
- } break;
1620
- default:
1621
- LLAMA_ASSERT(false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1622
  }
1623
 
1624
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -1717,17 +2091,17 @@ struct llama_context * llama_init_from_file(
1717
  if (params.logits_all) {
1718
  ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
1719
  } else {
1720
- ctx->logits.reserve(hparams.n_ctx);
1721
  }
1722
 
1723
  if (params.embedding){
1724
  ctx->embedding.resize(hparams.n_embd);
1725
  }
1726
 
1727
- ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
1728
 
1729
- ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
1730
- ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
1731
  }
1732
 
1733
  return ctx;
@@ -1740,9 +2114,10 @@ void llama_free(struct llama_context * ctx) {
1740
  int llama_model_quantize(
1741
  const char * fname_inp,
1742
  const char * fname_out,
1743
- int itype) {
 
1744
  try {
1745
- llama_model_quantize_internal(fname_inp, fname_out, itype);
1746
  return 0;
1747
  } catch (const std::string & err) {
1748
  fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
@@ -1750,31 +2125,446 @@ int llama_model_quantize(
1750
  }
1751
  }
1752
 
1753
- // Returns the KV cache that will contain the context for the
1754
- // ongoing prediction with the model.
1755
- const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
1756
- return ctx->model.kv_self.buf.addr;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1757
  }
1758
 
1759
- // Returns the size of the KV cache
1760
- size_t llama_get_kv_cache_size(struct llama_context * ctx) {
1761
- return ctx->model.kv_self.buf.size;
 
 
 
 
1762
  }
1763
 
1764
  int llama_get_kv_cache_token_count(struct llama_context * ctx) {
1765
  return ctx->model.kv_self.n;
1766
  }
1767
 
1768
- // Sets the KV cache containing the current context for the model
1769
- void llama_set_kv_cache(
1770
- struct llama_context * ctx,
1771
- const uint8_t * kv_cache,
1772
- size_t n_size,
1773
- int n_token_count) {
1774
- // Make sure we have the same kv cache setup
1775
- LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size);
1776
- memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size);
1777
- ctx->model.kv_self.n = n_token_count;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1778
  }
1779
 
1780
  int llama_eval(
@@ -1851,33 +2641,8 @@ llama_token llama_token_eos() {
1851
  return 2;
1852
  }
1853
 
1854
- llama_token llama_sample_top_p_top_k(
1855
- llama_context * ctx,
1856
- const llama_token * last_n_tokens_data,
1857
- int last_n_tokens_size,
1858
- int top_k,
1859
- float top_p,
1860
- float temp,
1861
- float repeat_penalty) {
1862
- const int64_t t_start_sample_us = ggml_time_us();
1863
-
1864
- llama_token result = 0;
1865
-
1866
- // TODO: avoid this ...
1867
- const auto last_n_tokens = std::vector<llama_token>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
1868
-
1869
- result = llama_sample_top_p_top_k(
1870
- *ctx,
1871
- last_n_tokens,
1872
- top_k,
1873
- top_p,
1874
- temp,
1875
- repeat_penalty);
1876
-
1877
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1878
- ctx->n_sample++;
1879
-
1880
- return result;
1881
  }
1882
 
1883
 
@@ -1907,18 +2672,20 @@ const char * llama_print_system_info(void) {
1907
  static std::string s;
1908
 
1909
  s = "";
1910
- s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
1911
- s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
1912
- s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
1913
- s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
1914
- s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
1915
- s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
1916
- s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
1917
- s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
1918
- s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
1919
- s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
1920
- s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
1921
- s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
 
 
1922
 
1923
  return s.c_str();
1924
  }
@@ -1927,3 +2694,57 @@ const char * llama_print_system_info(void) {
1927
  std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
1928
  return ctx->model.tensors_by_name;
1929
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Defines fileno on msys:
2
+ #ifndef _GNU_SOURCE
3
+ #define _GNU_SOURCE
4
+ #include <cstdint>
5
+ #include <cstdio>
6
+ #endif
7
+
8
+ #include "llama-util.h"
9
  #include "llama.h"
 
10
 
11
  #include "ggml.h"
12
 
13
  #include <array>
14
+ #include <ctime>
15
  #include <cinttypes>
16
  #include <fstream>
17
  #include <random>
 
24
  #include <memory>
25
  #include <algorithm>
26
  #include <initializer_list>
27
+ #include <thread>
28
+ #include <atomic>
29
+ #include <mutex>
30
+ #include <sstream>
31
+ #include <numeric>
32
 
33
  #define LLAMA_USE_SCRATCH
34
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
35
 
 
36
  // available llama models
37
  enum e_model {
38
  MODEL_UNKNOWN,
 
48
  // TODO: dynamically determine these sizes
49
  // needs modifications in ggml
50
 
51
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
52
+ {
53
+ static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
54
+ { MODEL_7B, 512ull * MB },
55
+ { MODEL_13B, 512ull * MB },
56
+ { MODEL_30B, 512ull * MB },
57
+ { MODEL_65B, 1024ull * MB },
58
+ };
59
+ return _MEM_REQ_SCRATCH0;
60
+ }
61
 
62
+ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
63
+ {
64
+ static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
65
+ { MODEL_7B, 512ull * MB },
66
+ { MODEL_13B, 512ull * MB },
67
+ { MODEL_30B, 512ull * MB },
68
+ { MODEL_65B, 1024ull * MB },
69
+ };
70
+ return _MEM_REQ_SCRATCH1;
71
+ }
72
 
73
  // 2*n_embd*n_ctx*n_layer*sizeof(float16)
74
+ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
75
+ {
76
+ static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
77
+ { MODEL_7B, 1026ull * MB },
78
+ { MODEL_13B, 1608ull * MB },
79
+ { MODEL_30B, 3124ull * MB },
80
+ { MODEL_65B, 5120ull * MB },
81
+ };
82
+ return _MEM_REQ_KV_SELF;
83
+ }
84
 
85
  // this is mostly needed for temporary mul_mat buffers to dequantize the data
86
  // not actually needed if BLAS is disabled
87
+ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
88
+ {
89
+ static std::map<e_model, size_t> _MEM_REQ_EVAL = {
90
+ { MODEL_7B, 768ull * MB },
91
+ { MODEL_13B, 1024ull * MB },
92
+ { MODEL_30B, 1280ull * MB },
93
+ { MODEL_65B, 1536ull * MB },
94
+ };
95
+ return _MEM_REQ_EVAL;
96
+ }
97
 
98
  // default hparams (LLaMA 7B)
99
  struct llama_hparams {
 
104
  uint32_t n_head = 32;
105
  uint32_t n_layer = 32;
106
  uint32_t n_rot = 64;
107
+ enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
108
 
109
  bool operator!=(const llama_hparams & other) const {
110
  return memcmp(this, &other, sizeof(llama_hparams));
 
136
 
137
  struct ggml_context * ctx = NULL;
138
 
139
+ llama_ctx_buffer buf;
140
 
141
  int n; // number of tokens currently in the cache
142
 
 
167
  struct llama_kv_cache kv_self;
168
 
169
  // the model memory buffer
170
+ llama_ctx_buffer buf;
171
 
172
  // model memory mapped file
173
  std::unique_ptr<llama_mmap> mapping;
 
228
 
229
  // memory buffers used to evaluate the model
230
  // TODO: move in llama_state
231
+ llama_ctx_buffer buf_compute;
232
+ llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
233
 
234
  int buf_last = 0;
235
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
 
284
  }
285
 
286
  static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
287
+ char buf[256];
288
+ snprintf(buf, sizeof(buf), "%5u", ne.at(0));
289
  for (size_t i = 1; i < ne.size(); i++) {
290
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
 
 
 
 
 
 
 
 
 
 
 
 
291
  }
292
+ return buf;
293
  }
294
 
295
  static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
 
444
  hparams.n_head = file.read_u32();
445
  hparams.n_layer = file.read_u32();
446
  hparams.n_rot = file.read_u32();
447
+ hparams.ftype = (enum llama_ftype) file.read_u32();
448
  }
449
  void read_vocab() {
450
  vocab.id_to_token.resize(hparams.n_vocab);
 
470
  llama_load_tensor_shard shard;
471
  uint32_t n_dims = file.read_u32();
472
  uint32_t name_len = file.read_u32();
473
+ shard.type = (enum ggml_type) file.read_u32();
474
  shard.ne.resize(n_dims);
475
  file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
476
  std::string name = file.read_string(name_len);
477
  if (n_dims < 1 || n_dims > 2) {
478
  throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
479
  }
480
+ switch (shard.type) {
481
+ case GGML_TYPE_F32:
482
+ case GGML_TYPE_F16:
483
+ case GGML_TYPE_Q4_0:
484
+ case GGML_TYPE_Q4_1:
485
+ case GGML_TYPE_Q4_2:
486
+ case GGML_TYPE_Q5_0:
487
+ case GGML_TYPE_Q5_1:
488
+ case GGML_TYPE_Q8_0:
489
+ break;
490
  default: {
491
+ throw format("unrecognized tensor type %u\n", shard.type);
492
  }
493
  }
494
 
 
519
  struct llama_file_saver {
520
  llama_file file;
521
  llama_file_loader * any_file_loader;
522
+ llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
523
  : file(fname, "wb"), any_file_loader(any_file_loader) {
524
  fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
525
  write_magic();
526
+ write_hparams(new_ftype);
527
  write_vocab();
528
  }
529
  void write_magic() {
530
  file.write_u32('ggjt'); // magic
531
  file.write_u32(1); // version
532
  }
533
+ void write_hparams(enum llama_ftype new_ftype) {
534
  const llama_hparams & hparams = any_file_loader->hparams;
535
  file.write_u32(hparams.n_vocab);
536
  file.write_u32(hparams.n_embd);
 
538
  file.write_u32(hparams.n_head);
539
  file.write_u32(hparams.n_layer);
540
  file.write_u32(hparams.n_rot);
541
+ file.write_u32(new_ftype);
542
  }
543
  void write_vocab() {
544
  if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
 
553
  }
554
  }
555
  void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
 
556
  switch (new_type) {
557
+ case GGML_TYPE_F32:
558
+ case GGML_TYPE_F16:
559
+ case GGML_TYPE_Q4_0:
560
+ case GGML_TYPE_Q4_1:
561
+ case GGML_TYPE_Q4_2:
562
+ case GGML_TYPE_Q5_0:
563
+ case GGML_TYPE_Q5_1:
564
+ case GGML_TYPE_Q8_0:
565
+ break;
566
  default: LLAMA_ASSERT(false);
567
  }
568
  file.write_u32((uint32_t) tensor.ne.size());
569
  file.write_u32((uint32_t) tensor.name.size());
570
+ file.write_u32(new_type);
571
  file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
572
  file.write_raw(tensor.name.data(), tensor.name.size());
573
  file.seek(-file.tell() & 31, SEEK_CUR);
 
647
  throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
648
  name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
649
  }
650
+
651
  return get_tensor_for(lt);
652
  }
653
 
 
780
  const int n_embd = hparams.n_embd;
781
  const int n_layer = hparams.n_layer;
782
 
783
+ const int64_t n_mem = n_layer*n_ctx;
784
  const int64_t n_elements = n_embd*n_mem;
785
 
786
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
 
842
  }
843
  }
844
 
845
+ static const char *llama_ftype_name(enum llama_ftype ftype) {
846
+ switch (ftype) {
847
+ case LLAMA_FTYPE_ALL_F32: return "all F32";
848
+ case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
849
+ case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
850
+ case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
851
+ case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
852
+ return "mostly Q4_1, some F16";
853
+ case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
854
+ case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
855
+ case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
856
+ case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
857
+ default: return "unknown, may not work";
858
+ }
859
+ }
860
+
861
  static const char *llama_model_type_name(e_model type) {
862
  switch (type) {
863
  case MODEL_7B: return "7B";
 
910
  fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
911
  fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
912
  fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
913
+ fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
914
  fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
915
  fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
916
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
 
934
  const size_t mem_required =
935
  ctx_size +
936
  mmapped_size +
937
+ MEM_REQ_SCRATCH0().at(model.type) +
938
+ MEM_REQ_SCRATCH1().at(model.type) +
939
+ MEM_REQ_EVAL().at(model.type);
940
 
941
  // this is the memory required by one llama_state
942
  const size_t mem_required_state =
943
+ scale*MEM_REQ_KV_SELF().at(model.type);
944
 
945
  fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
946
  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
 
977
  ml->ggml_ctx = ctx;
978
 
979
  model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
980
+ model.norm = ml->get_tensor("norm.weight", {n_embd});
981
+ model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
982
 
983
  model.layers.resize(n_layer);
984
  for (uint32_t i = 0; i < n_layer; ++i) {
 
1082
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1083
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1084
  ggml_cgraph gf = {};
1085
+ gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1086
 
1087
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1088
  memcpy(embd->data, tokens, N*ggml_element_size(embd));
 
1256
  ggml_build_forward_expand(&gf, inpL);
1257
  ggml_graph_compute (ctx0, &gf);
1258
 
1259
+ #ifdef GGML_PERF
1260
  // print timing information per ggml operation (for debugging purposes)
1261
  // requires GGML_PERF to be defined
1262
+ ggml_graph_print(&gf);
1263
+ #endif
1264
 
1265
  // plot the computation graph in dot format (for debugging purposes)
1266
  //if (n_past%100 == 0) {
 
1475
  // sampling
1476
  //
1477
 
1478
+ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
1479
+ assert(candidates->size > 0);
1480
+
1481
+ const int64_t t_start_sample_us = ggml_time_us();
 
 
 
 
1482
 
1483
+ // Sort the logits in descending order
1484
+ if (!candidates->sorted) {
1485
+ std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1486
+ return a.logit > b.logit;
1487
+ });
1488
+ candidates->sorted = true;
1489
+ }
1490
+
1491
+ float max_l = candidates->data[0].logit;
1492
+ float cum_sum = 0.0f;
1493
+ for (size_t i = 0; i < candidates->size; ++i) {
1494
+ float p = expf(candidates->data[i].logit - max_l);
1495
+ candidates->data[i].p = p;
1496
+ cum_sum += p;
1497
+ }
1498
+ for (size_t i = 0; i < candidates->size; ++i) {
1499
+ candidates->data[i].p /= cum_sum;
1500
+ }
1501
+
1502
+ if (ctx) {
1503
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1504
+ }
1505
  }
1506
 
1507
+ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
1508
+ const int64_t t_start_sample_us = ggml_time_us();
1509
+
1510
+ k = std::max(k, (int) min_keep);
1511
+ k = std::min(k, (int) candidates->size);
1512
+
1513
+ // Sort scores in descending order
1514
+ if (!candidates->sorted) {
1515
+ auto comp = [](const llama_token_data & a, const llama_token_data & b) {
1516
+ return a.logit > b.logit;
1517
+ };
1518
+ if (k == (int) candidates->size) {
1519
+ std::sort(candidates->data, candidates->data + candidates->size, comp);
1520
+ } else {
1521
+ std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
 
 
 
 
 
 
 
 
 
1522
  }
1523
+ candidates->sorted = true;
1524
+ }
1525
+ candidates->size = k;
1526
+
1527
+ if (ctx) {
1528
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1529
  }
1530
+ }
1531
 
1532
+ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
1533
+ if (p >= 1.0f) {
1534
+ return;
1535
+ }
1536
 
1537
+ const int64_t t_start_sample_us = ggml_time_us();
1538
+
1539
+ llama_sample_softmax(ctx, candidates);
1540
+
1541
+ // Compute the cumulative probabilities
1542
+ float cum_sum = 0.0f;
1543
+ size_t last_idx = candidates->size;
1544
+
1545
+ for (size_t i = 0; i < candidates->size; ++i) {
1546
+ cum_sum += candidates->data[i].p;
1547
+
1548
+ // Check if the running sum is greater than p or if we have kept at least min_keep tokens
1549
+ if (cum_sum > p && i >= min_keep) {
1550
+ last_idx = i;
1551
+ break;
1552
  }
1553
  }
1554
 
1555
+ // Resize the output vector to keep only the top-p tokens
1556
+ candidates->size = last_idx;
1557
 
1558
+ if (ctx) {
1559
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1560
+ }
1561
+ }
1562
 
1563
+ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
1564
+ if (z >= 1.0f || candidates->size <= 2) {
1565
+ return;
 
 
 
1566
  }
1567
 
1568
+ const int64_t t_start_sample_us = ggml_time_us();
1569
+
1570
+ llama_sample_softmax(nullptr, candidates);
1571
+
1572
+ // Compute the first and second derivatives
1573
+ std::vector<float> first_derivatives(candidates->size - 1);
1574
+ std::vector<float> second_derivatives(candidates->size - 2);
1575
+
1576
+ for (size_t i = 0; i < first_derivatives.size(); ++i) {
1577
+ first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
1578
+ }
1579
+ for (size_t i = 0; i < second_derivatives.size(); ++i) {
1580
+ second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
1581
  }
1582
 
1583
+ // Calculate absolute value of second derivatives
1584
+ for (size_t i = 0; i < second_derivatives.size(); ++i) {
1585
+ second_derivatives[i] = abs(second_derivatives[i]);
1586
+ }
1587
+
1588
+ // Normalize the second derivatives
1589
+ float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
1590
+ for (float & value : second_derivatives) {
1591
+ value /= second_derivatives_sum;
1592
+ }
1593
+
1594
+ float cum_sum = 0.0f;
1595
+ size_t last_idx = candidates->size;
1596
+ for (size_t i = 0; i < second_derivatives.size(); ++i) {
1597
+ cum_sum += second_derivatives[i];
1598
+
1599
+ // Check if the running sum is greater than z or if we have kept at least min_keep tokens
1600
+ if (cum_sum > z && i >= min_keep) {
1601
+ last_idx = i;
1602
+ break;
1603
  }
1604
  }
1605
 
1606
+ // Resize the output vector to keep only the tokens above the tail location
1607
+ candidates->size = last_idx;
1608
+
1609
+ if (ctx) {
1610
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1611
+ }
1612
+ }
1613
+
1614
+
1615
+ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
1616
+ // Reference implementation:
1617
+ // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
1618
+ if (p >= 1.0f) {
1619
+ return;
1620
+ }
1621
+
1622
+ const int64_t t_start_sample_us = ggml_time_us();
1623
+
1624
+ // Compute the softmax of logits and calculate entropy
1625
+ llama_sample_softmax(nullptr, candidates);
1626
+
1627
+ float entropy = 0.0f;
1628
+ for (size_t i = 0; i < candidates->size; ++i) {
1629
+ entropy += -candidates->data[i].p * logf(candidates->data[i].p);
1630
+ }
1631
+
1632
+ // Compute the absolute difference between negative log probability and entropy for each candidate
1633
+ std::vector<float> shifted_scores;
1634
+ for (size_t i = 0; i < candidates->size; ++i) {
1635
+ float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
1636
+ shifted_scores.push_back(shifted_score);
1637
+ }
1638
+
1639
+ // Sort tokens based on the shifted_scores and their corresponding indices
1640
+ std::vector<size_t> indices(candidates->size);
1641
+ std::iota(indices.begin(), indices.end(), 0);
1642
+
1643
+ std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
1644
+ return shifted_scores[a] < shifted_scores[b];
1645
+ });
1646
+
1647
+ // Compute the cumulative probabilities
1648
+ float cum_sum = 0.0f;
1649
+ size_t last_idx = indices.size();
1650
+
1651
+ for (size_t i = 0; i < indices.size(); ++i) {
1652
+ size_t idx = indices[i];
1653
+ cum_sum += candidates->data[idx].p;
1654
+
1655
+ // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
1656
+ if (cum_sum > p && i >= min_keep - 1) {
1657
+ last_idx = i + 1;
1658
+ break;
1659
+ }
1660
+ }
1661
+
1662
+ // Resize the output vector to keep only the locally typical tokens
1663
+ std::vector<llama_token_data> new_candidates;
1664
+ for (size_t i = 0; i < last_idx; ++i) {
1665
+ size_t idx = indices[i];
1666
+ new_candidates.push_back(candidates->data[idx]);
1667
+ }
1668
+
1669
+ // Replace the data in candidates with the new_candidates data
1670
+ std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
1671
+ candidates->size = new_candidates.size();
1672
+
1673
+ if (ctx) {
1674
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1675
+ }
1676
+ }
1677
+
1678
+ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
1679
+ const int64_t t_start_sample_us = ggml_time_us();
1680
+
1681
+ for (size_t i = 0; i < candidates_p->size; ++i) {
1682
+ candidates_p->data[i].logit /= temp;
1683
+ }
1684
+
1685
+ if (ctx) {
1686
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1687
+ }
1688
+ }
1689
+
1690
+ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty) {
1691
+ if (last_tokens_size == 0 || penalty == 1.0f) {
1692
+ return;
1693
+ }
1694
+
1695
+ const int64_t t_start_sample_us = ggml_time_us();
1696
+
1697
+ for (size_t i = 0; i < candidates->size; ++i) {
1698
+ auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
1699
+ if (token_iter == last_tokens + last_tokens_size) {
1700
+ continue;
1701
+ }
1702
+
1703
+ // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
1704
+ // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
1705
+ if (candidates->data[i].logit <= 0) {
1706
+ candidates->data[i].logit *= penalty;
1707
+ } else {
1708
+ candidates->data[i].logit /= penalty;
1709
+ }
1710
+ }
1711
+
1712
+ candidates->sorted = false;
1713
+
1714
+ if (ctx) {
1715
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1716
+ }
1717
+ }
1718
+
1719
+ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
1720
+ if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
1721
+ return;
1722
+ }
1723
+
1724
+ const int64_t t_start_sample_us = ggml_time_us();
1725
+
1726
+ // Create a frequency map to count occurrences of each token in last_tokens
1727
+ std::unordered_map<llama_token, int> token_count;
1728
+ for (size_t i = 0; i < last_tokens_size; ++i) {
1729
+ token_count[last_tokens_p[i]]++;
1730
+ }
1731
+
1732
+ // Apply frequency and presence penalties to the candidates
1733
+ for (size_t i = 0; i < candidates->size; ++i) {
1734
+ auto token_iter = token_count.find(candidates->data[i].id);
1735
+ if (token_iter == token_count.end()) {
1736
+ continue;
1737
+ }
1738
+
1739
+ int count = token_iter->second;
1740
+ candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
1741
+ }
1742
+
1743
+ candidates->sorted = false;
1744
+
1745
+ if (ctx) {
1746
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1747
+ }
1748
+ }
1749
+
1750
+
1751
+ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
1752
+ assert(ctx);
1753
+ auto N = float(llama_n_vocab(ctx));
1754
+ int64_t t_start_sample_us;
1755
+ t_start_sample_us = ggml_time_us();
1756
+
1757
+ llama_sample_softmax(nullptr, candidates);
1758
+
1759
+ // Estimate s_hat using the most probable m tokens
1760
+ float s_hat = 0.0;
1761
+ float sum_ti_bi = 0.0;
1762
+ float sum_ti_sq = 0.0;
1763
+ for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
1764
+ float t_i = logf(float(i + 2) / float(i + 1));
1765
+ float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
1766
+ sum_ti_bi += t_i * b_i;
1767
+ sum_ti_sq += t_i * t_i;
1768
+ }
1769
+ s_hat = sum_ti_bi / sum_ti_sq;
1770
+
1771
+ // Compute k from the estimated s_hat and target surprise value
1772
+ float epsilon_hat = s_hat - 1;
1773
+ float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
1774
+
1775
+ // Sample the next word X using top-k sampling
1776
+ llama_sample_top_k(nullptr, candidates, int(k));
1777
+ if (ctx) {
1778
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1779
+ }
1780
+ llama_token X = llama_sample_token(ctx, candidates);
1781
+ t_start_sample_us = ggml_time_us();
1782
+
1783
+ // Compute error as the difference between observed surprise and target surprise value
1784
+ size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
1785
+ return candidate.id == X;
1786
+ }));
1787
+ float observed_surprise = -log2f(candidates->data[X_idx].p);
1788
+ float e = observed_surprise - tau;
1789
+
1790
+ // Update mu using the learning rate and error
1791
+ *mu = *mu - eta * e;
1792
+
1793
+ if (ctx) {
1794
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1795
+ ctx->n_sample++;
1796
+ }
1797
+ return X;
1798
+ }
1799
+
1800
+ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
1801
+ assert(ctx);
1802
+ int64_t t_start_sample_us;
1803
+ t_start_sample_us = ggml_time_us();
1804
+
1805
+ llama_sample_softmax(ctx, candidates);
1806
+
1807
+ // Truncate the words with surprise values greater than mu
1808
+ candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
1809
+ return -log2f(candidate.p) > *mu;
1810
+ }));
1811
+
1812
+ // Normalize the probabilities of the remaining words
1813
+ llama_sample_softmax(ctx, candidates);
1814
+
1815
+ // Sample the next word X from the remaining words
1816
+ if (ctx) {
1817
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1818
+ }
1819
+ llama_token X = llama_sample_token(ctx, candidates);
1820
+ t_start_sample_us = ggml_time_us();
1821
+
1822
+ // Compute error as the difference between observed surprise and target surprise value
1823
+ size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
1824
+ return candidate.id == X;
1825
+ }));
1826
+ float observed_surprise = -log2f(candidates->data[X_idx].p);
1827
+ float e = observed_surprise - tau;
1828
+
1829
+ // Update mu using the learning rate and error
1830
+ *mu = *mu - eta * e;
1831
+
1832
+ if (ctx) {
1833
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1834
+ }
1835
+ return X;
1836
+ }
1837
+
1838
+ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
1839
+ const int64_t t_start_sample_us = ggml_time_us();
1840
+
1841
+ // Find max element
1842
+ auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
1843
+ return a.logit < b.logit;
1844
+ });
1845
+
1846
+ llama_token result = max_iter->id;
1847
+ if (ctx) {
1848
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1849
+ ctx->n_sample++;
1850
+ }
1851
+ return result;
1852
+ }
1853
+
1854
+ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
1855
+ assert(ctx);
1856
+ const int64_t t_start_sample_us = ggml_time_us();
1857
+ llama_sample_softmax(nullptr, candidates);
1858
+
1859
+ std::vector<float> probs;
1860
+ probs.reserve(candidates->size);
1861
+ for (size_t i = 0; i < candidates->size; ++i) {
1862
+ probs.push_back(candidates->data[i].p);
1863
+ }
1864
 
1865
  std::discrete_distribution<> dist(probs.begin(), probs.end());
1866
+ auto & rng = ctx->rng;
1867
  int idx = dist(rng);
1868
 
1869
+ llama_token result = candidates->data[idx].id;
1870
+
1871
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
1872
+ ctx->n_sample++;
1873
+ return result;
1874
  }
1875
 
1876
  //
1877
  // quantization
1878
  //
1879
 
1880
+ static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
1881
  ggml_type quantized_type;
1882
+ switch (ftype) {
1883
+ case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
1884
+ case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
1885
+ case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
1886
+ case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
1887
+ case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
1888
+ case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
1889
+ default: throw format("invalid output file type %d\n", ftype);
1890
  };
1891
 
1892
+ if (nthread <= 0) {
1893
+ nthread = std::thread::hardware_concurrency();
1894
+ }
1895
+
1896
  std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
1897
  /*vocab_only*/ false));
1898
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
1899
 
1900
  size_t total_size_org = 0;
1901
  size_t total_size_new = 0;
1902
  std::vector<int64_t> hist_all(1 << 4, 0);
1903
 
1904
+ std::vector<std::thread> workers;
1905
+ std::mutex mutex;
1906
+
1907
  size_t idx = 0;
1908
  for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
1909
  llama_buffer read_data;
 
1911
  tensor.data = read_data.addr;
1912
  model_loader->load_data_for(tensor);
1913
 
1914
+ printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
1915
  ++idx, model_loader->tensors_map.tensors.size(),
1916
  tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
1917
+ ggml_type_name(tensor.type));
1918
 
1919
  // This used to be a regex, but <regex> has an extreme cost to compile times.
1920
  bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
 
1922
  // quantize only 2D tensors
1923
  quantize &= (tensor.ne.size() == 2);
1924
 
1925
+ // uncomment this to keep the output layer in FP16
1926
+ //if (tensor.name == "output.weight") {
1927
+ // quantize = false;
1928
+ //}
1929
+
1930
  enum ggml_type new_type;
1931
  void * new_data;
1932
  size_t new_size;
 
1952
  f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
1953
  }
1954
  } else {
1955
+ throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
1956
  }
1957
 
1958
  printf("quantizing .. ");
 
1962
  new_data = work.addr;
1963
  std::vector<int64_t> hist_cur(1 << 4, 0);
1964
 
1965
+ int chunk_size = 32 * 512;
1966
+ const int nchunk = (nelements + chunk_size - 1)/chunk_size;
1967
+ const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
1968
+ if (nthread_use < 2) {
1969
+ new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
1970
+ } else {
1971
+ size_t counter = 0;
1972
+ new_size = 0;
1973
+ auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
1974
+ std::vector<int64_t> local_hist;
1975
+ size_t local_size = 0;
1976
+ while (true) {
1977
+ std::unique_lock<std::mutex> lock(mutex);
1978
+ size_t first = counter; counter += chunk_size;
1979
+ if (first >= nelements) {
1980
+ if (!local_hist.empty()) {
1981
+ for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
1982
+ new_size += local_size;
1983
+ }
1984
+ break;
1985
+ }
1986
+ lock.unlock();
1987
+ size_t last = std::min(nelements, first + chunk_size);
1988
+ if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
1989
+ local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
1990
+ }
1991
+ };
1992
+ if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
1993
+ for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
1994
+ compute();
1995
+ for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
1996
  }
1997
 
1998
  printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
 
2091
  if (params.logits_all) {
2092
  ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
2093
  } else {
2094
+ ctx->logits.reserve(hparams.n_vocab);
2095
  }
2096
 
2097
  if (params.embedding){
2098
  ctx->embedding.resize(hparams.n_embd);
2099
  }
2100
 
2101
+ ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
2102
 
2103
+ ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
2104
+ ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
2105
  }
2106
 
2107
  return ctx;
 
2114
  int llama_model_quantize(
2115
  const char * fname_inp,
2116
  const char * fname_out,
2117
+ enum llama_ftype ftype,
2118
+ int nthread) {
2119
  try {
2120
+ llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
2121
  return 0;
2122
  } catch (const std::string & err) {
2123
  fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
 
2125
  }
2126
  }
2127
 
2128
+ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2129
+ fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
2130
+
2131
+ auto & model = ctx->model;
2132
+
2133
+ const int64_t t_start_lora_us = ggml_time_us();
2134
+
2135
+ auto fin = std::ifstream(path_lora, std::ios::binary);
2136
+ if (!fin) {
2137
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
2138
+ return 1;
2139
+ }
2140
+
2141
+ // verify magic and version
2142
+ {
2143
+ uint32_t magic;
2144
+ fin.read((char *) &magic, sizeof(magic));
2145
+ if (magic != 'ggla') {
2146
+ fprintf(stderr, "%s: bad file magic\n", __func__);
2147
+ return 1;
2148
+ }
2149
+ uint32_t format_version;
2150
+ fin.read((char *) &format_version, sizeof(format_version));
2151
+
2152
+ if (format_version != 1) {
2153
+ fprintf(stderr, "%s: unsupported file version\n", __func__ );
2154
+ return 1;
2155
+ }
2156
+ }
2157
+
2158
+ int32_t lora_r;
2159
+ int32_t lora_alpha;
2160
+ fin.read((char *) &lora_r, sizeof(lora_r));
2161
+ fin.read((char *) &lora_alpha, sizeof(lora_alpha));
2162
+ float scaling = (float)lora_alpha / (float)lora_r;
2163
+
2164
+ fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
2165
+
2166
+
2167
+ // create a temporary ggml context to store the lora tensors
2168
+ // todo: calculate size from biggest possible tensor
2169
+ std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
2170
+ struct ggml_init_params params;
2171
+ params.mem_size = lora_buf.size();
2172
+ params.mem_buffer = lora_buf.data();
2173
+ params.no_alloc = false;
2174
+
2175
+ ggml_context * lora_ctx = ggml_init(params);
2176
+ std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
2177
+
2178
+ // create a name -> tensor map of the model to accelerate lookups
2179
+ std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
2180
+ for (auto & kv: model.tensors_by_name) {
2181
+ model_tensors.insert(kv);
2182
+ }
2183
+
2184
+
2185
+ // load base model
2186
+ std::unique_ptr<llama_model_loader> model_loader;
2187
+ ggml_context * base_ctx = NULL;
2188
+ llama_buffer base_buf;
2189
+ if (path_base_model) {
2190
+ fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
2191
+ model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
2192
+
2193
+ size_t ctx_size, mmapped_size;
2194
+ model_loader->calc_sizes(&ctx_size, &mmapped_size);
2195
+ base_buf.resize(ctx_size);
2196
+
2197
+ ggml_init_params base_params;
2198
+ base_params.mem_size = base_buf.size;
2199
+ base_params.mem_buffer = base_buf.addr;
2200
+ base_params.no_alloc = model_loader->use_mmap;
2201
+
2202
+ base_ctx = ggml_init(base_params);
2203
+
2204
+ model_loader->ggml_ctx = base_ctx;
2205
+
2206
+ // maybe this should in llama_model_loader
2207
+ if (model_loader->use_mmap) {
2208
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
2209
+ }
2210
+ }
2211
+
2212
+ // read tensors and apply
2213
+ bool warned = false;
2214
+ int n_tensors = 0;
2215
+ while (true) {
2216
+ int32_t n_dims;
2217
+ int32_t length;
2218
+ int32_t ftype;
2219
+
2220
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
2221
+ fin.read(reinterpret_cast<char *>(&length), sizeof(length));
2222
+ fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
2223
+ if (fin.eof()) {
2224
+ break;
2225
+ }
2226
+
2227
+ int32_t ne[2] = { 1, 1 };
2228
+ for (int i = 0; i < n_dims; ++i) {
2229
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
2230
+ }
2231
+
2232
+ std::string name(length, 0);
2233
+ fin.read(&name[0], length);
2234
+
2235
+ // check for lora suffix and get the type of tensor
2236
+ const std::string lora_suffix = ".lora";
2237
+ size_t pos = name.rfind(lora_suffix);
2238
+ if (pos == std::string::npos) {
2239
+ fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
2240
+ return 1;
2241
+ }
2242
+
2243
+ std::string lora_type = name.substr(pos + lora_suffix.length());
2244
+ std::string base_name = name;
2245
+ base_name.erase(pos);
2246
+ // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
2247
+
2248
+ if (model_tensors.find(base_name.data()) == model_tensors.end()) {
2249
+ fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
2250
+ return 1;
2251
+ }
2252
+
2253
+ // create ggml tensor
2254
+ ggml_type wtype;
2255
+ switch (ftype) {
2256
+ case 0: wtype = GGML_TYPE_F32; break;
2257
+ case 1: wtype = GGML_TYPE_F16; break;
2258
+ default:
2259
+ {
2260
+ fprintf(stderr, "%s: invalid tensor data type '%d'\n",
2261
+ __func__, ftype);
2262
+ return false;
2263
+ }
2264
+ }
2265
+ ggml_tensor* lora_tensor;
2266
+ if (n_dims == 2) {
2267
+ lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
2268
+ }
2269
+ else {
2270
+ fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
2271
+ return 1;
2272
+ }
2273
+
2274
+ // load tensor data
2275
+ size_t offset = fin.tellg();
2276
+ size_t tensor_data_size = ggml_nbytes(lora_tensor);
2277
+ offset = (offset + 31) & -32;
2278
+ fin.seekg(offset);
2279
+ fin.read((char*)lora_tensor->data, tensor_data_size);
2280
+
2281
+ lora_tensors[name] = lora_tensor;
2282
+
2283
+ // check if we have both A and B tensors and apply
2284
+ if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
2285
+ lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
2286
+
2287
+ ggml_tensor * dest_t = model_tensors[base_name];
2288
+ ggml_tensor * base_t;
2289
+ if (model_loader) {
2290
+ // load from base model
2291
+ if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
2292
+ fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
2293
+ return 1;
2294
+ }
2295
+ size_t idx = model_loader->tensors_map.name_to_idx[base_name];
2296
+ llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
2297
+ base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
2298
+ lt.data = (uint8_t *) lt.ggml_tensor->data;
2299
+ model_loader->load_data_for(lt);
2300
+ lt.ggml_tensor->data = lt.data;
2301
+ }
2302
+ else {
2303
+ base_t = dest_t;
2304
+ }
2305
+
2306
+ if (ggml_is_quantized(base_t->type)) {
2307
+ if (!warned) {
2308
+ fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
2309
+ "use a f16 or f32 base model with --lora-base\n", __func__);
2310
+ warned = true;
2311
+ }
2312
+ }
2313
+
2314
+ ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
2315
+ ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
2316
+
2317
+ if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
2318
+ fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
2319
+ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
2320
+ return 1;
2321
+ }
2322
+
2323
+ // w = w + BA*s
2324
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
2325
+
2326
+ if (scaling != 1.0f) {
2327
+ ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
2328
+ BA = ggml_scale(lora_ctx, BA, scale_tensor);
2329
+ }
2330
+
2331
+ ggml_tensor * r;
2332
+ if (base_t == dest_t) {
2333
+ r = ggml_add_inplace(lora_ctx, dest_t, BA);
2334
+ }
2335
+ else {
2336
+ r = ggml_add(lora_ctx, base_t, BA);
2337
+ r = ggml_cpy(lora_ctx, r, dest_t);
2338
+ }
2339
+
2340
+ struct ggml_cgraph gf = ggml_build_forward(r);
2341
+ gf.n_threads = n_threads;
2342
+ ggml_graph_compute(lora_ctx, &gf);
2343
+
2344
+ // we won't need these tensors again, reset the context to save memory
2345
+ ggml_free(lora_ctx);
2346
+ lora_ctx = ggml_init(params);
2347
+ lora_tensors.clear();
2348
+
2349
+ n_tensors++;
2350
+ if (n_tensors % 4 == 0)
2351
+ fprintf(stderr, ".");
2352
+ }
2353
+ }
2354
+
2355
+ // TODO: this should be in a destructor, it will leak on failure
2356
+ ggml_free(lora_ctx);
2357
+ if (base_ctx) {
2358
+ ggml_free(base_ctx);
2359
+ }
2360
+
2361
+ const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
2362
+ fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
2363
+
2364
+ return 0;
2365
  }
2366
 
2367
+ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2368
+ try {
2369
+ return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
2370
+ } catch (const std::string & err) {
2371
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
2372
+ return 1;
2373
+ }
2374
  }
2375
 
2376
  int llama_get_kv_cache_token_count(struct llama_context * ctx) {
2377
  return ctx->model.kv_self.n;
2378
  }
2379
 
2380
+ #define LLAMA_MAX_RNG_STATE 64*1024
2381
+
2382
+ void llama_set_rng_seed(struct llama_context * ctx, int seed) {
2383
+ if (seed <= 0) {
2384
+ seed = time(NULL);
2385
+ }
2386
+ ctx->rng.seed(seed);
2387
+ }
2388
+
2389
+ // Returns the size of the state
2390
+ size_t llama_get_state_size(struct llama_context * ctx) {
2391
+ // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
2392
+ // for reference, std::mt19937(1337) serializes to 6701 bytes.
2393
+ const size_t s_rng_size = sizeof(size_t);
2394
+ const size_t s_rng = LLAMA_MAX_RNG_STATE;
2395
+ const size_t s_logits_capacity = sizeof(size_t);
2396
+ const size_t s_logits_size = sizeof(size_t);
2397
+ const size_t s_logits = ctx->logits.capacity() * sizeof(float);
2398
+ const size_t s_embedding_size = sizeof(size_t);
2399
+ const size_t s_embedding = ctx->embedding.size() * sizeof(float);
2400
+ const size_t s_kv_size = sizeof(size_t);
2401
+ const size_t s_kv_ntok = sizeof(int);
2402
+ const size_t s_kv = ctx->model.kv_self.buf.size;
2403
+
2404
+ const size_t s_total = (
2405
+ + s_rng_size
2406
+ + s_rng
2407
+ + s_logits_capacity
2408
+ + s_logits_size
2409
+ + s_logits
2410
+ + s_embedding_size
2411
+ + s_embedding
2412
+ + s_kv_size
2413
+ + s_kv_ntok
2414
+ + s_kv
2415
+ );
2416
+
2417
+ return s_total;
2418
+ }
2419
+
2420
+ // Copies the state to the specified destination address
2421
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2422
+ uint8_t * out = dest;
2423
+
2424
+ // copy rng
2425
+ {
2426
+ std::stringstream rng_ss;
2427
+ rng_ss << ctx->rng;
2428
+
2429
+ const size_t rng_size = rng_ss.str().size();
2430
+ char rng_buf[LLAMA_MAX_RNG_STATE];
2431
+
2432
+ memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
2433
+ memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
2434
+
2435
+ memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
2436
+ memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
2437
+ }
2438
+
2439
+ // copy logits
2440
+ {
2441
+ const size_t logits_cap = ctx->logits.capacity();
2442
+ const size_t logits_size = ctx->logits.size();
2443
+
2444
+ memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap);
2445
+ memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
2446
+
2447
+ if (logits_size) {
2448
+ memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
2449
+ }
2450
+
2451
+ out += logits_cap * sizeof(float);
2452
+ }
2453
+
2454
+ // copy embeddings
2455
+ {
2456
+ const size_t embedding_size = ctx->embedding.size();
2457
+
2458
+ memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
2459
+
2460
+ if (embedding_size) {
2461
+ memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
2462
+ out += embedding_size * sizeof(float);
2463
+ }
2464
+ }
2465
+
2466
+ // copy kv cache
2467
+ {
2468
+ const size_t kv_size = ctx->model.kv_self.buf.size;
2469
+ const int kv_ntok = llama_get_kv_cache_token_count(ctx);
2470
+
2471
+ memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
2472
+ memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
2473
+
2474
+ if (kv_size) {
2475
+ memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size;
2476
+ }
2477
+ }
2478
+
2479
+ const size_t written = out - dest;
2480
+ const size_t expected = llama_get_state_size(ctx);
2481
+
2482
+ LLAMA_ASSERT(written == expected);
2483
+
2484
+ return written;
2485
+ }
2486
+
2487
+ // Sets the state reading from the specified source address
2488
+ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2489
+ const uint8_t * in = src;
2490
+
2491
+ // set rng
2492
+ {
2493
+ size_t rng_size;
2494
+ char rng_buf[LLAMA_MAX_RNG_STATE];
2495
+
2496
+ memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
2497
+ memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
2498
+
2499
+ std::stringstream rng_ss;
2500
+ rng_ss.str(std::string(&rng_buf[0], rng_size));
2501
+ rng_ss >> ctx->rng;
2502
+
2503
+ LLAMA_ASSERT(rng_ss.fail() == false);
2504
+ }
2505
+
2506
+ // set logits
2507
+ {
2508
+ size_t logits_cap;
2509
+ size_t logits_size;
2510
+
2511
+ memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
2512
+ memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
2513
+
2514
+ LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
2515
+
2516
+ if (logits_size) {
2517
+ ctx->logits.resize(logits_size);
2518
+ memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
2519
+ }
2520
+
2521
+ in += logits_cap * sizeof(float);
2522
+ }
2523
+
2524
+ // set embeddings
2525
+ {
2526
+ size_t embedding_size;
2527
+
2528
+ memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
2529
+
2530
+ LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
2531
+
2532
+ if (embedding_size) {
2533
+ memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
2534
+ in += embedding_size * sizeof(float);
2535
+ }
2536
+ }
2537
+
2538
+ // set kv cache
2539
+ {
2540
+ size_t kv_size;
2541
+ int kv_ntok;
2542
+
2543
+ memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
2544
+ memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
2545
+
2546
+ if (kv_size) {
2547
+ LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
2548
+
2549
+ void * k_data = ctx->model.kv_self.k->data; // remember data pointers
2550
+ void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
2551
+
2552
+ memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size;
2553
+
2554
+ ctx->model.kv_self.k->data = k_data; // restore correct data pointers
2555
+ ctx->model.kv_self.v->data = v_data;
2556
+
2557
+ }
2558
+
2559
+ ctx->model.kv_self.n = kv_ntok;
2560
+ }
2561
+
2562
+ const size_t nread = in - src;
2563
+ const size_t expected = llama_get_state_size(ctx);
2564
+
2565
+ LLAMA_ASSERT(nread == expected);
2566
+
2567
+ return nread;
2568
  }
2569
 
2570
  int llama_eval(
 
2641
  return 2;
2642
  }
2643
 
2644
+ llama_token llama_token_nl() {
2645
+ return 13;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2646
  }
2647
 
2648
 
 
2672
  static std::string s;
2673
 
2674
  s = "";
2675
+ s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
2676
+ s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
2677
+ s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
2678
+ s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
2679
+ s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
2680
+ s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
2681
+ s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
2682
+ s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
2683
+ s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
2684
+ s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
2685
+ s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
2686
+ s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
2687
+ s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
2688
+ s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
2689
 
2690
  return s.c_str();
2691
  }
 
2694
  std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
2695
  return ctx->model.tensors_by_name;
2696
  }
2697
+
2698
+ size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
2699
+ // TODO leverage mmap
2700
+ llama_file file(path_session, "rb");
2701
+ const uint32_t magic = file.read_u32();
2702
+ const uint32_t version = file.read_u32();
2703
+
2704
+ if (!(magic == 'ggsn' && version == 0)) {
2705
+ fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
2706
+ return 0;
2707
+ }
2708
+
2709
+ llama_hparams session_hparams;
2710
+ file.read_raw(&session_hparams, sizeof(llama_hparams));
2711
+
2712
+ // REVIEW
2713
+ if (session_hparams != ctx->model.hparams) {
2714
+ fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
2715
+ return 0;
2716
+ }
2717
+
2718
+ const uint32_t n_token_count = file.read_u32();
2719
+ LLAMA_ASSERT(n_token_capacity >= n_token_count);
2720
+ file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
2721
+ *n_token_count_out = n_token_count;
2722
+
2723
+ const size_t n_state_size = file.size - file.tell();
2724
+ const size_t n_orig_state_size = llama_get_state_size(ctx);
2725
+ if (n_state_size != n_orig_state_size) {
2726
+ fprintf(stderr, "%s : failed to validate state size\n", __func__);
2727
+ }
2728
+ std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
2729
+ file.read_raw(state_data.get(), n_state_size);
2730
+ return llama_set_state_data(ctx, state_data.get());
2731
+ }
2732
+
2733
+ size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
2734
+ // TODO save temp & swap
2735
+ llama_file file(path_session, "wb");
2736
+
2737
+ const size_t n_state_size = llama_get_state_size(ctx);
2738
+ std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
2739
+ llama_copy_state_data(ctx, state_data.get());
2740
+
2741
+ file.write_u32('ggsn'); // magic
2742
+ file.write_u32(0); // version
2743
+ file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
2744
+
2745
+ file.write_u32((uint32_t) n_token_count); // REVIEW
2746
+ file.write_raw(tokens, sizeof(llama_token) * n_token_count);
2747
+
2748
+ file.write_raw(state_data.get(), n_state_size);
2749
+ return n_state_size; // REVIEW
2750
+ }
examples/talk-llama/llama.h CHANGED
@@ -39,12 +39,16 @@ extern "C" {
39
 
40
  typedef struct llama_token_data {
41
  llama_token id; // token id
42
-
43
  float p; // probability of the token
44
- float plog; // log probability of the token
45
-
46
  } llama_token_data;
47
 
 
 
 
 
 
 
48
  typedef void (*llama_progress_callback)(float progress, void *ctx);
49
 
50
  struct llama_context_params {
@@ -65,6 +69,20 @@ extern "C" {
65
  void * progress_callback_user_data;
66
  };
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  LLAMA_API struct llama_context_params llama_context_default_params();
69
 
70
  LLAMA_API bool llama_mmap_supported();
@@ -82,27 +100,46 @@ extern "C" {
82
 
83
  // TODO: not great API - very likely to change
84
  // Returns 0 on success
 
85
  LLAMA_API int llama_model_quantize(
86
  const char * fname_inp,
87
  const char * fname_out,
88
- int itype);
89
-
90
- // Returns the KV cache that will contain the context for the
91
- // ongoing prediction with the model.
92
- LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
93
-
94
- // Returns the size of the KV cache
95
- LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
 
 
 
 
 
 
96
 
97
  // Returns the number of tokens in the KV cache
98
  LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
99
 
100
- // Sets the KV cache containing the current context for the model
101
- LLAMA_API void llama_set_kv_cache(
102
- struct llama_context * ctx,
103
- const uint8_t * kv_cache,
104
- size_t n_size,
105
- int n_token_count);
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  // Run the llama inference to obtain the logits and probabilities for the next token.
108
  // tokens + n_tokens is the provided batch of new tokens to process
@@ -148,16 +185,52 @@ extern "C" {
148
  // Special tokens
149
  LLAMA_API llama_token llama_token_bos();
150
  LLAMA_API llama_token llama_token_eos();
 
 
 
151
 
152
- // TODO: improve the last_n_tokens interface ?
153
- LLAMA_API llama_token llama_sample_top_p_top_k(
154
- struct llama_context * ctx,
155
- const llama_token * last_n_tokens_data,
156
- int last_n_tokens_size,
157
- int top_k,
158
- float top_p,
159
- float temp,
160
- float repeat_penalty);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  // Performance information
163
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
@@ -170,4 +243,15 @@ extern "C" {
170
  }
171
  #endif
172
 
 
 
 
 
 
 
 
 
 
 
 
173
  #endif // LLAMA_H
 
39
 
40
  typedef struct llama_token_data {
41
  llama_token id; // token id
42
+ float logit; // log-odds of the token
43
  float p; // probability of the token
 
 
44
  } llama_token_data;
45
 
46
+ typedef struct llama_token_data_array {
47
+ llama_token_data * data;
48
+ size_t size;
49
+ bool sorted;
50
+ } llama_token_data_array;
51
+
52
  typedef void (*llama_progress_callback)(float progress, void *ctx);
53
 
54
  struct llama_context_params {
 
69
  void * progress_callback_user_data;
70
  };
71
 
72
+ // model file types
73
+ enum llama_ftype {
74
+ LLAMA_FTYPE_ALL_F32 = 0,
75
+ LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
76
+ LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
77
+ LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
78
+ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
79
+ LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
80
+ // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
81
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
82
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
83
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
84
+ };
85
+
86
  LLAMA_API struct llama_context_params llama_context_default_params();
87
 
88
  LLAMA_API bool llama_mmap_supported();
 
100
 
101
  // TODO: not great API - very likely to change
102
  // Returns 0 on success
103
+ // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
104
  LLAMA_API int llama_model_quantize(
105
  const char * fname_inp,
106
  const char * fname_out,
107
+ enum llama_ftype ftype,
108
+ int nthread);
109
+
110
+ // Apply a LoRA adapter to a loaded model
111
+ // path_base_model is the path to a higher quality model to use as a base for
112
+ // the layers modified by the adapter. Can be NULL to use the current loaded model.
113
+ // The model needs to be reloaded before applying a new adapter, otherwise the adapter
114
+ // will be applied on top of the previous one
115
+ // Returns 0 on success
116
+ LLAMA_API int llama_apply_lora_from_file(
117
+ struct llama_context * ctx,
118
+ const char * path_lora,
119
+ const char * path_base_model,
120
+ int n_threads);
121
 
122
  // Returns the number of tokens in the KV cache
123
  LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
124
 
125
+ // Sets the current rng seed.
126
+ LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
127
+
128
+ // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
129
+ LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
130
+
131
+ // Copies the state to the specified destination address.
132
+ // Destination needs to have allocated enough memory.
133
+ // Returns the number of bytes copied
134
+ LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
135
+
136
+ // Set the state reading from the specified address
137
+ // Returns the number of bytes read
138
+ LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
139
+
140
+ // Save/load session file
141
+ LLAMA_API size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
142
+ LLAMA_API size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
143
 
144
  // Run the llama inference to obtain the logits and probabilities for the next token.
145
  // tokens + n_tokens is the provided batch of new tokens to process
 
185
  // Special tokens
186
  LLAMA_API llama_token llama_token_bos();
187
  LLAMA_API llama_token llama_token_eos();
188
+ LLAMA_API llama_token llama_token_nl();
189
+
190
+ // Sampling functions
191
 
192
+ /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
193
+ LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty);
194
+
195
+ /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
196
+ LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
197
+
198
+ /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
199
+ LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
200
+
201
+ /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
202
+ LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
203
+
204
+ /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
205
+ LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
206
+
207
+ /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
208
+ LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
209
+
210
+ /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
211
+ LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
212
+ LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
213
+
214
+ /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
215
+ /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
216
+ /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
217
+ /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
218
+ /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
219
+ /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
220
+ LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
221
+
222
+ /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
223
+ /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
224
+ /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
225
+ /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
226
+ /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
227
+ LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
228
+
229
+ /// @details Selects the token with the highest probability.
230
+ LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
231
+
232
+ /// @details Randomly selects a token from the candidates based on their probabilities.
233
+ LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
234
 
235
  // Performance information
236
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
 
243
  }
244
  #endif
245
 
246
+ // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
247
+ #ifdef LLAMA_API_INTERNAL
248
+
249
+ #include <vector>
250
+ #include <string>
251
+ struct ggml_tensor;
252
+
253
+ std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
254
+
255
+ #endif
256
+
257
  #endif // LLAMA_H
examples/talk-llama/llama_internal.h DELETED
@@ -1,12 +0,0 @@
1
- // Internal header to be included by llama.cpp and tests/benchmarks only.
2
-
3
- #ifndef LLAMA_INTERNAL_H
4
- #define LLAMA_INTERNAL_H
5
-
6
- #include <vector>
7
- #include <string>
8
- struct ggml_tensor;
9
-
10
- std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
11
-
12
- #endif // LLAMA_INTERNAL_H
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/talk-llama/talk-llama.cpp CHANGED
@@ -487,11 +487,37 @@ int main(int argc, char ** argv) {
487
 
488
  {
489
  auto logits = llama_get_logits(ctx_llama);
 
 
490
  logits[llama_token_eos()] = 0;
491
 
492
- id = llama_sample_top_p_top_k(ctx_llama,
 
 
 
 
 
 
 
 
 
 
 
493
  embd_inp.data() + std::max(0, n_past - repeat_last_n),
494
- repeat_last_n, top_k, top_p, temp, repeat_penalty);
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  }
496
 
497
  if (id != llama_token_eos()) {
 
487
 
488
  {
489
  auto logits = llama_get_logits(ctx_llama);
490
+ auto n_vocab = llama_n_vocab(ctx_llama);
491
+
492
  logits[llama_token_eos()] = 0;
493
 
494
+ std::vector<llama_token_data> candidates;
495
+ candidates.reserve(n_vocab);
496
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
497
+ candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
498
+ }
499
+
500
+ llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
501
+
502
+ // apply repeat penalty
503
+ const float nl_logit = logits[llama_token_nl()];
504
+
505
+ llama_sample_repetition_penalty(ctx_llama, &candidates_p,
506
  embd_inp.data() + std::max(0, n_past - repeat_last_n),
507
+ repeat_last_n, repeat_penalty);
508
+
509
+ logits[llama_token_nl()] = nl_logit;
510
+
511
+ if (temp <= 0) {
512
+ // Greedy sampling
513
+ id = llama_sample_token_greedy(ctx_llama, &candidates_p);
514
+ } else {
515
+ // Temperature sampling
516
+ llama_sample_top_k(ctx_llama, &candidates_p, top_k);
517
+ llama_sample_top_p(ctx_llama, &candidates_p, top_p);
518
+ llama_sample_temperature(ctx_llama, &candidates_p, temp);
519
+ id = llama_sample_token(ctx_llama, &candidates_p);
520
+ }
521
  }
522
 
523
  if (id != llama_token_eos()) {
examples/talk.wasm/CMakeLists.txt CHANGED
@@ -13,6 +13,7 @@ include(DefaultTargetOptions)
13
 
14
  target_link_libraries(${TARGET} PRIVATE
15
  whisper
 
16
  )
17
 
18
  unset(EXTRA_FLAGS)
 
13
 
14
  target_link_libraries(${TARGET} PRIVATE
15
  whisper
16
+ common
17
  )
18
 
19
  unset(EXTRA_FLAGS)
examples/talk.wasm/gpt-2.cpp CHANGED
@@ -1,4 +1,6 @@
1
  #include "ggml.h"
 
 
2
  #include "gpt-2.h"
3
 
4
  #include <cmath>
@@ -14,150 +16,6 @@
14
 
15
  /////////////////////// GPT-2 BEGIN /////////////////////////
16
 
17
- //
18
- // Vocab utils
19
- //
20
-
21
- std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
22
- std::vector<std::string> words;
23
-
24
- // first split the text into words
25
- {
26
- std::string str = text;
27
- std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
28
-
29
- std::regex re(pat);
30
- std::smatch m;
31
-
32
- while (std::regex_search(str, m, re)) {
33
- for (auto x : m) {
34
- words.push_back(x);
35
- }
36
- str = m.suffix();
37
- }
38
- }
39
-
40
- // find the longest tokens that form the words:
41
- std::vector<gpt_vocab::id> tokens;
42
- for (const auto & word : words) {
43
- if (word.size() == 0) continue;
44
-
45
- int i = 0;
46
- int n = word.size();
47
- while (i < n) {
48
- int j = n;
49
- while (j > i) {
50
- auto it = vocab.token_to_id.find(word.substr(i, j-i));
51
- if (it != vocab.token_to_id.end()) {
52
- tokens.push_back(it->second);
53
- i = j;
54
- break;
55
- }
56
- --j;
57
- }
58
- if (i == n) {
59
- break;
60
- }
61
- if (j == i) {
62
- auto sub = word.substr(i, 1);
63
- if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
64
- tokens.push_back(vocab.token_to_id.at(sub));
65
- } else {
66
- fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
67
- }
68
- ++i;
69
- }
70
- }
71
- }
72
-
73
- return tokens;
74
- }
75
-
76
- gpt_vocab::id gpt_sample_top_k_top_p(
77
- const gpt_vocab & vocab,
78
- const float * logits,
79
- int top_k,
80
- double top_p,
81
- double temp,
82
- std::mt19937 & rng) {
83
- int n_logits = vocab.id_to_token.size();
84
-
85
- std::vector<std::pair<double, gpt_vocab::id>> logits_id;
86
- logits_id.reserve(n_logits);
87
-
88
- for (int i = 0; i < n_logits; i++) {
89
- logits_id.push_back(std::make_pair(logits[i], i));
90
- }
91
-
92
- // find the top K tokens
93
- std::partial_sort(
94
- logits_id.begin(),
95
- logits_id.begin() + top_k, logits_id.end(),
96
- [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
97
- return a.first > b.first;
98
- });
99
-
100
- logits_id.resize(top_k);
101
-
102
- // normalize
103
- {
104
- double sum = 0.0f;
105
- for (int i = 0; i < (int)logits_id.size(); i++) {
106
- sum += logits_id[i].first;
107
- }
108
-
109
- sum = 1.0/sum;
110
- for (int i = 0; i < (int)logits_id.size(); i++) {
111
- logits_id[i].first *= sum;
112
- }
113
- }
114
-
115
- if (top_p < 1.0f) {
116
- {
117
- double cumsum = 0.0f;
118
- for (int i = 0; i < top_k; i++) {
119
- cumsum += logits_id[i].first;
120
- if (cumsum >= top_p) {
121
- logits_id.resize(i+1);
122
- break;
123
- }
124
- }
125
- }
126
-
127
- // normalize again
128
- {
129
- double sum = 0.0f;
130
- for (int i = 0; i < (int)logits_id.size(); i++) {
131
- sum += logits_id[i].first;
132
- }
133
-
134
- sum = 1.0/sum;
135
- for (int i = 0; i < (int)logits_id.size(); i++) {
136
- logits_id[i].first *= sum;
137
- }
138
- }
139
- }
140
-
141
- //printf("\n");
142
- //for (int i = 0; i < (int)logits_id.size(); i++) {
143
- // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
144
- //}
145
- //exit(0);
146
-
147
- // sample from the obtained distribution
148
- std::vector<double> probs;
149
- probs.reserve(logits_id.size());
150
-
151
- for (int i = 0; i < (int) logits_id.size(); i++) {
152
- probs.push_back(logits_id[i].first);
153
- }
154
-
155
- std::discrete_distribution<> dist(probs.begin(), probs.end());
156
- int idx = dist(rng);
157
-
158
- return logits_id[idx].second;
159
- }
160
-
161
  // default hparams (GPT-2 117M)
162
  struct gpt2_hparams {
163
  int32_t n_vocab = 50257;
@@ -165,7 +23,7 @@ struct gpt2_hparams {
165
  int32_t n_embd = 768;
166
  int32_t n_head = 12;
167
  int32_t n_layer = 12;
168
- int32_t f16 = 1;
169
  };
170
 
171
  struct gpt2_layer {
@@ -187,7 +45,7 @@ struct gpt2_layer {
187
  struct ggml_tensor * c_mlp_fc_w;
188
  struct ggml_tensor * c_mlp_fc_b;
189
 
190
- struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency
191
  struct ggml_tensor * c_mlp_proj_b;
192
  };
193
 
@@ -198,8 +56,9 @@ struct gpt2_model {
198
  struct ggml_tensor * ln_f_g;
199
  struct ggml_tensor * ln_f_b;
200
 
201
- struct ggml_tensor * wte; // position embedding
202
- struct ggml_tensor * wpe; // token embedding
 
203
 
204
  std::vector<gpt2_layer> layers;
205
 
@@ -241,14 +100,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
241
  fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
242
  fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
243
  fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
244
- fin.read((char *) &hparams.f16, sizeof(hparams.f16));
245
 
246
  printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
247
  printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
248
  printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
249
  printf("%s: n_head = %d\n", __func__, hparams.n_head);
250
  printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
251
- printf("%s: f16 = %d\n", __func__, hparams.f16);
252
  }
253
 
254
  // load vocab
@@ -275,9 +134,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
275
  }
276
  }
277
 
278
- // for the big tensors, we have the option to store the data in 16-bit floats
279
  // in order to save memory and also to speed up the computation
280
- const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
 
 
 
 
 
281
 
282
  auto & ctx = model.ctx;
283
 
@@ -291,32 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
291
  const int n_ctx = hparams.n_ctx;
292
  const int n_vocab = hparams.n_vocab;
293
 
294
- ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_g
295
- ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_b
296
 
297
- ctx_size += n_vocab*n_embd*ggml_type_size(wtype); // wte
298
- ctx_size += n_ctx*n_embd*ggml_type_size(GGML_TYPE_F32); // wpe
 
299
 
300
- ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_g
301
- ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_b
302
 
303
- ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_g
304
- ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_b
305
 
306
- ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_size(wtype)); // c_attn_attn_w
307
- ctx_size += n_layer*( 3*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b
308
 
309
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_size(wtype)); // c_attn_proj_w
310
- ctx_size += n_layer*( n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_proj_b
311
 
312
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype)); // c_mlp_fc_w
313
- ctx_size += n_layer*( 4*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b
314
 
315
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype)); // c_mlp_proj_w
316
- ctx_size += n_layer*( n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b
317
 
318
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_k
319
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_v
320
 
321
  ctx_size += (6 + 12*n_layer)*256; // object overhead
322
 
@@ -325,9 +190,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
325
 
326
  // create the ggml context
327
  {
328
- struct ggml_init_params params;
329
- params.mem_size = ctx_size;
330
- params.mem_buffer = NULL;
 
 
331
 
332
  model.ctx = ggml_init(params);
333
  if (!model.ctx) {
@@ -350,36 +217,38 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
350
  model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
351
  model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
352
 
353
- model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
354
- model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
 
355
 
356
  // map by name
357
  model.tensors["model/ln_f/g"] = model.ln_f_g;
358
  model.tensors["model/ln_f/b"] = model.ln_f_b;
359
 
360
- model.tensors["model/wte"] = model.wte;
361
- model.tensors["model/wpe"] = model.wpe;
 
362
 
363
  for (int i = 0; i < n_layer; ++i) {
364
  auto & layer = model.layers[i];
365
 
366
- layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
367
- layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
368
 
369
- layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
370
- layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
371
 
372
- layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, 3*n_embd, n_embd);
373
- layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
374
 
375
- layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
376
- layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
377
 
378
- layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
379
- layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
380
 
381
- layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
382
- layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
383
 
384
  // map by name
385
  model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
@@ -397,7 +266,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
397
  model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
398
  model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
399
 
400
- model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w_trans;
401
  model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
402
  }
403
  }
@@ -425,14 +294,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
425
  {
426
  size_t total_size = 0;
427
 
 
 
428
  while (true) {
429
  int32_t n_dims;
430
  int32_t length;
431
- int32_t ftype;
432
 
433
  fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
434
  fin.read(reinterpret_cast<char *>(&length), sizeof(length));
435
- fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
436
 
437
  if (fin.eof()) {
438
  break;
@@ -461,13 +332,18 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
461
 
462
  if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
463
  fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
464
- __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
465
  return false;
466
  }
467
 
468
- const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
 
 
 
 
 
469
 
470
- if (nelements*bpe != ggml_nbytes(tensor)) {
471
  fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
472
  __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
473
  return false;
@@ -475,7 +351,15 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
475
 
476
  fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
477
 
478
- //printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
 
 
 
 
 
 
 
 
479
  total_size += ggml_nbytes(tensor);
480
  }
481
 
@@ -493,7 +377,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
493
  // - n_threads: number of threads to use
494
  // - n_past: the context size so far
495
  // - embd_inp: the embeddings of the tokens in the context
496
- // - embd_w: the predicted probabilities of the next token
497
  //
498
  bool gpt2_eval(
499
  const gpt2_model & model,
@@ -512,12 +396,12 @@ bool gpt2_eval(
512
  const int n_head = hparams.n_head;
513
  const int n_vocab = hparams.n_vocab;
514
 
515
- static size_t buf_size = 640u*1024*1024;
516
  static void * buf = malloc(buf_size);
517
 
518
  if (mem_per_token > 0 && mem_per_token*N > buf_size) {
519
  const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
520
- printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
521
 
522
  // reallocate
523
  buf_size = buf_size_new;
@@ -528,13 +412,14 @@ bool gpt2_eval(
528
  }
529
  }
530
 
531
- struct ggml_init_params params;
532
- params.mem_size = buf_size;
533
- params.mem_buffer = buf;
 
 
534
 
535
  struct ggml_context * ctx0 = ggml_init(params);
536
-
537
- struct ggml_cgraph gf = { };
538
  gf.n_threads = n_threads;
539
 
540
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
@@ -578,7 +463,7 @@ bool gpt2_eval(
578
  // [2304, N]
579
  {
580
  cur = ggml_mul_mat(ctx0,
581
- ggml_transpose(ctx0, model.layers[il].c_attn_attn_w),
582
  cur);
583
 
584
  cur = ggml_add(ctx0,
@@ -654,11 +539,13 @@ bool gpt2_eval(
654
  // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
655
  // [n_past + N, 64, 12]
656
  struct ggml_tensor * V_trans =
657
- ggml_permute(ctx0,
658
- ggml_reshape_3d(ctx0,
659
- ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
660
- n_embd/n_head, n_head, n_past + N),
661
- 1, 2, 0, 3);
 
 
662
 
663
  // KQV = transpose(V) * KQ_soft_max
664
  // [64, N, 12]
@@ -685,7 +572,7 @@ bool gpt2_eval(
685
  // [768, N]
686
  {
687
  cur = ggml_mul_mat(ctx0,
688
- ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
689
  cur);
690
 
691
  cur = ggml_add(ctx0,
@@ -722,7 +609,7 @@ bool gpt2_eval(
722
  // cur = fc_w*cur + fc_b
723
  // [3072, N]
724
  cur = ggml_mul_mat(ctx0,
725
- ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
726
  cur);
727
 
728
  cur = ggml_add(ctx0,
@@ -742,7 +629,7 @@ bool gpt2_eval(
742
  // cur = proj_w*cur + proj_b
743
  // [768, N]
744
  cur = ggml_mul_mat(ctx0,
745
- model.layers[il].c_mlp_proj_w_trans,
746
  cur);
747
 
748
  cur = ggml_add(ctx0,
@@ -769,12 +656,12 @@ bool gpt2_eval(
769
  }
770
 
771
  // inpL = WTE * inpL
772
- // [ 768, 50257] - model.wte
773
  // [ 768, N] - inpL
774
- inpL = ggml_mul_mat(ctx0, model.wte, inpL);
775
 
776
  // logits -> probs
777
- inpL = ggml_soft_max(ctx0, inpL);
778
 
779
  // run the computation
780
  ggml_build_forward_expand(&gf, inpL);
@@ -788,7 +675,7 @@ bool gpt2_eval(
788
  //embd_w.resize(n_vocab*N);
789
  //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
790
 
791
- // return result for just the last token
792
  embd_w.resize(n_vocab);
793
  memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
794
 
@@ -825,7 +712,7 @@ Me too.
825
  int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
826
 
827
  // sampling parameters
828
- int32_t top_k = 40;
829
  float top_p = 0.9f;
830
  float temp = 1.0f;
831
  };
@@ -833,14 +720,14 @@ Me too.
833
  struct gpt2_context * gpt2_init(const char * path_model) {
834
  gpt2_context * ctx = new gpt2_context;
835
 
836
- ctx->rng = std::mt19937(time(NULL));
837
 
838
  // load the model
839
  {
840
  const int64_t t_start_us = ggml_time_us();
841
 
842
  if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
843
- fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, "gpt-2.bin");
844
  delete ctx;
845
  return nullptr;
846
  }
@@ -885,9 +772,9 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens)
885
 
886
  std::string result;
887
 
888
- for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) {
889
  // predict
890
- if (embd.size() > 0) {
891
  if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
892
  printf("gpt-2: failed to generate text\n");
893
  return "";
@@ -914,10 +801,7 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens)
914
  result += ctx->vocab.id_to_token[embd[0]];
915
 
916
  // end of text token
917
- if (embd.back() == 50256 ||
918
- ctx->vocab.id_to_token[embd.back()] == "." ||
919
- ctx->vocab.id_to_token[embd.back()] == "!" ||
920
- ctx->vocab.id_to_token[embd.back()] == "?") {
921
  break;
922
  }
923
  }
 
1
  #include "ggml.h"
2
+ #include "common-ggml.h"
3
+
4
  #include "gpt-2.h"
5
 
6
  #include <cmath>
 
16
 
17
  /////////////////////// GPT-2 BEGIN /////////////////////////
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  // default hparams (GPT-2 117M)
20
  struct gpt2_hparams {
21
  int32_t n_vocab = 50257;
 
23
  int32_t n_embd = 768;
24
  int32_t n_head = 12;
25
  int32_t n_layer = 12;
26
+ int32_t ftype = 1;
27
  };
28
 
29
  struct gpt2_layer {
 
45
  struct ggml_tensor * c_mlp_fc_w;
46
  struct ggml_tensor * c_mlp_fc_b;
47
 
48
+ struct ggml_tensor * c_mlp_proj_w;
49
  struct ggml_tensor * c_mlp_proj_b;
50
  };
51
 
 
56
  struct ggml_tensor * ln_f_g;
57
  struct ggml_tensor * ln_f_b;
58
 
59
+ struct ggml_tensor * wte; // position embedding
60
+ struct ggml_tensor * wpe; // token embedding
61
+ struct ggml_tensor * lm_head; // language model head
62
 
63
  std::vector<gpt2_layer> layers;
64
 
 
100
  fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
101
  fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
102
  fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
103
+ fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
104
 
105
  printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
106
  printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
107
  printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
108
  printf("%s: n_head = %d\n", __func__, hparams.n_head);
109
  printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
110
+ printf("%s: ftype = %d\n", __func__, hparams.ftype);
111
  }
112
 
113
  // load vocab
 
134
  }
135
  }
136
 
137
+ // for the big tensors, we have the option to store the data in 16-bit floats or quantized
138
  // in order to save memory and also to speed up the computation
139
+ ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
140
+ if (wtype == GGML_TYPE_COUNT) {
141
+ fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
142
+ __func__, fname.c_str(), model.hparams.ftype);
143
+ return false;
144
+ }
145
 
146
  auto & ctx = model.ctx;
147
 
 
155
  const int n_ctx = hparams.n_ctx;
156
  const int n_vocab = hparams.n_vocab;
157
 
158
+ ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
159
+ ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
160
 
161
+ ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
162
+ ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
163
+ ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
164
 
165
+ ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
166
+ ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
167
 
168
+ ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
169
+ ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
170
 
171
+ ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
172
+ ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
173
 
174
+ ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
175
+ ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
176
 
177
+ ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
178
+ ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
179
 
180
+ ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
181
+ ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
182
 
183
+ ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
184
+ ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
185
 
186
  ctx_size += (6 + 12*n_layer)*256; // object overhead
187
 
 
190
 
191
  // create the ggml context
192
  {
193
+ struct ggml_init_params params = {
194
+ .mem_size = ctx_size,
195
+ .mem_buffer = NULL,
196
+ .no_alloc = false,
197
+ };
198
 
199
  model.ctx = ggml_init(params);
200
  if (!model.ctx) {
 
217
  model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
218
  model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
219
 
220
+ model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
221
+ model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
222
+ model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
223
 
224
  // map by name
225
  model.tensors["model/ln_f/g"] = model.ln_f_g;
226
  model.tensors["model/ln_f/b"] = model.ln_f_b;
227
 
228
+ model.tensors["model/wte"] = model.wte;
229
+ model.tensors["model/wpe"] = model.wpe;
230
+ model.tensors["model/lm_head"] = model.lm_head;
231
 
232
  for (int i = 0; i < n_layer; ++i) {
233
  auto & layer = model.layers[i];
234
 
235
+ layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
236
+ layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
237
 
238
+ layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
239
+ layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
240
 
241
+ layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
242
+ layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
243
 
244
+ layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
245
+ layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
246
 
247
+ layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
248
+ layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
249
 
250
+ layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
251
+ layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
252
 
253
  // map by name
254
  model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
 
266
  model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
267
  model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
268
 
269
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
270
  model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
271
  }
272
  }
 
294
  {
295
  size_t total_size = 0;
296
 
297
+ bool has_lm_head = false;
298
+
299
  while (true) {
300
  int32_t n_dims;
301
  int32_t length;
302
+ int32_t ttype;
303
 
304
  fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
305
  fin.read(reinterpret_cast<char *>(&length), sizeof(length));
306
+ fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
307
 
308
  if (fin.eof()) {
309
  break;
 
332
 
333
  if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
334
  fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
335
+ __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
336
  return false;
337
  }
338
 
339
+ // for debugging
340
+ if (0) {
341
+ printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
342
+ }
343
+
344
+ const size_t bpe = ggml_type_size(ggml_type(ttype));
345
 
346
+ if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
347
  fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
348
  __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
349
  return false;
 
351
 
352
  fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
353
 
354
+ // GPT-2 models share the WTE tensor as the LM head
355
+ if (name == "model/wte" && has_lm_head == false) {
356
+ memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
357
+ }
358
+
359
+ if (name == "model/lm_head") {
360
+ has_lm_head = true;
361
+ }
362
+
363
  total_size += ggml_nbytes(tensor);
364
  }
365
 
 
377
  // - n_threads: number of threads to use
378
  // - n_past: the context size so far
379
  // - embd_inp: the embeddings of the tokens in the context
380
+ // - embd_w: the predicted logits for the next token
381
  //
382
  bool gpt2_eval(
383
  const gpt2_model & model,
 
396
  const int n_head = hparams.n_head;
397
  const int n_vocab = hparams.n_vocab;
398
 
399
+ static size_t buf_size = 512u*1024*1024;
400
  static void * buf = malloc(buf_size);
401
 
402
  if (mem_per_token > 0 && mem_per_token*N > buf_size) {
403
  const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
404
+ //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
405
 
406
  // reallocate
407
  buf_size = buf_size_new;
 
412
  }
413
  }
414
 
415
+ struct ggml_init_params params = {
416
+ /*.mem_size =*/ buf_size,
417
+ /*.mem_buffer =*/ buf,
418
+ /*.no_alloc =*/ false,
419
+ };
420
 
421
  struct ggml_context * ctx0 = ggml_init(params);
422
+ struct ggml_cgraph gf = {};
 
423
  gf.n_threads = n_threads;
424
 
425
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
 
463
  // [2304, N]
464
  {
465
  cur = ggml_mul_mat(ctx0,
466
+ model.layers[il].c_attn_attn_w,
467
  cur);
468
 
469
  cur = ggml_add(ctx0,
 
539
  // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
540
  // [n_past + N, 64, 12]
541
  struct ggml_tensor * V_trans =
542
+ ggml_cpy(ctx0,
543
+ ggml_permute(ctx0,
544
+ ggml_reshape_3d(ctx0,
545
+ ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
546
+ n_embd/n_head, n_head, n_past + N),
547
+ 1, 2, 0, 3),
548
+ ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
549
 
550
  // KQV = transpose(V) * KQ_soft_max
551
  // [64, N, 12]
 
572
  // [768, N]
573
  {
574
  cur = ggml_mul_mat(ctx0,
575
+ model.layers[il].c_attn_proj_w,
576
  cur);
577
 
578
  cur = ggml_add(ctx0,
 
609
  // cur = fc_w*cur + fc_b
610
  // [3072, N]
611
  cur = ggml_mul_mat(ctx0,
612
+ model.layers[il].c_mlp_fc_w,
613
  cur);
614
 
615
  cur = ggml_add(ctx0,
 
629
  // cur = proj_w*cur + proj_b
630
  // [768, N]
631
  cur = ggml_mul_mat(ctx0,
632
+ model.layers[il].c_mlp_proj_w,
633
  cur);
634
 
635
  cur = ggml_add(ctx0,
 
656
  }
657
 
658
  // inpL = WTE * inpL
659
+ // [ 768, 50257] - model.lm_head
660
  // [ 768, N] - inpL
661
+ inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
662
 
663
  // logits -> probs
664
+ //inpL = ggml_soft_max(ctx0, inpL);
665
 
666
  // run the computation
667
  ggml_build_forward_expand(&gf, inpL);
 
675
  //embd_w.resize(n_vocab*N);
676
  //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
677
 
678
+ // return result just for the last token
679
  embd_w.resize(n_vocab);
680
  memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
681
 
 
712
  int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
713
 
714
  // sampling parameters
715
+ int32_t top_k = 5;
716
  float top_p = 0.9f;
717
  float temp = 1.0f;
718
  };
 
720
  struct gpt2_context * gpt2_init(const char * path_model) {
721
  gpt2_context * ctx = new gpt2_context;
722
 
723
+ ctx->rng = std::mt19937(time(nullptr));
724
 
725
  // load the model
726
  {
727
  const int64_t t_start_us = ggml_time_us();
728
 
729
  if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
730
+ fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
731
  delete ctx;
732
  return nullptr;
733
  }
 
772
 
773
  std::string result;
774
 
775
+ for (int i = embd.size(); i < (int) embd_inp.size() + n_predict; i++) {
776
  // predict
777
+ if (!embd.empty()) {
778
  if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
779
  printf("gpt-2: failed to generate text\n");
780
  return "";
 
801
  result += ctx->vocab.id_to_token[embd[0]];
802
 
803
  // end of text token
804
+ if (embd.back() == 50256) {
 
 
 
805
  break;
806
  }
807
  }
examples/talk.wasm/gpt-2.h CHANGED
@@ -2,18 +2,12 @@
2
 
3
  // TODO: Change to C-style API and move to ./examples for easy reuse.
4
 
 
 
5
  #include <vector>
6
  #include <map>
7
  #include <string>
8
 
9
- struct gpt_vocab {
10
- using id = int32_t;
11
- using token = std::string;
12
-
13
- std::map<token, id> token_to_id;
14
- std::map<id, token> id_to_token;
15
- };
16
-
17
  struct gpt2_context;
18
 
19
  struct gpt2_context * gpt2_init(const char * path_model);
 
2
 
3
  // TODO: Change to C-style API and move to ./examples for easy reuse.
4
 
5
+ #include "common.h"
6
+
7
  #include <vector>
8
  #include <map>
9
  #include <string>
10
 
 
 
 
 
 
 
 
 
11
  struct gpt2_context;
12
 
13
  struct gpt2_context * gpt2_init(const char * path_model);
examples/talk.wasm/index-tmpl.html CHANGED
@@ -44,6 +44,15 @@
44
 
45
  <br><br>
46
 
 
 
 
 
 
 
 
 
 
47
  <hr>
48
 
49
  Select the models you would like to use and click the "Start" button to begin the conversation
@@ -54,6 +63,10 @@
54
  Whisper model: <span id="model-whisper-status"></span>
55
  <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
56
  <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
 
 
 
 
57
  <span id="fetch-whisper-progress"></span>
58
 
59
  <!--
@@ -266,11 +279,17 @@
266
  let urls = {
267
  'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
268
  'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
 
 
 
269
  };
270
 
271
  let sizes = {
272
  'tiny.en': 75,
273
  'base.en': 142,
 
 
 
274
  };
275
 
276
  let url = urls[model];
@@ -281,6 +300,10 @@
281
 
282
  document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
283
  document.getElementById('fetch-whisper-base-en').style.display = 'none';
 
 
 
 
284
  document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
285
 
286
  cbProgress = function(p) {
@@ -292,6 +315,10 @@
292
  var el;
293
  el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
294
  el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
 
 
 
 
295
  el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
296
  };
297
 
 
44
 
45
  <br><br>
46
 
47
+ <b>More examples:</b>
48
+ <a href="https://whisper.ggerganov.com/">main</a> |
49
+ <a href="https://whisper.ggerganov.com/bench">bench</a> |
50
+ <a href="https://whisper.ggerganov.com/stream">stream</a> |
51
+ <a href="https://whisper.ggerganov.com/command">command</a> |
52
+ <a href="https://whisper.ggerganov.com/talk">talk</a> |
53
+
54
+ <br><br>
55
+
56
  <hr>
57
 
58
  Select the models you would like to use and click the "Start" button to begin the conversation
 
63
  Whisper model: <span id="model-whisper-status"></span>
64
  <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
65
  <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
66
+ <br><br>
67
+ Quantized models:<br><br>
68
+ <button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
69
+ <button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
70
  <span id="fetch-whisper-progress"></span>
71
 
72
  <!--
 
279
  let urls = {
280
  'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
281
  'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
282
+
283
+ 'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
284
+ 'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
285
  };
286
 
287
  let sizes = {
288
  'tiny.en': 75,
289
  'base.en': 142,
290
+
291
+ 'tiny-en-q5_1': 31,
292
+ 'base-en-q5_1': 57,
293
  };
294
 
295
  let url = urls[model];
 
300
 
301
  document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
302
  document.getElementById('fetch-whisper-base-en').style.display = 'none';
303
+
304
+ document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
305
+ document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
306
+
307
  document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
308
 
309
  cbProgress = function(p) {
 
315
  var el;
316
  el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
317
  el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
318
+
319
+ el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
320
+ el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
321
+
322
  el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
323
  };
324
 
examples/talk/CMakeLists.txt CHANGED
@@ -1,16 +1,8 @@
1
  if (WHISPER_SDL2)
2
  # talk
3
  set(TARGET talk)
4
- #add_executable(${TARGET} talk.cpp gpt-2.cpp)
5
- #target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
6
- #target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
7
-
8
- # TODO: this is temporary
9
- # need to export ggml symbols for MSVC, but too lazy ..
10
- add_executable(${TARGET} talk.cpp gpt-2.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
11
 
12
  include(DefaultTargetOptions)
13
-
14
- target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
15
- target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
16
  endif ()
 
1
  if (WHISPER_SDL2)
2
  # talk
3
  set(TARGET talk)
4
+ add_executable(${TARGET} talk.cpp gpt-2.cpp)
5
+ target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
 
 
 
 
 
6
 
7
  include(DefaultTargetOptions)
 
 
 
8
  endif ()
examples/talk/gpt-2.cpp CHANGED
@@ -1,4 +1,6 @@
1
  #include "ggml.h"
 
 
2
  #include "gpt-2.h"
3
 
4
  #include <cmath>
@@ -14,150 +16,6 @@
14
 
15
  /////////////////////// GPT-2 BEGIN /////////////////////////
16
 
17
- //
18
- // Vocab utils
19
- //
20
-
21
- std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
22
- std::vector<std::string> words;
23
-
24
- // first split the text into words
25
- {
26
- std::string str = text;
27
- std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
28
-
29
- std::regex re(pat);
30
- std::smatch m;
31
-
32
- while (std::regex_search(str, m, re)) {
33
- for (auto x : m) {
34
- words.push_back(x);
35
- }
36
- str = m.suffix();
37
- }
38
- }
39
-
40
- // find the longest tokens that form the words:
41
- std::vector<gpt_vocab::id> tokens;
42
- for (const auto & word : words) {
43
- if (word.empty()) continue;
44
-
45
- int i = 0;
46
- int n = word.size();
47
- while (i < n) {
48
- int j = n;
49
- while (j > i) {
50
- auto it = vocab.token_to_id.find(word.substr(i, j-i));
51
- if (it != vocab.token_to_id.end()) {
52
- tokens.push_back(it->second);
53
- i = j;
54
- break;
55
- }
56
- --j;
57
- }
58
- if (i == n) {
59
- break;
60
- }
61
- if (j == i) {
62
- auto sub = word.substr(i, 1);
63
- if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
64
- tokens.push_back(vocab.token_to_id.at(sub));
65
- } else {
66
- fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
67
- }
68
- ++i;
69
- }
70
- }
71
- }
72
-
73
- return tokens;
74
- }
75
-
76
- gpt_vocab::id gpt_sample_top_k_top_p(
77
- const gpt_vocab & vocab,
78
- const float * logits,
79
- int top_k,
80
- double top_p,
81
- double /*temp*/,
82
- std::mt19937 & rng) {
83
- int n_logits = vocab.id_to_token.size();
84
-
85
- std::vector<std::pair<double, gpt_vocab::id>> logits_id;
86
- logits_id.reserve(n_logits);
87
-
88
- for (int i = 0; i < n_logits; i++) {
89
- logits_id.emplace_back(logits[i], i);
90
- }
91
-
92
- // find the top K tokens
93
- std::partial_sort(
94
- logits_id.begin(),
95
- logits_id.begin() + top_k, logits_id.end(),
96
- [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
97
- return a.first > b.first;
98
- });
99
-
100
- logits_id.resize(top_k);
101
-
102
- // normalize
103
- {
104
- double sum = 0.0f;
105
- for (int i = 0; i < (int)logits_id.size(); i++) {
106
- sum += logits_id[i].first;
107
- }
108
-
109
- sum = 1.0/sum;
110
- for (int i = 0; i < (int)logits_id.size(); i++) {
111
- logits_id[i].first *= sum;
112
- }
113
- }
114
-
115
- if (top_p < 1.0f) {
116
- {
117
- double cumsum = 0.0f;
118
- for (int i = 0; i < top_k; i++) {
119
- cumsum += logits_id[i].first;
120
- if (cumsum >= top_p) {
121
- logits_id.resize(i+1);
122
- break;
123
- }
124
- }
125
- }
126
-
127
- // normalize again
128
- {
129
- double sum = 0.0f;
130
- for (int i = 0; i < (int)logits_id.size(); i++) {
131
- sum += logits_id[i].first;
132
- }
133
-
134
- sum = 1.0/sum;
135
- for (int i = 0; i < (int)logits_id.size(); i++) {
136
- logits_id[i].first *= sum;
137
- }
138
- }
139
- }
140
-
141
- //printf("\n");
142
- //for (int i = 0; i < (int) logits_id.size(); i++) {
143
- // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
144
- //}
145
- //exit(0);
146
-
147
- // sample from the obtained distribution
148
- std::vector<double> probs;
149
- probs.reserve(logits_id.size());
150
-
151
- for (int i = 0; i < (int) logits_id.size(); i++) {
152
- probs.push_back(logits_id[i].first);
153
- }
154
-
155
- std::discrete_distribution<> dist(probs.begin(), probs.end());
156
- int idx = dist(rng);
157
-
158
- return logits_id[idx].second;
159
- }
160
-
161
  // default hparams (GPT-2 117M)
162
  struct gpt2_hparams {
163
  int32_t n_vocab = 50257;
@@ -165,7 +23,7 @@ struct gpt2_hparams {
165
  int32_t n_embd = 768;
166
  int32_t n_head = 12;
167
  int32_t n_layer = 12;
168
- int32_t f16 = 1;
169
  };
170
 
171
  struct gpt2_layer {
@@ -187,7 +45,7 @@ struct gpt2_layer {
187
  struct ggml_tensor * c_mlp_fc_w;
188
  struct ggml_tensor * c_mlp_fc_b;
189
 
190
- struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency
191
  struct ggml_tensor * c_mlp_proj_b;
192
  };
193
 
@@ -198,8 +56,9 @@ struct gpt2_model {
198
  struct ggml_tensor * ln_f_g;
199
  struct ggml_tensor * ln_f_b;
200
 
201
- struct ggml_tensor * wte; // position embedding
202
- struct ggml_tensor * wpe; // token embedding
 
203
 
204
  std::vector<gpt2_layer> layers;
205
 
@@ -241,14 +100,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
241
  fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
242
  fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
243
  fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
244
- fin.read((char *) &hparams.f16, sizeof(hparams.f16));
245
 
246
  printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
247
  printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
248
  printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
249
  printf("%s: n_head = %d\n", __func__, hparams.n_head);
250
  printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
251
- printf("%s: f16 = %d\n", __func__, hparams.f16);
252
  }
253
 
254
  // load vocab
@@ -268,16 +127,21 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
268
  fin.read((char *) &len, sizeof(len));
269
 
270
  word.resize(len);
271
- fin.read((char *) &word[0], len);
272
 
273
  vocab.token_to_id[word] = i;
274
  vocab.id_to_token[i] = word;
275
  }
276
  }
277
 
278
- // for the big tensors, we have the option to store the data in 16-bit floats
279
  // in order to save memory and also to speed up the computation
280
- const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
 
 
 
 
 
281
 
282
  auto & ctx = model.ctx;
283
 
@@ -291,32 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
291
  const int n_ctx = hparams.n_ctx;
292
  const int n_vocab = hparams.n_vocab;
293
 
294
- ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_g
295
- ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_b
296
 
297
- ctx_size += n_vocab*n_embd*ggml_type_size(wtype); // wte
298
- ctx_size += n_ctx*n_embd*ggml_type_size(GGML_TYPE_F32); // wpe
 
299
 
300
- ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_g
301
- ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_b
302
 
303
- ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_g
304
- ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_b
305
 
306
- ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_size(wtype)); // c_attn_attn_w
307
- ctx_size += n_layer*( 3*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b
308
 
309
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_size(wtype)); // c_attn_proj_w
310
- ctx_size += n_layer*( n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_proj_b
311
 
312
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype)); // c_mlp_fc_w
313
- ctx_size += n_layer*( 4*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b
314
 
315
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype)); // c_mlp_proj_w
316
- ctx_size += n_layer*( n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b
317
 
318
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_k
319
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_v
320
 
321
  ctx_size += (6 + 12*n_layer)*256; // object overhead
322
 
@@ -325,9 +190,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
325
 
326
  // create the ggml context
327
  {
328
- struct ggml_init_params params;
329
- params.mem_size = ctx_size;
330
- params.mem_buffer = nullptr;
 
 
331
 
332
  model.ctx = ggml_init(params);
333
  if (!model.ctx) {
@@ -350,36 +217,38 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
350
  model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
351
  model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
352
 
353
- model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
354
- model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
 
355
 
356
  // map by name
357
  model.tensors["model/ln_f/g"] = model.ln_f_g;
358
  model.tensors["model/ln_f/b"] = model.ln_f_b;
359
 
360
- model.tensors["model/wte"] = model.wte;
361
- model.tensors["model/wpe"] = model.wpe;
 
362
 
363
  for (int i = 0; i < n_layer; ++i) {
364
  auto & layer = model.layers[i];
365
 
366
- layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
367
- layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
368
 
369
- layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
370
- layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
371
 
372
- layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, 3*n_embd, n_embd);
373
- layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
374
 
375
- layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
376
- layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
377
 
378
- layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
379
- layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
380
 
381
- layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
382
- layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
383
 
384
  // map by name
385
  model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
@@ -397,7 +266,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
397
  model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
398
  model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
399
 
400
- model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w_trans;
401
  model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
402
  }
403
  }
@@ -425,14 +294,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
425
  {
426
  size_t total_size = 0;
427
 
 
 
428
  while (true) {
429
  int32_t n_dims;
430
  int32_t length;
431
- int32_t ftype;
432
 
433
  fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
434
  fin.read(reinterpret_cast<char *>(&length), sizeof(length));
435
- fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
436
 
437
  if (fin.eof()) {
438
  break;
@@ -448,7 +319,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
448
  std::string name(length, 0);
449
  fin.read(&name[0], length);
450
 
451
- if (model.tensors.find(name) == model.tensors.end()) {
452
  fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
453
  return false;
454
  }
@@ -461,13 +332,18 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
461
 
462
  if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
463
  fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
464
- __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
465
  return false;
466
  }
467
 
468
- const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
 
 
 
 
 
469
 
470
- if (nelements*bpe != ggml_nbytes(tensor)) {
471
  fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
472
  __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
473
  return false;
@@ -475,7 +351,15 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
475
 
476
  fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
477
 
478
- //printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
 
 
 
 
 
 
 
 
479
  total_size += ggml_nbytes(tensor);
480
  }
481
 
@@ -493,7 +377,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
493
  // - n_threads: number of threads to use
494
  // - n_past: the context size so far
495
  // - embd_inp: the embeddings of the tokens in the context
496
- // - embd_w: the predicted probabilities of the next token
497
  //
498
  bool gpt2_eval(
499
  const gpt2_model & model,
@@ -512,12 +396,12 @@ bool gpt2_eval(
512
  const int n_head = hparams.n_head;
513
  const int n_vocab = hparams.n_vocab;
514
 
515
- static size_t buf_size = 5640ull*1024*1024;
516
  static void * buf = malloc(buf_size);
517
 
518
  if (mem_per_token > 0 && mem_per_token*N > buf_size) {
519
  const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
520
- printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
521
 
522
  // reallocate
523
  buf_size = buf_size_new;
@@ -528,13 +412,14 @@ bool gpt2_eval(
528
  }
529
  }
530
 
531
- struct ggml_init_params params;
532
- params.mem_size = buf_size;
533
- params.mem_buffer = buf;
 
 
534
 
535
  struct ggml_context * ctx0 = ggml_init(params);
536
-
537
- struct ggml_cgraph gf = { };
538
  gf.n_threads = n_threads;
539
 
540
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
@@ -578,7 +463,7 @@ bool gpt2_eval(
578
  // [2304, N]
579
  {
580
  cur = ggml_mul_mat(ctx0,
581
- ggml_transpose(ctx0, model.layers[il].c_attn_attn_w),
582
  cur);
583
 
584
  cur = ggml_add(ctx0,
@@ -654,11 +539,13 @@ bool gpt2_eval(
654
  // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
655
  // [n_past + N, 64, 12]
656
  struct ggml_tensor * V_trans =
657
- ggml_permute(ctx0,
658
- ggml_reshape_3d(ctx0,
659
- ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
660
- n_embd/n_head, n_head, n_past + N),
661
- 1, 2, 0, 3);
 
 
662
 
663
  // KQV = transpose(V) * KQ_soft_max
664
  // [64, N, 12]
@@ -685,7 +572,7 @@ bool gpt2_eval(
685
  // [768, N]
686
  {
687
  cur = ggml_mul_mat(ctx0,
688
- ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
689
  cur);
690
 
691
  cur = ggml_add(ctx0,
@@ -722,7 +609,7 @@ bool gpt2_eval(
722
  // cur = fc_w*cur + fc_b
723
  // [3072, N]
724
  cur = ggml_mul_mat(ctx0,
725
- ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
726
  cur);
727
 
728
  cur = ggml_add(ctx0,
@@ -742,7 +629,7 @@ bool gpt2_eval(
742
  // cur = proj_w*cur + proj_b
743
  // [768, N]
744
  cur = ggml_mul_mat(ctx0,
745
- model.layers[il].c_mlp_proj_w_trans,
746
  cur);
747
 
748
  cur = ggml_add(ctx0,
@@ -769,12 +656,12 @@ bool gpt2_eval(
769
  }
770
 
771
  // inpL = WTE * inpL
772
- // [ 768, 50257] - model.wte
773
  // [ 768, N] - inpL
774
- inpL = ggml_mul_mat(ctx0, model.wte, inpL);
775
 
776
  // logits -> probs
777
- inpL = ggml_soft_max(ctx0, inpL);
778
 
779
  // run the computation
780
  ggml_build_forward_expand(&gf, inpL);
@@ -788,7 +675,7 @@ bool gpt2_eval(
788
  //embd_w.resize(n_vocab*N);
789
  //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
790
 
791
- // return result for just the last token
792
  embd_w.resize(n_vocab);
793
  memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
794
 
 
1
  #include "ggml.h"
2
+ #include "common-ggml.h"
3
+
4
  #include "gpt-2.h"
5
 
6
  #include <cmath>
 
16
 
17
  /////////////////////// GPT-2 BEGIN /////////////////////////
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  // default hparams (GPT-2 117M)
20
  struct gpt2_hparams {
21
  int32_t n_vocab = 50257;
 
23
  int32_t n_embd = 768;
24
  int32_t n_head = 12;
25
  int32_t n_layer = 12;
26
+ int32_t ftype = 1;
27
  };
28
 
29
  struct gpt2_layer {
 
45
  struct ggml_tensor * c_mlp_fc_w;
46
  struct ggml_tensor * c_mlp_fc_b;
47
 
48
+ struct ggml_tensor * c_mlp_proj_w;
49
  struct ggml_tensor * c_mlp_proj_b;
50
  };
51
 
 
56
  struct ggml_tensor * ln_f_g;
57
  struct ggml_tensor * ln_f_b;
58
 
59
+ struct ggml_tensor * wte; // position embedding
60
+ struct ggml_tensor * wpe; // token embedding
61
+ struct ggml_tensor * lm_head; // language model head
62
 
63
  std::vector<gpt2_layer> layers;
64
 
 
100
  fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
101
  fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
102
  fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
103
+ fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
104
 
105
  printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
106
  printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
107
  printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
108
  printf("%s: n_head = %d\n", __func__, hparams.n_head);
109
  printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
110
+ printf("%s: ftype = %d\n", __func__, hparams.ftype);
111
  }
112
 
113
  // load vocab
 
127
  fin.read((char *) &len, sizeof(len));
128
 
129
  word.resize(len);
130
+ fin.read((char *) word.data(), len);
131
 
132
  vocab.token_to_id[word] = i;
133
  vocab.id_to_token[i] = word;
134
  }
135
  }
136
 
137
+ // for the big tensors, we have the option to store the data in 16-bit floats or quantized
138
  // in order to save memory and also to speed up the computation
139
+ ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
140
+ if (wtype == GGML_TYPE_COUNT) {
141
+ fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
142
+ __func__, fname.c_str(), model.hparams.ftype);
143
+ return false;
144
+ }
145
 
146
  auto & ctx = model.ctx;
147
 
 
155
  const int n_ctx = hparams.n_ctx;
156
  const int n_vocab = hparams.n_vocab;
157
 
158
+ ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
159
+ ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
160
 
161
+ ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
162
+ ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
163
+ ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
164
 
165
+ ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
166
+ ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
167
 
168
+ ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
169
+ ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
170
 
171
+ ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
172
+ ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
173
 
174
+ ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
175
+ ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
176
 
177
+ ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
178
+ ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
179
 
180
+ ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
181
+ ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
182
 
183
+ ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
184
+ ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
185
 
186
  ctx_size += (6 + 12*n_layer)*256; // object overhead
187
 
 
190
 
191
  // create the ggml context
192
  {
193
+ struct ggml_init_params params = {
194
+ .mem_size = ctx_size,
195
+ .mem_buffer = NULL,
196
+ .no_alloc = false,
197
+ };
198
 
199
  model.ctx = ggml_init(params);
200
  if (!model.ctx) {
 
217
  model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
218
  model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
219
 
220
+ model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
221
+ model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
222
+ model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
223
 
224
  // map by name
225
  model.tensors["model/ln_f/g"] = model.ln_f_g;
226
  model.tensors["model/ln_f/b"] = model.ln_f_b;
227
 
228
+ model.tensors["model/wte"] = model.wte;
229
+ model.tensors["model/wpe"] = model.wpe;
230
+ model.tensors["model/lm_head"] = model.lm_head;
231
 
232
  for (int i = 0; i < n_layer; ++i) {
233
  auto & layer = model.layers[i];
234
 
235
+ layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
236
+ layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
237
 
238
+ layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
239
+ layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
240
 
241
+ layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
242
+ layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
243
 
244
+ layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
245
+ layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
246
 
247
+ layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
248
+ layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
249
 
250
+ layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
251
+ layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
252
 
253
  // map by name
254
  model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
 
266
  model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
267
  model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
268
 
269
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
270
  model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
271
  }
272
  }
 
294
  {
295
  size_t total_size = 0;
296
 
297
+ bool has_lm_head = false;
298
+
299
  while (true) {
300
  int32_t n_dims;
301
  int32_t length;
302
+ int32_t ttype;
303
 
304
  fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
305
  fin.read(reinterpret_cast<char *>(&length), sizeof(length));
306
+ fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
307
 
308
  if (fin.eof()) {
309
  break;
 
319
  std::string name(length, 0);
320
  fin.read(&name[0], length);
321
 
322
+ if (model.tensors.find(name.data()) == model.tensors.end()) {
323
  fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
324
  return false;
325
  }
 
332
 
333
  if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
334
  fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
335
+ __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
336
  return false;
337
  }
338
 
339
+ // for debugging
340
+ if (0) {
341
+ printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
342
+ }
343
+
344
+ const size_t bpe = ggml_type_size(ggml_type(ttype));
345
 
346
+ if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
347
  fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
348
  __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
349
  return false;
 
351
 
352
  fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
353
 
354
+ // GPT-2 models share the WTE tensor as the LM head
355
+ if (name == "model/wte" && has_lm_head == false) {
356
+ memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
357
+ }
358
+
359
+ if (name == "model/lm_head") {
360
+ has_lm_head = true;
361
+ }
362
+
363
  total_size += ggml_nbytes(tensor);
364
  }
365
 
 
377
  // - n_threads: number of threads to use
378
  // - n_past: the context size so far
379
  // - embd_inp: the embeddings of the tokens in the context
380
+ // - embd_w: the predicted logits for the next token
381
  //
382
  bool gpt2_eval(
383
  const gpt2_model & model,
 
396
  const int n_head = hparams.n_head;
397
  const int n_vocab = hparams.n_vocab;
398
 
399
+ static size_t buf_size = 512u*1024*1024;
400
  static void * buf = malloc(buf_size);
401
 
402
  if (mem_per_token > 0 && mem_per_token*N > buf_size) {
403
  const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
404
+ //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
405
 
406
  // reallocate
407
  buf_size = buf_size_new;
 
412
  }
413
  }
414
 
415
+ struct ggml_init_params params = {
416
+ /*.mem_size =*/ buf_size,
417
+ /*.mem_buffer =*/ buf,
418
+ /*.no_alloc =*/ false,
419
+ };
420
 
421
  struct ggml_context * ctx0 = ggml_init(params);
422
+ struct ggml_cgraph gf = {};
 
423
  gf.n_threads = n_threads;
424
 
425
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
 
463
  // [2304, N]
464
  {
465
  cur = ggml_mul_mat(ctx0,
466
+ model.layers[il].c_attn_attn_w,
467
  cur);
468
 
469
  cur = ggml_add(ctx0,
 
539
  // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
540
  // [n_past + N, 64, 12]
541
  struct ggml_tensor * V_trans =
542
+ ggml_cpy(ctx0,
543
+ ggml_permute(ctx0,
544
+ ggml_reshape_3d(ctx0,
545
+ ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
546
+ n_embd/n_head, n_head, n_past + N),
547
+ 1, 2, 0, 3),
548
+ ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
549
 
550
  // KQV = transpose(V) * KQ_soft_max
551
  // [64, N, 12]
 
572
  // [768, N]
573
  {
574
  cur = ggml_mul_mat(ctx0,
575
+ model.layers[il].c_attn_proj_w,
576
  cur);
577
 
578
  cur = ggml_add(ctx0,
 
609
  // cur = fc_w*cur + fc_b
610
  // [3072, N]
611
  cur = ggml_mul_mat(ctx0,
612
+ model.layers[il].c_mlp_fc_w,
613
  cur);
614
 
615
  cur = ggml_add(ctx0,
 
629
  // cur = proj_w*cur + proj_b
630
  // [768, N]
631
  cur = ggml_mul_mat(ctx0,
632
+ model.layers[il].c_mlp_proj_w,
633
  cur);
634
 
635
  cur = ggml_add(ctx0,
 
656
  }
657
 
658
  // inpL = WTE * inpL
659
+ // [ 768, 50257] - model.lm_head
660
  // [ 768, N] - inpL
661
+ inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
662
 
663
  // logits -> probs
664
+ //inpL = ggml_soft_max(ctx0, inpL);
665
 
666
  // run the computation
667
  ggml_build_forward_expand(&gf, inpL);
 
675
  //embd_w.resize(n_vocab*N);
676
  //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
677
 
678
+ // return result just for the last token
679
  embd_w.resize(n_vocab);
680
  memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
681
 
examples/talk/gpt-2.h CHANGED
@@ -2,18 +2,12 @@
2
 
3
  // TODO: Change to C-style API and move to ./examples for easy reuse.
4
 
 
 
5
  #include <vector>
6
  #include <map>
7
  #include <string>
8
 
9
- struct gpt_vocab {
10
- using id = int32_t;
11
- using token = std::string;
12
-
13
- std::map<token, id> token_to_id;
14
- std::map<id, token> id_to_token;
15
- };
16
-
17
  struct gpt2_context;
18
 
19
  struct gpt2_context * gpt2_init(const char * path_model);
 
2
 
3
  // TODO: Change to C-style API and move to ./examples for easy reuse.
4
 
5
+ #include "common.h"
6
+
7
  #include <vector>
8
  #include <map>
9
  #include <string>
10
 
 
 
 
 
 
 
 
 
11
  struct gpt2_context;
12
 
13
  struct gpt2_context * gpt2_init(const char * path_model);
examples/whisper.wasm/CMakeLists.txt CHANGED
@@ -31,9 +31,9 @@ endif()
31
  set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
32
  --bind \
33
  -s USE_PTHREADS=1 \
34
- -s PTHREAD_POOL_SIZE=8 \
35
- -s INITIAL_MEMORY=1500MB \
36
- -s TOTAL_MEMORY=1500MB \
37
  -s FORCE_FILESYSTEM=1 \
38
  -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
39
  ${EXTRA_FLAGS} \
 
31
  set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
32
  --bind \
33
  -s USE_PTHREADS=1 \
34
+ -s PTHREAD_POOL_SIZE_STRICT=0 \
35
+ -s INITIAL_MEMORY=2000MB \
36
+ -s TOTAL_MEMORY=2000MB \
37
  -s FORCE_FILESYSTEM=1 \
38
  -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
39
  ${EXTRA_FLAGS} \
examples/whisper.wasm/emscripten.cpp CHANGED
@@ -10,6 +10,12 @@ std::thread g_worker;
10
 
11
  std::vector<struct whisper_context *> g_contexts(4, nullptr);
12
 
 
 
 
 
 
 
13
  EMSCRIPTEN_BINDINGS(whisper) {
14
  emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
15
  if (g_worker.joinable()) {
@@ -43,7 +49,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
43
  }
44
  }));
45
 
46
- emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, bool translate) {
47
  if (g_worker.joinable()) {
48
  g_worker.join();
49
  }
@@ -66,7 +72,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
66
  params.print_special = false;
67
  params.translate = translate;
68
  params.language = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
69
- params.n_threads = std::min(8, (int) std::thread::hardware_concurrency());
70
  params.offset_ms = 0;
71
 
72
  std::vector<float> pcmf32;
 
10
 
11
  std::vector<struct whisper_context *> g_contexts(4, nullptr);
12
 
13
+ static inline int mpow2(int n) {
14
+ int p = 1;
15
+ while (p <= n) p *= 2;
16
+ return p/2;
17
+ }
18
+
19
  EMSCRIPTEN_BINDINGS(whisper) {
20
  emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
21
  if (g_worker.joinable()) {
 
49
  }
50
  }));
51
 
52
+ emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, int nthreads, bool translate) {
53
  if (g_worker.joinable()) {
54
  g_worker.join();
55
  }
 
72
  params.print_special = false;
73
  params.translate = translate;
74
  params.language = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
75
+ params.n_threads = std::min(nthreads, std::min(16, mpow2(std::thread::hardware_concurrency())));
76
  params.offset_ms = 0;
77
 
78
  std::vector<float> pcmf32;
examples/whisper.wasm/index-tmpl.html CHANGED
@@ -40,21 +40,42 @@
40
 
41
  Note that the computation is quite heavy and may take a few seconds to complete.<br>
42
  The transcription results will be displayed in the text area below.<br><br>
43
- <b>Important: your browser must support WASM SIMD instructions for this to work.</b>
 
 
 
 
44
 
45
- <br><br><hr>
 
 
 
 
 
 
 
46
 
47
  <div id="model">
48
- Whisper model: <span id="model-whisper-status"></span>
49
  <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
50
  <button id="fetch-whisper-tiny" onclick="loadWhisper('tiny')">tiny (75 MB)</button>
51
  <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
52
  <button id="fetch-whisper-base" onclick="loadWhisper('base')">base (142 MB)</button>
53
  <button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
54
  <button id="fetch-whisper-small" onclick="loadWhisper('small')">small (466 MB)</button>
55
- <span id="fetch-whisper-progress"></span>
56
-
57
  <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
 
 
 
 
 
 
 
 
 
 
 
 
58
  </div>
59
 
60
  <br>
@@ -161,6 +182,12 @@
161
  <option value="yi">Yiddish</option>
162
  </select>
163
  </td>
 
 
 
 
 
 
164
  <td>
165
  <button onclick="onProcess(false);">Transcribe</button>
166
  </td>
@@ -263,11 +290,13 @@
263
 
264
  Module.FS_createDataFile("/", fname, buf, true, true);
265
 
266
- model_whisper = fname;
267
 
268
  document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
269
 
270
  printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
 
 
271
  }
272
 
273
  function loadFile(event, fname) {
@@ -292,6 +321,17 @@
292
  document.getElementById('fetch-whisper-tiny' ).style.display = 'none';
293
  document.getElementById('fetch-whisper-base' ).style.display = 'none';
294
  document.getElementById('fetch-whisper-small' ).style.display = 'none';
 
 
 
 
 
 
 
 
 
 
 
295
  document.getElementById('whisper-file' ).style.display = 'none';
296
  document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
297
  }
@@ -304,6 +344,16 @@
304
  'base': 'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
305
  'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
306
  'small': 'https://whisper.ggerganov.com/ggml-model-whisper-small.bin',
 
 
 
 
 
 
 
 
 
 
307
  };
308
 
309
  let sizes = {
@@ -313,6 +363,16 @@
313
  'base': 142,
314
  'small.en': 466,
315
  'small': 466,
 
 
 
 
 
 
 
 
 
 
316
  };
317
 
318
  let url = urls[model];
@@ -327,8 +387,19 @@
327
  document.getElementById('fetch-whisper-tiny' ).style.display = 'none';
328
  document.getElementById('fetch-whisper-base' ).style.display = 'none';
329
  document.getElementById('fetch-whisper-small' ).style.display = 'none';
330
- document.getElementById('whisper-file' ).style.display = 'none';
331
- document.getElementById('model-whisper-status' ).innerHTML = 'loading model: ' + model;
 
 
 
 
 
 
 
 
 
 
 
332
 
333
  cbProgress = function(p) {
334
  let el = document.getElementById('fetch-whisper-progress');
@@ -337,14 +408,26 @@
337
 
338
  cbCancel = function() {
339
  var el;
 
340
  el = document.getElementById('fetch-whisper-tiny-en' ); if (el) el.style.display = 'inline-block';
341
  el = document.getElementById('fetch-whisper-base-en' ); if (el) el.style.display = 'inline-block';
342
  el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
343
  el = document.getElementById('fetch-whisper-tiny' ); if (el) el.style.display = 'inline-block';
344
  el = document.getElementById('fetch-whisper-base' ); if (el) el.style.display = 'inline-block';
345
  el = document.getElementById('fetch-whisper-small' ); if (el) el.style.display = 'inline-block';
346
- el = document.getElementById('whisper-file' ); if (el) el.style.display = 'inline-block';
347
- el = document.getElementById('model-whisper-status' ); if (el) el.innerHTML = '';
 
 
 
 
 
 
 
 
 
 
 
348
  };
349
 
350
  loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
@@ -354,7 +437,8 @@
354
  // audio file
355
  //
356
 
357
- const kMaxAudio_s = 120;
 
358
  const kSampleRate = 16000;
359
 
360
  window.AudioContext = window.AudioContext || window.webkitAudioContext;
@@ -423,7 +507,7 @@
423
  doRecording = false;
424
  }
425
 
426
- // record up to kMaxAudio_s seconds of audio from the microphone
427
  // check if doRecording is false every 1000 ms and stop recording if so
428
  // update progress information
429
  function startRecording() {
@@ -479,9 +563,9 @@
479
  printTextarea('js: audio recorded, size: ' + audio.length);
480
 
481
  // truncate to first 30 seconds
482
- if (audio.length > kMaxAudio_s*kSampleRate) {
483
- audio = audio.slice(0, kMaxAudio_s*kSampleRate);
484
- printTextarea('js: truncated audio to first ' + kMaxAudio_s + ' seconds');
485
  }
486
  setAudio(audio);
487
  });
@@ -509,24 +593,31 @@
509
  });
510
  }
511
 
512
- document.getElementById('progress-bar').style.width = (100*(Date.now() - startTime)/1000/kMaxAudio_s) + '%';
513
- document.getElementById('progress-text').innerHTML = (100*(Date.now() - startTime)/1000/kMaxAudio_s).toFixed(0) + '%';
514
  }, 1000);
515
 
516
  printTextarea('js: recording ...');
517
 
518
  setTimeout(function() {
519
  if (doRecording) {
520
- printTextarea('js: recording stopped after ' + kMaxAudio_s + ' seconds');
521
  stopRecording();
522
  }
523
- }, kMaxAudio_s*1000);
524
  }
525
 
526
  //
527
  // transcribe
528
  //
529
 
 
 
 
 
 
 
 
530
  function onProcess(translate) {
531
  if (!instance) {
532
  instance = Module.init('whisper.bin');
@@ -553,7 +644,7 @@
553
  printTextarea('');
554
 
555
  setTimeout(function() {
556
- var ret = Module.full_default(instance, audio, document.getElementById('language').value, translate);
557
  console.log('js: full_default returned: ' + ret);
558
  if (ret) {
559
  printTextarea("js: whisper returned: " + ret);
 
40
 
41
  Note that the computation is quite heavy and may take a few seconds to complete.<br>
42
  The transcription results will be displayed in the text area below.<br><br>
43
+ <b>Important:</b>
44
+ <ul>
45
+ <li>your browser must support WASM SIMD instructions for this to work</li>
46
+ <li>Firefox cannot load files larger than 256 MB - use Chrome instead</li>
47
+ </ul>
48
 
49
+ <b>More examples:</b>
50
+ <a href="https://whisper.ggerganov.com/">main</a> |
51
+ <a href="https://whisper.ggerganov.com/bench">bench</a> |
52
+ <a href="https://whisper.ggerganov.com/stream">stream</a> |
53
+ <a href="https://whisper.ggerganov.com/command">command</a> |
54
+ <a href="https://whisper.ggerganov.com/talk">talk</a> |
55
+
56
+ <hr>
57
 
58
  <div id="model">
59
+ Whisper models: <span id="model-whisper-status"></span><br><br>
60
  <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
61
  <button id="fetch-whisper-tiny" onclick="loadWhisper('tiny')">tiny (75 MB)</button>
62
  <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
63
  <button id="fetch-whisper-base" onclick="loadWhisper('base')">base (142 MB)</button>
64
  <button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
65
  <button id="fetch-whisper-small" onclick="loadWhisper('small')">small (466 MB)</button>
 
 
66
  <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
67
+ <br><br>
68
+ Quantized models:<br><br>
69
+ <button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
70
+ <button id="fetch-whisper-tiny-q5_1" onclick="loadWhisper('tiny-q5_1')">tiny (Q5_1, 31 MB)</button>
71
+ <button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
72
+ <button id="fetch-whisper-base-q5_1" onclick="loadWhisper('base-q5_1')">base (Q5_1, 57 MB)</button>
73
+ <button id="fetch-whisper-small-en-q5_1" onclick="loadWhisper('small-en-q5_1')">small.en (Q5_1, 182 MB)</button>
74
+ <button id="fetch-whisper-small-q5_1" onclick="loadWhisper('small-q5_1')">small (Q5_1, 182 MB)</button><br>
75
+ <button id="fetch-whisper-medium-en-q5_0" onclick="loadWhisper('medium-en-q5_0')">medium.en (Q5_0, 515 MB)</button>
76
+ <button id="fetch-whisper-medium-q5_0" onclick="loadWhisper('medium-q5_0')">medium (Q5_0, 515 MB)</button>
77
+ <button id="fetch-whisper-large-q5_0" onclick="loadWhisper('large-q5_0')">large (Q5_0, 1030 MB)</button>
78
+ <span id="fetch-whisper-progress"></span>
79
  </div>
80
 
81
  <br>
 
182
  <option value="yi">Yiddish</option>
183
  </select>
184
  </td>
185
+ <!-- Slider to select number of threads between 1 and 16 -->
186
+ <td>
187
+ Threads:
188
+ <input type="range" id="threads" name="threads" min="1" max="16" value="8" onchange="changeThreads(this.value)" />
189
+ <span id="threads-value">8</span>
190
+ </td>
191
  <td>
192
  <button onclick="onProcess(false);">Transcribe</button>
193
  </td>
 
290
 
291
  Module.FS_createDataFile("/", fname, buf, true, true);
292
 
293
+ //model_whisper = fname;
294
 
295
  document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
296
 
297
  printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
298
+
299
+ document.getElementById('model').innerHTML = 'Model fetched: ' + model_whisper;
300
  }
301
 
302
  function loadFile(event, fname) {
 
321
  document.getElementById('fetch-whisper-tiny' ).style.display = 'none';
322
  document.getElementById('fetch-whisper-base' ).style.display = 'none';
323
  document.getElementById('fetch-whisper-small' ).style.display = 'none';
324
+
325
+ document.getElementById('fetch-whisper-tiny-en-q5_1' ).style.display = 'none';
326
+ document.getElementById('fetch-whisper-tiny-q5_1' ).style.display = 'none';
327
+ document.getElementById('fetch-whisper-base-en-q5_1' ).style.display = 'none';
328
+ document.getElementById('fetch-whisper-base-q5_1' ).style.display = 'none';
329
+ document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
330
+ document.getElementById('fetch-whisper-small-q5_1' ).style.display = 'none';
331
+ document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
332
+ document.getElementById('fetch-whisper-medium-q5_0' ).style.display = 'none';
333
+ document.getElementById('fetch-whisper-large-q5_0' ).style.display = 'none';
334
+
335
  document.getElementById('whisper-file' ).style.display = 'none';
336
  document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
337
  }
 
344
  'base': 'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
345
  'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
346
  'small': 'https://whisper.ggerganov.com/ggml-model-whisper-small.bin',
347
+
348
+ 'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
349
+ 'tiny-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny-q5_1.bin',
350
+ 'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
351
+ 'base-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base-q5_1.bin',
352
+ 'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin',
353
+ 'small-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small-q5_1.bin',
354
+ 'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin',
355
+ 'medium-q5_0': 'https://whisper.ggerganov.com/ggml-model-whisper-medium-q5_0.bin',
356
+ 'large-q5_0': 'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin',
357
  };
358
 
359
  let sizes = {
 
363
  'base': 142,
364
  'small.en': 466,
365
  'small': 466,
366
+
367
+ 'tiny-en-q5_1': 31,
368
+ 'tiny-q5_1': 31,
369
+ 'base-en-q5_1': 57,
370
+ 'base-q5_1': 57,
371
+ 'small-en-q5_1': 182,
372
+ 'small-q5_1': 182,
373
+ 'medium-en-q5_0': 515,
374
+ 'medium-q5_0': 515,
375
+ 'large-q5_0': 1030,
376
  };
377
 
378
  let url = urls[model];
 
387
  document.getElementById('fetch-whisper-tiny' ).style.display = 'none';
388
  document.getElementById('fetch-whisper-base' ).style.display = 'none';
389
  document.getElementById('fetch-whisper-small' ).style.display = 'none';
390
+
391
+ document.getElementById('fetch-whisper-tiny-en-q5_1' ).style.display = 'none';
392
+ document.getElementById('fetch-whisper-tiny-q5_1' ).style.display = 'none';
393
+ document.getElementById('fetch-whisper-base-en-q5_1' ).style.display = 'none';
394
+ document.getElementById('fetch-whisper-base-q5_1' ).style.display = 'none';
395
+ document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
396
+ document.getElementById('fetch-whisper-small-q5_1' ).style.display = 'none';
397
+ document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
398
+ document.getElementById('fetch-whisper-medium-q5_0' ).style.display = 'none';
399
+ document.getElementById('fetch-whisper-large-q5_0' ).style.display = 'none';
400
+
401
+ document.getElementById('whisper-file' ).style.display = 'none';
402
+ document.getElementById('model-whisper-status').innerHTML = 'loading model: ' + model;
403
 
404
  cbProgress = function(p) {
405
  let el = document.getElementById('fetch-whisper-progress');
 
408
 
409
  cbCancel = function() {
410
  var el;
411
+
412
  el = document.getElementById('fetch-whisper-tiny-en' ); if (el) el.style.display = 'inline-block';
413
  el = document.getElementById('fetch-whisper-base-en' ); if (el) el.style.display = 'inline-block';
414
  el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
415
  el = document.getElementById('fetch-whisper-tiny' ); if (el) el.style.display = 'inline-block';
416
  el = document.getElementById('fetch-whisper-base' ); if (el) el.style.display = 'inline-block';
417
  el = document.getElementById('fetch-whisper-small' ); if (el) el.style.display = 'inline-block';
418
+
419
+ el = document.getElementById('fetch-whisper-tiny-en-q5_1' ); if (el) el.style.display = 'inline-block';
420
+ el = document.getElementById('fetch-whisper-tiny-q5_1' ); if (el) el.style.display = 'inline-block';
421
+ el = document.getElementById('fetch-whisper-base-en-q5_1' ); if (el) el.style.display = 'inline-block';
422
+ el = document.getElementById('fetch-whisper-base-q5_1' ); if (el) el.style.display = 'inline-block';
423
+ el = document.getElementById('fetch-whisper-small-en-q5_1' ); if (el) el.style.display = 'inline-block';
424
+ el = document.getElementById('fetch-whisper-small-q5_1' ); if (el) el.style.display = 'inline-block';
425
+ el = document.getElementById('fetch-whisper-medium-en-q5_0'); if (el) el.style.display = 'inline-block';
426
+ el = document.getElementById('fetch-whisper-medium-q5_0' ); if (el) el.style.display = 'inline-block';
427
+ el = document.getElementById('fetch-whisper-large-q5_0' ); if (el) el.style.display = 'inline-block';
428
+
429
+ el = document.getElementById('whisper-file' ); if (el) el.style.display = 'inline-block';
430
+ el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
431
  };
432
 
433
  loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
 
437
  // audio file
438
  //
439
 
440
+ const kMaxAudio_s = 30*60;
441
+ const kMaxRecording_s = 2*60;
442
  const kSampleRate = 16000;
443
 
444
  window.AudioContext = window.AudioContext || window.webkitAudioContext;
 
507
  doRecording = false;
508
  }
509
 
510
+ // record up to kMaxRecording_s seconds of audio from the microphone
511
  // check if doRecording is false every 1000 ms and stop recording if so
512
  // update progress information
513
  function startRecording() {
 
563
  printTextarea('js: audio recorded, size: ' + audio.length);
564
 
565
  // truncate to first 30 seconds
566
+ if (audio.length > kMaxRecording_s*kSampleRate) {
567
+ audio = audio.slice(0, kMaxRecording_s*kSampleRate);
568
+ printTextarea('js: truncated audio to first ' + kMaxRecording_s + ' seconds');
569
  }
570
  setAudio(audio);
571
  });
 
593
  });
594
  }
595
 
596
+ document.getElementById('progress-bar').style.width = (100*(Date.now() - startTime)/1000/kMaxRecording_s) + '%';
597
+ document.getElementById('progress-text').innerHTML = (100*(Date.now() - startTime)/1000/kMaxRecording_s).toFixed(0) + '%';
598
  }, 1000);
599
 
600
  printTextarea('js: recording ...');
601
 
602
  setTimeout(function() {
603
  if (doRecording) {
604
+ printTextarea('js: recording stopped after ' + kMaxRecording_s + ' seconds');
605
  stopRecording();
606
  }
607
+ }, kMaxRecording_s*1000);
608
  }
609
 
610
  //
611
  // transcribe
612
  //
613
 
614
+ var nthreads = 8;
615
+
616
+ function changeThreads(value) {
617
+ nthreads = value;
618
+ document.getElementById('threads-value').innerHTML = nthreads;
619
+ }
620
+
621
  function onProcess(translate) {
622
  if (!instance) {
623
  instance = Module.init('whisper.bin');
 
644
  printTextarea('');
645
 
646
  setTimeout(function() {
647
+ var ret = Module.full_default(instance, audio, document.getElementById('language').value, nthreads, translate);
648
  console.log('js: full_default returned: ' + ret);
649
  if (ret) {
650
  printTextarea("js: whisper returned: " + ret);
extra/quantize-all.sh ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ printf "Usage: $0 <upload>"
4
+
5
+ if [ $# -ne 1 ]; then
6
+ printf "\nError: Invalid number of arguments\n"
7
+ exit 1
8
+ fi
9
+
10
+ qtype0="q5_0"
11
+ qtype1="q5_1"
12
+ upload="$1"
13
+
14
+ cd `dirname $0`
15
+ cd ../
16
+
17
+ ./quantize ./models/ggml-tiny.en.bin ./models/ggml-tiny.en-${qtype1}.bin ${qtype1}
18
+ ./quantize ./models/ggml-tiny.bin ./models/ggml-tiny-${qtype1}.bin ${qtype1}
19
+
20
+ ./quantize ./models/ggml-base.en.bin ./models/ggml-base.en-${qtype1}.bin ${qtype1}
21
+ ./quantize ./models/ggml-base.bin ./models/ggml-base-${qtype1}.bin ${qtype1}
22
+
23
+ ./quantize ./models/ggml-small.en.bin ./models/ggml-small.en-${qtype1}.bin ${qtype1}
24
+ ./quantize ./models/ggml-small.bin ./models/ggml-small-${qtype1}.bin ${qtype1}
25
+
26
+ ./quantize ./models/ggml-medium.en.bin ./models/ggml-medium.en-${qtype0}.bin ${qtype0}
27
+ ./quantize ./models/ggml-medium.bin ./models/ggml-medium-${qtype0}.bin ${qtype0}
28
+
29
+ ./quantize ./models/ggml-large.bin ./models/ggml-large-${qtype0}.bin ${qtype0}
30
+
31
+ if [ "$upload" == "1" ]; then
32
+ scp ./models/ggml-tiny.en-${qtype1}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-tiny.en-${qtype1}.bin
33
+ scp ./models/ggml-tiny-${qtype1}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-tiny-${qtype1}.bin
34
+
35
+ scp ./models/ggml-base.en-${qtype1}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-base.en-${qtype1}.bin
36
+ scp ./models/ggml-base-${qtype1}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-base-${qtype1}.bin
37
+
38
+ scp ./models/ggml-small.en-${qtype1}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-small.en-${qtype1}.bin
39
+ scp ./models/ggml-small-${qtype1}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-small-${qtype1}.bin
40
+
41
+ scp ./models/ggml-medium.en-${qtype0}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-medium.en-${qtype0}.bin
42
+ scp ./models/ggml-medium-${qtype0}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-medium-${qtype0}.bin
43
+
44
+ scp ./models/ggml-large-${qtype0}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-large-${qtype0}.bin
45
+ fi
extra/sync-ggml.sh CHANGED
@@ -1,6 +1,10 @@
1
  #!/bin/bash
2
 
3
- cp -rpv ../ggml/src/ggml.c ./ggml.c
4
- cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
5
- cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
6
- cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
 
 
 
 
 
1
  #!/bin/bash
2
 
3
+ cp -rpv ../ggml/src/ggml.c ./ggml.c
4
+ cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
5
+ cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
6
+ cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
7
+ cp -rpv ../ggml/examples/common.h ./examples/common.h
8
+ cp -rpv ../ggml/examples/common.cpp ./examples/common.cpp
9
+ cp -rpv ../ggml/examples/common-ggml.h ./examples/common-ggml.h
10
+ cp -rpv ../ggml/examples/common-ggml.cpp ./examples/common-ggml.cpp
ggml.c CHANGED
@@ -330,7 +330,7 @@ static ggml_fp16_t table_exp_f16[1 << 16];
330
  // precomputed f32 table for f16 (256 KB)
331
  static float table_f32_f16[1 << 16];
332
 
333
- #if defined(__ARM_NEON)
334
  #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
335
  #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
336
  #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
@@ -3180,6 +3180,72 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
3180
  }
3181
 
3182
  *s = vaddvq_f32(sumv);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3183
  #elif defined(__AVX2__)
3184
  // Initialize accumulator with zeros
3185
  __m256 acc = _mm256_setzero_ps();
@@ -3311,6 +3377,77 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3311
  }
3312
 
3313
  *s = vaddvq_f32(sumv) + summs;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3314
  #elif defined(__AVX2__)
3315
  // Initialize accumulator with zeros
3316
  __m256 acc = _mm256_setzero_ps();
@@ -3827,6 +3964,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
3827
  "DIAG_MASK_INF",
3828
  "SOFT_MAX",
3829
  "ROPE",
 
3830
  "CONV_1D_1S",
3831
  "CONV_1D_2S",
3832
 
@@ -3875,6 +4013,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3875
  "diag_mask_inf(x)",
3876
  "soft_max(x)",
3877
  "rope(x)",
 
3878
  "conv_1d_1s(x)",
3879
  "conv_1d_2s(x)",
3880
 
@@ -4055,6 +4194,27 @@ bool ggml_is_quantized(enum ggml_type type) {
4055
  return GGML_IS_QUANTIZED[type];
4056
  }
4057
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4058
  static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
4059
  return tensor->nb[0] > tensor->nb[1];
4060
  }
 
330
  // precomputed f32 table for f16 (256 KB)
331
  static float table_f32_f16[1 << 16];
332
 
333
+ #if defined(__ARM_NEON) || defined(__wasm_simd128__)
334
  #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
335
  #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
336
  #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
 
3180
  }
3181
 
3182
  *s = vaddvq_f32(sumv);
3183
+ #elif defined(__wasm_simd128__)
3184
+ v128_t sumv = wasm_f32x4_splat(0.0f);
3185
+
3186
+ uint64_t tmp[4];
3187
+
3188
+ for (int i = 0; i < nb; ++i) {
3189
+ const block_q5_0 * restrict x0 = &x[i];
3190
+ const block_q8_0 * restrict y0 = &y[i];
3191
+
3192
+ const v128_t m4b = wasm_i8x16_splat(0x0F);
3193
+ const v128_t s16b = wasm_i8x16_splat(0x10);
3194
+
3195
+ // extract the 5th bit
3196
+ uint32_t qh;
3197
+ memcpy(&qh, x0->qh, sizeof(qh));
3198
+
3199
+ tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
3200
+ tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
3201
+ tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
3202
+ tmp[3] = table_b2b_u[(qh >> 24) ];
3203
+
3204
+ const v128_t qhl = wasm_v128_load(tmp + 0);
3205
+ const v128_t qhh = wasm_v128_load(tmp + 2);
3206
+
3207
+ const v128_t v0 = wasm_v128_load(x0->qs);
3208
+
3209
+ // 4-bit -> 8-bit
3210
+ const v128_t v0l = wasm_v128_and (v0, m4b);
3211
+ const v128_t v0h = wasm_u8x16_shr(v0, 4);
3212
+
3213
+ // interleave
3214
+ const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
3215
+ const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
3216
+
3217
+ // add high bit and sub 16
3218
+ const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b);
3219
+ const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b);
3220
+
3221
+ // load y
3222
+ const v128_t v1l = wasm_v128_load(y0->qs);
3223
+ const v128_t v1h = wasm_v128_load(y0->qs + 16);
3224
+
3225
+ // int8x16 -> int16x8
3226
+ const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
3227
+ const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
3228
+ const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
3229
+ const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
3230
+
3231
+ const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
3232
+ const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
3233
+ const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
3234
+ const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
3235
+
3236
+ const float x0d = GGML_FP16_TO_FP32(x0->d);
3237
+
3238
+ // dot product
3239
+ sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
3240
+ wasm_i32x4_add(
3241
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
3242
+ wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
3243
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
3244
+ wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
3245
+ }
3246
+
3247
+ *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
3248
+ wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
3249
  #elif defined(__AVX2__)
3250
  // Initialize accumulator with zeros
3251
  __m256 acc = _mm256_setzero_ps();
 
3377
  }
3378
 
3379
  *s = vaddvq_f32(sumv) + summs;
3380
+ #elif defined(__wasm_simd128__)
3381
+ v128_t sumv = wasm_f32x4_splat(0.0f);
3382
+
3383
+ float summs = 0.0f;
3384
+
3385
+ uint64_t tmp[4];
3386
+
3387
+ for (int i = 0; i < nb; ++i) {
3388
+ const block_q5_1 * restrict x0 = &x[i];
3389
+ const block_q8_1 * restrict y0 = &y[i];
3390
+
3391
+ summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
3392
+
3393
+ const v128_t m4b = wasm_i8x16_splat(0x0F);
3394
+
3395
+ // extract the 5th bit
3396
+ uint32_t qh;
3397
+ memcpy(&qh, x0->qh, sizeof(qh));
3398
+
3399
+ tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
3400
+ tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
3401
+ tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
3402
+ tmp[3] = table_b2b_u[(qh >> 24) ];
3403
+
3404
+ const v128_t qhl = wasm_v128_load(tmp + 0);
3405
+ const v128_t qhh = wasm_v128_load(tmp + 2);
3406
+
3407
+ const v128_t v0 = wasm_v128_load(x0->qs);
3408
+
3409
+ // 4-bit -> 8-bit
3410
+ const v128_t v0l = wasm_v128_and (v0, m4b);
3411
+ const v128_t v0h = wasm_u8x16_shr(v0, 4);
3412
+
3413
+ static bool x = true;
3414
+
3415
+ // interleave
3416
+ const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
3417
+ const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
3418
+
3419
+ // add high bit
3420
+ const v128_t v0lf = wasm_v128_or(v0lz, qhl);
3421
+ const v128_t v0hf = wasm_v128_or(v0hz, qhh);
3422
+
3423
+ // load y
3424
+ const v128_t v1l = wasm_v128_load(y0->qs);
3425
+ const v128_t v1h = wasm_v128_load(y0->qs + 16);
3426
+
3427
+ // int8x16 -> int16x8
3428
+ const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
3429
+ const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
3430
+ const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
3431
+ const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
3432
+
3433
+ const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
3434
+ const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
3435
+ const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
3436
+ const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
3437
+
3438
+ const float x0d = GGML_FP16_TO_FP32(x0->d);
3439
+
3440
+ // dot product
3441
+ sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
3442
+ wasm_i32x4_add(
3443
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
3444
+ wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
3445
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
3446
+ wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
3447
+ }
3448
+
3449
+ *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
3450
+ wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
3451
  #elif defined(__AVX2__)
3452
  // Initialize accumulator with zeros
3453
  __m256 acc = _mm256_setzero_ps();
 
3964
  "DIAG_MASK_INF",
3965
  "SOFT_MAX",
3966
  "ROPE",
3967
+ "ALIBI",
3968
  "CONV_1D_1S",
3969
  "CONV_1D_2S",
3970
 
 
4013
  "diag_mask_inf(x)",
4014
  "soft_max(x)",
4015
  "rope(x)",
4016
+ "alibi(x)",
4017
  "conv_1d_1s(x)",
4018
  "conv_1d_2s(x)",
4019
 
 
4194
  return GGML_IS_QUANTIZED[type];
4195
  }
4196
 
4197
+ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
4198
+ enum ggml_type wtype = GGML_TYPE_COUNT;
4199
+
4200
+ switch (ftype) {
4201
+ case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break;
4202
+ case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break;
4203
+ case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break;
4204
+ case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break;
4205
+ case GGML_FTYPE_MOSTLY_Q4_2: wtype = GGML_TYPE_Q4_2; break;
4206
+ case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
4207
+ case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
4208
+ case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
4209
+ case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
4210
+ case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
4211
+ }
4212
+
4213
+ GGML_ASSERT(wtype != GGML_TYPE_COUNT);
4214
+
4215
+ return wtype;
4216
+ }
4217
+
4218
  static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
4219
  return tensor->nb[0] > tensor->nb[1];
4220
  }
ggml.h CHANGED
@@ -232,6 +232,20 @@ extern "C" {
232
  GGML_TYPE_COUNT,
233
  };
234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  // available tensor operations:
236
  enum ggml_op {
237
  GGML_OP_NONE = 0,
@@ -385,6 +399,8 @@ extern "C" {
385
 
386
  GGML_API bool ggml_is_quantized(enum ggml_type type);
387
 
 
 
388
  // main
389
 
390
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
 
232
  GGML_TYPE_COUNT,
233
  };
234
 
235
+ // model file types
236
+ enum ggml_ftype {
237
+ GGML_FTYPE_UNKNOWN = -1,
238
+ GGML_FTYPE_ALL_F32 = 0,
239
+ GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
240
+ GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
241
+ GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
242
+ GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
243
+ GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
244
+ GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
245
+ GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
246
+ GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
247
+ };
248
+
249
  // available tensor operations:
250
  enum ggml_op {
251
  GGML_OP_NONE = 0,
 
399
 
400
  GGML_API bool ggml_is_quantized(enum ggml_type type);
401
 
402
+ GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
403
+
404
  // main
405
 
406
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
whisper.cpp CHANGED
@@ -1,4 +1,3 @@
1
- #define WHISPER_BUILD
2
  #include "whisper.h"
3
  #if WHISPER_USE_COREML
4
  #include "coreml/whisper-encoder.h"
@@ -255,12 +254,70 @@ static const std::map<e_model, size_t> MEM_REQ_SCRATCH3 = {
255
  { MODEL_LARGE, 9ull*MB },
256
  };
257
 
258
- static const std::map<e_model, size_t> MEM_REQ_MODEL = {
259
- { MODEL_TINY, 74ull*MB },
260
- { MODEL_BASE, 142ull*MB },
261
- { MODEL_SMALL, 466ull*MB },
262
- { MODEL_MEDIUM, 1464ull*MB },
263
- { MODEL_LARGE, 2952ull*MB },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  };
265
 
266
  static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
@@ -370,7 +427,7 @@ struct whisper_hparams {
370
  int32_t n_text_head = 6;
371
  int32_t n_text_layer = 4;
372
  int32_t n_mels = 80;
373
- int32_t f16 = 1;
374
  };
375
 
376
  // audio encoding layer
@@ -637,10 +694,11 @@ struct whisper_state {
637
  };
638
 
639
  struct whisper_context {
640
- int64_t t_load_us = 0;
641
  int64_t t_start_us = 0;
642
 
643
- ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 or FP16)
 
644
 
645
  whisper_model model;
646
  whisper_vocab vocab;
@@ -697,7 +755,7 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
697
  const ggml_type wtype = cache.k->type;
698
  WHISPER_ASSERT(wtype == cache.v->type);
699
 
700
- WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype));
701
 
702
  struct ggml_init_params params = {
703
  /*.mem_size =*/ cache.buf.size(),
@@ -770,7 +828,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
770
  read_safe(loader, hparams.n_text_head);
771
  read_safe(loader, hparams.n_text_layer);
772
  read_safe(loader, hparams.n_mels);
773
- read_safe(loader, hparams.f16);
774
 
775
  assert(hparams.n_text_state == hparams.n_audio_state);
776
 
@@ -794,11 +852,15 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
794
  model.type = e_model::MODEL_LARGE;
795
  }
796
 
797
- // for the big tensors, we have the option to store the data in 16-bit floats
798
  // in order to save memory and also to speed up the computation
799
- wctx.wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
 
 
 
 
800
 
801
- const size_t scale = model.hparams.f16 ? 1 : 2;
802
 
803
  fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
804
  fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
@@ -810,18 +872,18 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
810
  fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
811
  fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
812
  fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
813
- fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
814
  fprintf(stderr, "%s: type = %d\n", __func__, model.type);
815
 
816
  // print memory requirements
817
  {
818
  // this is the total memory required to run the inference
819
  const size_t mem_required =
820
- MEM_REQ_SCRATCH0.at (model.type) +
821
- MEM_REQ_SCRATCH1.at (model.type) +
822
- MEM_REQ_SCRATCH2.at (model.type) +
823
- MEM_REQ_SCRATCH3.at (model.type) +
824
- scale*MEM_REQ_MODEL.at (model.type) +
825
  scale*MEM_REQ_KV_CROSS.at(model.type) +
826
  scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type));
827
 
@@ -837,7 +899,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
837
  // always have at least one decoder
838
 
839
  wctx.model.buf = new std::vector<uint8_t>();
840
- wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(model.type));
841
 
842
  // we skip initialization of the state until it is needed
843
  // because it might be that state will always be provided externally.
@@ -928,6 +990,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
928
  size_t ctx_size = 0;
929
 
930
  const ggml_type wtype = wctx.wtype;
 
931
 
932
  {
933
  const auto & hparams = model.hparams;
@@ -946,92 +1009,92 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
946
 
947
  // encoder
948
  {
949
- ctx_size += n_audio_ctx*n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_pe;
950
 
951
- ctx_size += 3*n_mels*n_audio_state*ggml_type_size(wtype); // e_conv_1_w
952
- ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_1_b
953
 
954
- ctx_size += 3*n_audio_state*n_audio_state*ggml_type_size(wtype); // e_conv_2_w
955
- ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_2_b
956
 
957
- ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_w;
958
- ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_b;
959
  }
960
 
961
  // decoder
962
  {
963
- ctx_size += n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F32); // d_pe;
964
 
965
- ctx_size += n_vocab*n_text_state*ggml_type_size(wtype); // d_te;
966
 
967
- ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_w;
968
- ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_b;
969
  }
970
 
971
  // encoder layers
972
  {
973
- ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w
974
- ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b
975
 
976
- ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype)); // mlp_0_w
977
- ctx_size += n_audio_layer*( 4*n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b
978
 
979
- ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype)); // mlp_1_w
980
- ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b
981
 
982
- ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w
983
- ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b
984
 
985
- ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_q_w
986
- ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b
987
 
988
- ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_k_w
989
 
990
- ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_v_w
991
- ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b
992
 
993
- ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_ln_1_w
994
- ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b
995
  }
996
 
997
  // decoder layers
998
  {
999
- ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w
1000
- ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b
1001
 
1002
- ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype)); // mlp_0_w
1003
- ctx_size += n_text_layer*( 4*n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b
1004
 
1005
- ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype)); // mlp_1_w
1006
- ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b
1007
 
1008
- ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w
1009
- ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b
1010
 
1011
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_q_w
1012
- ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b
1013
 
1014
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_k_w
1015
 
1016
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_v_w
1017
- ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b
1018
 
1019
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_ln_1_w
1020
- ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b
1021
  //
1022
- ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_w
1023
- ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_b
1024
 
1025
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_q_w
1026
- ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_q_b
1027
 
1028
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_k_w
1029
 
1030
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_v_w
1031
- ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_v_b
1032
 
1033
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_ln_1_w
1034
- ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_1_b
1035
  }
1036
 
1037
  ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
@@ -1077,175 +1140,175 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1077
 
1078
  // encoder
1079
  {
1080
- model.e_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx);
1081
 
1082
- model.e_conv_1_w = ggml_new_tensor_3d(ctx, wtype, 3, n_mels, n_audio_state);
1083
  model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
1084
 
1085
- model.e_conv_2_w = ggml_new_tensor_3d(ctx, wtype, 3, n_audio_state, n_audio_state);
1086
  model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
1087
 
1088
- model.e_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1089
- model.e_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1090
 
1091
  // map by name
1092
  model.tensors["encoder.positional_embedding"] = model.e_pe;
1093
 
1094
- model.tensors["encoder.conv1.weight"] = model.e_conv_1_w;
1095
- model.tensors["encoder.conv1.bias"] = model.e_conv_1_b;
1096
 
1097
- model.tensors["encoder.conv2.weight"] = model.e_conv_2_w;
1098
- model.tensors["encoder.conv2.bias"] = model.e_conv_2_b;
1099
 
1100
- model.tensors["encoder.ln_post.weight"] = model.e_ln_w;
1101
- model.tensors["encoder.ln_post.bias"] = model.e_ln_b;
1102
 
1103
  for (int i = 0; i < n_audio_layer; ++i) {
1104
  auto & layer = model.layers_encoder[i];
1105
 
1106
- layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1107
- layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1108
 
1109
- layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, 4*n_audio_state);
1110
- layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state);
1111
 
1112
- layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_audio_state, n_audio_state);
1113
- layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1114
 
1115
- layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1116
- layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1117
 
1118
- layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
1119
- layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1120
 
1121
- layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
1122
 
1123
- layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
1124
- layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1125
 
1126
- layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
1127
- layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1128
 
1129
  // map by name
1130
- model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w;
1131
- model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b;
1132
 
1133
- model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w;
1134
- model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.bias"] = layer.mlp_0_b;
1135
 
1136
- model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w;
1137
- model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b;
1138
 
1139
- model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w;
1140
- model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.bias"] = layer.attn_ln_0_b;
1141
 
1142
  model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w;
1143
  model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.bias"] = layer.attn_q_b;
1144
 
1145
- model.tensors["encoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w;
1146
 
1147
  model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w;
1148
  model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.bias"] = layer.attn_v_b;
1149
 
1150
- model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w;
1151
- model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b;
1152
  }
1153
  }
1154
 
1155
  // decoder
1156
  {
1157
- model.d_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx);
1158
 
1159
- model.d_te = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab);
1160
 
1161
  model.d_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1162
  model.d_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1163
 
1164
  // map by name
1165
- model.tensors["decoder.positional_embedding"] = model.d_pe;
1166
 
1167
  model.tensors["decoder.token_embedding.weight"] = model.d_te;
1168
 
1169
- model.tensors["decoder.ln.weight"] = model.d_ln_w;
1170
- model.tensors["decoder.ln.bias"] = model.d_ln_b;
1171
 
1172
  for (int i = 0; i < n_text_layer; ++i) {
1173
  auto & layer = model.layers_decoder[i];
1174
 
1175
- layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1176
- layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1177
 
1178
- layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, 4*n_text_state);
1179
- layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state);
1180
 
1181
- layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_text_state, n_text_state);
1182
- layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1183
 
1184
- layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1185
- layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1186
 
1187
- layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1188
- layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1189
 
1190
- layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1191
 
1192
- layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1193
- layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1194
 
1195
- layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1196
- layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1197
 
1198
- layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1199
- layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1200
 
1201
- layer.cross_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1202
- layer.cross_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1203
 
1204
- layer.cross_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1205
 
1206
- layer.cross_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1207
- layer.cross_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1208
 
1209
- layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1210
- layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1211
 
1212
  // map by name
1213
- model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w;
1214
- model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b;
1215
 
1216
- model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w;
1217
- model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.bias"] = layer.mlp_0_b;
1218
 
1219
- model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w;
1220
- model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b;
1221
 
1222
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w;
1223
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.bias"] = layer.attn_ln_0_b;
1224
 
1225
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w;
1226
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.bias"] = layer.attn_q_b;
1227
 
1228
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w;
1229
 
1230
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w;
1231
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.bias"] = layer.attn_v_b;
1232
 
1233
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w;
1234
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b;
1235
 
1236
- model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.weight"] = layer.cross_attn_ln_0_w;
1237
- model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.bias"] = layer.cross_attn_ln_0_b;
1238
 
1239
  model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.weight"] = layer.cross_attn_q_w;
1240
  model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.bias"] = layer.cross_attn_q_b;
1241
 
1242
- model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.key.weight"] = layer.cross_attn_k_w;
1243
 
1244
  model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.weight"] = layer.cross_attn_v_w;
1245
  model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.bias"] = layer.cross_attn_v_b;
1246
 
1247
- model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.weight"] = layer.cross_attn_ln_1_w;
1248
- model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.bias"] = layer.cross_attn_ln_1_b;
1249
  }
1250
  }
1251
  }
@@ -1259,11 +1322,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1259
  while (true) {
1260
  int32_t n_dims;
1261
  int32_t length;
1262
- int32_t ftype;
1263
 
1264
  read_safe(loader, n_dims);
1265
  read_safe(loader, length);
1266
- read_safe(loader, ftype);
1267
 
1268
  if (loader->eof(loader->context)) {
1269
  break;
@@ -1298,9 +1361,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1298
  return false;
1299
  }
1300
 
1301
- const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
1302
 
1303
- if (nelements*bpe != ggml_nbytes(tensor)) {
1304
  fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
1305
  __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
1306
  return false;
@@ -1309,7 +1372,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1309
  loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
1310
  BYTESWAP_TENSOR(tensor);
1311
 
1312
- //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
1313
  total_size += ggml_nbytes(tensor);
1314
  model.n_loaded++;
1315
  }
@@ -1508,14 +1571,14 @@ static bool whisper_encode_internal(
1508
  ggml_permute(ctx0,
1509
  ggml_cpy(ctx0,
1510
  Qcur,
1511
- ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
1512
  0, 2, 1, 3);
1513
 
1514
  struct ggml_tensor * K =
1515
  ggml_permute(ctx0,
1516
  ggml_cpy(ctx0,
1517
  Kcur,
1518
- ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
1519
  0, 2, 1, 3);
1520
 
1521
  struct ggml_tensor * V =
@@ -1525,7 +1588,7 @@ static bool whisper_encode_internal(
1525
  Vcur,
1526
  n_state/n_head, n_head, n_ctx),
1527
  1, 2, 0, 3),
1528
- ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head));
1529
 
1530
  struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
1531
  #else
@@ -1540,7 +1603,7 @@ static bool whisper_encode_internal(
1540
  ggml_permute(ctx0,
1541
  ggml_cpy(ctx0,
1542
  Kcur,
1543
- ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
1544
  0, 2, 1, 3);
1545
 
1546
  // K * Q
@@ -1561,7 +1624,7 @@ static bool whisper_encode_internal(
1561
  Vcur,
1562
  n_state/n_head, n_head, n_ctx),
1563
  1, 2, 0, 3),
1564
- ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head)
1565
  );
1566
 
1567
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
@@ -1619,7 +1682,7 @@ static bool whisper_encode_internal(
1619
  wstate.use_buf(ctx0, 0);
1620
 
1621
  cur = ggml_flash_ff(ctx0,
1622
- ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.wtype, n_state, n_ctx)),
1623
  layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
1624
  #else
1625
  wstate.use_buf(ctx0, 0);
@@ -2537,9 +2600,9 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
2537
  struct whisper_state * whisper_init_state(whisper_context * ctx) {
2538
  whisper_state * state = new whisper_state;
2539
 
2540
- const size_t scale = ctx->model.hparams.f16 ? 1 : 2;
2541
 
2542
- if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->wtype, ctx->model.hparams.n_text_ctx)) {
2543
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2544
  delete state;
2545
  return nullptr;
@@ -2550,7 +2613,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
2550
  fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2551
  }
2552
 
2553
- if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->wtype, ctx->model.hparams.n_audio_ctx)) {
2554
  fprintf(stderr, "%s: kv_cache_init() failed for cross-attention cache\n", __func__);
2555
  delete state;
2556
  return nullptr;
@@ -3049,8 +3112,8 @@ int whisper_model_n_mels(struct whisper_context * ctx) {
3049
  return ctx->model.hparams.n_mels;
3050
  }
3051
 
3052
- int whisper_model_f16(struct whisper_context * ctx) {
3053
- return ctx->model.hparams.f16;
3054
  }
3055
 
3056
  int whisper_model_type(struct whisper_context * ctx) {
@@ -4825,23 +4888,32 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4825
  // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
4826
  std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
4827
 
 
4828
  for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
4829
 
4830
  for (int j = 0; j < (int) sizes.size(); j++) {
 
 
4831
  int n_fp16 = 0;
4832
  int n_fp32 = 0;
4833
 
4834
  // GFLOPS/s
 
 
4835
  double s_fp16 = 0.0;
4836
  double s_fp32 = 0.0;
4837
 
4838
  const size_t N = sizes[j];
4839
 
4840
- for (int k = 0; k < 2; ++k) {
4841
- const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32;
 
 
 
 
4842
 
4843
- double & s = k == 0 ? s_fp16 : s_fp32;
4844
- int & n = k == 0 ? n_fp16 : n_fp32;
4845
 
4846
  struct ggml_init_params gparams = {
4847
  /*.mem_size =*/ buf.size(),
@@ -4885,8 +4957,8 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
4885
  s = ((2.0*N*N*N*n)/tsum)*1e-9;
4886
  }
4887
 
4888
- snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
4889
- N, N, s_fp16, n_fp16, s_fp32, n_fp32);
4890
  s += strbuf;
4891
  }
4892
 
 
 
1
  #include "whisper.h"
2
  #if WHISPER_USE_COREML
3
  #include "coreml/whisper-encoder.h"
 
254
  { MODEL_LARGE, 9ull*MB },
255
  };
256
 
257
+ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
258
+ { GGML_TYPE_F32,
259
+ {
260
+ { MODEL_TINY, 74ull*MB },
261
+ { MODEL_BASE, 142ull*MB },
262
+ { MODEL_SMALL, 466ull*MB },
263
+ { MODEL_MEDIUM, 1464ull*MB },
264
+ { MODEL_LARGE, 2952ull*MB },
265
+ },
266
+ },
267
+ { GGML_TYPE_F16,
268
+ {
269
+ { MODEL_TINY, 74ull*MB },
270
+ { MODEL_BASE, 142ull*MB },
271
+ { MODEL_SMALL, 466ull*MB },
272
+ { MODEL_MEDIUM, 1464ull*MB },
273
+ { MODEL_LARGE, 2952ull*MB },
274
+ },
275
+ },
276
+ { GGML_TYPE_Q4_0,
277
+ {
278
+ { MODEL_TINY, 26ull*MB },
279
+ { MODEL_BASE, 50ull*MB },
280
+ { MODEL_SMALL, 154ull*MB },
281
+ { MODEL_MEDIUM, 470ull*MB },
282
+ { MODEL_LARGE, 940ull*MB },
283
+ },
284
+ },
285
+ { GGML_TYPE_Q4_1,
286
+ {
287
+ { MODEL_TINY, 31ull*MB },
288
+ { MODEL_BASE, 57ull*MB },
289
+ { MODEL_SMALL, 181ull*MB },
290
+ { MODEL_MEDIUM, 559ull*MB },
291
+ { MODEL_LARGE, 1122ull*MB },
292
+ },
293
+ },
294
+ { GGML_TYPE_Q4_2,
295
+ {
296
+ { MODEL_TINY, 26ull*MB },
297
+ { MODEL_BASE, 50ull*MB },
298
+ { MODEL_SMALL, 154ull*MB },
299
+ { MODEL_MEDIUM, 470ull*MB },
300
+ { MODEL_LARGE, 940ull*MB },
301
+ },
302
+ },
303
+ { GGML_TYPE_Q5_0, // TODO: fix
304
+ {
305
+ { MODEL_TINY, 31ull*MB },
306
+ { MODEL_BASE, 57ull*MB },
307
+ { MODEL_SMALL, 181ull*MB },
308
+ { MODEL_MEDIUM, 559ull*MB },
309
+ { MODEL_LARGE, 1122ull*MB },
310
+ },
311
+ },
312
+ { GGML_TYPE_Q5_1,
313
+ {
314
+ { MODEL_TINY, 31ull*MB },
315
+ { MODEL_BASE, 57ull*MB },
316
+ { MODEL_SMALL, 181ull*MB },
317
+ { MODEL_MEDIUM, 559ull*MB },
318
+ { MODEL_LARGE, 1122ull*MB },
319
+ },
320
+ },
321
  };
322
 
323
  static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
 
427
  int32_t n_text_head = 6;
428
  int32_t n_text_layer = 4;
429
  int32_t n_mels = 80;
430
+ int32_t ftype = 1;
431
  };
432
 
433
  // audio encoding layer
 
694
  };
695
 
696
  struct whisper_context {
697
+ int64_t t_load_us = 0;
698
  int64_t t_start_us = 0;
699
 
700
+ ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX)
701
+ ggml_type itype = ggml_type::GGML_TYPE_F16; // intermediate type (FP32 or FP16)
702
 
703
  whisper_model model;
704
  whisper_vocab vocab;
 
755
  const ggml_type wtype = cache.k->type;
756
  WHISPER_ASSERT(wtype == cache.v->type);
757
 
758
+ WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_sizef(wtype));
759
 
760
  struct ggml_init_params params = {
761
  /*.mem_size =*/ cache.buf.size(),
 
828
  read_safe(loader, hparams.n_text_head);
829
  read_safe(loader, hparams.n_text_layer);
830
  read_safe(loader, hparams.n_mels);
831
+ read_safe(loader, hparams.ftype);
832
 
833
  assert(hparams.n_text_state == hparams.n_audio_state);
834
 
 
852
  model.type = e_model::MODEL_LARGE;
853
  }
854
 
855
+ // for the big tensors, we have the option to store the data in 16-bit floats or quantized
856
  // in order to save memory and also to speed up the computation
857
+ wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
858
+ if (wctx.wtype == GGML_TYPE_COUNT) {
859
+ fprintf(stderr, "%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
860
+ return false;
861
+ }
862
 
863
+ const size_t scale = model.hparams.ftype ? 1 : 2;
864
 
865
  fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
866
  fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
 
872
  fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
873
  fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
874
  fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
875
+ fprintf(stderr, "%s: ftype = %d\n", __func__, model.hparams.ftype);
876
  fprintf(stderr, "%s: type = %d\n", __func__, model.type);
877
 
878
  // print memory requirements
879
  {
880
  // this is the total memory required to run the inference
881
  const size_t mem_required =
882
+ MEM_REQ_SCRATCH0.at(model.type) +
883
+ MEM_REQ_SCRATCH1.at(model.type) +
884
+ MEM_REQ_SCRATCH2.at(model.type) +
885
+ MEM_REQ_SCRATCH3.at(model.type) +
886
+ scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type) +
887
  scale*MEM_REQ_KV_CROSS.at(model.type) +
888
  scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type));
889
 
 
899
  // always have at least one decoder
900
 
901
  wctx.model.buf = new std::vector<uint8_t>();
902
+ wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type));
903
 
904
  // we skip initialization of the state until it is needed
905
  // because it might be that state will always be provided externally.
 
990
  size_t ctx_size = 0;
991
 
992
  const ggml_type wtype = wctx.wtype;
993
+ const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
994
 
995
  {
996
  const auto & hparams = model.hparams;
 
1009
 
1010
  // encoder
1011
  {
1012
+ ctx_size += n_audio_ctx*n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_pe;
1013
 
1014
+ ctx_size += 3*n_mels*n_audio_state*ggml_type_sizef(vtype); // e_conv_1_w
1015
+ ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_1_b
1016
 
1017
+ ctx_size += 3*n_audio_state*n_audio_state*ggml_type_sizef(vtype); // e_conv_2_w
1018
+ ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_2_b
1019
 
1020
+ ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_w;
1021
+ ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_b;
1022
  }
1023
 
1024
  // decoder
1025
  {
1026
+ ctx_size += n_text_ctx*n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_pe;
1027
 
1028
+ ctx_size += n_vocab*n_text_state*ggml_type_sizef(wtype); // d_te;
1029
 
1030
+ ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_w;
1031
+ ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_b;
1032
  }
1033
 
1034
  // encoder layers
1035
  {
1036
+ ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
1037
+ ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
1038
 
1039
+ ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // mlp_0_w
1040
+ ctx_size += n_audio_layer*( 4*n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
1041
 
1042
+ ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // mlp_1_w
1043
+ ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
1044
 
1045
+ ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
1046
+ ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
1047
 
1048
+ ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_q_w
1049
+ ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
1050
 
1051
+ ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_k_w
1052
 
1053
+ ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_v_w
1054
+ ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
1055
 
1056
+ ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_ln_1_w
1057
+ ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
1058
  }
1059
 
1060
  // decoder layers
1061
  {
1062
+ ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
1063
+ ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
1064
 
1065
+ ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype)); // mlp_0_w
1066
+ ctx_size += n_text_layer*( 4*n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
1067
 
1068
+ ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype)); // mlp_1_w
1069
+ ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
1070
 
1071
+ ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
1072
+ ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
1073
 
1074
+ ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_q_w
1075
+ ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
1076
 
1077
+ ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_k_w
1078
 
1079
+ ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_v_w
1080
+ ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
1081
 
1082
+ ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_ln_1_w
1083
+ ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
1084
  //
1085
+ ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_w
1086
+ ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_b
1087
 
1088
+ ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_q_w
1089
+ ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_q_b
1090
 
1091
+ ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_k_w
1092
 
1093
+ ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_v_w
1094
+ ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_v_b
1095
 
1096
+ ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_ln_1_w
1097
+ ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
1098
  }
1099
 
1100
  ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
 
1140
 
1141
  // encoder
1142
  {
1143
+ model.e_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx);
1144
 
1145
+ model.e_conv_1_w = ggml_new_tensor_3d(ctx, vtype, 3, n_mels, n_audio_state);
1146
  model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
1147
 
1148
+ model.e_conv_2_w = ggml_new_tensor_3d(ctx, vtype, 3, n_audio_state, n_audio_state);
1149
  model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
1150
 
1151
+ model.e_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1152
+ model.e_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1153
 
1154
  // map by name
1155
  model.tensors["encoder.positional_embedding"] = model.e_pe;
1156
 
1157
+ model.tensors["encoder.conv1.weight"] = model.e_conv_1_w;
1158
+ model.tensors["encoder.conv1.bias"] = model.e_conv_1_b;
1159
 
1160
+ model.tensors["encoder.conv2.weight"] = model.e_conv_2_w;
1161
+ model.tensors["encoder.conv2.bias"] = model.e_conv_2_b;
1162
 
1163
+ model.tensors["encoder.ln_post.weight"] = model.e_ln_w;
1164
+ model.tensors["encoder.ln_post.bias"] = model.e_ln_b;
1165
 
1166
  for (int i = 0; i < n_audio_layer; ++i) {
1167
  auto & layer = model.layers_encoder[i];
1168
 
1169
+ layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1170
+ layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1171
 
1172
+ layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, 4*n_audio_state);
1173
+ layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state);
1174
 
1175
+ layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_audio_state, n_audio_state);
1176
+ layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1177
 
1178
+ layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1179
+ layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1180
 
1181
+ layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
1182
+ layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1183
 
1184
+ layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
1185
 
1186
+ layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
1187
+ layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1188
 
1189
+ layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
1190
+ layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
1191
 
1192
  // map by name
1193
+ model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w;
1194
+ model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b;
1195
 
1196
+ model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w;
1197
+ model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.bias"] = layer.mlp_0_b;
1198
 
1199
+ model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w;
1200
+ model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b;
1201
 
1202
+ model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w;
1203
+ model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.bias"] = layer.attn_ln_0_b;
1204
 
1205
  model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w;
1206
  model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.bias"] = layer.attn_q_b;
1207
 
1208
+ model.tensors["encoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w;
1209
 
1210
  model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w;
1211
  model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.bias"] = layer.attn_v_b;
1212
 
1213
+ model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w;
1214
+ model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b;
1215
  }
1216
  }
1217
 
1218
  // decoder
1219
  {
1220
+ model.d_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx);
1221
 
1222
+ model.d_te = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab);
1223
 
1224
  model.d_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1225
  model.d_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1226
 
1227
  // map by name
1228
+ model.tensors["decoder.positional_embedding"] = model.d_pe;
1229
 
1230
  model.tensors["decoder.token_embedding.weight"] = model.d_te;
1231
 
1232
+ model.tensors["decoder.ln.weight"] = model.d_ln_w;
1233
+ model.tensors["decoder.ln.bias"] = model.d_ln_b;
1234
 
1235
  for (int i = 0; i < n_text_layer; ++i) {
1236
  auto & layer = model.layers_decoder[i];
1237
 
1238
+ layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1239
+ layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1240
 
1241
+ layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, 4*n_text_state);
1242
+ layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state);
1243
 
1244
+ layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_text_state, n_text_state);
1245
+ layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1246
 
1247
+ layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1248
+ layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1249
 
1250
+ layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1251
+ layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1252
 
1253
+ layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1254
 
1255
+ layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1256
+ layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1257
 
1258
+ layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1259
+ layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1260
 
1261
+ layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1262
+ layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1263
 
1264
+ layer.cross_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1265
+ layer.cross_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1266
 
1267
+ layer.cross_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1268
 
1269
+ layer.cross_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1270
+ layer.cross_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1271
 
1272
+ layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
1273
+ layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
1274
 
1275
  // map by name
1276
+ model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w;
1277
+ model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b;
1278
 
1279
+ model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w;
1280
+ model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.bias"] = layer.mlp_0_b;
1281
 
1282
+ model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w;
1283
+ model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b;
1284
 
1285
+ model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w;
1286
+ model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.bias"] = layer.attn_ln_0_b;
1287
 
1288
+ model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w;
1289
+ model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.bias"] = layer.attn_q_b;
1290
 
1291
+ model.tensors["decoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w;
1292
 
1293
+ model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w;
1294
+ model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.bias"] = layer.attn_v_b;
1295
 
1296
+ model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w;
1297
+ model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b;
1298
 
1299
+ model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.weight"] = layer.cross_attn_ln_0_w;
1300
+ model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.bias"] = layer.cross_attn_ln_0_b;
1301
 
1302
  model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.weight"] = layer.cross_attn_q_w;
1303
  model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.bias"] = layer.cross_attn_q_b;
1304
 
1305
+ model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.key.weight"] = layer.cross_attn_k_w;
1306
 
1307
  model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.weight"] = layer.cross_attn_v_w;
1308
  model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.bias"] = layer.cross_attn_v_b;
1309
 
1310
+ model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.weight"] = layer.cross_attn_ln_1_w;
1311
+ model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.bias"] = layer.cross_attn_ln_1_b;
1312
  }
1313
  }
1314
  }
 
1322
  while (true) {
1323
  int32_t n_dims;
1324
  int32_t length;
1325
+ int32_t ttype;
1326
 
1327
  read_safe(loader, n_dims);
1328
  read_safe(loader, length);
1329
+ read_safe(loader, ttype);
1330
 
1331
  if (loader->eof(loader->context)) {
1332
  break;
 
1361
  return false;
1362
  }
1363
 
1364
+ const size_t bpe = ggml_type_size(ggml_type(ttype));
1365
 
1366
+ if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
1367
  fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
1368
  __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
1369
  return false;
 
1372
  loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
1373
  BYTESWAP_TENSOR(tensor);
1374
 
1375
+ //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype), ggml_nbytes(tensor)/1024.0/1024.0);
1376
  total_size += ggml_nbytes(tensor);
1377
  model.n_loaded++;
1378
  }
 
1571
  ggml_permute(ctx0,
1572
  ggml_cpy(ctx0,
1573
  Qcur,
1574
+ ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
1575
  0, 2, 1, 3);
1576
 
1577
  struct ggml_tensor * K =
1578
  ggml_permute(ctx0,
1579
  ggml_cpy(ctx0,
1580
  Kcur,
1581
+ ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
1582
  0, 2, 1, 3);
1583
 
1584
  struct ggml_tensor * V =
 
1588
  Vcur,
1589
  n_state/n_head, n_head, n_ctx),
1590
  1, 2, 0, 3),
1591
+ ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head));
1592
 
1593
  struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
1594
  #else
 
1603
  ggml_permute(ctx0,
1604
  ggml_cpy(ctx0,
1605
  Kcur,
1606
+ ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
1607
  0, 2, 1, 3);
1608
 
1609
  // K * Q
 
1624
  Vcur,
1625
  n_state/n_head, n_head, n_ctx),
1626
  1, 2, 0, 3),
1627
+ ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)
1628
  );
1629
 
1630
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
 
1682
  wstate.use_buf(ctx0, 0);
1683
 
1684
  cur = ggml_flash_ff(ctx0,
1685
+ ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)),
1686
  layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
1687
  #else
1688
  wstate.use_buf(ctx0, 0);
 
2600
  struct whisper_state * whisper_init_state(whisper_context * ctx) {
2601
  whisper_state * state = new whisper_state;
2602
 
2603
+ const size_t scale = ctx->model.hparams.ftype ? 1 : 2;
2604
 
2605
+ if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) {
2606
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2607
  delete state;
2608
  return nullptr;
 
2613
  fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2614
  }
2615
 
2616
+ if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
2617
  fprintf(stderr, "%s: kv_cache_init() failed for cross-attention cache\n", __func__);
2618
  delete state;
2619
  return nullptr;
 
3112
  return ctx->model.hparams.n_mels;
3113
  }
3114
 
3115
+ int whisper_model_ftype(struct whisper_context * ctx) {
3116
+ return ctx->model.hparams.ftype;
3117
  }
3118
 
3119
  int whisper_model_type(struct whisper_context * ctx) {
 
4888
  // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
4889
  std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
4890
 
4891
+ // put a bunch of random data in the buffer
4892
  for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
4893
 
4894
  for (int j = 0; j < (int) sizes.size(); j++) {
4895
+ int n_q4_0 = 0;
4896
+ int n_q4_1 = 0;
4897
  int n_fp16 = 0;
4898
  int n_fp32 = 0;
4899
 
4900
  // GFLOPS/s
4901
+ double s_q4_0 = 0.0;
4902
+ double s_q4_1 = 0.0;
4903
  double s_fp16 = 0.0;
4904
  double s_fp32 = 0.0;
4905
 
4906
  const size_t N = sizes[j];
4907
 
4908
+ for (int k = 0; k < 4; ++k) {
4909
+ const ggml_type wtype =
4910
+ k == 0 ? GGML_TYPE_Q4_0 :
4911
+ k == 1 ? GGML_TYPE_Q4_1 :
4912
+ k == 2 ? GGML_TYPE_F16 :
4913
+ GGML_TYPE_F32;
4914
 
4915
+ double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_fp16 : s_fp32;
4916
+ int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_fp16 : n_fp32;
4917
 
4918
  struct ggml_init_params gparams = {
4919
  /*.mem_size =*/ buf.size(),
 
4957
  s = ((2.0*N*N*N*n)/tsum)*1e-9;
4958
  }
4959
 
4960
+ snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) / Q4_1 %7.1f GFLOPS (%3d runs) / F16 %7.1f GFLOPS (%3d runs) / F32 %7.1f GFLOPS (%3d runs)\n",
4961
+ N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_fp16, n_fp16, s_fp32, n_fp32);
4962
  s += strbuf;
4963
  }
4964
 
whisper.h CHANGED
@@ -258,7 +258,7 @@ extern "C" {
258
  WHISPER_API int whisper_model_n_text_head (struct whisper_context * ctx);
259
  WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
260
  WHISPER_API int whisper_model_n_mels (struct whisper_context * ctx);
261
- WHISPER_API int whisper_model_f16 (struct whisper_context * ctx);
262
  WHISPER_API int whisper_model_type (struct whisper_context * ctx);
263
 
264
  // Token logits obtained from the last call to whisper_decode()
 
258
  WHISPER_API int whisper_model_n_text_head (struct whisper_context * ctx);
259
  WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
260
  WHISPER_API int whisper_model_n_mels (struct whisper_context * ctx);
261
+ WHISPER_API int whisper_model_ftype (struct whisper_context * ctx);
262
  WHISPER_API int whisper_model_type (struct whisper_context * ctx);
263
 
264
  // Token logits obtained from the last call to whisper_decode()