Spaces:
Running
Running
whisper : add integer quantization support (#540)
Browse files* whisper : add integer quantization support
* examples : add common-ggml + prepare to add "quantize" tool
* whisper : quantization tool ready
* whisper : fix F32 support
* whisper : try to fix shared lib linkage
* wasm : update quantized models to Q5
* bench.wasm : remove "medium" button
* bench.wasm : fix custom model button
* ggml : add Q5_0 and Q5_1 WASM SIMD
* wasm : add quantized models to all WASM examples
* wasm : bump DB version number to 2
* talk-llama : update example to latest llama.cpp
* node : increase test timeout to 10s
* readme : add information for model quantization
* wasm : add links to other examples
- .gitignore +1 -0
- CMakeLists.txt +6 -0
- Makefile +6 -3
- README.md +17 -0
- bindings/javascript/whisper.js +0 -0
- examples/CMakeLists.txt +5 -0
- examples/addon.node/__test__/whisper.spec.js +5 -4
- examples/bench.wasm/CMakeLists.txt +3 -3
- examples/bench.wasm/index-tmpl.html +65 -9
- examples/command.wasm/index-tmpl.html +27 -0
- examples/common-ggml.cpp +241 -0
- examples/common-ggml.h +18 -0
- examples/common.cpp +319 -1
- examples/common.h +83 -3
- examples/helpers.js +10 -3
- examples/main/main.cpp +1 -1
- examples/quantize/CMakeLists.txt +6 -0
- examples/quantize/README.md +3 -0
- examples/quantize/quantize.cpp +215 -0
- examples/stream.wasm/index-tmpl.html +27 -0
- examples/talk-llama/{llama_util.h → llama-util.h} +71 -21
- examples/talk-llama/llama.cpp +1057 -236
- examples/talk-llama/llama.h +110 -26
- examples/talk-llama/llama_internal.h +0 -12
- examples/talk-llama/talk-llama.cpp +28 -2
- examples/talk.wasm/CMakeLists.txt +1 -0
- examples/talk.wasm/gpt-2.cpp +110 -226
- examples/talk.wasm/gpt-2.h +2 -8
- examples/talk.wasm/index-tmpl.html +27 -0
- examples/talk/CMakeLists.txt +2 -10
- examples/talk/gpt-2.cpp +106 -219
- examples/talk/gpt-2.h +2 -8
- examples/whisper.wasm/CMakeLists.txt +3 -3
- examples/whisper.wasm/emscripten.cpp +8 -2
- examples/whisper.wasm/index-tmpl.html +111 -20
- extra/quantize-all.sh +45 -0
- extra/sync-ggml.sh +8 -4
- ggml.c +161 -1
- ggml.h +16 -0
- whisper.cpp +252 -180
- whisper.h +1 -1
.gitignore
CHANGED
|
@@ -23,6 +23,7 @@ build-sanitize-thread/
|
|
| 23 |
/talk
|
| 24 |
/talk-llama
|
| 25 |
/bench
|
|
|
|
| 26 |
|
| 27 |
arm_neon.h
|
| 28 |
sync.sh
|
|
|
|
| 23 |
/talk
|
| 24 |
/talk-llama
|
| 25 |
/bench
|
| 26 |
+
/quantize
|
| 27 |
|
| 28 |
arm_neon.h
|
| 29 |
sync.sh
|
CMakeLists.txt
CHANGED
|
@@ -303,6 +303,12 @@ if (BUILD_SHARED_LIBS)
|
|
| 303 |
|
| 304 |
target_compile_definitions(${TARGET} PUBLIC
|
| 305 |
WHISPER_SHARED
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
)
|
| 307 |
endif()
|
| 308 |
|
|
|
|
| 303 |
|
| 304 |
target_compile_definitions(${TARGET} PUBLIC
|
| 305 |
WHISPER_SHARED
|
| 306 |
+
GGML_SHARED
|
| 307 |
+
)
|
| 308 |
+
|
| 309 |
+
target_compile_definitions(${TARGET} PRIVATE
|
| 310 |
+
WHISPER_BUILD
|
| 311 |
+
GGML_BUILD
|
| 312 |
)
|
| 313 |
endif()
|
| 314 |
|
Makefile
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
default: main bench
|
| 2 |
|
| 3 |
ifndef UNAME_S
|
| 4 |
UNAME_S := $(shell uname -s)
|
|
@@ -243,7 +243,7 @@ libwhisper.so: ggml.o $(WHISPER_OBJ)
|
|
| 243 |
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
|
| 244 |
|
| 245 |
clean:
|
| 246 |
-
rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so
|
| 247 |
|
| 248 |
#
|
| 249 |
# Examples
|
|
@@ -251,7 +251,7 @@ clean:
|
|
| 251 |
|
| 252 |
CC_SDL=`sdl2-config --cflags --libs`
|
| 253 |
|
| 254 |
-
SRC_COMMON
|
| 255 |
SRC_COMMON_SDL = examples/common-sdl.cpp
|
| 256 |
|
| 257 |
main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
|
|
@@ -261,6 +261,9 @@ main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
|
|
| 261 |
bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
|
| 262 |
$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
|
| 263 |
|
|
|
|
|
|
|
|
|
|
| 264 |
stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
|
| 265 |
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
|
| 266 |
|
|
|
|
| 1 |
+
default: main bench quantize
|
| 2 |
|
| 3 |
ifndef UNAME_S
|
| 4 |
UNAME_S := $(shell uname -s)
|
|
|
|
| 243 |
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
|
| 244 |
|
| 245 |
clean:
|
| 246 |
+
rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so
|
| 247 |
|
| 248 |
#
|
| 249 |
# Examples
|
|
|
|
| 251 |
|
| 252 |
CC_SDL=`sdl2-config --cflags --libs`
|
| 253 |
|
| 254 |
+
SRC_COMMON = examples/common.cpp examples/common-ggml.cpp
|
| 255 |
SRC_COMMON_SDL = examples/common-sdl.cpp
|
| 256 |
|
| 257 |
main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
|
|
|
|
| 261 |
bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
|
| 262 |
$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
|
| 263 |
|
| 264 |
+
quantize: examples/quantize/quantize.cpp ggml.o $(WHISPER_OBJ) $(SRC_COMMON)
|
| 265 |
+
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o quantize $(LDFLAGS)
|
| 266 |
+
|
| 267 |
stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
|
| 268 |
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
|
| 269 |
|
README.md
CHANGED
|
@@ -15,6 +15,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
|
|
| 15 |
- AVX intrinsics support for x86 architectures
|
| 16 |
- VSX intrinsics support for POWER architectures
|
| 17 |
- Mixed F16 / F32 precision
|
|
|
|
| 18 |
- Low memory usage (Flash Attention)
|
| 19 |
- Zero memory allocations at runtime
|
| 20 |
- Runs on the CPU
|
|
@@ -228,6 +229,22 @@ make large
|
|
| 228 |
| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
|
| 229 |
| large | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
|
| 230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
## Core ML support
|
| 232 |
|
| 233 |
On Apple Silicon devices, the Encoder inference can be executed on the Apple Neural Engine (ANE) via Core ML. This can result in significant
|
|
|
|
| 15 |
- AVX intrinsics support for x86 architectures
|
| 16 |
- VSX intrinsics support for POWER architectures
|
| 17 |
- Mixed F16 / F32 precision
|
| 18 |
+
- [4-bit and 5-bit integer quantization support](https://github.com/ggerganov/whisper.cpp#quantization)
|
| 19 |
- Low memory usage (Flash Attention)
|
| 20 |
- Zero memory allocations at runtime
|
| 21 |
- Runs on the CPU
|
|
|
|
| 229 |
| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
|
| 230 |
| large | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
|
| 231 |
|
| 232 |
+
## Quantization
|
| 233 |
+
|
| 234 |
+
`whisper.cpp` supports integer quantization of the Whisper `ggml` models.
|
| 235 |
+
Quantized models require less memory and disk space and depending on the hardware can be processed more efficiently.
|
| 236 |
+
|
| 237 |
+
Here are the steps for creating and using a quantized model:
|
| 238 |
+
|
| 239 |
+
```bash
|
| 240 |
+
# quantize a model with Q5_0 method
|
| 241 |
+
make quantize
|
| 242 |
+
./quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
|
| 243 |
+
|
| 244 |
+
# run the examples as usual, specifying the quantized model file
|
| 245 |
+
./main -m models/ggml-base.en-q5_0.bin ./samples/gb0.wav
|
| 246 |
+
```
|
| 247 |
+
|
| 248 |
## Core ML support
|
| 249 |
|
| 250 |
On Apple Silicon devices, the Encoder inference can be executed on the Apple Neural Engine (ANE) via Core ML. This can result in significant
|
bindings/javascript/whisper.js
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
examples/CMakeLists.txt
CHANGED
|
@@ -21,10 +21,14 @@ set(TARGET common)
|
|
| 21 |
add_library(${TARGET} STATIC
|
| 22 |
common.h
|
| 23 |
common.cpp
|
|
|
|
|
|
|
| 24 |
)
|
| 25 |
|
| 26 |
include(DefaultTargetOptions)
|
| 27 |
|
|
|
|
|
|
|
| 28 |
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
| 29 |
|
| 30 |
if (WHISPER_SDL2)
|
|
@@ -62,6 +66,7 @@ else()
|
|
| 62 |
add_subdirectory(stream)
|
| 63 |
add_subdirectory(command)
|
| 64 |
add_subdirectory(bench)
|
|
|
|
| 65 |
add_subdirectory(talk)
|
| 66 |
add_subdirectory(talk-llama)
|
| 67 |
endif()
|
|
|
|
| 21 |
add_library(${TARGET} STATIC
|
| 22 |
common.h
|
| 23 |
common.cpp
|
| 24 |
+
common-ggml.h
|
| 25 |
+
common-ggml.cpp
|
| 26 |
)
|
| 27 |
|
| 28 |
include(DefaultTargetOptions)
|
| 29 |
|
| 30 |
+
target_link_libraries(${TARGET} PRIVATE whisper)
|
| 31 |
+
|
| 32 |
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
| 33 |
|
| 34 |
if (WHISPER_SDL2)
|
|
|
|
| 66 |
add_subdirectory(stream)
|
| 67 |
add_subdirectory(command)
|
| 68 |
add_subdirectory(bench)
|
| 69 |
+
add_subdirectory(quantize)
|
| 70 |
add_subdirectory(talk)
|
| 71 |
add_subdirectory(talk-llama)
|
| 72 |
endif()
|
examples/addon.node/__test__/whisper.spec.js
CHANGED
|
@@ -14,9 +14,10 @@ const whisperParamsMock = {
|
|
| 14 |
};
|
| 15 |
|
| 16 |
describe("Run whisper.node", () => {
|
| 17 |
-
|
| 18 |
-
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
});
|
|
|
|
|
|
| 14 |
};
|
| 15 |
|
| 16 |
describe("Run whisper.node", () => {
|
| 17 |
+
test("it should receive a non-empty value", async () => {
|
| 18 |
+
let result = await whisperAsync(whisperParamsMock);
|
| 19 |
|
| 20 |
+
expect(result.length).toBeGreaterThan(0);
|
| 21 |
+
}, 10000);
|
| 22 |
});
|
| 23 |
+
|
examples/bench.wasm/CMakeLists.txt
CHANGED
|
@@ -31,9 +31,9 @@ endif()
|
|
| 31 |
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
|
| 32 |
--bind \
|
| 33 |
-s USE_PTHREADS=1 \
|
| 34 |
-
-s
|
| 35 |
-
-s INITIAL_MEMORY=
|
| 36 |
-
-s TOTAL_MEMORY=
|
| 37 |
-s FORCE_FILESYSTEM=1 \
|
| 38 |
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
|
| 39 |
${EXTRA_FLAGS} \
|
|
|
|
| 31 |
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
|
| 32 |
--bind \
|
| 33 |
-s USE_PTHREADS=1 \
|
| 34 |
+
-s PTHREAD_POOL_SIZE_STRICT=0 \
|
| 35 |
+
-s INITIAL_MEMORY=2000MB \
|
| 36 |
+
-s TOTAL_MEMORY=2000MB \
|
| 37 |
-s FORCE_FILESYSTEM=1 \
|
| 38 |
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
|
| 39 |
${EXTRA_FLAGS} \
|
examples/bench.wasm/index-tmpl.html
CHANGED
|
@@ -35,6 +35,15 @@
|
|
| 35 |
|
| 36 |
<br><br>
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
<hr>
|
| 39 |
|
| 40 |
Select the model you would like to use and click the "Bench" button.<br>
|
|
@@ -44,11 +53,18 @@
|
|
| 44 |
|
| 45 |
<div id="model-whisper">
|
| 46 |
Whisper model: <span id="model-whisper-status"></span>
|
| 47 |
-
<button id="fetch-whisper-tiny-en"
|
| 48 |
-
<button id="fetch-whisper-base-en"
|
| 49 |
-
<
|
| 50 |
-
|
| 51 |
<input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
</div>
|
| 53 |
|
| 54 |
<br>
|
|
@@ -160,6 +176,14 @@
|
|
| 160 |
|
| 161 |
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
|
| 162 |
document.getElementById('fetch-whisper-base-en').style.display = 'none';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
document.getElementById('whisper-file' ).style.display = 'none';
|
| 164 |
document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
|
| 165 |
}
|
|
@@ -168,19 +192,42 @@
|
|
| 168 |
let urls = {
|
| 169 |
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
|
| 170 |
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
};
|
| 172 |
|
| 173 |
let sizes = {
|
| 174 |
'tiny.en': 75,
|
| 175 |
'base.en': 142,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
};
|
| 177 |
|
| 178 |
let url = urls[model];
|
| 179 |
let dst = 'whisper.bin';
|
| 180 |
let size_mb = sizes[model];
|
| 181 |
|
| 182 |
-
document.getElementById('fetch-whisper-tiny-en').style.display
|
| 183 |
-
document.getElementById('fetch-whisper-base-en').style.display
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
|
| 185 |
|
| 186 |
cbProgress = function(p) {
|
|
@@ -190,9 +237,18 @@
|
|
| 190 |
|
| 191 |
cbCancel = function() {
|
| 192 |
var el;
|
| 193 |
-
el = document.getElementById('fetch-whisper-tiny-en');
|
| 194 |
-
el = document.getElementById('fetch-whisper-base-en');
|
| 195 |
-
el = document.getElementById('
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
};
|
| 197 |
|
| 198 |
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
|
|
|
|
| 35 |
|
| 36 |
<br><br>
|
| 37 |
|
| 38 |
+
<b>More examples:</b>
|
| 39 |
+
<a href="https://whisper.ggerganov.com/">main</a> |
|
| 40 |
+
<a href="https://whisper.ggerganov.com/bench">bench</a> |
|
| 41 |
+
<a href="https://whisper.ggerganov.com/stream">stream</a> |
|
| 42 |
+
<a href="https://whisper.ggerganov.com/command">command</a> |
|
| 43 |
+
<a href="https://whisper.ggerganov.com/talk">talk</a> |
|
| 44 |
+
|
| 45 |
+
<br><br>
|
| 46 |
+
|
| 47 |
<hr>
|
| 48 |
|
| 49 |
Select the model you would like to use and click the "Bench" button.<br>
|
|
|
|
| 53 |
|
| 54 |
<div id="model-whisper">
|
| 55 |
Whisper model: <span id="model-whisper-status"></span>
|
| 56 |
+
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
|
| 57 |
+
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
|
| 58 |
+
<button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
|
|
|
|
| 59 |
<input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
|
| 60 |
+
<br><br>
|
| 61 |
+
Quantized models:<br><br>
|
| 62 |
+
<button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
|
| 63 |
+
<button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
|
| 64 |
+
<button id="fetch-whisper-small-en-q5_1" onclick="loadWhisper('small-en-q5_1')">small.en (Q5_1, 182 MB)</button>
|
| 65 |
+
<button id="fetch-whisper-medium-en-q5_0" onclick="loadWhisper('medium-en-q5_0')">medium.en (Q5_0, 515 MB)</button>
|
| 66 |
+
<button id="fetch-whisper-large-q5_0" onclick="loadWhisper('large-q5_0')">large (Q5_0, 1030 MB)</button>
|
| 67 |
+
<span id="fetch-whisper-progress"></span>
|
| 68 |
</div>
|
| 69 |
|
| 70 |
<br>
|
|
|
|
| 176 |
|
| 177 |
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
|
| 178 |
document.getElementById('fetch-whisper-base-en').style.display = 'none';
|
| 179 |
+
document.getElementById('fetch-whisper-small-en').style.display = 'none';
|
| 180 |
+
|
| 181 |
+
document.getElementById('fetch-whisper-tiny-en-q5_1' ).style.display = 'none';
|
| 182 |
+
document.getElementById('fetch-whisper-base-en-q5_1' ).style.display = 'none';
|
| 183 |
+
document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
|
| 184 |
+
document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
|
| 185 |
+
document.getElementById('fetch-whisper-large-q5_0' ).style.display = 'none';
|
| 186 |
+
|
| 187 |
document.getElementById('whisper-file' ).style.display = 'none';
|
| 188 |
document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
|
| 189 |
}
|
|
|
|
| 192 |
let urls = {
|
| 193 |
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
|
| 194 |
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
|
| 195 |
+
'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
|
| 196 |
+
|
| 197 |
+
'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
|
| 198 |
+
'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
|
| 199 |
+
'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin',
|
| 200 |
+
'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin',
|
| 201 |
+
'large-q5_0': 'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin',
|
| 202 |
};
|
| 203 |
|
| 204 |
let sizes = {
|
| 205 |
'tiny.en': 75,
|
| 206 |
'base.en': 142,
|
| 207 |
+
'small.en': 466,
|
| 208 |
+
|
| 209 |
+
'tiny-en-q5_1': 31,
|
| 210 |
+
'base-en-q5_1': 57,
|
| 211 |
+
'small-en-q5_1': 182,
|
| 212 |
+
'medium-en-q5_0': 515,
|
| 213 |
+
'large-q5_0': 1030,
|
| 214 |
};
|
| 215 |
|
| 216 |
let url = urls[model];
|
| 217 |
let dst = 'whisper.bin';
|
| 218 |
let size_mb = sizes[model];
|
| 219 |
|
| 220 |
+
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
|
| 221 |
+
document.getElementById('fetch-whisper-base-en').style.display = 'none';
|
| 222 |
+
document.getElementById('fetch-whisper-small-en').style.display = 'none';
|
| 223 |
+
|
| 224 |
+
document.getElementById('fetch-whisper-tiny-en-q5_1' ).style.display = 'none';
|
| 225 |
+
document.getElementById('fetch-whisper-base-en-q5_1' ).style.display = 'none';
|
| 226 |
+
document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
|
| 227 |
+
document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
|
| 228 |
+
document.getElementById('fetch-whisper-large-q5_0' ).style.display = 'none';
|
| 229 |
+
|
| 230 |
+
document.getElementById('whisper-file' ).style.display = 'none';
|
| 231 |
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
|
| 232 |
|
| 233 |
cbProgress = function(p) {
|
|
|
|
| 237 |
|
| 238 |
cbCancel = function() {
|
| 239 |
var el;
|
| 240 |
+
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
|
| 241 |
+
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
|
| 242 |
+
el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
|
| 243 |
+
|
| 244 |
+
el = document.getElementById('fetch-whisper-tiny-en-q5_1' ); if (el) el.style.display = 'inline-block';
|
| 245 |
+
el = document.getElementById('fetch-whisper-base-en-q5_1' ); if (el) el.style.display = 'inline-block';
|
| 246 |
+
el = document.getElementById('fetch-whisper-small-en-q5_1' ); if (el) el.style.display = 'inline-block';
|
| 247 |
+
el = document.getElementById('fetch-whisper-medium-en-q5_0'); if (el) el.style.display = 'inline-block';
|
| 248 |
+
el = document.getElementById('fetch-whisper-large-q5_0' ); if (el) el.style.display = 'inline-block';
|
| 249 |
+
|
| 250 |
+
el = document.getElementById('whisper-file' ); if (el) el.style.display = 'inline-block';
|
| 251 |
+
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
|
| 252 |
};
|
| 253 |
|
| 254 |
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
|
examples/command.wasm/index-tmpl.html
CHANGED
|
@@ -35,6 +35,15 @@
|
|
| 35 |
|
| 36 |
<br><br>
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
<hr>
|
| 39 |
|
| 40 |
Select the model you would like to use, click the "Start" button and follow the instructions.
|
|
@@ -45,6 +54,10 @@
|
|
| 45 |
Whisper model: <span id="model-whisper-status"></span>
|
| 46 |
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
|
| 47 |
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
<span id="fetch-whisper-progress"></span>
|
| 49 |
|
| 50 |
<!--
|
|
@@ -162,11 +175,17 @@
|
|
| 162 |
let urls = {
|
| 163 |
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
|
| 164 |
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
|
|
|
|
|
|
|
|
|
|
| 165 |
};
|
| 166 |
|
| 167 |
let sizes = {
|
| 168 |
'tiny.en': 75,
|
| 169 |
'base.en': 142,
|
|
|
|
|
|
|
|
|
|
| 170 |
};
|
| 171 |
|
| 172 |
let url = urls[model];
|
|
@@ -177,6 +196,10 @@
|
|
| 177 |
|
| 178 |
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
|
| 179 |
document.getElementById('fetch-whisper-base-en').style.display = 'none';
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
|
| 181 |
|
| 182 |
cbProgress = function(p) {
|
|
@@ -188,6 +211,10 @@
|
|
| 188 |
var el;
|
| 189 |
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
|
| 190 |
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
|
| 192 |
};
|
| 193 |
|
|
|
|
| 35 |
|
| 36 |
<br><br>
|
| 37 |
|
| 38 |
+
<b>More examples:</b>
|
| 39 |
+
<a href="https://whisper.ggerganov.com/">main</a> |
|
| 40 |
+
<a href="https://whisper.ggerganov.com/bench">bench</a> |
|
| 41 |
+
<a href="https://whisper.ggerganov.com/stream">stream</a> |
|
| 42 |
+
<a href="https://whisper.ggerganov.com/command">command</a> |
|
| 43 |
+
<a href="https://whisper.ggerganov.com/talk">talk</a> |
|
| 44 |
+
|
| 45 |
+
<br><br>
|
| 46 |
+
|
| 47 |
<hr>
|
| 48 |
|
| 49 |
Select the model you would like to use, click the "Start" button and follow the instructions.
|
|
|
|
| 54 |
Whisper model: <span id="model-whisper-status"></span>
|
| 55 |
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
|
| 56 |
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
|
| 57 |
+
<br><br>
|
| 58 |
+
Quantized models:<br><br>
|
| 59 |
+
<button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
|
| 60 |
+
<button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
|
| 61 |
<span id="fetch-whisper-progress"></span>
|
| 62 |
|
| 63 |
<!--
|
|
|
|
| 175 |
let urls = {
|
| 176 |
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
|
| 177 |
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
|
| 178 |
+
|
| 179 |
+
'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
|
| 180 |
+
'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
|
| 181 |
};
|
| 182 |
|
| 183 |
let sizes = {
|
| 184 |
'tiny.en': 75,
|
| 185 |
'base.en': 142,
|
| 186 |
+
|
| 187 |
+
'tiny-en-q5_1': 31,
|
| 188 |
+
'base-en-q5_1': 57,
|
| 189 |
};
|
| 190 |
|
| 191 |
let url = urls[model];
|
|
|
|
| 196 |
|
| 197 |
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
|
| 198 |
document.getElementById('fetch-whisper-base-en').style.display = 'none';
|
| 199 |
+
|
| 200 |
+
document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
|
| 201 |
+
document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
|
| 202 |
+
|
| 203 |
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
|
| 204 |
|
| 205 |
cbProgress = function(p) {
|
|
|
|
| 211 |
var el;
|
| 212 |
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
|
| 213 |
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
|
| 214 |
+
|
| 215 |
+
el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
|
| 216 |
+
el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
|
| 217 |
+
|
| 218 |
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
|
| 219 |
};
|
| 220 |
|
examples/common-ggml.cpp
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "common-ggml.h"
|
| 2 |
+
|
| 3 |
+
#include <regex>
|
| 4 |
+
#include <map>
|
| 5 |
+
|
| 6 |
+
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
|
| 7 |
+
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
|
| 8 |
+
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
|
| 9 |
+
{"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
|
| 10 |
+
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
|
| 11 |
+
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
|
| 12 |
+
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
|
| 13 |
+
};
|
| 14 |
+
|
| 15 |
+
void ggml_print_ftypes(FILE * fp) {
|
| 16 |
+
for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
|
| 17 |
+
fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
enum ggml_ftype ggml_parse_ftype(const char * str) {
|
| 22 |
+
enum ggml_ftype ftype;
|
| 23 |
+
if (str[0] == 'q') {
|
| 24 |
+
const auto it = GGML_FTYPE_MAP.find(str);
|
| 25 |
+
if (it == GGML_FTYPE_MAP.end()) {
|
| 26 |
+
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
|
| 27 |
+
return GGML_FTYPE_UNKNOWN;
|
| 28 |
+
}
|
| 29 |
+
ftype = it->second;
|
| 30 |
+
} else {
|
| 31 |
+
ftype = (enum ggml_ftype) atoi(str);
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
return ftype;
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
bool ggml_common_quantize_0(
|
| 38 |
+
std::ifstream & finp,
|
| 39 |
+
std::ofstream & fout,
|
| 40 |
+
const ggml_ftype ftype,
|
| 41 |
+
const std::vector<std::string> & to_quant,
|
| 42 |
+
const std::vector<std::string> & to_skip) {
|
| 43 |
+
|
| 44 |
+
ggml_type qtype = GGML_TYPE_F32;
|
| 45 |
+
|
| 46 |
+
switch (ftype) {
|
| 47 |
+
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
|
| 48 |
+
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
|
| 49 |
+
case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
|
| 50 |
+
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
|
| 51 |
+
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
|
| 52 |
+
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
|
| 53 |
+
case GGML_FTYPE_UNKNOWN:
|
| 54 |
+
case GGML_FTYPE_ALL_F32:
|
| 55 |
+
case GGML_FTYPE_MOSTLY_F16:
|
| 56 |
+
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
| 57 |
+
{
|
| 58 |
+
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
| 59 |
+
return false;
|
| 60 |
+
}
|
| 61 |
+
};
|
| 62 |
+
|
| 63 |
+
if (!ggml_is_quantized(qtype)) {
|
| 64 |
+
fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
|
| 65 |
+
return false;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
size_t total_size_org = 0;
|
| 69 |
+
size_t total_size_new = 0;
|
| 70 |
+
|
| 71 |
+
std::vector<float> work;
|
| 72 |
+
|
| 73 |
+
std::vector<uint8_t> data_u8;
|
| 74 |
+
std::vector<ggml_fp16_t> data_f16;
|
| 75 |
+
std::vector<float> data_f32;
|
| 76 |
+
|
| 77 |
+
std::vector<int64_t> hist_all(1 << 4, 0);
|
| 78 |
+
|
| 79 |
+
while (true) {
|
| 80 |
+
int32_t n_dims;
|
| 81 |
+
int32_t length;
|
| 82 |
+
int32_t ttype;
|
| 83 |
+
|
| 84 |
+
finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
| 85 |
+
finp.read(reinterpret_cast<char *>(&length), sizeof(length));
|
| 86 |
+
finp.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
|
| 87 |
+
|
| 88 |
+
if (finp.eof()) {
|
| 89 |
+
break;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
int32_t nelements = 1;
|
| 93 |
+
int32_t ne[2] = { 1, 1 };
|
| 94 |
+
for (int i = 0; i < n_dims; ++i) {
|
| 95 |
+
finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
| 96 |
+
nelements *= ne[i];
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
std::string name(length, 0);
|
| 100 |
+
finp.read (&name[0], length);
|
| 101 |
+
|
| 102 |
+
printf("%64s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ggml_type_name((ggml_type) ttype));
|
| 103 |
+
|
| 104 |
+
bool quantize = false;
|
| 105 |
+
|
| 106 |
+
// check if we should quantize this tensor
|
| 107 |
+
for (const auto & s : to_quant) {
|
| 108 |
+
if (std::regex_match(name, std::regex(s))) {
|
| 109 |
+
quantize = true;
|
| 110 |
+
break;
|
| 111 |
+
}
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
// check if we should skip this tensor
|
| 115 |
+
for (const auto & s : to_skip) {
|
| 116 |
+
if (std::regex_match(name, std::regex(s))) {
|
| 117 |
+
quantize = false;
|
| 118 |
+
break;
|
| 119 |
+
}
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
// quantize only 2D tensors
|
| 123 |
+
quantize &= (n_dims == 2);
|
| 124 |
+
|
| 125 |
+
if (quantize) {
|
| 126 |
+
if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
|
| 127 |
+
fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
| 128 |
+
return false;
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
if (ttype == GGML_TYPE_F16) {
|
| 132 |
+
data_f16.resize(nelements);
|
| 133 |
+
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
|
| 134 |
+
data_f32.resize(nelements);
|
| 135 |
+
for (int i = 0; i < nelements; ++i) {
|
| 136 |
+
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
|
| 137 |
+
}
|
| 138 |
+
} else {
|
| 139 |
+
data_f32.resize(nelements);
|
| 140 |
+
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
ttype = qtype;
|
| 144 |
+
} else {
|
| 145 |
+
const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
|
| 146 |
+
|
| 147 |
+
data_u8.resize(nelements*bpe);
|
| 148 |
+
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
| 152 |
+
fout.write(reinterpret_cast<char *>(&length), sizeof(length));
|
| 153 |
+
fout.write(reinterpret_cast<char *>(&ttype), sizeof(ttype));
|
| 154 |
+
for (int i = 0; i < n_dims; ++i) {
|
| 155 |
+
fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
| 156 |
+
}
|
| 157 |
+
fout.write(&name[0], length);
|
| 158 |
+
|
| 159 |
+
if (quantize) {
|
| 160 |
+
work.resize(nelements); // for quantization
|
| 161 |
+
|
| 162 |
+
size_t cur_size = 0;
|
| 163 |
+
std::vector<int64_t> hist_cur(1 << 4, 0);
|
| 164 |
+
|
| 165 |
+
switch ((ggml_type) ttype) {
|
| 166 |
+
case GGML_TYPE_Q4_0:
|
| 167 |
+
{
|
| 168 |
+
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
| 169 |
+
} break;
|
| 170 |
+
case GGML_TYPE_Q4_1:
|
| 171 |
+
{
|
| 172 |
+
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
| 173 |
+
} break;
|
| 174 |
+
case GGML_TYPE_Q4_2:
|
| 175 |
+
{
|
| 176 |
+
cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
| 177 |
+
} break;
|
| 178 |
+
case GGML_TYPE_Q5_0:
|
| 179 |
+
{
|
| 180 |
+
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
| 181 |
+
} break;
|
| 182 |
+
case GGML_TYPE_Q5_1:
|
| 183 |
+
{
|
| 184 |
+
cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
| 185 |
+
} break;
|
| 186 |
+
case GGML_TYPE_Q8_0:
|
| 187 |
+
{
|
| 188 |
+
cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
| 189 |
+
} break;
|
| 190 |
+
case GGML_TYPE_F32:
|
| 191 |
+
case GGML_TYPE_F16:
|
| 192 |
+
case GGML_TYPE_I8:
|
| 193 |
+
case GGML_TYPE_I16:
|
| 194 |
+
case GGML_TYPE_I32:
|
| 195 |
+
case GGML_TYPE_Q8_1:
|
| 196 |
+
case GGML_TYPE_COUNT:
|
| 197 |
+
{
|
| 198 |
+
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
| 199 |
+
return false;
|
| 200 |
+
}
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
fout.write(reinterpret_cast<char *>(work.data()), cur_size);
|
| 204 |
+
total_size_new += cur_size;
|
| 205 |
+
|
| 206 |
+
printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
|
| 207 |
+
for (int i = 0; i < hist_cur.size(); ++i) {
|
| 208 |
+
hist_all[i] += hist_cur[i];
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
for (int i = 0; i < hist_cur.size(); ++i) {
|
| 212 |
+
printf("%5.3f ", hist_cur[i] / (float)nelements);
|
| 213 |
+
}
|
| 214 |
+
printf("\n");
|
| 215 |
+
} else {
|
| 216 |
+
printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
|
| 217 |
+
fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
|
| 218 |
+
total_size_new += data_u8.size();
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
total_size_org += nelements * sizeof(float);
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
| 225 |
+
printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
|
| 226 |
+
|
| 227 |
+
{
|
| 228 |
+
int64_t sum_all = 0;
|
| 229 |
+
for (int i = 0; i < hist_all.size(); ++i) {
|
| 230 |
+
sum_all += hist_all[i];
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
printf("%s: hist: ", __func__);
|
| 234 |
+
for (int i = 0; i < hist_all.size(); ++i) {
|
| 235 |
+
printf("%5.3f ", hist_all[i] / (float)sum_all);
|
| 236 |
+
}
|
| 237 |
+
printf("\n");
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
return true;
|
| 241 |
+
}
|
examples/common-ggml.h
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include "ggml.h"
|
| 4 |
+
|
| 5 |
+
#include <fstream>
|
| 6 |
+
#include <vector>
|
| 7 |
+
#include <string>
|
| 8 |
+
|
| 9 |
+
enum ggml_ftype ggml_parse_ftype(const char * str);
|
| 10 |
+
|
| 11 |
+
void ggml_print_ftypes(FILE * fp = stderr);
|
| 12 |
+
|
| 13 |
+
bool ggml_common_quantize_0(
|
| 14 |
+
std::ifstream & finp,
|
| 15 |
+
std::ofstream & fout,
|
| 16 |
+
const ggml_ftype ftype,
|
| 17 |
+
const std::vector<std::string> & to_quant,
|
| 18 |
+
const std::vector<std::string> & to_skip);
|
examples/common.cpp
CHANGED
|
@@ -6,13 +6,86 @@
|
|
| 6 |
#include "dr_wav.h"
|
| 7 |
|
| 8 |
#include <cmath>
|
| 9 |
-
#include <
|
| 10 |
#include <regex>
|
| 11 |
|
| 12 |
#ifndef M_PI
|
| 13 |
#define M_PI 3.14159265358979323846
|
| 14 |
#endif
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
std::string trim(const std::string & s) {
|
| 17 |
std::regex e("^\\s+|\\s+$");
|
| 18 |
return std::regex_replace(s, e, "");
|
|
@@ -28,6 +101,251 @@ std::string replace(const std::string & s, const std::string & from, const std::
|
|
| 28 |
return result;
|
| 29 |
}
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
|
| 32 |
drwav wav;
|
| 33 |
std::vector<uint8_t> wav_data; // used for pipe input from stdin
|
|
|
|
| 6 |
#include "dr_wav.h"
|
| 7 |
|
| 8 |
#include <cmath>
|
| 9 |
+
#include <fstream>
|
| 10 |
#include <regex>
|
| 11 |
|
| 12 |
#ifndef M_PI
|
| 13 |
#define M_PI 3.14159265358979323846
|
| 14 |
#endif
|
| 15 |
|
| 16 |
+
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
| 17 |
+
for (int i = 1; i < argc; i++) {
|
| 18 |
+
std::string arg = argv[i];
|
| 19 |
+
|
| 20 |
+
if (arg == "-s" || arg == "--seed") {
|
| 21 |
+
params.seed = std::stoi(argv[++i]);
|
| 22 |
+
} else if (arg == "-t" || arg == "--threads") {
|
| 23 |
+
params.n_threads = std::stoi(argv[++i]);
|
| 24 |
+
} else if (arg == "-p" || arg == "--prompt") {
|
| 25 |
+
params.prompt = argv[++i];
|
| 26 |
+
} else if (arg == "-n" || arg == "--n_predict") {
|
| 27 |
+
params.n_predict = std::stoi(argv[++i]);
|
| 28 |
+
} else if (arg == "--top_k") {
|
| 29 |
+
params.top_k = std::stoi(argv[++i]);
|
| 30 |
+
} else if (arg == "--top_p") {
|
| 31 |
+
params.top_p = std::stof(argv[++i]);
|
| 32 |
+
} else if (arg == "--temp") {
|
| 33 |
+
params.temp = std::stof(argv[++i]);
|
| 34 |
+
} else if (arg == "-b" || arg == "--batch_size") {
|
| 35 |
+
params.n_batch = std::stoi(argv[++i]);
|
| 36 |
+
} else if (arg == "-m" || arg == "--model") {
|
| 37 |
+
params.model = argv[++i];
|
| 38 |
+
} else if (arg == "-h" || arg == "--help") {
|
| 39 |
+
gpt_print_usage(argc, argv, params);
|
| 40 |
+
exit(0);
|
| 41 |
+
} else {
|
| 42 |
+
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 43 |
+
gpt_print_usage(argc, argv, params);
|
| 44 |
+
exit(0);
|
| 45 |
+
}
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
return true;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
| 52 |
+
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
| 53 |
+
fprintf(stderr, "\n");
|
| 54 |
+
fprintf(stderr, "options:\n");
|
| 55 |
+
fprintf(stderr, " -h, --help show this help message and exit\n");
|
| 56 |
+
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
|
| 57 |
+
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
| 58 |
+
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
|
| 59 |
+
fprintf(stderr, " prompt to start generation with (default: random)\n");
|
| 60 |
+
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
|
| 61 |
+
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
|
| 62 |
+
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
|
| 63 |
+
fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
|
| 64 |
+
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
| 65 |
+
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
| 66 |
+
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
| 67 |
+
fprintf(stderr, "\n");
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
std::string gpt_random_prompt(std::mt19937 & rng) {
|
| 71 |
+
const int r = rng() % 10;
|
| 72 |
+
switch (r) {
|
| 73 |
+
case 0: return "So";
|
| 74 |
+
case 1: return "Once upon a time";
|
| 75 |
+
case 2: return "When";
|
| 76 |
+
case 3: return "The";
|
| 77 |
+
case 4: return "After";
|
| 78 |
+
case 5: return "If";
|
| 79 |
+
case 6: return "import";
|
| 80 |
+
case 7: return "He";
|
| 81 |
+
case 8: return "She";
|
| 82 |
+
case 9: return "They";
|
| 83 |
+
default: return "To";
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
return "The";
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
std::string trim(const std::string & s) {
|
| 90 |
std::regex e("^\\s+|\\s+$");
|
| 91 |
return std::regex_replace(s, e, "");
|
|
|
|
| 101 |
return result;
|
| 102 |
}
|
| 103 |
|
| 104 |
+
std::map<std::string, int32_t> json_parse(const std::string & fname) {
|
| 105 |
+
std::map<std::string, int32_t> result;
|
| 106 |
+
|
| 107 |
+
// read file into string
|
| 108 |
+
std::string json;
|
| 109 |
+
{
|
| 110 |
+
std::ifstream ifs(fname);
|
| 111 |
+
if (!ifs) {
|
| 112 |
+
fprintf(stderr, "Failed to open %s\n", fname.c_str());
|
| 113 |
+
exit(1);
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
json = std::string((std::istreambuf_iterator<char>(ifs)),
|
| 117 |
+
(std::istreambuf_iterator<char>()));
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
if (json[0] != '{') {
|
| 121 |
+
return result;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
// parse json
|
| 125 |
+
{
|
| 126 |
+
bool has_key = false;
|
| 127 |
+
bool in_token = false;
|
| 128 |
+
|
| 129 |
+
std::string str_key = "";
|
| 130 |
+
std::string str_val = "";
|
| 131 |
+
|
| 132 |
+
int n = json.size();
|
| 133 |
+
for (int i = 1; i < n; ++i) {
|
| 134 |
+
if (!in_token) {
|
| 135 |
+
if (json[i] == ' ') continue;
|
| 136 |
+
if (json[i] == '"') {
|
| 137 |
+
in_token = true;
|
| 138 |
+
continue;
|
| 139 |
+
}
|
| 140 |
+
} else {
|
| 141 |
+
if (json[i] == '\\' && i+1 < n) {
|
| 142 |
+
if (has_key == false) {
|
| 143 |
+
str_key += json[i];
|
| 144 |
+
} else {
|
| 145 |
+
str_val += json[i];
|
| 146 |
+
}
|
| 147 |
+
++i;
|
| 148 |
+
} else if (json[i] == '"') {
|
| 149 |
+
if (has_key == false) {
|
| 150 |
+
has_key = true;
|
| 151 |
+
++i;
|
| 152 |
+
while (json[i] == ' ') ++i;
|
| 153 |
+
++i; // :
|
| 154 |
+
while (json[i] == ' ') ++i;
|
| 155 |
+
if (json[i] != '\"') {
|
| 156 |
+
while (json[i] != ',' && json[i] != '}') {
|
| 157 |
+
str_val += json[i++];
|
| 158 |
+
}
|
| 159 |
+
has_key = false;
|
| 160 |
+
} else {
|
| 161 |
+
in_token = true;
|
| 162 |
+
continue;
|
| 163 |
+
}
|
| 164 |
+
} else {
|
| 165 |
+
has_key = false;
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
str_key = ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
|
| 169 |
+
str_key = ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
|
| 170 |
+
str_key = ::replace(str_key, "\\\"", "\""); // \\\" -> "
|
| 171 |
+
|
| 172 |
+
try {
|
| 173 |
+
result[str_key] = std::stoi(str_val);
|
| 174 |
+
} catch (...) {
|
| 175 |
+
//fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
|
| 176 |
+
|
| 177 |
+
}
|
| 178 |
+
str_key = "";
|
| 179 |
+
str_val = "";
|
| 180 |
+
in_token = false;
|
| 181 |
+
continue;
|
| 182 |
+
}
|
| 183 |
+
if (has_key == false) {
|
| 184 |
+
str_key += json[i];
|
| 185 |
+
} else {
|
| 186 |
+
str_val += json[i];
|
| 187 |
+
}
|
| 188 |
+
}
|
| 189 |
+
}
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
return result;
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
| 196 |
+
std::vector<std::string> words;
|
| 197 |
+
|
| 198 |
+
// first split the text into words
|
| 199 |
+
{
|
| 200 |
+
std::string str = text;
|
| 201 |
+
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
| 202 |
+
|
| 203 |
+
std::regex re(pat);
|
| 204 |
+
std::smatch m;
|
| 205 |
+
|
| 206 |
+
while (std::regex_search(str, m, re)) {
|
| 207 |
+
for (auto x : m) {
|
| 208 |
+
words.push_back(x);
|
| 209 |
+
}
|
| 210 |
+
str = m.suffix();
|
| 211 |
+
}
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
// find the longest tokens that form the words:
|
| 215 |
+
std::vector<gpt_vocab::id> tokens;
|
| 216 |
+
for (const auto & word : words) {
|
| 217 |
+
if (word.size() == 0) continue;
|
| 218 |
+
|
| 219 |
+
int i = 0;
|
| 220 |
+
int n = word.size();
|
| 221 |
+
while (i < n) {
|
| 222 |
+
int j = n;
|
| 223 |
+
while (j > i) {
|
| 224 |
+
auto it = vocab.token_to_id.find(word.substr(i, j-i));
|
| 225 |
+
if (it != vocab.token_to_id.end()) {
|
| 226 |
+
tokens.push_back(it->second);
|
| 227 |
+
i = j;
|
| 228 |
+
break;
|
| 229 |
+
}
|
| 230 |
+
--j;
|
| 231 |
+
}
|
| 232 |
+
if (i == n) {
|
| 233 |
+
break;
|
| 234 |
+
}
|
| 235 |
+
if (j == i) {
|
| 236 |
+
auto sub = word.substr(i, 1);
|
| 237 |
+
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
|
| 238 |
+
tokens.push_back(vocab.token_to_id.at(sub));
|
| 239 |
+
} else {
|
| 240 |
+
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
|
| 241 |
+
}
|
| 242 |
+
++i;
|
| 243 |
+
}
|
| 244 |
+
}
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
return tokens;
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
|
| 251 |
+
printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
|
| 252 |
+
|
| 253 |
+
vocab.token_to_id = ::json_parse(fname);
|
| 254 |
+
|
| 255 |
+
for (const auto & kv : vocab.token_to_id) {
|
| 256 |
+
vocab.id_to_token[kv.second] = kv.first;
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
|
| 260 |
+
|
| 261 |
+
// print the vocabulary
|
| 262 |
+
//for (auto kv : vocab.token_to_id) {
|
| 263 |
+
// printf("'%s' -> %d\n", kv.first.data(), kv.second);
|
| 264 |
+
//}
|
| 265 |
+
|
| 266 |
+
return true;
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
gpt_vocab::id gpt_sample_top_k_top_p(
|
| 270 |
+
const gpt_vocab & vocab,
|
| 271 |
+
const float * logits,
|
| 272 |
+
int top_k,
|
| 273 |
+
double top_p,
|
| 274 |
+
double temp,
|
| 275 |
+
std::mt19937 & rng) {
|
| 276 |
+
int n_logits = vocab.id_to_token.size();
|
| 277 |
+
|
| 278 |
+
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
|
| 279 |
+
logits_id.reserve(n_logits);
|
| 280 |
+
|
| 281 |
+
{
|
| 282 |
+
const double scale = 1.0/temp;
|
| 283 |
+
for (int i = 0; i < n_logits; ++i) {
|
| 284 |
+
logits_id.push_back(std::make_pair(logits[i]*scale, i));
|
| 285 |
+
}
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
// find the top K tokens
|
| 289 |
+
std::partial_sort(
|
| 290 |
+
logits_id.begin(),
|
| 291 |
+
logits_id.begin() + top_k, logits_id.end(),
|
| 292 |
+
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
|
| 293 |
+
return a.first > b.first;
|
| 294 |
+
});
|
| 295 |
+
|
| 296 |
+
logits_id.resize(top_k);
|
| 297 |
+
|
| 298 |
+
double maxl = -INFINITY;
|
| 299 |
+
for (const auto & kv : logits_id) {
|
| 300 |
+
maxl = std::max(maxl, kv.first);
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
// compute probs for the top K tokens
|
| 304 |
+
std::vector<double> probs;
|
| 305 |
+
probs.reserve(logits_id.size());
|
| 306 |
+
|
| 307 |
+
double sum = 0.0;
|
| 308 |
+
for (const auto & kv : logits_id) {
|
| 309 |
+
double p = exp(kv.first - maxl);
|
| 310 |
+
probs.push_back(p);
|
| 311 |
+
sum += p;
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
// normalize the probs
|
| 315 |
+
for (auto & p : probs) {
|
| 316 |
+
p /= sum;
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
if (top_p < 1.0f) {
|
| 320 |
+
double cumsum = 0.0f;
|
| 321 |
+
for (int i = 0; i < top_k; i++) {
|
| 322 |
+
cumsum += probs[i];
|
| 323 |
+
if (cumsum >= top_p) {
|
| 324 |
+
top_k = i + 1;
|
| 325 |
+
probs.resize(top_k);
|
| 326 |
+
logits_id.resize(top_k);
|
| 327 |
+
break;
|
| 328 |
+
}
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
cumsum = 1.0/cumsum;
|
| 332 |
+
for (int i = 0; i < (int) probs.size(); i++) {
|
| 333 |
+
probs[i] *= cumsum;
|
| 334 |
+
}
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
//printf("\n");
|
| 338 |
+
//for (int i = 0; i < (int) probs.size(); i++) {
|
| 339 |
+
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
|
| 340 |
+
//}
|
| 341 |
+
//exit(0);
|
| 342 |
+
|
| 343 |
+
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
| 344 |
+
int idx = dist(rng);
|
| 345 |
+
|
| 346 |
+
return logits_id[idx].second;
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
|
| 350 |
drwav wav;
|
| 351 |
std::vector<uint8_t> wav_data; // used for pipe input from stdin
|
examples/common.h
CHANGED
|
@@ -1,10 +1,44 @@
|
|
|
|
|
|
|
|
| 1 |
#pragma once
|
| 2 |
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
#define COMMON_SAMPLE_RATE 16000
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
std::string trim(const std::string & s);
|
| 10 |
|
|
@@ -13,6 +47,52 @@ std::string replace(
|
|
| 13 |
const std::string & from,
|
| 14 |
const std::string & to);
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
// Read WAV audio file and store the PCM data into pcmf32
|
| 17 |
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
|
| 18 |
// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
|
|
|
|
| 1 |
+
// Various helper functions and utilities
|
| 2 |
+
|
| 3 |
#pragma once
|
| 4 |
|
| 5 |
+
#include <string>
|
| 6 |
+
#include <map>
|
| 7 |
+
#include <vector>
|
| 8 |
+
#include <random>
|
| 9 |
+
#include <thread>
|
| 10 |
+
|
| 11 |
#define COMMON_SAMPLE_RATE 16000
|
| 12 |
|
| 13 |
+
//
|
| 14 |
+
// CLI argument parsing
|
| 15 |
+
//
|
| 16 |
+
|
| 17 |
+
struct gpt_params {
|
| 18 |
+
int32_t seed = -1; // RNG seed
|
| 19 |
+
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
| 20 |
+
int32_t n_predict = 200; // new tokens to predict
|
| 21 |
+
|
| 22 |
+
// sampling parameters
|
| 23 |
+
int32_t top_k = 40;
|
| 24 |
+
float top_p = 0.9f;
|
| 25 |
+
float temp = 0.9f;
|
| 26 |
+
|
| 27 |
+
int32_t n_batch = 8; // batch size for prompt processing
|
| 28 |
+
|
| 29 |
+
std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
|
| 30 |
+
std::string prompt;
|
| 31 |
+
};
|
| 32 |
+
|
| 33 |
+
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
| 34 |
+
|
| 35 |
+
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
| 36 |
+
|
| 37 |
+
std::string gpt_random_prompt(std::mt19937 & rng);
|
| 38 |
+
|
| 39 |
+
//
|
| 40 |
+
// Vocab utils
|
| 41 |
+
//
|
| 42 |
|
| 43 |
std::string trim(const std::string & s);
|
| 44 |
|
|
|
|
| 47 |
const std::string & from,
|
| 48 |
const std::string & to);
|
| 49 |
|
| 50 |
+
struct gpt_vocab {
|
| 51 |
+
using id = int32_t;
|
| 52 |
+
using token = std::string;
|
| 53 |
+
|
| 54 |
+
std::map<token, id> token_to_id;
|
| 55 |
+
std::map<id, token> id_to_token;
|
| 56 |
+
};
|
| 57 |
+
|
| 58 |
+
// poor-man's JSON parsing
|
| 59 |
+
std::map<std::string, int32_t> json_parse(const std::string & fname);
|
| 60 |
+
|
| 61 |
+
// split text into tokens
|
| 62 |
+
//
|
| 63 |
+
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
|
| 64 |
+
//
|
| 65 |
+
// Regex (Python):
|
| 66 |
+
// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
|
| 67 |
+
//
|
| 68 |
+
// Regex (C++):
|
| 69 |
+
// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
|
| 70 |
+
//
|
| 71 |
+
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
|
| 72 |
+
|
| 73 |
+
// load the tokens from encoder.json
|
| 74 |
+
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
|
| 75 |
+
|
| 76 |
+
// sample next token given probabilities for each embedding
|
| 77 |
+
//
|
| 78 |
+
// - consider only the top K tokens
|
| 79 |
+
// - from them, consider only the top tokens with cumulative probability > P
|
| 80 |
+
//
|
| 81 |
+
// TODO: not sure if this implementation is correct
|
| 82 |
+
// TODO: temperature is not implemented
|
| 83 |
+
//
|
| 84 |
+
gpt_vocab::id gpt_sample_top_k_top_p(
|
| 85 |
+
const gpt_vocab & vocab,
|
| 86 |
+
const float * logits,
|
| 87 |
+
int top_k,
|
| 88 |
+
double top_p,
|
| 89 |
+
double temp,
|
| 90 |
+
std::mt19937 & rng);
|
| 91 |
+
|
| 92 |
+
//
|
| 93 |
+
// Audio utils
|
| 94 |
+
//
|
| 95 |
+
|
| 96 |
// Read WAV audio file and store the PCM data into pcmf32
|
| 97 |
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
|
| 98 |
// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
|
examples/helpers.js
CHANGED
|
@@ -145,7 +145,15 @@ function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
|
|
| 145 |
var db = event.target.result;
|
| 146 |
var tx = db.transaction(['models'], 'readwrite');
|
| 147 |
var os = tx.objectStore('models');
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
rq.onsuccess = function (event) {
|
| 151 |
cbPrint('loadRemote: "' + url + '" stored in the IndexedDB');
|
|
@@ -180,7 +188,6 @@ function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
|
|
| 180 |
|
| 181 |
rq.onabort = function (event) {
|
| 182 |
cbPrint('loadRemote: failed to open IndexedDB: abort');
|
| 183 |
-
|
| 184 |
};
|
| 185 |
}
|
| 186 |
-
|
|
|
|
| 145 |
var db = event.target.result;
|
| 146 |
var tx = db.transaction(['models'], 'readwrite');
|
| 147 |
var os = tx.objectStore('models');
|
| 148 |
+
|
| 149 |
+
var rq = null;
|
| 150 |
+
try {
|
| 151 |
+
var rq = os.put(data, url);
|
| 152 |
+
} catch (e) {
|
| 153 |
+
cbPrint('loadRemote: failed to store "' + url + '" in the IndexedDB: \n' + e);
|
| 154 |
+
cbCancel();
|
| 155 |
+
return;
|
| 156 |
+
}
|
| 157 |
|
| 158 |
rq.onsuccess = function (event) {
|
| 159 |
cbPrint('loadRemote: "' + url + '" stored in the IndexedDB');
|
|
|
|
| 188 |
|
| 189 |
rq.onabort = function (event) {
|
| 190 |
cbPrint('loadRemote: failed to open IndexedDB: abort');
|
| 191 |
+
cbCancel();
|
| 192 |
};
|
| 193 |
}
|
|
|
examples/main/main.cpp
CHANGED
|
@@ -496,7 +496,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
|
|
| 496 |
value_i("layer", whisper_model_n_text_layer(ctx), true);
|
| 497 |
end_obj();
|
| 498 |
value_i("mels", whisper_model_n_mels(ctx));
|
| 499 |
-
value_i("
|
| 500 |
end_obj();
|
| 501 |
start_obj("params");
|
| 502 |
value_s("model", params.model.c_str());
|
|
|
|
| 496 |
value_i("layer", whisper_model_n_text_layer(ctx), true);
|
| 497 |
end_obj();
|
| 498 |
value_i("mels", whisper_model_n_mels(ctx));
|
| 499 |
+
value_i("ftype", whisper_model_ftype(ctx), true);
|
| 500 |
end_obj();
|
| 501 |
start_obj("params");
|
| 502 |
value_s("model", params.model.c_str());
|
examples/quantize/CMakeLists.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set(TARGET quantize)
|
| 2 |
+
add_executable(${TARGET} quantize.cpp)
|
| 3 |
+
|
| 4 |
+
include(DefaultTargetOptions)
|
| 5 |
+
|
| 6 |
+
target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
|
examples/quantize/README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# quantize
|
| 2 |
+
|
| 3 |
+
Tool for integer quantization of Whisper `ggml` model files
|
examples/quantize/quantize.cpp
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "ggml.h"
|
| 2 |
+
|
| 3 |
+
#include "common.h"
|
| 4 |
+
#include "common-ggml.h"
|
| 5 |
+
|
| 6 |
+
#include <cassert>
|
| 7 |
+
#include <cmath>
|
| 8 |
+
#include <cstdio>
|
| 9 |
+
#include <cstring>
|
| 10 |
+
#include <fstream>
|
| 11 |
+
#include <map>
|
| 12 |
+
#include <string>
|
| 13 |
+
#include <vector>
|
| 14 |
+
#include <regex>
|
| 15 |
+
|
| 16 |
+
// default hparams (Whisper tiny)
|
| 17 |
+
struct whisper_hparams {
|
| 18 |
+
int32_t n_vocab = 51864;
|
| 19 |
+
int32_t n_audio_ctx = 1500;
|
| 20 |
+
int32_t n_audio_state = 384;
|
| 21 |
+
int32_t n_audio_head = 6;
|
| 22 |
+
int32_t n_audio_layer = 4;
|
| 23 |
+
int32_t n_text_ctx = 448;
|
| 24 |
+
int32_t n_text_state = 384;
|
| 25 |
+
int32_t n_text_head = 6;
|
| 26 |
+
int32_t n_text_layer = 4;
|
| 27 |
+
int32_t n_mels = 80;
|
| 28 |
+
int32_t f16 = 1;
|
| 29 |
+
};
|
| 30 |
+
|
| 31 |
+
struct whisper_filters {
|
| 32 |
+
int32_t n_mel;
|
| 33 |
+
int32_t n_fft;
|
| 34 |
+
|
| 35 |
+
std::vector<float> data;
|
| 36 |
+
};
|
| 37 |
+
|
| 38 |
+
// quantize a model
|
| 39 |
+
bool whisper_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
|
| 40 |
+
gpt_vocab vocab;
|
| 41 |
+
|
| 42 |
+
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
| 43 |
+
|
| 44 |
+
auto finp = std::ifstream(fname_inp, std::ios::binary);
|
| 45 |
+
if (!finp) {
|
| 46 |
+
fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
|
| 47 |
+
return false;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
auto fout = std::ofstream(fname_out, std::ios::binary);
|
| 51 |
+
if (!fout) {
|
| 52 |
+
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
|
| 53 |
+
return false;
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
// verify magic
|
| 57 |
+
{
|
| 58 |
+
uint32_t magic;
|
| 59 |
+
finp.read((char *) &magic, sizeof(magic));
|
| 60 |
+
if (magic != 0x67676d6c) {
|
| 61 |
+
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
|
| 62 |
+
return false;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
fout.write((char *) &magic, sizeof(magic));
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
whisper_hparams hparams;
|
| 69 |
+
|
| 70 |
+
// load hparams
|
| 71 |
+
{
|
| 72 |
+
finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
| 73 |
+
finp.read((char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
|
| 74 |
+
finp.read((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
|
| 75 |
+
finp.read((char *) &hparams.n_audio_head, sizeof(hparams.n_audio_head));
|
| 76 |
+
finp.read((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
|
| 77 |
+
finp.read((char *) &hparams.n_text_ctx, sizeof(hparams.n_text_ctx));
|
| 78 |
+
finp.read((char *) &hparams.n_text_state, sizeof(hparams.n_text_state));
|
| 79 |
+
finp.read((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
|
| 80 |
+
finp.read((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
|
| 81 |
+
finp.read((char *) &hparams.n_mels, sizeof(hparams.n_mels));
|
| 82 |
+
finp.read((char *) &hparams.f16, sizeof(hparams.f16));
|
| 83 |
+
|
| 84 |
+
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
| 85 |
+
fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
|
| 86 |
+
fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
|
| 87 |
+
fprintf(stderr, "%s: n_audio_head = %d\n", __func__, hparams.n_audio_head);
|
| 88 |
+
fprintf(stderr, "%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
|
| 89 |
+
fprintf(stderr, "%s: n_text_ctx = %d\n", __func__, hparams.n_text_ctx);
|
| 90 |
+
fprintf(stderr, "%s: n_text_state = %d\n", __func__, hparams.n_text_state);
|
| 91 |
+
fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
|
| 92 |
+
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
|
| 93 |
+
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
|
| 94 |
+
fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
|
| 95 |
+
|
| 96 |
+
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
| 97 |
+
fout.write((char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
|
| 98 |
+
fout.write((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
|
| 99 |
+
fout.write((char *) &hparams.n_audio_head, sizeof(hparams.n_audio_head));
|
| 100 |
+
fout.write((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
|
| 101 |
+
fout.write((char *) &hparams.n_text_ctx, sizeof(hparams.n_text_ctx));
|
| 102 |
+
fout.write((char *) &hparams.n_text_state, sizeof(hparams.n_text_state));
|
| 103 |
+
fout.write((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
|
| 104 |
+
fout.write((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
|
| 105 |
+
fout.write((char *) &hparams.n_mels, sizeof(hparams.n_mels));
|
| 106 |
+
fout.write((char *) &ftype, sizeof(hparams.f16));
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
// load mel filters
|
| 110 |
+
{
|
| 111 |
+
whisper_filters filters;
|
| 112 |
+
|
| 113 |
+
finp.read ((char *) &filters.n_mel, sizeof(filters.n_mel));
|
| 114 |
+
fout.write((char *) &filters.n_mel, sizeof(filters.n_mel));
|
| 115 |
+
finp.read ((char *) &filters.n_fft, sizeof(filters.n_fft));
|
| 116 |
+
fout.write((char *) &filters.n_fft, sizeof(filters.n_fft));
|
| 117 |
+
|
| 118 |
+
filters.data.resize(filters.n_mel * filters.n_fft);
|
| 119 |
+
finp.read ((char *) filters.data.data(), filters.data.size() * sizeof(float));
|
| 120 |
+
fout.write((char *) filters.data.data(), filters.data.size() * sizeof(float));
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
// load vocab
|
| 124 |
+
{
|
| 125 |
+
int32_t n_vocab = 0;
|
| 126 |
+
finp.read ((char *) &n_vocab, sizeof(n_vocab));
|
| 127 |
+
fout.write((char *) &n_vocab, sizeof(n_vocab));
|
| 128 |
+
|
| 129 |
+
//if (n_vocab != hparams.n_vocab) {
|
| 130 |
+
// fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
| 131 |
+
// __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
|
| 132 |
+
// return false;
|
| 133 |
+
//}
|
| 134 |
+
|
| 135 |
+
std::string word;
|
| 136 |
+
for (int i = 0; i < n_vocab; i++) {
|
| 137 |
+
uint32_t len;
|
| 138 |
+
finp.read ((char *) &len, sizeof(len));
|
| 139 |
+
fout.write((char *) &len, sizeof(len));
|
| 140 |
+
|
| 141 |
+
word.resize(len);
|
| 142 |
+
finp.read ((char *) word.data(), len);
|
| 143 |
+
fout.write((char *) word.data(), len);
|
| 144 |
+
|
| 145 |
+
vocab.token_to_id[word] = i;
|
| 146 |
+
vocab.id_to_token[i] = word;
|
| 147 |
+
}
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
// regexes of tensor names to not be quantized
|
| 151 |
+
const std::vector<std::string> to_skip = {
|
| 152 |
+
//"encoder.*",
|
| 153 |
+
"encoder.conv1.bias",
|
| 154 |
+
"encoder.conv2.bias",
|
| 155 |
+
"encoder.positional_embedding",
|
| 156 |
+
"decoder.positional_embedding",
|
| 157 |
+
};
|
| 158 |
+
|
| 159 |
+
if (!ggml_common_quantize_0(finp, fout, ftype, { ".*" }, to_skip)) {
|
| 160 |
+
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
|
| 161 |
+
return false;
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
finp.close();
|
| 165 |
+
fout.close();
|
| 166 |
+
|
| 167 |
+
return true;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
int main(int argc, char ** argv) {
|
| 171 |
+
if (argc != 4) {
|
| 172 |
+
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
| 173 |
+
ggml_print_ftypes(stderr);
|
| 174 |
+
return 1;
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
// needed to initialize f16 tables
|
| 178 |
+
{
|
| 179 |
+
struct ggml_init_params params = { 0, NULL, false };
|
| 180 |
+
struct ggml_context * ctx = ggml_init(params);
|
| 181 |
+
ggml_free(ctx);
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
const std::string fname_inp = argv[1];
|
| 185 |
+
const std::string fname_out = argv[2];
|
| 186 |
+
|
| 187 |
+
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
|
| 188 |
+
|
| 189 |
+
const int64_t t_main_start_us = ggml_time_us();
|
| 190 |
+
|
| 191 |
+
int64_t t_quantize_us = 0;
|
| 192 |
+
|
| 193 |
+
// load the model
|
| 194 |
+
{
|
| 195 |
+
const int64_t t_start_us = ggml_time_us();
|
| 196 |
+
|
| 197 |
+
if (!whisper_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
|
| 198 |
+
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
| 199 |
+
return 1;
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
t_quantize_us = ggml_time_us() - t_start_us;
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
// report timing
|
| 206 |
+
{
|
| 207 |
+
const int64_t t_main_end_us = ggml_time_us();
|
| 208 |
+
|
| 209 |
+
printf("\n");
|
| 210 |
+
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
|
| 211 |
+
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
return 0;
|
| 215 |
+
}
|
examples/stream.wasm/index-tmpl.html
CHANGED
|
@@ -35,6 +35,15 @@
|
|
| 35 |
|
| 36 |
<br><br>
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
<hr>
|
| 39 |
|
| 40 |
Select the model you would like to use, click the "Start" button and start speaking
|
|
@@ -45,6 +54,10 @@
|
|
| 45 |
Whisper model: <span id="model-whisper-status"></span>
|
| 46 |
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
|
| 47 |
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
<span id="fetch-whisper-progress"></span>
|
| 49 |
|
| 50 |
<!--
|
|
@@ -162,11 +175,17 @@
|
|
| 162 |
let urls = {
|
| 163 |
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
|
| 164 |
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
|
|
|
|
|
|
|
|
|
|
| 165 |
};
|
| 166 |
|
| 167 |
let sizes = {
|
| 168 |
'tiny.en': 75,
|
| 169 |
'base.en': 142,
|
|
|
|
|
|
|
|
|
|
| 170 |
};
|
| 171 |
|
| 172 |
let url = urls[model];
|
|
@@ -177,6 +196,10 @@
|
|
| 177 |
|
| 178 |
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
|
| 179 |
document.getElementById('fetch-whisper-base-en').style.display = 'none';
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
|
| 181 |
|
| 182 |
cbProgress = function(p) {
|
|
@@ -188,6 +211,10 @@
|
|
| 188 |
var el;
|
| 189 |
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
|
| 190 |
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
|
| 192 |
};
|
| 193 |
|
|
|
|
| 35 |
|
| 36 |
<br><br>
|
| 37 |
|
| 38 |
+
<b>More examples:</b>
|
| 39 |
+
<a href="https://whisper.ggerganov.com/">main</a> |
|
| 40 |
+
<a href="https://whisper.ggerganov.com/bench">bench</a> |
|
| 41 |
+
<a href="https://whisper.ggerganov.com/stream">stream</a> |
|
| 42 |
+
<a href="https://whisper.ggerganov.com/command">command</a> |
|
| 43 |
+
<a href="https://whisper.ggerganov.com/talk">talk</a> |
|
| 44 |
+
|
| 45 |
+
<br><br>
|
| 46 |
+
|
| 47 |
<hr>
|
| 48 |
|
| 49 |
Select the model you would like to use, click the "Start" button and start speaking
|
|
|
|
| 54 |
Whisper model: <span id="model-whisper-status"></span>
|
| 55 |
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
|
| 56 |
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
|
| 57 |
+
<br><br>
|
| 58 |
+
Quantized models:<br><br>
|
| 59 |
+
<button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
|
| 60 |
+
<button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
|
| 61 |
<span id="fetch-whisper-progress"></span>
|
| 62 |
|
| 63 |
<!--
|
|
|
|
| 175 |
let urls = {
|
| 176 |
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
|
| 177 |
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
|
| 178 |
+
|
| 179 |
+
'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
|
| 180 |
+
'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
|
| 181 |
};
|
| 182 |
|
| 183 |
let sizes = {
|
| 184 |
'tiny.en': 75,
|
| 185 |
'base.en': 142,
|
| 186 |
+
|
| 187 |
+
'tiny-en-q5_1': 31,
|
| 188 |
+
'base-en-q5_1': 57,
|
| 189 |
};
|
| 190 |
|
| 191 |
let url = urls[model];
|
|
|
|
| 196 |
|
| 197 |
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
|
| 198 |
document.getElementById('fetch-whisper-base-en').style.display = 'none';
|
| 199 |
+
|
| 200 |
+
document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
|
| 201 |
+
document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
|
| 202 |
+
|
| 203 |
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
|
| 204 |
|
| 205 |
cbProgress = function(p) {
|
|
|
|
| 211 |
var el;
|
| 212 |
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
|
| 213 |
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
|
| 214 |
+
|
| 215 |
+
el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
|
| 216 |
+
el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
|
| 217 |
+
|
| 218 |
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
|
| 219 |
};
|
| 220 |
|
examples/talk-llama/{llama_util.h → llama-util.h}
RENAMED
|
@@ -21,12 +21,17 @@
|
|
| 21 |
#if defined(_POSIX_MAPPED_FILES)
|
| 22 |
#include <sys/mman.h>
|
| 23 |
#endif
|
|
|
|
|
|
|
|
|
|
| 24 |
#endif
|
| 25 |
#endif
|
| 26 |
|
| 27 |
#if defined(_WIN32)
|
| 28 |
#define WIN32_LEAN_AND_MEAN
|
| 29 |
-
#
|
|
|
|
|
|
|
| 30 |
#include <windows.h>
|
| 31 |
#include <io.h>
|
| 32 |
#include <stdio.h> // for _fseeki64
|
|
@@ -41,8 +46,12 @@
|
|
| 41 |
} while (0)
|
| 42 |
|
| 43 |
#ifdef __GNUC__
|
|
|
|
|
|
|
|
|
|
| 44 |
__attribute__((format(printf, 1, 2)))
|
| 45 |
#endif
|
|
|
|
| 46 |
static std::string format(const char * fmt, ...) {
|
| 47 |
va_list ap, ap2;
|
| 48 |
va_start(ap, fmt);
|
|
@@ -55,7 +64,7 @@ static std::string format(const char * fmt, ...) {
|
|
| 55 |
va_end(ap2);
|
| 56 |
va_end(ap);
|
| 57 |
return std::string(buf.data(), size);
|
| 58 |
-
}
|
| 59 |
|
| 60 |
struct llama_file {
|
| 61 |
// use FILE * so we don't have to re-open the file to mmap
|
|
@@ -162,7 +171,7 @@ struct llama_mmap {
|
|
| 162 |
#ifdef _POSIX_MAPPED_FILES
|
| 163 |
static constexpr bool SUPPORTED = true;
|
| 164 |
|
| 165 |
-
llama_mmap(struct llama_file * file) {
|
| 166 |
size = file->size;
|
| 167 |
int fd = fileno(file->fp);
|
| 168 |
int flags = MAP_SHARED;
|
|
@@ -170,15 +179,16 @@ struct llama_mmap {
|
|
| 170 |
flags |= MAP_POPULATE;
|
| 171 |
#endif
|
| 172 |
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
| 173 |
-
close(fd);
|
| 174 |
if (addr == MAP_FAILED) {
|
| 175 |
throw format("mmap failed: %s", strerror(errno));
|
| 176 |
}
|
| 177 |
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
|
|
|
|
|
|
| 182 |
}
|
| 183 |
}
|
| 184 |
|
|
@@ -188,14 +198,13 @@ struct llama_mmap {
|
|
| 188 |
#elif defined(_WIN32)
|
| 189 |
static constexpr bool SUPPORTED = true;
|
| 190 |
|
| 191 |
-
llama_mmap(struct llama_file * file) {
|
| 192 |
size = file->size;
|
| 193 |
|
| 194 |
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
| 195 |
|
| 196 |
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
| 197 |
DWORD error = GetLastError();
|
| 198 |
-
CloseHandle(hFile);
|
| 199 |
|
| 200 |
if (hMapping == NULL) {
|
| 201 |
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
|
@@ -209,14 +218,20 @@ struct llama_mmap {
|
|
| 209 |
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
|
| 210 |
}
|
| 211 |
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
| 219 |
}
|
|
|
|
|
|
|
|
|
|
| 220 |
}
|
| 221 |
|
| 222 |
~llama_mmap() {
|
|
@@ -291,8 +306,18 @@ struct llama_mlock {
|
|
| 291 |
if (!mlock(addr, size)) {
|
| 292 |
return true;
|
| 293 |
} else {
|
| 294 |
-
|
| 295 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
return false;
|
| 297 |
}
|
| 298 |
}
|
|
@@ -338,8 +363,8 @@ struct llama_mlock {
|
|
| 338 |
// Hopefully a megabyte is enough overhead:
|
| 339 |
size_t increment = size + 1048576;
|
| 340 |
// The minimum must be <= the maximum, so we need to increase both:
|
| 341 |
-
min_ws_size +=
|
| 342 |
-
max_ws_size +=
|
| 343 |
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
| 344 |
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
|
| 345 |
llama_format_win_err(GetLastError()).c_str());
|
|
@@ -380,4 +405,29 @@ struct llama_buffer {
|
|
| 380 |
delete[] addr;
|
| 381 |
}
|
| 382 |
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
#endif
|
|
|
|
| 21 |
#if defined(_POSIX_MAPPED_FILES)
|
| 22 |
#include <sys/mman.h>
|
| 23 |
#endif
|
| 24 |
+
#if defined(_POSIX_MEMLOCK_RANGE)
|
| 25 |
+
#include <sys/resource.h>
|
| 26 |
+
#endif
|
| 27 |
#endif
|
| 28 |
#endif
|
| 29 |
|
| 30 |
#if defined(_WIN32)
|
| 31 |
#define WIN32_LEAN_AND_MEAN
|
| 32 |
+
#ifndef NOMINMAX
|
| 33 |
+
#define NOMINMAX
|
| 34 |
+
#endif
|
| 35 |
#include <windows.h>
|
| 36 |
#include <io.h>
|
| 37 |
#include <stdio.h> // for _fseeki64
|
|
|
|
| 46 |
} while (0)
|
| 47 |
|
| 48 |
#ifdef __GNUC__
|
| 49 |
+
#ifdef __MINGW32__
|
| 50 |
+
__attribute__((format(gnu_printf, 1, 2)))
|
| 51 |
+
#else
|
| 52 |
__attribute__((format(printf, 1, 2)))
|
| 53 |
#endif
|
| 54 |
+
#endif
|
| 55 |
static std::string format(const char * fmt, ...) {
|
| 56 |
va_list ap, ap2;
|
| 57 |
va_start(ap, fmt);
|
|
|
|
| 64 |
va_end(ap2);
|
| 65 |
va_end(ap);
|
| 66 |
return std::string(buf.data(), size);
|
| 67 |
+
}
|
| 68 |
|
| 69 |
struct llama_file {
|
| 70 |
// use FILE * so we don't have to re-open the file to mmap
|
|
|
|
| 171 |
#ifdef _POSIX_MAPPED_FILES
|
| 172 |
static constexpr bool SUPPORTED = true;
|
| 173 |
|
| 174 |
+
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
| 175 |
size = file->size;
|
| 176 |
int fd = fileno(file->fp);
|
| 177 |
int flags = MAP_SHARED;
|
|
|
|
| 179 |
flags |= MAP_POPULATE;
|
| 180 |
#endif
|
| 181 |
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
|
|
|
| 182 |
if (addr == MAP_FAILED) {
|
| 183 |
throw format("mmap failed: %s", strerror(errno));
|
| 184 |
}
|
| 185 |
|
| 186 |
+
if (prefetch) {
|
| 187 |
+
// Advise the kernel to preload the mapped memory
|
| 188 |
+
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
| 189 |
+
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
| 190 |
+
strerror(errno));
|
| 191 |
+
}
|
| 192 |
}
|
| 193 |
}
|
| 194 |
|
|
|
|
| 198 |
#elif defined(_WIN32)
|
| 199 |
static constexpr bool SUPPORTED = true;
|
| 200 |
|
| 201 |
+
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
| 202 |
size = file->size;
|
| 203 |
|
| 204 |
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
| 205 |
|
| 206 |
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
| 207 |
DWORD error = GetLastError();
|
|
|
|
| 208 |
|
| 209 |
if (hMapping == NULL) {
|
| 210 |
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
|
|
|
| 218 |
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
|
| 219 |
}
|
| 220 |
|
| 221 |
+
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
| 222 |
+
if (prefetch) {
|
| 223 |
+
// Advise the kernel to preload the mapped memory
|
| 224 |
+
WIN32_MEMORY_RANGE_ENTRY range;
|
| 225 |
+
range.VirtualAddress = addr;
|
| 226 |
+
range.NumberOfBytes = (SIZE_T)size;
|
| 227 |
+
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
| 228 |
+
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
| 229 |
+
llama_format_win_err(GetLastError()).c_str());
|
| 230 |
+
}
|
| 231 |
}
|
| 232 |
+
#else
|
| 233 |
+
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
| 234 |
+
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
| 235 |
}
|
| 236 |
|
| 237 |
~llama_mmap() {
|
|
|
|
| 306 |
if (!mlock(addr, size)) {
|
| 307 |
return true;
|
| 308 |
} else {
|
| 309 |
+
char* errmsg = std::strerror(errno);
|
| 310 |
+
bool suggest = (errno == ENOMEM);
|
| 311 |
+
|
| 312 |
+
// Check if the resource limit is fine after all
|
| 313 |
+
struct rlimit lock_limit;
|
| 314 |
+
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
|
| 315 |
+
suggest = false;
|
| 316 |
+
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
|
| 317 |
+
suggest = false;
|
| 318 |
+
|
| 319 |
+
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
| 320 |
+
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
| 321 |
return false;
|
| 322 |
}
|
| 323 |
}
|
|
|
|
| 363 |
// Hopefully a megabyte is enough overhead:
|
| 364 |
size_t increment = size + 1048576;
|
| 365 |
// The minimum must be <= the maximum, so we need to increase both:
|
| 366 |
+
min_ws_size += increment;
|
| 367 |
+
max_ws_size += increment;
|
| 368 |
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
| 369 |
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
|
| 370 |
llama_format_win_err(GetLastError()).c_str());
|
|
|
|
| 405 |
delete[] addr;
|
| 406 |
}
|
| 407 |
};
|
| 408 |
+
|
| 409 |
+
#ifdef GGML_USE_CUBLAS
|
| 410 |
+
#include "ggml-cuda.h"
|
| 411 |
+
struct llama_ctx_buffer {
|
| 412 |
+
uint8_t * addr = NULL;
|
| 413 |
+
size_t size = 0;
|
| 414 |
+
|
| 415 |
+
void resize(size_t size) {
|
| 416 |
+
if (addr) {
|
| 417 |
+
ggml_cuda_host_free(addr);
|
| 418 |
+
}
|
| 419 |
+
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
| 420 |
+
this->size = size;
|
| 421 |
+
}
|
| 422 |
+
|
| 423 |
+
~llama_ctx_buffer() {
|
| 424 |
+
if (addr) {
|
| 425 |
+
ggml_cuda_host_free(addr);
|
| 426 |
+
}
|
| 427 |
+
}
|
| 428 |
+
};
|
| 429 |
+
#else
|
| 430 |
+
typedef llama_buffer llama_ctx_buffer;
|
| 431 |
+
#endif
|
| 432 |
+
|
| 433 |
#endif
|
examples/talk-llama/llama.cpp
CHANGED
|
@@ -1,10 +1,17 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
#include "llama.h"
|
| 3 |
-
#include "llama_internal.h"
|
| 4 |
|
| 5 |
#include "ggml.h"
|
| 6 |
|
| 7 |
#include <array>
|
|
|
|
| 8 |
#include <cinttypes>
|
| 9 |
#include <fstream>
|
| 10 |
#include <random>
|
|
@@ -17,11 +24,15 @@
|
|
| 17 |
#include <memory>
|
| 18 |
#include <algorithm>
|
| 19 |
#include <initializer_list>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
#define LLAMA_USE_SCRATCH
|
| 22 |
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
| 23 |
|
| 24 |
-
|
| 25 |
// available llama models
|
| 26 |
enum e_model {
|
| 27 |
MODEL_UNKNOWN,
|
|
@@ -37,36 +48,52 @@ static const size_t MB = 1024*1024;
|
|
| 37 |
// TODO: dynamically determine these sizes
|
| 38 |
// needs modifications in ggml
|
| 39 |
|
| 40 |
-
static const std::map<e_model, size_t> MEM_REQ_SCRATCH0
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
static const std::map<e_model, size_t> MEM_REQ_SCRATCH1
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
| 55 |
-
static const std::map<e_model, size_t> MEM_REQ_KV_SELF
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
| 63 |
// not actually needed if BLAS is disabled
|
| 64 |
-
static const std::map<e_model, size_t> MEM_REQ_EVAL
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
// default hparams (LLaMA 7B)
|
| 72 |
struct llama_hparams {
|
|
@@ -77,7 +104,7 @@ struct llama_hparams {
|
|
| 77 |
uint32_t n_head = 32;
|
| 78 |
uint32_t n_layer = 32;
|
| 79 |
uint32_t n_rot = 64;
|
| 80 |
-
|
| 81 |
|
| 82 |
bool operator!=(const llama_hparams & other) const {
|
| 83 |
return memcmp(this, &other, sizeof(llama_hparams));
|
|
@@ -109,7 +136,7 @@ struct llama_kv_cache {
|
|
| 109 |
|
| 110 |
struct ggml_context * ctx = NULL;
|
| 111 |
|
| 112 |
-
|
| 113 |
|
| 114 |
int n; // number of tokens currently in the cache
|
| 115 |
|
|
@@ -140,7 +167,7 @@ struct llama_model {
|
|
| 140 |
struct llama_kv_cache kv_self;
|
| 141 |
|
| 142 |
// the model memory buffer
|
| 143 |
-
|
| 144 |
|
| 145 |
// model memory mapped file
|
| 146 |
std::unique_ptr<llama_mmap> mapping;
|
|
@@ -201,8 +228,8 @@ struct llama_context {
|
|
| 201 |
|
| 202 |
// memory buffers used to evaluate the model
|
| 203 |
// TODO: move in llama_state
|
| 204 |
-
|
| 205 |
-
|
| 206 |
|
| 207 |
int buf_last = 0;
|
| 208 |
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
|
@@ -257,22 +284,12 @@ static size_t checked_div(size_t a, size_t b) {
|
|
| 257 |
}
|
| 258 |
|
| 259 |
static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
|
| 260 |
-
|
|
|
|
| 261 |
for (size_t i = 1; i < ne.size(); i++) {
|
| 262 |
-
|
| 263 |
-
}
|
| 264 |
-
ret += "]";
|
| 265 |
-
return ret;
|
| 266 |
-
}
|
| 267 |
-
|
| 268 |
-
static const char * llama_format_type(enum ggml_type type) {
|
| 269 |
-
switch (type) {
|
| 270 |
-
case GGML_TYPE_F32: return "f32";
|
| 271 |
-
case GGML_TYPE_F16: return "f16";
|
| 272 |
-
case GGML_TYPE_Q4_0: return "q4_0";
|
| 273 |
-
case GGML_TYPE_Q4_1: return "q4_1";
|
| 274 |
-
default: LLAMA_ASSERT(false);
|
| 275 |
}
|
|
|
|
| 276 |
}
|
| 277 |
|
| 278 |
static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
|
|
@@ -427,7 +444,7 @@ struct llama_file_loader {
|
|
| 427 |
hparams.n_head = file.read_u32();
|
| 428 |
hparams.n_layer = file.read_u32();
|
| 429 |
hparams.n_rot = file.read_u32();
|
| 430 |
-
hparams.
|
| 431 |
}
|
| 432 |
void read_vocab() {
|
| 433 |
vocab.id_to_token.resize(hparams.n_vocab);
|
|
@@ -453,20 +470,25 @@ struct llama_file_loader {
|
|
| 453 |
llama_load_tensor_shard shard;
|
| 454 |
uint32_t n_dims = file.read_u32();
|
| 455 |
uint32_t name_len = file.read_u32();
|
| 456 |
-
|
| 457 |
shard.ne.resize(n_dims);
|
| 458 |
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
|
| 459 |
std::string name = file.read_string(name_len);
|
| 460 |
if (n_dims < 1 || n_dims > 2) {
|
| 461 |
throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
|
| 462 |
}
|
| 463 |
-
switch (
|
| 464 |
-
case
|
| 465 |
-
case
|
| 466 |
-
case
|
| 467 |
-
case
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
default: {
|
| 469 |
-
throw format("unrecognized
|
| 470 |
}
|
| 471 |
}
|
| 472 |
|
|
@@ -497,18 +519,18 @@ struct llama_file_loader {
|
|
| 497 |
struct llama_file_saver {
|
| 498 |
llama_file file;
|
| 499 |
llama_file_loader * any_file_loader;
|
| 500 |
-
llama_file_saver(const char * fname, llama_file_loader * any_file_loader,
|
| 501 |
: file(fname, "wb"), any_file_loader(any_file_loader) {
|
| 502 |
fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
|
| 503 |
write_magic();
|
| 504 |
-
write_hparams(
|
| 505 |
write_vocab();
|
| 506 |
}
|
| 507 |
void write_magic() {
|
| 508 |
file.write_u32('ggjt'); // magic
|
| 509 |
file.write_u32(1); // version
|
| 510 |
}
|
| 511 |
-
void write_hparams(
|
| 512 |
const llama_hparams & hparams = any_file_loader->hparams;
|
| 513 |
file.write_u32(hparams.n_vocab);
|
| 514 |
file.write_u32(hparams.n_embd);
|
|
@@ -516,7 +538,7 @@ struct llama_file_saver {
|
|
| 516 |
file.write_u32(hparams.n_head);
|
| 517 |
file.write_u32(hparams.n_layer);
|
| 518 |
file.write_u32(hparams.n_rot);
|
| 519 |
-
file.write_u32(
|
| 520 |
}
|
| 521 |
void write_vocab() {
|
| 522 |
if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
|
|
@@ -531,17 +553,21 @@ struct llama_file_saver {
|
|
| 531 |
}
|
| 532 |
}
|
| 533 |
void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
|
| 534 |
-
uint32_t ftype;
|
| 535 |
switch (new_type) {
|
| 536 |
-
case GGML_TYPE_F32:
|
| 537 |
-
case GGML_TYPE_F16:
|
| 538 |
-
case GGML_TYPE_Q4_0:
|
| 539 |
-
case GGML_TYPE_Q4_1:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 540 |
default: LLAMA_ASSERT(false);
|
| 541 |
}
|
| 542 |
file.write_u32((uint32_t) tensor.ne.size());
|
| 543 |
file.write_u32((uint32_t) tensor.name.size());
|
| 544 |
-
file.write_u32(
|
| 545 |
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
| 546 |
file.write_raw(tensor.name.data(), tensor.name.size());
|
| 547 |
file.seek(-file.tell() & 31, SEEK_CUR);
|
|
@@ -621,6 +647,7 @@ struct llama_model_loader {
|
|
| 621 |
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
| 622 |
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
| 623 |
}
|
|
|
|
| 624 |
return get_tensor_for(lt);
|
| 625 |
}
|
| 626 |
|
|
@@ -753,7 +780,7 @@ static bool kv_cache_init(
|
|
| 753 |
const int n_embd = hparams.n_embd;
|
| 754 |
const int n_layer = hparams.n_layer;
|
| 755 |
|
| 756 |
-
const int64_t n_mem =
|
| 757 |
const int64_t n_elements = n_embd*n_mem;
|
| 758 |
|
| 759 |
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
|
@@ -815,6 +842,22 @@ static const char *llama_file_version_name(llama_file_version version) {
|
|
| 815 |
}
|
| 816 |
}
|
| 817 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 818 |
static const char *llama_model_type_name(e_model type) {
|
| 819 |
switch (type) {
|
| 820 |
case MODEL_7B: return "7B";
|
|
@@ -867,7 +910,7 @@ static void llama_model_load_internal(
|
|
| 867 |
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
| 868 |
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
| 869 |
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
| 870 |
-
fprintf(stderr, "%s:
|
| 871 |
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
| 872 |
fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
|
| 873 |
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
|
@@ -891,13 +934,13 @@ static void llama_model_load_internal(
|
|
| 891 |
const size_t mem_required =
|
| 892 |
ctx_size +
|
| 893 |
mmapped_size +
|
| 894 |
-
MEM_REQ_SCRATCH0.at(model.type) +
|
| 895 |
-
MEM_REQ_SCRATCH1.at(model.type) +
|
| 896 |
-
MEM_REQ_EVAL.at
|
| 897 |
|
| 898 |
// this is the memory required by one llama_state
|
| 899 |
const size_t mem_required_state =
|
| 900 |
-
scale*MEM_REQ_KV_SELF.at(model.type);
|
| 901 |
|
| 902 |
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
| 903 |
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
|
@@ -934,8 +977,8 @@ static void llama_model_load_internal(
|
|
| 934 |
ml->ggml_ctx = ctx;
|
| 935 |
|
| 936 |
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
| 937 |
-
model.norm
|
| 938 |
-
model.output
|
| 939 |
|
| 940 |
model.layers.resize(n_layer);
|
| 941 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
|
@@ -1039,7 +1082,7 @@ static bool llama_eval_internal(
|
|
| 1039 |
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
| 1040 |
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
| 1041 |
ggml_cgraph gf = {};
|
| 1042 |
-
gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
|
| 1043 |
|
| 1044 |
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
| 1045 |
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
|
@@ -1213,9 +1256,11 @@ static bool llama_eval_internal(
|
|
| 1213 |
ggml_build_forward_expand(&gf, inpL);
|
| 1214 |
ggml_graph_compute (ctx0, &gf);
|
| 1215 |
|
|
|
|
| 1216 |
// print timing information per ggml operation (for debugging purposes)
|
| 1217 |
// requires GGML_PERF to be defined
|
| 1218 |
-
|
|
|
|
| 1219 |
|
| 1220 |
// plot the computation graph in dot format (for debugging purposes)
|
| 1221 |
//if (n_past%100 == 0) {
|
|
@@ -1430,131 +1475,435 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
|
| 1430 |
// sampling
|
| 1431 |
//
|
| 1432 |
|
| 1433 |
-
|
| 1434 |
-
|
| 1435 |
-
|
| 1436 |
-
|
| 1437 |
-
logits_id.begin() + top_k, logits_id.end(),
|
| 1438 |
-
[](const std::pair<float, llama_vocab::id> & a, const std::pair<float, llama_vocab::id> & b) {
|
| 1439 |
-
return a.first > b.first;
|
| 1440 |
-
});
|
| 1441 |
|
| 1442 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1443 |
}
|
| 1444 |
|
| 1445 |
-
|
| 1446 |
-
|
| 1447 |
-
|
| 1448 |
-
|
| 1449 |
-
|
| 1450 |
-
|
| 1451 |
-
|
| 1452 |
-
|
| 1453 |
-
|
| 1454 |
-
|
| 1455 |
-
|
| 1456 |
-
|
| 1457 |
-
|
| 1458 |
-
|
| 1459 |
-
|
| 1460 |
-
// select the token with the highest logit directly
|
| 1461 |
-
float max_logit = plogits[0];
|
| 1462 |
-
llama_vocab::id max_id = 0;
|
| 1463 |
-
|
| 1464 |
-
for (int i = 1; i < n_logits; ++i) {
|
| 1465 |
-
if (plogits[i] > max_logit) {
|
| 1466 |
-
max_logit = plogits[i];
|
| 1467 |
-
max_id = i;
|
| 1468 |
-
}
|
| 1469 |
}
|
| 1470 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1471 |
}
|
|
|
|
| 1472 |
|
| 1473 |
-
|
| 1474 |
-
|
|
|
|
|
|
|
| 1475 |
|
| 1476 |
-
|
| 1477 |
-
|
| 1478 |
-
|
| 1479 |
-
|
| 1480 |
-
|
| 1481 |
-
|
| 1482 |
-
|
| 1483 |
-
|
| 1484 |
-
|
| 1485 |
-
|
| 1486 |
-
|
| 1487 |
-
|
| 1488 |
-
|
| 1489 |
-
|
| 1490 |
-
|
| 1491 |
}
|
| 1492 |
}
|
| 1493 |
|
| 1494 |
-
|
|
|
|
| 1495 |
|
| 1496 |
-
|
| 1497 |
-
|
| 1498 |
-
|
|
|
|
| 1499 |
|
| 1500 |
-
|
| 1501 |
-
|
| 1502 |
-
|
| 1503 |
-
const float p = expf(kv.first - maxl);
|
| 1504 |
-
probs.push_back(p);
|
| 1505 |
-
sum += p;
|
| 1506 |
}
|
| 1507 |
|
| 1508 |
-
|
| 1509 |
-
|
| 1510 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1511 |
}
|
| 1512 |
|
| 1513 |
-
|
| 1514 |
-
|
| 1515 |
-
|
| 1516 |
-
|
| 1517 |
-
|
| 1518 |
-
|
| 1519 |
-
|
| 1520 |
-
|
| 1521 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1522 |
}
|
| 1523 |
}
|
| 1524 |
|
| 1525 |
-
//
|
| 1526 |
-
|
| 1527 |
-
|
| 1528 |
-
|
| 1529 |
-
|
| 1530 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1531 |
|
| 1532 |
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
|
|
|
| 1533 |
int idx = dist(rng);
|
| 1534 |
|
| 1535 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1536 |
}
|
| 1537 |
|
| 1538 |
//
|
| 1539 |
// quantization
|
| 1540 |
//
|
| 1541 |
|
| 1542 |
-
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int
|
| 1543 |
ggml_type quantized_type;
|
| 1544 |
-
switch (
|
| 1545 |
-
case
|
| 1546 |
-
case
|
| 1547 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1548 |
};
|
| 1549 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1550 |
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
| 1551 |
/*vocab_only*/ false));
|
| 1552 |
-
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(),
|
| 1553 |
|
| 1554 |
size_t total_size_org = 0;
|
| 1555 |
size_t total_size_new = 0;
|
| 1556 |
std::vector<int64_t> hist_all(1 << 4, 0);
|
| 1557 |
|
|
|
|
|
|
|
|
|
|
| 1558 |
size_t idx = 0;
|
| 1559 |
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
| 1560 |
llama_buffer read_data;
|
|
@@ -1562,10 +1911,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 1562 |
tensor.data = read_data.addr;
|
| 1563 |
model_loader->load_data_for(tensor);
|
| 1564 |
|
| 1565 |
-
printf("[%
|
| 1566 |
++idx, model_loader->tensors_map.tensors.size(),
|
| 1567 |
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
| 1568 |
-
|
| 1569 |
|
| 1570 |
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
| 1571 |
bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
|
|
@@ -1573,6 +1922,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 1573 |
// quantize only 2D tensors
|
| 1574 |
quantize &= (tensor.ne.size() == 2);
|
| 1575 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1576 |
enum ggml_type new_type;
|
| 1577 |
void * new_data;
|
| 1578 |
size_t new_size;
|
|
@@ -1598,7 +1952,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 1598 |
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
| 1599 |
}
|
| 1600 |
} else {
|
| 1601 |
-
throw format("type %s unsupported for integer quantization",
|
| 1602 |
}
|
| 1603 |
|
| 1604 |
printf("quantizing .. ");
|
|
@@ -1608,17 +1962,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 1608 |
new_data = work.addr;
|
| 1609 |
std::vector<int64_t> hist_cur(1 << 4, 0);
|
| 1610 |
|
| 1611 |
-
|
| 1612 |
-
|
| 1613 |
-
|
| 1614 |
-
|
| 1615 |
-
|
| 1616 |
-
|
| 1617 |
-
|
| 1618 |
-
|
| 1619 |
-
|
| 1620 |
-
|
| 1621 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1622 |
}
|
| 1623 |
|
| 1624 |
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
|
@@ -1717,17 +2091,17 @@ struct llama_context * llama_init_from_file(
|
|
| 1717 |
if (params.logits_all) {
|
| 1718 |
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
|
| 1719 |
} else {
|
| 1720 |
-
ctx->logits.reserve(hparams.
|
| 1721 |
}
|
| 1722 |
|
| 1723 |
if (params.embedding){
|
| 1724 |
ctx->embedding.resize(hparams.n_embd);
|
| 1725 |
}
|
| 1726 |
|
| 1727 |
-
ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
|
| 1728 |
|
| 1729 |
-
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
|
| 1730 |
-
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
|
| 1731 |
}
|
| 1732 |
|
| 1733 |
return ctx;
|
|
@@ -1740,9 +2114,10 @@ void llama_free(struct llama_context * ctx) {
|
|
| 1740 |
int llama_model_quantize(
|
| 1741 |
const char * fname_inp,
|
| 1742 |
const char * fname_out,
|
| 1743 |
-
|
|
|
|
| 1744 |
try {
|
| 1745 |
-
llama_model_quantize_internal(fname_inp, fname_out,
|
| 1746 |
return 0;
|
| 1747 |
} catch (const std::string & err) {
|
| 1748 |
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
|
|
@@ -1750,31 +2125,446 @@ int llama_model_quantize(
|
|
| 1750 |
}
|
| 1751 |
}
|
| 1752 |
|
| 1753 |
-
|
| 1754 |
-
|
| 1755 |
-
|
| 1756 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1757 |
}
|
| 1758 |
|
| 1759 |
-
|
| 1760 |
-
|
| 1761 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1762 |
}
|
| 1763 |
|
| 1764 |
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
| 1765 |
return ctx->model.kv_self.n;
|
| 1766 |
}
|
| 1767 |
|
| 1768 |
-
|
| 1769 |
-
|
| 1770 |
-
|
| 1771 |
-
|
| 1772 |
-
|
| 1773 |
-
|
| 1774 |
-
|
| 1775 |
-
|
| 1776 |
-
|
| 1777 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1778 |
}
|
| 1779 |
|
| 1780 |
int llama_eval(
|
|
@@ -1851,33 +2641,8 @@ llama_token llama_token_eos() {
|
|
| 1851 |
return 2;
|
| 1852 |
}
|
| 1853 |
|
| 1854 |
-
llama_token
|
| 1855 |
-
|
| 1856 |
-
const llama_token * last_n_tokens_data,
|
| 1857 |
-
int last_n_tokens_size,
|
| 1858 |
-
int top_k,
|
| 1859 |
-
float top_p,
|
| 1860 |
-
float temp,
|
| 1861 |
-
float repeat_penalty) {
|
| 1862 |
-
const int64_t t_start_sample_us = ggml_time_us();
|
| 1863 |
-
|
| 1864 |
-
llama_token result = 0;
|
| 1865 |
-
|
| 1866 |
-
// TODO: avoid this ...
|
| 1867 |
-
const auto last_n_tokens = std::vector<llama_token>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
|
| 1868 |
-
|
| 1869 |
-
result = llama_sample_top_p_top_k(
|
| 1870 |
-
*ctx,
|
| 1871 |
-
last_n_tokens,
|
| 1872 |
-
top_k,
|
| 1873 |
-
top_p,
|
| 1874 |
-
temp,
|
| 1875 |
-
repeat_penalty);
|
| 1876 |
-
|
| 1877 |
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 1878 |
-
ctx->n_sample++;
|
| 1879 |
-
|
| 1880 |
-
return result;
|
| 1881 |
}
|
| 1882 |
|
| 1883 |
|
|
@@ -1907,18 +2672,20 @@ const char * llama_print_system_info(void) {
|
|
| 1907 |
static std::string s;
|
| 1908 |
|
| 1909 |
s = "";
|
| 1910 |
-
s += "AVX = "
|
| 1911 |
-
s += "AVX2 = "
|
| 1912 |
-
s += "AVX512 = "
|
| 1913 |
-
s += "
|
| 1914 |
-
s += "
|
| 1915 |
-
s += "
|
| 1916 |
-
s += "
|
| 1917 |
-
s += "
|
| 1918 |
-
s += "
|
| 1919 |
-
s += "
|
| 1920 |
-
s += "
|
| 1921 |
-
s += "
|
|
|
|
|
|
|
| 1922 |
|
| 1923 |
return s.c_str();
|
| 1924 |
}
|
|
@@ -1927,3 +2694,57 @@ const char * llama_print_system_info(void) {
|
|
| 1927 |
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
| 1928 |
return ctx->model.tensors_by_name;
|
| 1929 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Defines fileno on msys:
|
| 2 |
+
#ifndef _GNU_SOURCE
|
| 3 |
+
#define _GNU_SOURCE
|
| 4 |
+
#include <cstdint>
|
| 5 |
+
#include <cstdio>
|
| 6 |
+
#endif
|
| 7 |
+
|
| 8 |
+
#include "llama-util.h"
|
| 9 |
#include "llama.h"
|
|
|
|
| 10 |
|
| 11 |
#include "ggml.h"
|
| 12 |
|
| 13 |
#include <array>
|
| 14 |
+
#include <ctime>
|
| 15 |
#include <cinttypes>
|
| 16 |
#include <fstream>
|
| 17 |
#include <random>
|
|
|
|
| 24 |
#include <memory>
|
| 25 |
#include <algorithm>
|
| 26 |
#include <initializer_list>
|
| 27 |
+
#include <thread>
|
| 28 |
+
#include <atomic>
|
| 29 |
+
#include <mutex>
|
| 30 |
+
#include <sstream>
|
| 31 |
+
#include <numeric>
|
| 32 |
|
| 33 |
#define LLAMA_USE_SCRATCH
|
| 34 |
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
| 35 |
|
|
|
|
| 36 |
// available llama models
|
| 37 |
enum e_model {
|
| 38 |
MODEL_UNKNOWN,
|
|
|
|
| 48 |
// TODO: dynamically determine these sizes
|
| 49 |
// needs modifications in ggml
|
| 50 |
|
| 51 |
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
| 52 |
+
{
|
| 53 |
+
static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
|
| 54 |
+
{ MODEL_7B, 512ull * MB },
|
| 55 |
+
{ MODEL_13B, 512ull * MB },
|
| 56 |
+
{ MODEL_30B, 512ull * MB },
|
| 57 |
+
{ MODEL_65B, 1024ull * MB },
|
| 58 |
+
};
|
| 59 |
+
return _MEM_REQ_SCRATCH0;
|
| 60 |
+
}
|
| 61 |
|
| 62 |
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
| 63 |
+
{
|
| 64 |
+
static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
|
| 65 |
+
{ MODEL_7B, 512ull * MB },
|
| 66 |
+
{ MODEL_13B, 512ull * MB },
|
| 67 |
+
{ MODEL_30B, 512ull * MB },
|
| 68 |
+
{ MODEL_65B, 1024ull * MB },
|
| 69 |
+
};
|
| 70 |
+
return _MEM_REQ_SCRATCH1;
|
| 71 |
+
}
|
| 72 |
|
| 73 |
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
| 74 |
+
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
| 75 |
+
{
|
| 76 |
+
static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
|
| 77 |
+
{ MODEL_7B, 1026ull * MB },
|
| 78 |
+
{ MODEL_13B, 1608ull * MB },
|
| 79 |
+
{ MODEL_30B, 3124ull * MB },
|
| 80 |
+
{ MODEL_65B, 5120ull * MB },
|
| 81 |
+
};
|
| 82 |
+
return _MEM_REQ_KV_SELF;
|
| 83 |
+
}
|
| 84 |
|
| 85 |
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
| 86 |
// not actually needed if BLAS is disabled
|
| 87 |
+
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
| 88 |
+
{
|
| 89 |
+
static std::map<e_model, size_t> _MEM_REQ_EVAL = {
|
| 90 |
+
{ MODEL_7B, 768ull * MB },
|
| 91 |
+
{ MODEL_13B, 1024ull * MB },
|
| 92 |
+
{ MODEL_30B, 1280ull * MB },
|
| 93 |
+
{ MODEL_65B, 1536ull * MB },
|
| 94 |
+
};
|
| 95 |
+
return _MEM_REQ_EVAL;
|
| 96 |
+
}
|
| 97 |
|
| 98 |
// default hparams (LLaMA 7B)
|
| 99 |
struct llama_hparams {
|
|
|
|
| 104 |
uint32_t n_head = 32;
|
| 105 |
uint32_t n_layer = 32;
|
| 106 |
uint32_t n_rot = 64;
|
| 107 |
+
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
| 108 |
|
| 109 |
bool operator!=(const llama_hparams & other) const {
|
| 110 |
return memcmp(this, &other, sizeof(llama_hparams));
|
|
|
|
| 136 |
|
| 137 |
struct ggml_context * ctx = NULL;
|
| 138 |
|
| 139 |
+
llama_ctx_buffer buf;
|
| 140 |
|
| 141 |
int n; // number of tokens currently in the cache
|
| 142 |
|
|
|
|
| 167 |
struct llama_kv_cache kv_self;
|
| 168 |
|
| 169 |
// the model memory buffer
|
| 170 |
+
llama_ctx_buffer buf;
|
| 171 |
|
| 172 |
// model memory mapped file
|
| 173 |
std::unique_ptr<llama_mmap> mapping;
|
|
|
|
| 228 |
|
| 229 |
// memory buffers used to evaluate the model
|
| 230 |
// TODO: move in llama_state
|
| 231 |
+
llama_ctx_buffer buf_compute;
|
| 232 |
+
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
| 233 |
|
| 234 |
int buf_last = 0;
|
| 235 |
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
|
|
|
| 284 |
}
|
| 285 |
|
| 286 |
static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
|
| 287 |
+
char buf[256];
|
| 288 |
+
snprintf(buf, sizeof(buf), "%5u", ne.at(0));
|
| 289 |
for (size_t i = 1; i < ne.size(); i++) {
|
| 290 |
+
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
}
|
| 292 |
+
return buf;
|
| 293 |
}
|
| 294 |
|
| 295 |
static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
|
|
|
|
| 444 |
hparams.n_head = file.read_u32();
|
| 445 |
hparams.n_layer = file.read_u32();
|
| 446 |
hparams.n_rot = file.read_u32();
|
| 447 |
+
hparams.ftype = (enum llama_ftype) file.read_u32();
|
| 448 |
}
|
| 449 |
void read_vocab() {
|
| 450 |
vocab.id_to_token.resize(hparams.n_vocab);
|
|
|
|
| 470 |
llama_load_tensor_shard shard;
|
| 471 |
uint32_t n_dims = file.read_u32();
|
| 472 |
uint32_t name_len = file.read_u32();
|
| 473 |
+
shard.type = (enum ggml_type) file.read_u32();
|
| 474 |
shard.ne.resize(n_dims);
|
| 475 |
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
|
| 476 |
std::string name = file.read_string(name_len);
|
| 477 |
if (n_dims < 1 || n_dims > 2) {
|
| 478 |
throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
|
| 479 |
}
|
| 480 |
+
switch (shard.type) {
|
| 481 |
+
case GGML_TYPE_F32:
|
| 482 |
+
case GGML_TYPE_F16:
|
| 483 |
+
case GGML_TYPE_Q4_0:
|
| 484 |
+
case GGML_TYPE_Q4_1:
|
| 485 |
+
case GGML_TYPE_Q4_2:
|
| 486 |
+
case GGML_TYPE_Q5_0:
|
| 487 |
+
case GGML_TYPE_Q5_1:
|
| 488 |
+
case GGML_TYPE_Q8_0:
|
| 489 |
+
break;
|
| 490 |
default: {
|
| 491 |
+
throw format("unrecognized tensor type %u\n", shard.type);
|
| 492 |
}
|
| 493 |
}
|
| 494 |
|
|
|
|
| 519 |
struct llama_file_saver {
|
| 520 |
llama_file file;
|
| 521 |
llama_file_loader * any_file_loader;
|
| 522 |
+
llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
|
| 523 |
: file(fname, "wb"), any_file_loader(any_file_loader) {
|
| 524 |
fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
|
| 525 |
write_magic();
|
| 526 |
+
write_hparams(new_ftype);
|
| 527 |
write_vocab();
|
| 528 |
}
|
| 529 |
void write_magic() {
|
| 530 |
file.write_u32('ggjt'); // magic
|
| 531 |
file.write_u32(1); // version
|
| 532 |
}
|
| 533 |
+
void write_hparams(enum llama_ftype new_ftype) {
|
| 534 |
const llama_hparams & hparams = any_file_loader->hparams;
|
| 535 |
file.write_u32(hparams.n_vocab);
|
| 536 |
file.write_u32(hparams.n_embd);
|
|
|
|
| 538 |
file.write_u32(hparams.n_head);
|
| 539 |
file.write_u32(hparams.n_layer);
|
| 540 |
file.write_u32(hparams.n_rot);
|
| 541 |
+
file.write_u32(new_ftype);
|
| 542 |
}
|
| 543 |
void write_vocab() {
|
| 544 |
if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
|
|
|
|
| 553 |
}
|
| 554 |
}
|
| 555 |
void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
|
|
|
|
| 556 |
switch (new_type) {
|
| 557 |
+
case GGML_TYPE_F32:
|
| 558 |
+
case GGML_TYPE_F16:
|
| 559 |
+
case GGML_TYPE_Q4_0:
|
| 560 |
+
case GGML_TYPE_Q4_1:
|
| 561 |
+
case GGML_TYPE_Q4_2:
|
| 562 |
+
case GGML_TYPE_Q5_0:
|
| 563 |
+
case GGML_TYPE_Q5_1:
|
| 564 |
+
case GGML_TYPE_Q8_0:
|
| 565 |
+
break;
|
| 566 |
default: LLAMA_ASSERT(false);
|
| 567 |
}
|
| 568 |
file.write_u32((uint32_t) tensor.ne.size());
|
| 569 |
file.write_u32((uint32_t) tensor.name.size());
|
| 570 |
+
file.write_u32(new_type);
|
| 571 |
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
| 572 |
file.write_raw(tensor.name.data(), tensor.name.size());
|
| 573 |
file.seek(-file.tell() & 31, SEEK_CUR);
|
|
|
|
| 647 |
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
| 648 |
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
| 649 |
}
|
| 650 |
+
|
| 651 |
return get_tensor_for(lt);
|
| 652 |
}
|
| 653 |
|
|
|
|
| 780 |
const int n_embd = hparams.n_embd;
|
| 781 |
const int n_layer = hparams.n_layer;
|
| 782 |
|
| 783 |
+
const int64_t n_mem = n_layer*n_ctx;
|
| 784 |
const int64_t n_elements = n_embd*n_mem;
|
| 785 |
|
| 786 |
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
|
|
|
| 842 |
}
|
| 843 |
}
|
| 844 |
|
| 845 |
+
static const char *llama_ftype_name(enum llama_ftype ftype) {
|
| 846 |
+
switch (ftype) {
|
| 847 |
+
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
| 848 |
+
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
|
| 849 |
+
case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
|
| 850 |
+
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
| 851 |
+
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
| 852 |
+
return "mostly Q4_1, some F16";
|
| 853 |
+
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
| 854 |
+
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
| 855 |
+
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
| 856 |
+
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
| 857 |
+
default: return "unknown, may not work";
|
| 858 |
+
}
|
| 859 |
+
}
|
| 860 |
+
|
| 861 |
static const char *llama_model_type_name(e_model type) {
|
| 862 |
switch (type) {
|
| 863 |
case MODEL_7B: return "7B";
|
|
|
|
| 910 |
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
| 911 |
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
| 912 |
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
| 913 |
+
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
| 914 |
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
| 915 |
fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
|
| 916 |
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
|
|
|
| 934 |
const size_t mem_required =
|
| 935 |
ctx_size +
|
| 936 |
mmapped_size +
|
| 937 |
+
MEM_REQ_SCRATCH0().at(model.type) +
|
| 938 |
+
MEM_REQ_SCRATCH1().at(model.type) +
|
| 939 |
+
MEM_REQ_EVAL().at(model.type);
|
| 940 |
|
| 941 |
// this is the memory required by one llama_state
|
| 942 |
const size_t mem_required_state =
|
| 943 |
+
scale*MEM_REQ_KV_SELF().at(model.type);
|
| 944 |
|
| 945 |
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
| 946 |
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
|
|
|
| 977 |
ml->ggml_ctx = ctx;
|
| 978 |
|
| 979 |
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
| 980 |
+
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
| 981 |
+
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
|
| 982 |
|
| 983 |
model.layers.resize(n_layer);
|
| 984 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
|
|
|
| 1082 |
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
| 1083 |
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
| 1084 |
ggml_cgraph gf = {};
|
| 1085 |
+
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
| 1086 |
|
| 1087 |
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
| 1088 |
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
|
|
|
| 1256 |
ggml_build_forward_expand(&gf, inpL);
|
| 1257 |
ggml_graph_compute (ctx0, &gf);
|
| 1258 |
|
| 1259 |
+
#ifdef GGML_PERF
|
| 1260 |
// print timing information per ggml operation (for debugging purposes)
|
| 1261 |
// requires GGML_PERF to be defined
|
| 1262 |
+
ggml_graph_print(&gf);
|
| 1263 |
+
#endif
|
| 1264 |
|
| 1265 |
// plot the computation graph in dot format (for debugging purposes)
|
| 1266 |
//if (n_past%100 == 0) {
|
|
|
|
| 1475 |
// sampling
|
| 1476 |
//
|
| 1477 |
|
| 1478 |
+
void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
|
| 1479 |
+
assert(candidates->size > 0);
|
| 1480 |
+
|
| 1481 |
+
const int64_t t_start_sample_us = ggml_time_us();
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1482 |
|
| 1483 |
+
// Sort the logits in descending order
|
| 1484 |
+
if (!candidates->sorted) {
|
| 1485 |
+
std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
| 1486 |
+
return a.logit > b.logit;
|
| 1487 |
+
});
|
| 1488 |
+
candidates->sorted = true;
|
| 1489 |
+
}
|
| 1490 |
+
|
| 1491 |
+
float max_l = candidates->data[0].logit;
|
| 1492 |
+
float cum_sum = 0.0f;
|
| 1493 |
+
for (size_t i = 0; i < candidates->size; ++i) {
|
| 1494 |
+
float p = expf(candidates->data[i].logit - max_l);
|
| 1495 |
+
candidates->data[i].p = p;
|
| 1496 |
+
cum_sum += p;
|
| 1497 |
+
}
|
| 1498 |
+
for (size_t i = 0; i < candidates->size; ++i) {
|
| 1499 |
+
candidates->data[i].p /= cum_sum;
|
| 1500 |
+
}
|
| 1501 |
+
|
| 1502 |
+
if (ctx) {
|
| 1503 |
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 1504 |
+
}
|
| 1505 |
}
|
| 1506 |
|
| 1507 |
+
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
|
| 1508 |
+
const int64_t t_start_sample_us = ggml_time_us();
|
| 1509 |
+
|
| 1510 |
+
k = std::max(k, (int) min_keep);
|
| 1511 |
+
k = std::min(k, (int) candidates->size);
|
| 1512 |
+
|
| 1513 |
+
// Sort scores in descending order
|
| 1514 |
+
if (!candidates->sorted) {
|
| 1515 |
+
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
| 1516 |
+
return a.logit > b.logit;
|
| 1517 |
+
};
|
| 1518 |
+
if (k == (int) candidates->size) {
|
| 1519 |
+
std::sort(candidates->data, candidates->data + candidates->size, comp);
|
| 1520 |
+
} else {
|
| 1521 |
+
std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1522 |
}
|
| 1523 |
+
candidates->sorted = true;
|
| 1524 |
+
}
|
| 1525 |
+
candidates->size = k;
|
| 1526 |
+
|
| 1527 |
+
if (ctx) {
|
| 1528 |
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 1529 |
}
|
| 1530 |
+
}
|
| 1531 |
|
| 1532 |
+
void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
| 1533 |
+
if (p >= 1.0f) {
|
| 1534 |
+
return;
|
| 1535 |
+
}
|
| 1536 |
|
| 1537 |
+
const int64_t t_start_sample_us = ggml_time_us();
|
| 1538 |
+
|
| 1539 |
+
llama_sample_softmax(ctx, candidates);
|
| 1540 |
+
|
| 1541 |
+
// Compute the cumulative probabilities
|
| 1542 |
+
float cum_sum = 0.0f;
|
| 1543 |
+
size_t last_idx = candidates->size;
|
| 1544 |
+
|
| 1545 |
+
for (size_t i = 0; i < candidates->size; ++i) {
|
| 1546 |
+
cum_sum += candidates->data[i].p;
|
| 1547 |
+
|
| 1548 |
+
// Check if the running sum is greater than p or if we have kept at least min_keep tokens
|
| 1549 |
+
if (cum_sum > p && i >= min_keep) {
|
| 1550 |
+
last_idx = i;
|
| 1551 |
+
break;
|
| 1552 |
}
|
| 1553 |
}
|
| 1554 |
|
| 1555 |
+
// Resize the output vector to keep only the top-p tokens
|
| 1556 |
+
candidates->size = last_idx;
|
| 1557 |
|
| 1558 |
+
if (ctx) {
|
| 1559 |
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 1560 |
+
}
|
| 1561 |
+
}
|
| 1562 |
|
| 1563 |
+
void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
|
| 1564 |
+
if (z >= 1.0f || candidates->size <= 2) {
|
| 1565 |
+
return;
|
|
|
|
|
|
|
|
|
|
| 1566 |
}
|
| 1567 |
|
| 1568 |
+
const int64_t t_start_sample_us = ggml_time_us();
|
| 1569 |
+
|
| 1570 |
+
llama_sample_softmax(nullptr, candidates);
|
| 1571 |
+
|
| 1572 |
+
// Compute the first and second derivatives
|
| 1573 |
+
std::vector<float> first_derivatives(candidates->size - 1);
|
| 1574 |
+
std::vector<float> second_derivatives(candidates->size - 2);
|
| 1575 |
+
|
| 1576 |
+
for (size_t i = 0; i < first_derivatives.size(); ++i) {
|
| 1577 |
+
first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
|
| 1578 |
+
}
|
| 1579 |
+
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
| 1580 |
+
second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
|
| 1581 |
}
|
| 1582 |
|
| 1583 |
+
// Calculate absolute value of second derivatives
|
| 1584 |
+
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
| 1585 |
+
second_derivatives[i] = abs(second_derivatives[i]);
|
| 1586 |
+
}
|
| 1587 |
+
|
| 1588 |
+
// Normalize the second derivatives
|
| 1589 |
+
float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
| 1590 |
+
for (float & value : second_derivatives) {
|
| 1591 |
+
value /= second_derivatives_sum;
|
| 1592 |
+
}
|
| 1593 |
+
|
| 1594 |
+
float cum_sum = 0.0f;
|
| 1595 |
+
size_t last_idx = candidates->size;
|
| 1596 |
+
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
| 1597 |
+
cum_sum += second_derivatives[i];
|
| 1598 |
+
|
| 1599 |
+
// Check if the running sum is greater than z or if we have kept at least min_keep tokens
|
| 1600 |
+
if (cum_sum > z && i >= min_keep) {
|
| 1601 |
+
last_idx = i;
|
| 1602 |
+
break;
|
| 1603 |
}
|
| 1604 |
}
|
| 1605 |
|
| 1606 |
+
// Resize the output vector to keep only the tokens above the tail location
|
| 1607 |
+
candidates->size = last_idx;
|
| 1608 |
+
|
| 1609 |
+
if (ctx) {
|
| 1610 |
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 1611 |
+
}
|
| 1612 |
+
}
|
| 1613 |
+
|
| 1614 |
+
|
| 1615 |
+
void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
| 1616 |
+
// Reference implementation:
|
| 1617 |
+
// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
|
| 1618 |
+
if (p >= 1.0f) {
|
| 1619 |
+
return;
|
| 1620 |
+
}
|
| 1621 |
+
|
| 1622 |
+
const int64_t t_start_sample_us = ggml_time_us();
|
| 1623 |
+
|
| 1624 |
+
// Compute the softmax of logits and calculate entropy
|
| 1625 |
+
llama_sample_softmax(nullptr, candidates);
|
| 1626 |
+
|
| 1627 |
+
float entropy = 0.0f;
|
| 1628 |
+
for (size_t i = 0; i < candidates->size; ++i) {
|
| 1629 |
+
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
|
| 1630 |
+
}
|
| 1631 |
+
|
| 1632 |
+
// Compute the absolute difference between negative log probability and entropy for each candidate
|
| 1633 |
+
std::vector<float> shifted_scores;
|
| 1634 |
+
for (size_t i = 0; i < candidates->size; ++i) {
|
| 1635 |
+
float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
|
| 1636 |
+
shifted_scores.push_back(shifted_score);
|
| 1637 |
+
}
|
| 1638 |
+
|
| 1639 |
+
// Sort tokens based on the shifted_scores and their corresponding indices
|
| 1640 |
+
std::vector<size_t> indices(candidates->size);
|
| 1641 |
+
std::iota(indices.begin(), indices.end(), 0);
|
| 1642 |
+
|
| 1643 |
+
std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
|
| 1644 |
+
return shifted_scores[a] < shifted_scores[b];
|
| 1645 |
+
});
|
| 1646 |
+
|
| 1647 |
+
// Compute the cumulative probabilities
|
| 1648 |
+
float cum_sum = 0.0f;
|
| 1649 |
+
size_t last_idx = indices.size();
|
| 1650 |
+
|
| 1651 |
+
for (size_t i = 0; i < indices.size(); ++i) {
|
| 1652 |
+
size_t idx = indices[i];
|
| 1653 |
+
cum_sum += candidates->data[idx].p;
|
| 1654 |
+
|
| 1655 |
+
// Check if the running sum is greater than typical or if we have kept at least min_keep tokens
|
| 1656 |
+
if (cum_sum > p && i >= min_keep - 1) {
|
| 1657 |
+
last_idx = i + 1;
|
| 1658 |
+
break;
|
| 1659 |
+
}
|
| 1660 |
+
}
|
| 1661 |
+
|
| 1662 |
+
// Resize the output vector to keep only the locally typical tokens
|
| 1663 |
+
std::vector<llama_token_data> new_candidates;
|
| 1664 |
+
for (size_t i = 0; i < last_idx; ++i) {
|
| 1665 |
+
size_t idx = indices[i];
|
| 1666 |
+
new_candidates.push_back(candidates->data[idx]);
|
| 1667 |
+
}
|
| 1668 |
+
|
| 1669 |
+
// Replace the data in candidates with the new_candidates data
|
| 1670 |
+
std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
|
| 1671 |
+
candidates->size = new_candidates.size();
|
| 1672 |
+
|
| 1673 |
+
if (ctx) {
|
| 1674 |
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 1675 |
+
}
|
| 1676 |
+
}
|
| 1677 |
+
|
| 1678 |
+
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
| 1679 |
+
const int64_t t_start_sample_us = ggml_time_us();
|
| 1680 |
+
|
| 1681 |
+
for (size_t i = 0; i < candidates_p->size; ++i) {
|
| 1682 |
+
candidates_p->data[i].logit /= temp;
|
| 1683 |
+
}
|
| 1684 |
+
|
| 1685 |
+
if (ctx) {
|
| 1686 |
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 1687 |
+
}
|
| 1688 |
+
}
|
| 1689 |
+
|
| 1690 |
+
void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty) {
|
| 1691 |
+
if (last_tokens_size == 0 || penalty == 1.0f) {
|
| 1692 |
+
return;
|
| 1693 |
+
}
|
| 1694 |
+
|
| 1695 |
+
const int64_t t_start_sample_us = ggml_time_us();
|
| 1696 |
+
|
| 1697 |
+
for (size_t i = 0; i < candidates->size; ++i) {
|
| 1698 |
+
auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
| 1699 |
+
if (token_iter == last_tokens + last_tokens_size) {
|
| 1700 |
+
continue;
|
| 1701 |
+
}
|
| 1702 |
+
|
| 1703 |
+
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
| 1704 |
+
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
| 1705 |
+
if (candidates->data[i].logit <= 0) {
|
| 1706 |
+
candidates->data[i].logit *= penalty;
|
| 1707 |
+
} else {
|
| 1708 |
+
candidates->data[i].logit /= penalty;
|
| 1709 |
+
}
|
| 1710 |
+
}
|
| 1711 |
+
|
| 1712 |
+
candidates->sorted = false;
|
| 1713 |
+
|
| 1714 |
+
if (ctx) {
|
| 1715 |
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 1716 |
+
}
|
| 1717 |
+
}
|
| 1718 |
+
|
| 1719 |
+
void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
|
| 1720 |
+
if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
|
| 1721 |
+
return;
|
| 1722 |
+
}
|
| 1723 |
+
|
| 1724 |
+
const int64_t t_start_sample_us = ggml_time_us();
|
| 1725 |
+
|
| 1726 |
+
// Create a frequency map to count occurrences of each token in last_tokens
|
| 1727 |
+
std::unordered_map<llama_token, int> token_count;
|
| 1728 |
+
for (size_t i = 0; i < last_tokens_size; ++i) {
|
| 1729 |
+
token_count[last_tokens_p[i]]++;
|
| 1730 |
+
}
|
| 1731 |
+
|
| 1732 |
+
// Apply frequency and presence penalties to the candidates
|
| 1733 |
+
for (size_t i = 0; i < candidates->size; ++i) {
|
| 1734 |
+
auto token_iter = token_count.find(candidates->data[i].id);
|
| 1735 |
+
if (token_iter == token_count.end()) {
|
| 1736 |
+
continue;
|
| 1737 |
+
}
|
| 1738 |
+
|
| 1739 |
+
int count = token_iter->second;
|
| 1740 |
+
candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
|
| 1741 |
+
}
|
| 1742 |
+
|
| 1743 |
+
candidates->sorted = false;
|
| 1744 |
+
|
| 1745 |
+
if (ctx) {
|
| 1746 |
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 1747 |
+
}
|
| 1748 |
+
}
|
| 1749 |
+
|
| 1750 |
+
|
| 1751 |
+
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
|
| 1752 |
+
assert(ctx);
|
| 1753 |
+
auto N = float(llama_n_vocab(ctx));
|
| 1754 |
+
int64_t t_start_sample_us;
|
| 1755 |
+
t_start_sample_us = ggml_time_us();
|
| 1756 |
+
|
| 1757 |
+
llama_sample_softmax(nullptr, candidates);
|
| 1758 |
+
|
| 1759 |
+
// Estimate s_hat using the most probable m tokens
|
| 1760 |
+
float s_hat = 0.0;
|
| 1761 |
+
float sum_ti_bi = 0.0;
|
| 1762 |
+
float sum_ti_sq = 0.0;
|
| 1763 |
+
for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
|
| 1764 |
+
float t_i = logf(float(i + 2) / float(i + 1));
|
| 1765 |
+
float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
|
| 1766 |
+
sum_ti_bi += t_i * b_i;
|
| 1767 |
+
sum_ti_sq += t_i * t_i;
|
| 1768 |
+
}
|
| 1769 |
+
s_hat = sum_ti_bi / sum_ti_sq;
|
| 1770 |
+
|
| 1771 |
+
// Compute k from the estimated s_hat and target surprise value
|
| 1772 |
+
float epsilon_hat = s_hat - 1;
|
| 1773 |
+
float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
|
| 1774 |
+
|
| 1775 |
+
// Sample the next word X using top-k sampling
|
| 1776 |
+
llama_sample_top_k(nullptr, candidates, int(k));
|
| 1777 |
+
if (ctx) {
|
| 1778 |
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 1779 |
+
}
|
| 1780 |
+
llama_token X = llama_sample_token(ctx, candidates);
|
| 1781 |
+
t_start_sample_us = ggml_time_us();
|
| 1782 |
+
|
| 1783 |
+
// Compute error as the difference between observed surprise and target surprise value
|
| 1784 |
+
size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
| 1785 |
+
return candidate.id == X;
|
| 1786 |
+
}));
|
| 1787 |
+
float observed_surprise = -log2f(candidates->data[X_idx].p);
|
| 1788 |
+
float e = observed_surprise - tau;
|
| 1789 |
+
|
| 1790 |
+
// Update mu using the learning rate and error
|
| 1791 |
+
*mu = *mu - eta * e;
|
| 1792 |
+
|
| 1793 |
+
if (ctx) {
|
| 1794 |
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 1795 |
+
ctx->n_sample++;
|
| 1796 |
+
}
|
| 1797 |
+
return X;
|
| 1798 |
+
}
|
| 1799 |
+
|
| 1800 |
+
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
|
| 1801 |
+
assert(ctx);
|
| 1802 |
+
int64_t t_start_sample_us;
|
| 1803 |
+
t_start_sample_us = ggml_time_us();
|
| 1804 |
+
|
| 1805 |
+
llama_sample_softmax(ctx, candidates);
|
| 1806 |
+
|
| 1807 |
+
// Truncate the words with surprise values greater than mu
|
| 1808 |
+
candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
| 1809 |
+
return -log2f(candidate.p) > *mu;
|
| 1810 |
+
}));
|
| 1811 |
+
|
| 1812 |
+
// Normalize the probabilities of the remaining words
|
| 1813 |
+
llama_sample_softmax(ctx, candidates);
|
| 1814 |
+
|
| 1815 |
+
// Sample the next word X from the remaining words
|
| 1816 |
+
if (ctx) {
|
| 1817 |
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 1818 |
+
}
|
| 1819 |
+
llama_token X = llama_sample_token(ctx, candidates);
|
| 1820 |
+
t_start_sample_us = ggml_time_us();
|
| 1821 |
+
|
| 1822 |
+
// Compute error as the difference between observed surprise and target surprise value
|
| 1823 |
+
size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
| 1824 |
+
return candidate.id == X;
|
| 1825 |
+
}));
|
| 1826 |
+
float observed_surprise = -log2f(candidates->data[X_idx].p);
|
| 1827 |
+
float e = observed_surprise - tau;
|
| 1828 |
+
|
| 1829 |
+
// Update mu using the learning rate and error
|
| 1830 |
+
*mu = *mu - eta * e;
|
| 1831 |
+
|
| 1832 |
+
if (ctx) {
|
| 1833 |
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 1834 |
+
}
|
| 1835 |
+
return X;
|
| 1836 |
+
}
|
| 1837 |
+
|
| 1838 |
+
llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
|
| 1839 |
+
const int64_t t_start_sample_us = ggml_time_us();
|
| 1840 |
+
|
| 1841 |
+
// Find max element
|
| 1842 |
+
auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
| 1843 |
+
return a.logit < b.logit;
|
| 1844 |
+
});
|
| 1845 |
+
|
| 1846 |
+
llama_token result = max_iter->id;
|
| 1847 |
+
if (ctx) {
|
| 1848 |
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 1849 |
+
ctx->n_sample++;
|
| 1850 |
+
}
|
| 1851 |
+
return result;
|
| 1852 |
+
}
|
| 1853 |
+
|
| 1854 |
+
llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
|
| 1855 |
+
assert(ctx);
|
| 1856 |
+
const int64_t t_start_sample_us = ggml_time_us();
|
| 1857 |
+
llama_sample_softmax(nullptr, candidates);
|
| 1858 |
+
|
| 1859 |
+
std::vector<float> probs;
|
| 1860 |
+
probs.reserve(candidates->size);
|
| 1861 |
+
for (size_t i = 0; i < candidates->size; ++i) {
|
| 1862 |
+
probs.push_back(candidates->data[i].p);
|
| 1863 |
+
}
|
| 1864 |
|
| 1865 |
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
| 1866 |
+
auto & rng = ctx->rng;
|
| 1867 |
int idx = dist(rng);
|
| 1868 |
|
| 1869 |
+
llama_token result = candidates->data[idx].id;
|
| 1870 |
+
|
| 1871 |
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 1872 |
+
ctx->n_sample++;
|
| 1873 |
+
return result;
|
| 1874 |
}
|
| 1875 |
|
| 1876 |
//
|
| 1877 |
// quantization
|
| 1878 |
//
|
| 1879 |
|
| 1880 |
+
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
|
| 1881 |
ggml_type quantized_type;
|
| 1882 |
+
switch (ftype) {
|
| 1883 |
+
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
| 1884 |
+
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
| 1885 |
+
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
| 1886 |
+
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
| 1887 |
+
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
| 1888 |
+
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
| 1889 |
+
default: throw format("invalid output file type %d\n", ftype);
|
| 1890 |
};
|
| 1891 |
|
| 1892 |
+
if (nthread <= 0) {
|
| 1893 |
+
nthread = std::thread::hardware_concurrency();
|
| 1894 |
+
}
|
| 1895 |
+
|
| 1896 |
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
| 1897 |
/*vocab_only*/ false));
|
| 1898 |
+
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
| 1899 |
|
| 1900 |
size_t total_size_org = 0;
|
| 1901 |
size_t total_size_new = 0;
|
| 1902 |
std::vector<int64_t> hist_all(1 << 4, 0);
|
| 1903 |
|
| 1904 |
+
std::vector<std::thread> workers;
|
| 1905 |
+
std::mutex mutex;
|
| 1906 |
+
|
| 1907 |
size_t idx = 0;
|
| 1908 |
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
| 1909 |
llama_buffer read_data;
|
|
|
|
| 1911 |
tensor.data = read_data.addr;
|
| 1912 |
model_loader->load_data_for(tensor);
|
| 1913 |
|
| 1914 |
+
printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
|
| 1915 |
++idx, model_loader->tensors_map.tensors.size(),
|
| 1916 |
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
| 1917 |
+
ggml_type_name(tensor.type));
|
| 1918 |
|
| 1919 |
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
| 1920 |
bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
|
|
|
|
| 1922 |
// quantize only 2D tensors
|
| 1923 |
quantize &= (tensor.ne.size() == 2);
|
| 1924 |
|
| 1925 |
+
// uncomment this to keep the output layer in FP16
|
| 1926 |
+
//if (tensor.name == "output.weight") {
|
| 1927 |
+
// quantize = false;
|
| 1928 |
+
//}
|
| 1929 |
+
|
| 1930 |
enum ggml_type new_type;
|
| 1931 |
void * new_data;
|
| 1932 |
size_t new_size;
|
|
|
|
| 1952 |
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
| 1953 |
}
|
| 1954 |
} else {
|
| 1955 |
+
throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
|
| 1956 |
}
|
| 1957 |
|
| 1958 |
printf("quantizing .. ");
|
|
|
|
| 1962 |
new_data = work.addr;
|
| 1963 |
std::vector<int64_t> hist_cur(1 << 4, 0);
|
| 1964 |
|
| 1965 |
+
int chunk_size = 32 * 512;
|
| 1966 |
+
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
| 1967 |
+
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
| 1968 |
+
if (nthread_use < 2) {
|
| 1969 |
+
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
|
| 1970 |
+
} else {
|
| 1971 |
+
size_t counter = 0;
|
| 1972 |
+
new_size = 0;
|
| 1973 |
+
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
|
| 1974 |
+
std::vector<int64_t> local_hist;
|
| 1975 |
+
size_t local_size = 0;
|
| 1976 |
+
while (true) {
|
| 1977 |
+
std::unique_lock<std::mutex> lock(mutex);
|
| 1978 |
+
size_t first = counter; counter += chunk_size;
|
| 1979 |
+
if (first >= nelements) {
|
| 1980 |
+
if (!local_hist.empty()) {
|
| 1981 |
+
for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
|
| 1982 |
+
new_size += local_size;
|
| 1983 |
+
}
|
| 1984 |
+
break;
|
| 1985 |
+
}
|
| 1986 |
+
lock.unlock();
|
| 1987 |
+
size_t last = std::min(nelements, first + chunk_size);
|
| 1988 |
+
if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
|
| 1989 |
+
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
| 1990 |
+
}
|
| 1991 |
+
};
|
| 1992 |
+
if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
|
| 1993 |
+
for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
|
| 1994 |
+
compute();
|
| 1995 |
+
for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
|
| 1996 |
}
|
| 1997 |
|
| 1998 |
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
|
|
|
| 2091 |
if (params.logits_all) {
|
| 2092 |
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
|
| 2093 |
} else {
|
| 2094 |
+
ctx->logits.reserve(hparams.n_vocab);
|
| 2095 |
}
|
| 2096 |
|
| 2097 |
if (params.embedding){
|
| 2098 |
ctx->embedding.resize(hparams.n_embd);
|
| 2099 |
}
|
| 2100 |
|
| 2101 |
+
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
| 2102 |
|
| 2103 |
+
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
| 2104 |
+
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
| 2105 |
}
|
| 2106 |
|
| 2107 |
return ctx;
|
|
|
|
| 2114 |
int llama_model_quantize(
|
| 2115 |
const char * fname_inp,
|
| 2116 |
const char * fname_out,
|
| 2117 |
+
enum llama_ftype ftype,
|
| 2118 |
+
int nthread) {
|
| 2119 |
try {
|
| 2120 |
+
llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
|
| 2121 |
return 0;
|
| 2122 |
} catch (const std::string & err) {
|
| 2123 |
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
|
|
|
|
| 2125 |
}
|
| 2126 |
}
|
| 2127 |
|
| 2128 |
+
int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
| 2129 |
+
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
| 2130 |
+
|
| 2131 |
+
auto & model = ctx->model;
|
| 2132 |
+
|
| 2133 |
+
const int64_t t_start_lora_us = ggml_time_us();
|
| 2134 |
+
|
| 2135 |
+
auto fin = std::ifstream(path_lora, std::ios::binary);
|
| 2136 |
+
if (!fin) {
|
| 2137 |
+
fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
|
| 2138 |
+
return 1;
|
| 2139 |
+
}
|
| 2140 |
+
|
| 2141 |
+
// verify magic and version
|
| 2142 |
+
{
|
| 2143 |
+
uint32_t magic;
|
| 2144 |
+
fin.read((char *) &magic, sizeof(magic));
|
| 2145 |
+
if (magic != 'ggla') {
|
| 2146 |
+
fprintf(stderr, "%s: bad file magic\n", __func__);
|
| 2147 |
+
return 1;
|
| 2148 |
+
}
|
| 2149 |
+
uint32_t format_version;
|
| 2150 |
+
fin.read((char *) &format_version, sizeof(format_version));
|
| 2151 |
+
|
| 2152 |
+
if (format_version != 1) {
|
| 2153 |
+
fprintf(stderr, "%s: unsupported file version\n", __func__ );
|
| 2154 |
+
return 1;
|
| 2155 |
+
}
|
| 2156 |
+
}
|
| 2157 |
+
|
| 2158 |
+
int32_t lora_r;
|
| 2159 |
+
int32_t lora_alpha;
|
| 2160 |
+
fin.read((char *) &lora_r, sizeof(lora_r));
|
| 2161 |
+
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
| 2162 |
+
float scaling = (float)lora_alpha / (float)lora_r;
|
| 2163 |
+
|
| 2164 |
+
fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
| 2165 |
+
|
| 2166 |
+
|
| 2167 |
+
// create a temporary ggml context to store the lora tensors
|
| 2168 |
+
// todo: calculate size from biggest possible tensor
|
| 2169 |
+
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
|
| 2170 |
+
struct ggml_init_params params;
|
| 2171 |
+
params.mem_size = lora_buf.size();
|
| 2172 |
+
params.mem_buffer = lora_buf.data();
|
| 2173 |
+
params.no_alloc = false;
|
| 2174 |
+
|
| 2175 |
+
ggml_context * lora_ctx = ggml_init(params);
|
| 2176 |
+
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
| 2177 |
+
|
| 2178 |
+
// create a name -> tensor map of the model to accelerate lookups
|
| 2179 |
+
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
| 2180 |
+
for (auto & kv: model.tensors_by_name) {
|
| 2181 |
+
model_tensors.insert(kv);
|
| 2182 |
+
}
|
| 2183 |
+
|
| 2184 |
+
|
| 2185 |
+
// load base model
|
| 2186 |
+
std::unique_ptr<llama_model_loader> model_loader;
|
| 2187 |
+
ggml_context * base_ctx = NULL;
|
| 2188 |
+
llama_buffer base_buf;
|
| 2189 |
+
if (path_base_model) {
|
| 2190 |
+
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
| 2191 |
+
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
| 2192 |
+
|
| 2193 |
+
size_t ctx_size, mmapped_size;
|
| 2194 |
+
model_loader->calc_sizes(&ctx_size, &mmapped_size);
|
| 2195 |
+
base_buf.resize(ctx_size);
|
| 2196 |
+
|
| 2197 |
+
ggml_init_params base_params;
|
| 2198 |
+
base_params.mem_size = base_buf.size;
|
| 2199 |
+
base_params.mem_buffer = base_buf.addr;
|
| 2200 |
+
base_params.no_alloc = model_loader->use_mmap;
|
| 2201 |
+
|
| 2202 |
+
base_ctx = ggml_init(base_params);
|
| 2203 |
+
|
| 2204 |
+
model_loader->ggml_ctx = base_ctx;
|
| 2205 |
+
|
| 2206 |
+
// maybe this should in llama_model_loader
|
| 2207 |
+
if (model_loader->use_mmap) {
|
| 2208 |
+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
|
| 2209 |
+
}
|
| 2210 |
+
}
|
| 2211 |
+
|
| 2212 |
+
// read tensors and apply
|
| 2213 |
+
bool warned = false;
|
| 2214 |
+
int n_tensors = 0;
|
| 2215 |
+
while (true) {
|
| 2216 |
+
int32_t n_dims;
|
| 2217 |
+
int32_t length;
|
| 2218 |
+
int32_t ftype;
|
| 2219 |
+
|
| 2220 |
+
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
| 2221 |
+
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
| 2222 |
+
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
| 2223 |
+
if (fin.eof()) {
|
| 2224 |
+
break;
|
| 2225 |
+
}
|
| 2226 |
+
|
| 2227 |
+
int32_t ne[2] = { 1, 1 };
|
| 2228 |
+
for (int i = 0; i < n_dims; ++i) {
|
| 2229 |
+
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
| 2230 |
+
}
|
| 2231 |
+
|
| 2232 |
+
std::string name(length, 0);
|
| 2233 |
+
fin.read(&name[0], length);
|
| 2234 |
+
|
| 2235 |
+
// check for lora suffix and get the type of tensor
|
| 2236 |
+
const std::string lora_suffix = ".lora";
|
| 2237 |
+
size_t pos = name.rfind(lora_suffix);
|
| 2238 |
+
if (pos == std::string::npos) {
|
| 2239 |
+
fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
| 2240 |
+
return 1;
|
| 2241 |
+
}
|
| 2242 |
+
|
| 2243 |
+
std::string lora_type = name.substr(pos + lora_suffix.length());
|
| 2244 |
+
std::string base_name = name;
|
| 2245 |
+
base_name.erase(pos);
|
| 2246 |
+
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
| 2247 |
+
|
| 2248 |
+
if (model_tensors.find(base_name.data()) == model_tensors.end()) {
|
| 2249 |
+
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
| 2250 |
+
return 1;
|
| 2251 |
+
}
|
| 2252 |
+
|
| 2253 |
+
// create ggml tensor
|
| 2254 |
+
ggml_type wtype;
|
| 2255 |
+
switch (ftype) {
|
| 2256 |
+
case 0: wtype = GGML_TYPE_F32; break;
|
| 2257 |
+
case 1: wtype = GGML_TYPE_F16; break;
|
| 2258 |
+
default:
|
| 2259 |
+
{
|
| 2260 |
+
fprintf(stderr, "%s: invalid tensor data type '%d'\n",
|
| 2261 |
+
__func__, ftype);
|
| 2262 |
+
return false;
|
| 2263 |
+
}
|
| 2264 |
+
}
|
| 2265 |
+
ggml_tensor* lora_tensor;
|
| 2266 |
+
if (n_dims == 2) {
|
| 2267 |
+
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
| 2268 |
+
}
|
| 2269 |
+
else {
|
| 2270 |
+
fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
| 2271 |
+
return 1;
|
| 2272 |
+
}
|
| 2273 |
+
|
| 2274 |
+
// load tensor data
|
| 2275 |
+
size_t offset = fin.tellg();
|
| 2276 |
+
size_t tensor_data_size = ggml_nbytes(lora_tensor);
|
| 2277 |
+
offset = (offset + 31) & -32;
|
| 2278 |
+
fin.seekg(offset);
|
| 2279 |
+
fin.read((char*)lora_tensor->data, tensor_data_size);
|
| 2280 |
+
|
| 2281 |
+
lora_tensors[name] = lora_tensor;
|
| 2282 |
+
|
| 2283 |
+
// check if we have both A and B tensors and apply
|
| 2284 |
+
if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
|
| 2285 |
+
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
|
| 2286 |
+
|
| 2287 |
+
ggml_tensor * dest_t = model_tensors[base_name];
|
| 2288 |
+
ggml_tensor * base_t;
|
| 2289 |
+
if (model_loader) {
|
| 2290 |
+
// load from base model
|
| 2291 |
+
if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
|
| 2292 |
+
fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
| 2293 |
+
return 1;
|
| 2294 |
+
}
|
| 2295 |
+
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
| 2296 |
+
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
| 2297 |
+
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
|
| 2298 |
+
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
| 2299 |
+
model_loader->load_data_for(lt);
|
| 2300 |
+
lt.ggml_tensor->data = lt.data;
|
| 2301 |
+
}
|
| 2302 |
+
else {
|
| 2303 |
+
base_t = dest_t;
|
| 2304 |
+
}
|
| 2305 |
+
|
| 2306 |
+
if (ggml_is_quantized(base_t->type)) {
|
| 2307 |
+
if (!warned) {
|
| 2308 |
+
fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
| 2309 |
+
"use a f16 or f32 base model with --lora-base\n", __func__);
|
| 2310 |
+
warned = true;
|
| 2311 |
+
}
|
| 2312 |
+
}
|
| 2313 |
+
|
| 2314 |
+
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
|
| 2315 |
+
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
|
| 2316 |
+
|
| 2317 |
+
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
| 2318 |
+
fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
| 2319 |
+
" are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
| 2320 |
+
return 1;
|
| 2321 |
+
}
|
| 2322 |
+
|
| 2323 |
+
// w = w + BA*s
|
| 2324 |
+
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
| 2325 |
+
|
| 2326 |
+
if (scaling != 1.0f) {
|
| 2327 |
+
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
| 2328 |
+
BA = ggml_scale(lora_ctx, BA, scale_tensor);
|
| 2329 |
+
}
|
| 2330 |
+
|
| 2331 |
+
ggml_tensor * r;
|
| 2332 |
+
if (base_t == dest_t) {
|
| 2333 |
+
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
| 2334 |
+
}
|
| 2335 |
+
else {
|
| 2336 |
+
r = ggml_add(lora_ctx, base_t, BA);
|
| 2337 |
+
r = ggml_cpy(lora_ctx, r, dest_t);
|
| 2338 |
+
}
|
| 2339 |
+
|
| 2340 |
+
struct ggml_cgraph gf = ggml_build_forward(r);
|
| 2341 |
+
gf.n_threads = n_threads;
|
| 2342 |
+
ggml_graph_compute(lora_ctx, &gf);
|
| 2343 |
+
|
| 2344 |
+
// we won't need these tensors again, reset the context to save memory
|
| 2345 |
+
ggml_free(lora_ctx);
|
| 2346 |
+
lora_ctx = ggml_init(params);
|
| 2347 |
+
lora_tensors.clear();
|
| 2348 |
+
|
| 2349 |
+
n_tensors++;
|
| 2350 |
+
if (n_tensors % 4 == 0)
|
| 2351 |
+
fprintf(stderr, ".");
|
| 2352 |
+
}
|
| 2353 |
+
}
|
| 2354 |
+
|
| 2355 |
+
// TODO: this should be in a destructor, it will leak on failure
|
| 2356 |
+
ggml_free(lora_ctx);
|
| 2357 |
+
if (base_ctx) {
|
| 2358 |
+
ggml_free(base_ctx);
|
| 2359 |
+
}
|
| 2360 |
+
|
| 2361 |
+
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
| 2362 |
+
fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
|
| 2363 |
+
|
| 2364 |
+
return 0;
|
| 2365 |
}
|
| 2366 |
|
| 2367 |
+
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
| 2368 |
+
try {
|
| 2369 |
+
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
| 2370 |
+
} catch (const std::string & err) {
|
| 2371 |
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
|
| 2372 |
+
return 1;
|
| 2373 |
+
}
|
| 2374 |
}
|
| 2375 |
|
| 2376 |
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
| 2377 |
return ctx->model.kv_self.n;
|
| 2378 |
}
|
| 2379 |
|
| 2380 |
+
#define LLAMA_MAX_RNG_STATE 64*1024
|
| 2381 |
+
|
| 2382 |
+
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
|
| 2383 |
+
if (seed <= 0) {
|
| 2384 |
+
seed = time(NULL);
|
| 2385 |
+
}
|
| 2386 |
+
ctx->rng.seed(seed);
|
| 2387 |
+
}
|
| 2388 |
+
|
| 2389 |
+
// Returns the size of the state
|
| 2390 |
+
size_t llama_get_state_size(struct llama_context * ctx) {
|
| 2391 |
+
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
| 2392 |
+
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
| 2393 |
+
const size_t s_rng_size = sizeof(size_t);
|
| 2394 |
+
const size_t s_rng = LLAMA_MAX_RNG_STATE;
|
| 2395 |
+
const size_t s_logits_capacity = sizeof(size_t);
|
| 2396 |
+
const size_t s_logits_size = sizeof(size_t);
|
| 2397 |
+
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
| 2398 |
+
const size_t s_embedding_size = sizeof(size_t);
|
| 2399 |
+
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
| 2400 |
+
const size_t s_kv_size = sizeof(size_t);
|
| 2401 |
+
const size_t s_kv_ntok = sizeof(int);
|
| 2402 |
+
const size_t s_kv = ctx->model.kv_self.buf.size;
|
| 2403 |
+
|
| 2404 |
+
const size_t s_total = (
|
| 2405 |
+
+ s_rng_size
|
| 2406 |
+
+ s_rng
|
| 2407 |
+
+ s_logits_capacity
|
| 2408 |
+
+ s_logits_size
|
| 2409 |
+
+ s_logits
|
| 2410 |
+
+ s_embedding_size
|
| 2411 |
+
+ s_embedding
|
| 2412 |
+
+ s_kv_size
|
| 2413 |
+
+ s_kv_ntok
|
| 2414 |
+
+ s_kv
|
| 2415 |
+
);
|
| 2416 |
+
|
| 2417 |
+
return s_total;
|
| 2418 |
+
}
|
| 2419 |
+
|
| 2420 |
+
// Copies the state to the specified destination address
|
| 2421 |
+
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
| 2422 |
+
uint8_t * out = dest;
|
| 2423 |
+
|
| 2424 |
+
// copy rng
|
| 2425 |
+
{
|
| 2426 |
+
std::stringstream rng_ss;
|
| 2427 |
+
rng_ss << ctx->rng;
|
| 2428 |
+
|
| 2429 |
+
const size_t rng_size = rng_ss.str().size();
|
| 2430 |
+
char rng_buf[LLAMA_MAX_RNG_STATE];
|
| 2431 |
+
|
| 2432 |
+
memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
|
| 2433 |
+
memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
|
| 2434 |
+
|
| 2435 |
+
memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
|
| 2436 |
+
memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
|
| 2437 |
+
}
|
| 2438 |
+
|
| 2439 |
+
// copy logits
|
| 2440 |
+
{
|
| 2441 |
+
const size_t logits_cap = ctx->logits.capacity();
|
| 2442 |
+
const size_t logits_size = ctx->logits.size();
|
| 2443 |
+
|
| 2444 |
+
memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap);
|
| 2445 |
+
memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
|
| 2446 |
+
|
| 2447 |
+
if (logits_size) {
|
| 2448 |
+
memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
|
| 2449 |
+
}
|
| 2450 |
+
|
| 2451 |
+
out += logits_cap * sizeof(float);
|
| 2452 |
+
}
|
| 2453 |
+
|
| 2454 |
+
// copy embeddings
|
| 2455 |
+
{
|
| 2456 |
+
const size_t embedding_size = ctx->embedding.size();
|
| 2457 |
+
|
| 2458 |
+
memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
|
| 2459 |
+
|
| 2460 |
+
if (embedding_size) {
|
| 2461 |
+
memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
|
| 2462 |
+
out += embedding_size * sizeof(float);
|
| 2463 |
+
}
|
| 2464 |
+
}
|
| 2465 |
+
|
| 2466 |
+
// copy kv cache
|
| 2467 |
+
{
|
| 2468 |
+
const size_t kv_size = ctx->model.kv_self.buf.size;
|
| 2469 |
+
const int kv_ntok = llama_get_kv_cache_token_count(ctx);
|
| 2470 |
+
|
| 2471 |
+
memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
|
| 2472 |
+
memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
|
| 2473 |
+
|
| 2474 |
+
if (kv_size) {
|
| 2475 |
+
memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size;
|
| 2476 |
+
}
|
| 2477 |
+
}
|
| 2478 |
+
|
| 2479 |
+
const size_t written = out - dest;
|
| 2480 |
+
const size_t expected = llama_get_state_size(ctx);
|
| 2481 |
+
|
| 2482 |
+
LLAMA_ASSERT(written == expected);
|
| 2483 |
+
|
| 2484 |
+
return written;
|
| 2485 |
+
}
|
| 2486 |
+
|
| 2487 |
+
// Sets the state reading from the specified source address
|
| 2488 |
+
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
| 2489 |
+
const uint8_t * in = src;
|
| 2490 |
+
|
| 2491 |
+
// set rng
|
| 2492 |
+
{
|
| 2493 |
+
size_t rng_size;
|
| 2494 |
+
char rng_buf[LLAMA_MAX_RNG_STATE];
|
| 2495 |
+
|
| 2496 |
+
memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
|
| 2497 |
+
memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
|
| 2498 |
+
|
| 2499 |
+
std::stringstream rng_ss;
|
| 2500 |
+
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
| 2501 |
+
rng_ss >> ctx->rng;
|
| 2502 |
+
|
| 2503 |
+
LLAMA_ASSERT(rng_ss.fail() == false);
|
| 2504 |
+
}
|
| 2505 |
+
|
| 2506 |
+
// set logits
|
| 2507 |
+
{
|
| 2508 |
+
size_t logits_cap;
|
| 2509 |
+
size_t logits_size;
|
| 2510 |
+
|
| 2511 |
+
memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
|
| 2512 |
+
memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
|
| 2513 |
+
|
| 2514 |
+
LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
|
| 2515 |
+
|
| 2516 |
+
if (logits_size) {
|
| 2517 |
+
ctx->logits.resize(logits_size);
|
| 2518 |
+
memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
|
| 2519 |
+
}
|
| 2520 |
+
|
| 2521 |
+
in += logits_cap * sizeof(float);
|
| 2522 |
+
}
|
| 2523 |
+
|
| 2524 |
+
// set embeddings
|
| 2525 |
+
{
|
| 2526 |
+
size_t embedding_size;
|
| 2527 |
+
|
| 2528 |
+
memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
|
| 2529 |
+
|
| 2530 |
+
LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
|
| 2531 |
+
|
| 2532 |
+
if (embedding_size) {
|
| 2533 |
+
memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
|
| 2534 |
+
in += embedding_size * sizeof(float);
|
| 2535 |
+
}
|
| 2536 |
+
}
|
| 2537 |
+
|
| 2538 |
+
// set kv cache
|
| 2539 |
+
{
|
| 2540 |
+
size_t kv_size;
|
| 2541 |
+
int kv_ntok;
|
| 2542 |
+
|
| 2543 |
+
memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
|
| 2544 |
+
memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
|
| 2545 |
+
|
| 2546 |
+
if (kv_size) {
|
| 2547 |
+
LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
|
| 2548 |
+
|
| 2549 |
+
void * k_data = ctx->model.kv_self.k->data; // remember data pointers
|
| 2550 |
+
void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
|
| 2551 |
+
|
| 2552 |
+
memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size;
|
| 2553 |
+
|
| 2554 |
+
ctx->model.kv_self.k->data = k_data; // restore correct data pointers
|
| 2555 |
+
ctx->model.kv_self.v->data = v_data;
|
| 2556 |
+
|
| 2557 |
+
}
|
| 2558 |
+
|
| 2559 |
+
ctx->model.kv_self.n = kv_ntok;
|
| 2560 |
+
}
|
| 2561 |
+
|
| 2562 |
+
const size_t nread = in - src;
|
| 2563 |
+
const size_t expected = llama_get_state_size(ctx);
|
| 2564 |
+
|
| 2565 |
+
LLAMA_ASSERT(nread == expected);
|
| 2566 |
+
|
| 2567 |
+
return nread;
|
| 2568 |
}
|
| 2569 |
|
| 2570 |
int llama_eval(
|
|
|
|
| 2641 |
return 2;
|
| 2642 |
}
|
| 2643 |
|
| 2644 |
+
llama_token llama_token_nl() {
|
| 2645 |
+
return 13;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2646 |
}
|
| 2647 |
|
| 2648 |
|
|
|
|
| 2672 |
static std::string s;
|
| 2673 |
|
| 2674 |
s = "";
|
| 2675 |
+
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
| 2676 |
+
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
| 2677 |
+
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
| 2678 |
+
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
| 2679 |
+
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
| 2680 |
+
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
| 2681 |
+
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
| 2682 |
+
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
| 2683 |
+
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
| 2684 |
+
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
| 2685 |
+
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
| 2686 |
+
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
| 2687 |
+
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
| 2688 |
+
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
| 2689 |
|
| 2690 |
return s.c_str();
|
| 2691 |
}
|
|
|
|
| 2694 |
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
| 2695 |
return ctx->model.tensors_by_name;
|
| 2696 |
}
|
| 2697 |
+
|
| 2698 |
+
size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
| 2699 |
+
// TODO leverage mmap
|
| 2700 |
+
llama_file file(path_session, "rb");
|
| 2701 |
+
const uint32_t magic = file.read_u32();
|
| 2702 |
+
const uint32_t version = file.read_u32();
|
| 2703 |
+
|
| 2704 |
+
if (!(magic == 'ggsn' && version == 0)) {
|
| 2705 |
+
fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
|
| 2706 |
+
return 0;
|
| 2707 |
+
}
|
| 2708 |
+
|
| 2709 |
+
llama_hparams session_hparams;
|
| 2710 |
+
file.read_raw(&session_hparams, sizeof(llama_hparams));
|
| 2711 |
+
|
| 2712 |
+
// REVIEW
|
| 2713 |
+
if (session_hparams != ctx->model.hparams) {
|
| 2714 |
+
fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
|
| 2715 |
+
return 0;
|
| 2716 |
+
}
|
| 2717 |
+
|
| 2718 |
+
const uint32_t n_token_count = file.read_u32();
|
| 2719 |
+
LLAMA_ASSERT(n_token_capacity >= n_token_count);
|
| 2720 |
+
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
|
| 2721 |
+
*n_token_count_out = n_token_count;
|
| 2722 |
+
|
| 2723 |
+
const size_t n_state_size = file.size - file.tell();
|
| 2724 |
+
const size_t n_orig_state_size = llama_get_state_size(ctx);
|
| 2725 |
+
if (n_state_size != n_orig_state_size) {
|
| 2726 |
+
fprintf(stderr, "%s : failed to validate state size\n", __func__);
|
| 2727 |
+
}
|
| 2728 |
+
std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
|
| 2729 |
+
file.read_raw(state_data.get(), n_state_size);
|
| 2730 |
+
return llama_set_state_data(ctx, state_data.get());
|
| 2731 |
+
}
|
| 2732 |
+
|
| 2733 |
+
size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
| 2734 |
+
// TODO save temp & swap
|
| 2735 |
+
llama_file file(path_session, "wb");
|
| 2736 |
+
|
| 2737 |
+
const size_t n_state_size = llama_get_state_size(ctx);
|
| 2738 |
+
std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
|
| 2739 |
+
llama_copy_state_data(ctx, state_data.get());
|
| 2740 |
+
|
| 2741 |
+
file.write_u32('ggsn'); // magic
|
| 2742 |
+
file.write_u32(0); // version
|
| 2743 |
+
file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
|
| 2744 |
+
|
| 2745 |
+
file.write_u32((uint32_t) n_token_count); // REVIEW
|
| 2746 |
+
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
|
| 2747 |
+
|
| 2748 |
+
file.write_raw(state_data.get(), n_state_size);
|
| 2749 |
+
return n_state_size; // REVIEW
|
| 2750 |
+
}
|
examples/talk-llama/llama.h
CHANGED
|
@@ -39,12 +39,16 @@ extern "C" {
|
|
| 39 |
|
| 40 |
typedef struct llama_token_data {
|
| 41 |
llama_token id; // token id
|
| 42 |
-
|
| 43 |
float p; // probability of the token
|
| 44 |
-
float plog; // log probability of the token
|
| 45 |
-
|
| 46 |
} llama_token_data;
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
| 49 |
|
| 50 |
struct llama_context_params {
|
|
@@ -65,6 +69,20 @@ extern "C" {
|
|
| 65 |
void * progress_callback_user_data;
|
| 66 |
};
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
LLAMA_API struct llama_context_params llama_context_default_params();
|
| 69 |
|
| 70 |
LLAMA_API bool llama_mmap_supported();
|
|
@@ -82,27 +100,46 @@ extern "C" {
|
|
| 82 |
|
| 83 |
// TODO: not great API - very likely to change
|
| 84 |
// Returns 0 on success
|
|
|
|
| 85 |
LLAMA_API int llama_model_quantize(
|
| 86 |
const char * fname_inp,
|
| 87 |
const char * fname_out,
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
//
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
//
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
// Returns the number of tokens in the KV cache
|
| 98 |
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
|
| 99 |
|
| 100 |
-
// Sets the
|
| 101 |
-
LLAMA_API void
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
// Run the llama inference to obtain the logits and probabilities for the next token.
|
| 108 |
// tokens + n_tokens is the provided batch of new tokens to process
|
|
@@ -148,16 +185,52 @@ extern "C" {
|
|
| 148 |
// Special tokens
|
| 149 |
LLAMA_API llama_token llama_token_bos();
|
| 150 |
LLAMA_API llama_token llama_token_eos();
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
-
|
| 153 |
-
LLAMA_API llama_token
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
// Performance information
|
| 163 |
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
|
@@ -170,4 +243,15 @@ extern "C" {
|
|
| 170 |
}
|
| 171 |
#endif
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
#endif // LLAMA_H
|
|
|
|
| 39 |
|
| 40 |
typedef struct llama_token_data {
|
| 41 |
llama_token id; // token id
|
| 42 |
+
float logit; // log-odds of the token
|
| 43 |
float p; // probability of the token
|
|
|
|
|
|
|
| 44 |
} llama_token_data;
|
| 45 |
|
| 46 |
+
typedef struct llama_token_data_array {
|
| 47 |
+
llama_token_data * data;
|
| 48 |
+
size_t size;
|
| 49 |
+
bool sorted;
|
| 50 |
+
} llama_token_data_array;
|
| 51 |
+
|
| 52 |
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
| 53 |
|
| 54 |
struct llama_context_params {
|
|
|
|
| 69 |
void * progress_callback_user_data;
|
| 70 |
};
|
| 71 |
|
| 72 |
+
// model file types
|
| 73 |
+
enum llama_ftype {
|
| 74 |
+
LLAMA_FTYPE_ALL_F32 = 0,
|
| 75 |
+
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
| 76 |
+
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
| 77 |
+
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
| 78 |
+
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
| 79 |
+
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
| 80 |
+
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
|
| 81 |
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
| 82 |
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
| 83 |
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
| 84 |
+
};
|
| 85 |
+
|
| 86 |
LLAMA_API struct llama_context_params llama_context_default_params();
|
| 87 |
|
| 88 |
LLAMA_API bool llama_mmap_supported();
|
|
|
|
| 100 |
|
| 101 |
// TODO: not great API - very likely to change
|
| 102 |
// Returns 0 on success
|
| 103 |
+
// nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
|
| 104 |
LLAMA_API int llama_model_quantize(
|
| 105 |
const char * fname_inp,
|
| 106 |
const char * fname_out,
|
| 107 |
+
enum llama_ftype ftype,
|
| 108 |
+
int nthread);
|
| 109 |
+
|
| 110 |
+
// Apply a LoRA adapter to a loaded model
|
| 111 |
+
// path_base_model is the path to a higher quality model to use as a base for
|
| 112 |
+
// the layers modified by the adapter. Can be NULL to use the current loaded model.
|
| 113 |
+
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
| 114 |
+
// will be applied on top of the previous one
|
| 115 |
+
// Returns 0 on success
|
| 116 |
+
LLAMA_API int llama_apply_lora_from_file(
|
| 117 |
+
struct llama_context * ctx,
|
| 118 |
+
const char * path_lora,
|
| 119 |
+
const char * path_base_model,
|
| 120 |
+
int n_threads);
|
| 121 |
|
| 122 |
// Returns the number of tokens in the KV cache
|
| 123 |
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
|
| 124 |
|
| 125 |
+
// Sets the current rng seed.
|
| 126 |
+
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
|
| 127 |
+
|
| 128 |
+
// Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
|
| 129 |
+
LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
|
| 130 |
+
|
| 131 |
+
// Copies the state to the specified destination address.
|
| 132 |
+
// Destination needs to have allocated enough memory.
|
| 133 |
+
// Returns the number of bytes copied
|
| 134 |
+
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
|
| 135 |
+
|
| 136 |
+
// Set the state reading from the specified address
|
| 137 |
+
// Returns the number of bytes read
|
| 138 |
+
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
|
| 139 |
+
|
| 140 |
+
// Save/load session file
|
| 141 |
+
LLAMA_API size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
| 142 |
+
LLAMA_API size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
|
| 143 |
|
| 144 |
// Run the llama inference to obtain the logits and probabilities for the next token.
|
| 145 |
// tokens + n_tokens is the provided batch of new tokens to process
|
|
|
|
| 185 |
// Special tokens
|
| 186 |
LLAMA_API llama_token llama_token_bos();
|
| 187 |
LLAMA_API llama_token llama_token_eos();
|
| 188 |
+
LLAMA_API llama_token llama_token_nl();
|
| 189 |
+
|
| 190 |
+
// Sampling functions
|
| 191 |
|
| 192 |
+
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
| 193 |
+
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
| 194 |
+
|
| 195 |
+
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
| 196 |
+
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
| 197 |
+
|
| 198 |
+
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
| 199 |
+
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
| 200 |
+
|
| 201 |
+
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
| 202 |
+
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
|
| 203 |
+
|
| 204 |
+
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
| 205 |
+
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
|
| 206 |
+
|
| 207 |
+
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
| 208 |
+
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
|
| 209 |
+
|
| 210 |
+
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
| 211 |
+
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
|
| 212 |
+
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
| 213 |
+
|
| 214 |
+
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
| 215 |
+
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
| 216 |
+
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
| 217 |
+
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
| 218 |
+
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
|
| 219 |
+
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
| 220 |
+
LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
|
| 221 |
+
|
| 222 |
+
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
| 223 |
+
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
| 224 |
+
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
| 225 |
+
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
| 226 |
+
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
| 227 |
+
LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
|
| 228 |
+
|
| 229 |
+
/// @details Selects the token with the highest probability.
|
| 230 |
+
LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
|
| 231 |
+
|
| 232 |
+
/// @details Randomly selects a token from the candidates based on their probabilities.
|
| 233 |
+
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
| 234 |
|
| 235 |
// Performance information
|
| 236 |
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
|
|
|
| 243 |
}
|
| 244 |
#endif
|
| 245 |
|
| 246 |
+
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
|
| 247 |
+
#ifdef LLAMA_API_INTERNAL
|
| 248 |
+
|
| 249 |
+
#include <vector>
|
| 250 |
+
#include <string>
|
| 251 |
+
struct ggml_tensor;
|
| 252 |
+
|
| 253 |
+
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
| 254 |
+
|
| 255 |
+
#endif
|
| 256 |
+
|
| 257 |
#endif // LLAMA_H
|
examples/talk-llama/llama_internal.h
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
// Internal header to be included by llama.cpp and tests/benchmarks only.
|
| 2 |
-
|
| 3 |
-
#ifndef LLAMA_INTERNAL_H
|
| 4 |
-
#define LLAMA_INTERNAL_H
|
| 5 |
-
|
| 6 |
-
#include <vector>
|
| 7 |
-
#include <string>
|
| 8 |
-
struct ggml_tensor;
|
| 9 |
-
|
| 10 |
-
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
| 11 |
-
|
| 12 |
-
#endif // LLAMA_INTERNAL_H
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
examples/talk-llama/talk-llama.cpp
CHANGED
|
@@ -487,11 +487,37 @@ int main(int argc, char ** argv) {
|
|
| 487 |
|
| 488 |
{
|
| 489 |
auto logits = llama_get_logits(ctx_llama);
|
|
|
|
|
|
|
| 490 |
logits[llama_token_eos()] = 0;
|
| 491 |
|
| 492 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 493 |
embd_inp.data() + std::max(0, n_past - repeat_last_n),
|
| 494 |
-
repeat_last_n,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
}
|
| 496 |
|
| 497 |
if (id != llama_token_eos()) {
|
|
|
|
| 487 |
|
| 488 |
{
|
| 489 |
auto logits = llama_get_logits(ctx_llama);
|
| 490 |
+
auto n_vocab = llama_n_vocab(ctx_llama);
|
| 491 |
+
|
| 492 |
logits[llama_token_eos()] = 0;
|
| 493 |
|
| 494 |
+
std::vector<llama_token_data> candidates;
|
| 495 |
+
candidates.reserve(n_vocab);
|
| 496 |
+
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
| 497 |
+
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
| 498 |
+
}
|
| 499 |
+
|
| 500 |
+
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
| 501 |
+
|
| 502 |
+
// apply repeat penalty
|
| 503 |
+
const float nl_logit = logits[llama_token_nl()];
|
| 504 |
+
|
| 505 |
+
llama_sample_repetition_penalty(ctx_llama, &candidates_p,
|
| 506 |
embd_inp.data() + std::max(0, n_past - repeat_last_n),
|
| 507 |
+
repeat_last_n, repeat_penalty);
|
| 508 |
+
|
| 509 |
+
logits[llama_token_nl()] = nl_logit;
|
| 510 |
+
|
| 511 |
+
if (temp <= 0) {
|
| 512 |
+
// Greedy sampling
|
| 513 |
+
id = llama_sample_token_greedy(ctx_llama, &candidates_p);
|
| 514 |
+
} else {
|
| 515 |
+
// Temperature sampling
|
| 516 |
+
llama_sample_top_k(ctx_llama, &candidates_p, top_k);
|
| 517 |
+
llama_sample_top_p(ctx_llama, &candidates_p, top_p);
|
| 518 |
+
llama_sample_temperature(ctx_llama, &candidates_p, temp);
|
| 519 |
+
id = llama_sample_token(ctx_llama, &candidates_p);
|
| 520 |
+
}
|
| 521 |
}
|
| 522 |
|
| 523 |
if (id != llama_token_eos()) {
|
examples/talk.wasm/CMakeLists.txt
CHANGED
|
@@ -13,6 +13,7 @@ include(DefaultTargetOptions)
|
|
| 13 |
|
| 14 |
target_link_libraries(${TARGET} PRIVATE
|
| 15 |
whisper
|
|
|
|
| 16 |
)
|
| 17 |
|
| 18 |
unset(EXTRA_FLAGS)
|
|
|
|
| 13 |
|
| 14 |
target_link_libraries(${TARGET} PRIVATE
|
| 15 |
whisper
|
| 16 |
+
common
|
| 17 |
)
|
| 18 |
|
| 19 |
unset(EXTRA_FLAGS)
|
examples/talk.wasm/gpt-2.cpp
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
#include "ggml.h"
|
|
|
|
|
|
|
| 2 |
#include "gpt-2.h"
|
| 3 |
|
| 4 |
#include <cmath>
|
|
@@ -14,150 +16,6 @@
|
|
| 14 |
|
| 15 |
/////////////////////// GPT-2 BEGIN /////////////////////////
|
| 16 |
|
| 17 |
-
//
|
| 18 |
-
// Vocab utils
|
| 19 |
-
//
|
| 20 |
-
|
| 21 |
-
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
| 22 |
-
std::vector<std::string> words;
|
| 23 |
-
|
| 24 |
-
// first split the text into words
|
| 25 |
-
{
|
| 26 |
-
std::string str = text;
|
| 27 |
-
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
| 28 |
-
|
| 29 |
-
std::regex re(pat);
|
| 30 |
-
std::smatch m;
|
| 31 |
-
|
| 32 |
-
while (std::regex_search(str, m, re)) {
|
| 33 |
-
for (auto x : m) {
|
| 34 |
-
words.push_back(x);
|
| 35 |
-
}
|
| 36 |
-
str = m.suffix();
|
| 37 |
-
}
|
| 38 |
-
}
|
| 39 |
-
|
| 40 |
-
// find the longest tokens that form the words:
|
| 41 |
-
std::vector<gpt_vocab::id> tokens;
|
| 42 |
-
for (const auto & word : words) {
|
| 43 |
-
if (word.size() == 0) continue;
|
| 44 |
-
|
| 45 |
-
int i = 0;
|
| 46 |
-
int n = word.size();
|
| 47 |
-
while (i < n) {
|
| 48 |
-
int j = n;
|
| 49 |
-
while (j > i) {
|
| 50 |
-
auto it = vocab.token_to_id.find(word.substr(i, j-i));
|
| 51 |
-
if (it != vocab.token_to_id.end()) {
|
| 52 |
-
tokens.push_back(it->second);
|
| 53 |
-
i = j;
|
| 54 |
-
break;
|
| 55 |
-
}
|
| 56 |
-
--j;
|
| 57 |
-
}
|
| 58 |
-
if (i == n) {
|
| 59 |
-
break;
|
| 60 |
-
}
|
| 61 |
-
if (j == i) {
|
| 62 |
-
auto sub = word.substr(i, 1);
|
| 63 |
-
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
|
| 64 |
-
tokens.push_back(vocab.token_to_id.at(sub));
|
| 65 |
-
} else {
|
| 66 |
-
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
|
| 67 |
-
}
|
| 68 |
-
++i;
|
| 69 |
-
}
|
| 70 |
-
}
|
| 71 |
-
}
|
| 72 |
-
|
| 73 |
-
return tokens;
|
| 74 |
-
}
|
| 75 |
-
|
| 76 |
-
gpt_vocab::id gpt_sample_top_k_top_p(
|
| 77 |
-
const gpt_vocab & vocab,
|
| 78 |
-
const float * logits,
|
| 79 |
-
int top_k,
|
| 80 |
-
double top_p,
|
| 81 |
-
double temp,
|
| 82 |
-
std::mt19937 & rng) {
|
| 83 |
-
int n_logits = vocab.id_to_token.size();
|
| 84 |
-
|
| 85 |
-
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
|
| 86 |
-
logits_id.reserve(n_logits);
|
| 87 |
-
|
| 88 |
-
for (int i = 0; i < n_logits; i++) {
|
| 89 |
-
logits_id.push_back(std::make_pair(logits[i], i));
|
| 90 |
-
}
|
| 91 |
-
|
| 92 |
-
// find the top K tokens
|
| 93 |
-
std::partial_sort(
|
| 94 |
-
logits_id.begin(),
|
| 95 |
-
logits_id.begin() + top_k, logits_id.end(),
|
| 96 |
-
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
|
| 97 |
-
return a.first > b.first;
|
| 98 |
-
});
|
| 99 |
-
|
| 100 |
-
logits_id.resize(top_k);
|
| 101 |
-
|
| 102 |
-
// normalize
|
| 103 |
-
{
|
| 104 |
-
double sum = 0.0f;
|
| 105 |
-
for (int i = 0; i < (int)logits_id.size(); i++) {
|
| 106 |
-
sum += logits_id[i].first;
|
| 107 |
-
}
|
| 108 |
-
|
| 109 |
-
sum = 1.0/sum;
|
| 110 |
-
for (int i = 0; i < (int)logits_id.size(); i++) {
|
| 111 |
-
logits_id[i].first *= sum;
|
| 112 |
-
}
|
| 113 |
-
}
|
| 114 |
-
|
| 115 |
-
if (top_p < 1.0f) {
|
| 116 |
-
{
|
| 117 |
-
double cumsum = 0.0f;
|
| 118 |
-
for (int i = 0; i < top_k; i++) {
|
| 119 |
-
cumsum += logits_id[i].first;
|
| 120 |
-
if (cumsum >= top_p) {
|
| 121 |
-
logits_id.resize(i+1);
|
| 122 |
-
break;
|
| 123 |
-
}
|
| 124 |
-
}
|
| 125 |
-
}
|
| 126 |
-
|
| 127 |
-
// normalize again
|
| 128 |
-
{
|
| 129 |
-
double sum = 0.0f;
|
| 130 |
-
for (int i = 0; i < (int)logits_id.size(); i++) {
|
| 131 |
-
sum += logits_id[i].first;
|
| 132 |
-
}
|
| 133 |
-
|
| 134 |
-
sum = 1.0/sum;
|
| 135 |
-
for (int i = 0; i < (int)logits_id.size(); i++) {
|
| 136 |
-
logits_id[i].first *= sum;
|
| 137 |
-
}
|
| 138 |
-
}
|
| 139 |
-
}
|
| 140 |
-
|
| 141 |
-
//printf("\n");
|
| 142 |
-
//for (int i = 0; i < (int)logits_id.size(); i++) {
|
| 143 |
-
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
|
| 144 |
-
//}
|
| 145 |
-
//exit(0);
|
| 146 |
-
|
| 147 |
-
// sample from the obtained distribution
|
| 148 |
-
std::vector<double> probs;
|
| 149 |
-
probs.reserve(logits_id.size());
|
| 150 |
-
|
| 151 |
-
for (int i = 0; i < (int) logits_id.size(); i++) {
|
| 152 |
-
probs.push_back(logits_id[i].first);
|
| 153 |
-
}
|
| 154 |
-
|
| 155 |
-
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
| 156 |
-
int idx = dist(rng);
|
| 157 |
-
|
| 158 |
-
return logits_id[idx].second;
|
| 159 |
-
}
|
| 160 |
-
|
| 161 |
// default hparams (GPT-2 117M)
|
| 162 |
struct gpt2_hparams {
|
| 163 |
int32_t n_vocab = 50257;
|
|
@@ -165,7 +23,7 @@ struct gpt2_hparams {
|
|
| 165 |
int32_t n_embd = 768;
|
| 166 |
int32_t n_head = 12;
|
| 167 |
int32_t n_layer = 12;
|
| 168 |
-
int32_t
|
| 169 |
};
|
| 170 |
|
| 171 |
struct gpt2_layer {
|
|
@@ -187,7 +45,7 @@ struct gpt2_layer {
|
|
| 187 |
struct ggml_tensor * c_mlp_fc_w;
|
| 188 |
struct ggml_tensor * c_mlp_fc_b;
|
| 189 |
|
| 190 |
-
struct ggml_tensor *
|
| 191 |
struct ggml_tensor * c_mlp_proj_b;
|
| 192 |
};
|
| 193 |
|
|
@@ -198,8 +56,9 @@ struct gpt2_model {
|
|
| 198 |
struct ggml_tensor * ln_f_g;
|
| 199 |
struct ggml_tensor * ln_f_b;
|
| 200 |
|
| 201 |
-
struct ggml_tensor * wte;
|
| 202 |
-
struct ggml_tensor * wpe;
|
|
|
|
| 203 |
|
| 204 |
std::vector<gpt2_layer> layers;
|
| 205 |
|
|
@@ -241,14 +100,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 241 |
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
| 242 |
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
| 243 |
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
| 244 |
-
fin.read((char *) &hparams.
|
| 245 |
|
| 246 |
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
| 247 |
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
| 248 |
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
| 249 |
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
| 250 |
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
| 251 |
-
printf("%s:
|
| 252 |
}
|
| 253 |
|
| 254 |
// load vocab
|
|
@@ -275,9 +134,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 275 |
}
|
| 276 |
}
|
| 277 |
|
| 278 |
-
// for the big tensors, we have the option to store the data in 16-bit floats
|
| 279 |
// in order to save memory and also to speed up the computation
|
| 280 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
auto & ctx = model.ctx;
|
| 283 |
|
|
@@ -291,32 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 291 |
const int n_ctx = hparams.n_ctx;
|
| 292 |
const int n_vocab = hparams.n_vocab;
|
| 293 |
|
| 294 |
-
ctx_size += n_embd*
|
| 295 |
-
ctx_size += n_embd*
|
| 296 |
|
| 297 |
-
ctx_size += n_vocab*n_embd*
|
| 298 |
-
ctx_size += n_ctx*n_embd*
|
|
|
|
| 299 |
|
| 300 |
-
ctx_size += n_layer*(n_embd*
|
| 301 |
-
ctx_size += n_layer*(n_embd*
|
| 302 |
|
| 303 |
-
ctx_size += n_layer*(n_embd*
|
| 304 |
-
ctx_size += n_layer*(n_embd*
|
| 305 |
|
| 306 |
-
ctx_size += n_layer*(3*n_embd*n_embd*
|
| 307 |
-
ctx_size += n_layer*( 3*n_embd*
|
| 308 |
|
| 309 |
-
ctx_size += n_layer*(n_embd*n_embd*
|
| 310 |
-
ctx_size += n_layer*( n_embd*
|
| 311 |
|
| 312 |
-
ctx_size += n_layer*(4*n_embd*n_embd*
|
| 313 |
-
ctx_size += n_layer*( 4*n_embd*
|
| 314 |
|
| 315 |
-
ctx_size += n_layer*(4*n_embd*n_embd*
|
| 316 |
-
ctx_size += n_layer*( n_embd*
|
| 317 |
|
| 318 |
-
ctx_size += n_ctx*n_layer*n_embd*
|
| 319 |
-
ctx_size += n_ctx*n_layer*n_embd*
|
| 320 |
|
| 321 |
ctx_size += (6 + 12*n_layer)*256; // object overhead
|
| 322 |
|
|
@@ -325,9 +190,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 325 |
|
| 326 |
// create the ggml context
|
| 327 |
{
|
| 328 |
-
struct ggml_init_params params
|
| 329 |
-
|
| 330 |
-
|
|
|
|
|
|
|
| 331 |
|
| 332 |
model.ctx = ggml_init(params);
|
| 333 |
if (!model.ctx) {
|
|
@@ -350,36 +217,38 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 350 |
model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 351 |
model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 352 |
|
| 353 |
-
model.wte
|
| 354 |
-
model.wpe
|
|
|
|
| 355 |
|
| 356 |
// map by name
|
| 357 |
model.tensors["model/ln_f/g"] = model.ln_f_g;
|
| 358 |
model.tensors["model/ln_f/b"] = model.ln_f_b;
|
| 359 |
|
| 360 |
-
model.tensors["model/wte"]
|
| 361 |
-
model.tensors["model/wpe"]
|
|
|
|
| 362 |
|
| 363 |
for (int i = 0; i < n_layer; ++i) {
|
| 364 |
auto & layer = model.layers[i];
|
| 365 |
|
| 366 |
-
layer.ln_1_g
|
| 367 |
-
layer.ln_1_b
|
| 368 |
|
| 369 |
-
layer.ln_2_g
|
| 370 |
-
layer.ln_2_b
|
| 371 |
|
| 372 |
-
layer.c_attn_attn_w
|
| 373 |
-
layer.c_attn_attn_b
|
| 374 |
|
| 375 |
-
layer.c_attn_proj_w
|
| 376 |
-
layer.c_attn_proj_b
|
| 377 |
|
| 378 |
-
layer.c_mlp_fc_w
|
| 379 |
-
layer.c_mlp_fc_b
|
| 380 |
|
| 381 |
-
layer.
|
| 382 |
-
layer.c_mlp_proj_b
|
| 383 |
|
| 384 |
// map by name
|
| 385 |
model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
|
|
@@ -397,7 +266,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 397 |
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
|
| 398 |
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
|
| 399 |
|
| 400 |
-
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.
|
| 401 |
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
|
| 402 |
}
|
| 403 |
}
|
|
@@ -425,14 +294,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 425 |
{
|
| 426 |
size_t total_size = 0;
|
| 427 |
|
|
|
|
|
|
|
| 428 |
while (true) {
|
| 429 |
int32_t n_dims;
|
| 430 |
int32_t length;
|
| 431 |
-
int32_t
|
| 432 |
|
| 433 |
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
| 434 |
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
| 435 |
-
fin.read(reinterpret_cast<char *>(&
|
| 436 |
|
| 437 |
if (fin.eof()) {
|
| 438 |
break;
|
|
@@ -461,13 +332,18 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 461 |
|
| 462 |
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
| 463 |
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
| 464 |
-
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
| 465 |
return false;
|
| 466 |
}
|
| 467 |
|
| 468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
|
| 470 |
-
if (nelements*bpe != ggml_nbytes(tensor)) {
|
| 471 |
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
| 472 |
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
| 473 |
return false;
|
|
@@ -475,7 +351,15 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 475 |
|
| 476 |
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
| 477 |
|
| 478 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
total_size += ggml_nbytes(tensor);
|
| 480 |
}
|
| 481 |
|
|
@@ -493,7 +377,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 493 |
// - n_threads: number of threads to use
|
| 494 |
// - n_past: the context size so far
|
| 495 |
// - embd_inp: the embeddings of the tokens in the context
|
| 496 |
-
// - embd_w: the predicted
|
| 497 |
//
|
| 498 |
bool gpt2_eval(
|
| 499 |
const gpt2_model & model,
|
|
@@ -512,12 +396,12 @@ bool gpt2_eval(
|
|
| 512 |
const int n_head = hparams.n_head;
|
| 513 |
const int n_vocab = hparams.n_vocab;
|
| 514 |
|
| 515 |
-
static size_t buf_size =
|
| 516 |
static void * buf = malloc(buf_size);
|
| 517 |
|
| 518 |
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
| 519 |
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
| 520 |
-
printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
| 521 |
|
| 522 |
// reallocate
|
| 523 |
buf_size = buf_size_new;
|
|
@@ -528,13 +412,14 @@ bool gpt2_eval(
|
|
| 528 |
}
|
| 529 |
}
|
| 530 |
|
| 531 |
-
struct ggml_init_params params
|
| 532 |
-
|
| 533 |
-
|
|
|
|
|
|
|
| 534 |
|
| 535 |
struct ggml_context * ctx0 = ggml_init(params);
|
| 536 |
-
|
| 537 |
-
struct ggml_cgraph gf = { };
|
| 538 |
gf.n_threads = n_threads;
|
| 539 |
|
| 540 |
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
|
@@ -578,7 +463,7 @@ bool gpt2_eval(
|
|
| 578 |
// [2304, N]
|
| 579 |
{
|
| 580 |
cur = ggml_mul_mat(ctx0,
|
| 581 |
-
|
| 582 |
cur);
|
| 583 |
|
| 584 |
cur = ggml_add(ctx0,
|
|
@@ -654,11 +539,13 @@ bool gpt2_eval(
|
|
| 654 |
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
| 655 |
// [n_past + N, 64, 12]
|
| 656 |
struct ggml_tensor * V_trans =
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
|
|
|
|
|
|
| 662 |
|
| 663 |
// KQV = transpose(V) * KQ_soft_max
|
| 664 |
// [64, N, 12]
|
|
@@ -685,7 +572,7 @@ bool gpt2_eval(
|
|
| 685 |
// [768, N]
|
| 686 |
{
|
| 687 |
cur = ggml_mul_mat(ctx0,
|
| 688 |
-
|
| 689 |
cur);
|
| 690 |
|
| 691 |
cur = ggml_add(ctx0,
|
|
@@ -722,7 +609,7 @@ bool gpt2_eval(
|
|
| 722 |
// cur = fc_w*cur + fc_b
|
| 723 |
// [3072, N]
|
| 724 |
cur = ggml_mul_mat(ctx0,
|
| 725 |
-
|
| 726 |
cur);
|
| 727 |
|
| 728 |
cur = ggml_add(ctx0,
|
|
@@ -742,7 +629,7 @@ bool gpt2_eval(
|
|
| 742 |
// cur = proj_w*cur + proj_b
|
| 743 |
// [768, N]
|
| 744 |
cur = ggml_mul_mat(ctx0,
|
| 745 |
-
model.layers[il].
|
| 746 |
cur);
|
| 747 |
|
| 748 |
cur = ggml_add(ctx0,
|
|
@@ -769,12 +656,12 @@ bool gpt2_eval(
|
|
| 769 |
}
|
| 770 |
|
| 771 |
// inpL = WTE * inpL
|
| 772 |
-
// [ 768, 50257] - model.
|
| 773 |
// [ 768, N] - inpL
|
| 774 |
-
inpL = ggml_mul_mat(ctx0, model.
|
| 775 |
|
| 776 |
// logits -> probs
|
| 777 |
-
inpL = ggml_soft_max(ctx0, inpL);
|
| 778 |
|
| 779 |
// run the computation
|
| 780 |
ggml_build_forward_expand(&gf, inpL);
|
|
@@ -788,7 +675,7 @@ bool gpt2_eval(
|
|
| 788 |
//embd_w.resize(n_vocab*N);
|
| 789 |
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
| 790 |
|
| 791 |
-
// return result for
|
| 792 |
embd_w.resize(n_vocab);
|
| 793 |
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
| 794 |
|
|
@@ -825,7 +712,7 @@ Me too.
|
|
| 825 |
int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
|
| 826 |
|
| 827 |
// sampling parameters
|
| 828 |
-
int32_t top_k =
|
| 829 |
float top_p = 0.9f;
|
| 830 |
float temp = 1.0f;
|
| 831 |
};
|
|
@@ -833,14 +720,14 @@ Me too.
|
|
| 833 |
struct gpt2_context * gpt2_init(const char * path_model) {
|
| 834 |
gpt2_context * ctx = new gpt2_context;
|
| 835 |
|
| 836 |
-
ctx->rng = std::mt19937(time(
|
| 837 |
|
| 838 |
// load the model
|
| 839 |
{
|
| 840 |
const int64_t t_start_us = ggml_time_us();
|
| 841 |
|
| 842 |
if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
|
| 843 |
-
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__,
|
| 844 |
delete ctx;
|
| 845 |
return nullptr;
|
| 846 |
}
|
|
@@ -885,9 +772,9 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens)
|
|
| 885 |
|
| 886 |
std::string result;
|
| 887 |
|
| 888 |
-
for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) {
|
| 889 |
// predict
|
| 890 |
-
if (embd.
|
| 891 |
if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
|
| 892 |
printf("gpt-2: failed to generate text\n");
|
| 893 |
return "";
|
|
@@ -914,10 +801,7 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens)
|
|
| 914 |
result += ctx->vocab.id_to_token[embd[0]];
|
| 915 |
|
| 916 |
// end of text token
|
| 917 |
-
if (embd.back() == 50256
|
| 918 |
-
ctx->vocab.id_to_token[embd.back()] == "." ||
|
| 919 |
-
ctx->vocab.id_to_token[embd.back()] == "!" ||
|
| 920 |
-
ctx->vocab.id_to_token[embd.back()] == "?") {
|
| 921 |
break;
|
| 922 |
}
|
| 923 |
}
|
|
|
|
| 1 |
#include "ggml.h"
|
| 2 |
+
#include "common-ggml.h"
|
| 3 |
+
|
| 4 |
#include "gpt-2.h"
|
| 5 |
|
| 6 |
#include <cmath>
|
|
|
|
| 16 |
|
| 17 |
/////////////////////// GPT-2 BEGIN /////////////////////////
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
// default hparams (GPT-2 117M)
|
| 20 |
struct gpt2_hparams {
|
| 21 |
int32_t n_vocab = 50257;
|
|
|
|
| 23 |
int32_t n_embd = 768;
|
| 24 |
int32_t n_head = 12;
|
| 25 |
int32_t n_layer = 12;
|
| 26 |
+
int32_t ftype = 1;
|
| 27 |
};
|
| 28 |
|
| 29 |
struct gpt2_layer {
|
|
|
|
| 45 |
struct ggml_tensor * c_mlp_fc_w;
|
| 46 |
struct ggml_tensor * c_mlp_fc_b;
|
| 47 |
|
| 48 |
+
struct ggml_tensor * c_mlp_proj_w;
|
| 49 |
struct ggml_tensor * c_mlp_proj_b;
|
| 50 |
};
|
| 51 |
|
|
|
|
| 56 |
struct ggml_tensor * ln_f_g;
|
| 57 |
struct ggml_tensor * ln_f_b;
|
| 58 |
|
| 59 |
+
struct ggml_tensor * wte; // position embedding
|
| 60 |
+
struct ggml_tensor * wpe; // token embedding
|
| 61 |
+
struct ggml_tensor * lm_head; // language model head
|
| 62 |
|
| 63 |
std::vector<gpt2_layer> layers;
|
| 64 |
|
|
|
|
| 100 |
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
| 101 |
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
| 102 |
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
| 103 |
+
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
| 104 |
|
| 105 |
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
| 106 |
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
| 107 |
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
| 108 |
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
| 109 |
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
| 110 |
+
printf("%s: ftype = %d\n", __func__, hparams.ftype);
|
| 111 |
}
|
| 112 |
|
| 113 |
// load vocab
|
|
|
|
| 134 |
}
|
| 135 |
}
|
| 136 |
|
| 137 |
+
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
| 138 |
// in order to save memory and also to speed up the computation
|
| 139 |
+
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
| 140 |
+
if (wtype == GGML_TYPE_COUNT) {
|
| 141 |
+
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
|
| 142 |
+
__func__, fname.c_str(), model.hparams.ftype);
|
| 143 |
+
return false;
|
| 144 |
+
}
|
| 145 |
|
| 146 |
auto & ctx = model.ctx;
|
| 147 |
|
|
|
|
| 155 |
const int n_ctx = hparams.n_ctx;
|
| 156 |
const int n_vocab = hparams.n_vocab;
|
| 157 |
|
| 158 |
+
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
|
| 159 |
+
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
|
| 160 |
|
| 161 |
+
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
|
| 162 |
+
ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
|
| 163 |
+
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
|
| 164 |
|
| 165 |
+
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
|
| 166 |
+
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
|
| 167 |
|
| 168 |
+
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
|
| 169 |
+
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
|
| 170 |
|
| 171 |
+
ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
|
| 172 |
+
ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
|
| 173 |
|
| 174 |
+
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
|
| 175 |
+
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
|
| 176 |
|
| 177 |
+
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
|
| 178 |
+
ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
|
| 179 |
|
| 180 |
+
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
|
| 181 |
+
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
|
| 182 |
|
| 183 |
+
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
|
| 184 |
+
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
|
| 185 |
|
| 186 |
ctx_size += (6 + 12*n_layer)*256; // object overhead
|
| 187 |
|
|
|
|
| 190 |
|
| 191 |
// create the ggml context
|
| 192 |
{
|
| 193 |
+
struct ggml_init_params params = {
|
| 194 |
+
.mem_size = ctx_size,
|
| 195 |
+
.mem_buffer = NULL,
|
| 196 |
+
.no_alloc = false,
|
| 197 |
+
};
|
| 198 |
|
| 199 |
model.ctx = ggml_init(params);
|
| 200 |
if (!model.ctx) {
|
|
|
|
| 217 |
model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 218 |
model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 219 |
|
| 220 |
+
model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
| 221 |
+
model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
|
| 222 |
+
model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
| 223 |
|
| 224 |
// map by name
|
| 225 |
model.tensors["model/ln_f/g"] = model.ln_f_g;
|
| 226 |
model.tensors["model/ln_f/b"] = model.ln_f_b;
|
| 227 |
|
| 228 |
+
model.tensors["model/wte"] = model.wte;
|
| 229 |
+
model.tensors["model/wpe"] = model.wpe;
|
| 230 |
+
model.tensors["model/lm_head"] = model.lm_head;
|
| 231 |
|
| 232 |
for (int i = 0; i < n_layer; ++i) {
|
| 233 |
auto & layer = model.layers[i];
|
| 234 |
|
| 235 |
+
layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 236 |
+
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 237 |
|
| 238 |
+
layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 239 |
+
layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 240 |
|
| 241 |
+
layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
|
| 242 |
+
layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
|
| 243 |
|
| 244 |
+
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
| 245 |
+
layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 246 |
|
| 247 |
+
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
|
| 248 |
+
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
|
| 249 |
|
| 250 |
+
layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
|
| 251 |
+
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 252 |
|
| 253 |
// map by name
|
| 254 |
model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
|
|
|
|
| 266 |
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
|
| 267 |
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
|
| 268 |
|
| 269 |
+
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
|
| 270 |
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
|
| 271 |
}
|
| 272 |
}
|
|
|
|
| 294 |
{
|
| 295 |
size_t total_size = 0;
|
| 296 |
|
| 297 |
+
bool has_lm_head = false;
|
| 298 |
+
|
| 299 |
while (true) {
|
| 300 |
int32_t n_dims;
|
| 301 |
int32_t length;
|
| 302 |
+
int32_t ttype;
|
| 303 |
|
| 304 |
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
| 305 |
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
| 306 |
+
fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
|
| 307 |
|
| 308 |
if (fin.eof()) {
|
| 309 |
break;
|
|
|
|
| 332 |
|
| 333 |
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
| 334 |
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
| 335 |
+
__func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
|
| 336 |
return false;
|
| 337 |
}
|
| 338 |
|
| 339 |
+
// for debugging
|
| 340 |
+
if (0) {
|
| 341 |
+
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
const size_t bpe = ggml_type_size(ggml_type(ttype));
|
| 345 |
|
| 346 |
+
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
| 347 |
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
| 348 |
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
| 349 |
return false;
|
|
|
|
| 351 |
|
| 352 |
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
| 353 |
|
| 354 |
+
// GPT-2 models share the WTE tensor as the LM head
|
| 355 |
+
if (name == "model/wte" && has_lm_head == false) {
|
| 356 |
+
memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
if (name == "model/lm_head") {
|
| 360 |
+
has_lm_head = true;
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
total_size += ggml_nbytes(tensor);
|
| 364 |
}
|
| 365 |
|
|
|
|
| 377 |
// - n_threads: number of threads to use
|
| 378 |
// - n_past: the context size so far
|
| 379 |
// - embd_inp: the embeddings of the tokens in the context
|
| 380 |
+
// - embd_w: the predicted logits for the next token
|
| 381 |
//
|
| 382 |
bool gpt2_eval(
|
| 383 |
const gpt2_model & model,
|
|
|
|
| 396 |
const int n_head = hparams.n_head;
|
| 397 |
const int n_vocab = hparams.n_vocab;
|
| 398 |
|
| 399 |
+
static size_t buf_size = 512u*1024*1024;
|
| 400 |
static void * buf = malloc(buf_size);
|
| 401 |
|
| 402 |
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
| 403 |
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
| 404 |
+
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
| 405 |
|
| 406 |
// reallocate
|
| 407 |
buf_size = buf_size_new;
|
|
|
|
| 412 |
}
|
| 413 |
}
|
| 414 |
|
| 415 |
+
struct ggml_init_params params = {
|
| 416 |
+
/*.mem_size =*/ buf_size,
|
| 417 |
+
/*.mem_buffer =*/ buf,
|
| 418 |
+
/*.no_alloc =*/ false,
|
| 419 |
+
};
|
| 420 |
|
| 421 |
struct ggml_context * ctx0 = ggml_init(params);
|
| 422 |
+
struct ggml_cgraph gf = {};
|
|
|
|
| 423 |
gf.n_threads = n_threads;
|
| 424 |
|
| 425 |
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
|
|
|
| 463 |
// [2304, N]
|
| 464 |
{
|
| 465 |
cur = ggml_mul_mat(ctx0,
|
| 466 |
+
model.layers[il].c_attn_attn_w,
|
| 467 |
cur);
|
| 468 |
|
| 469 |
cur = ggml_add(ctx0,
|
|
|
|
| 539 |
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
| 540 |
// [n_past + N, 64, 12]
|
| 541 |
struct ggml_tensor * V_trans =
|
| 542 |
+
ggml_cpy(ctx0,
|
| 543 |
+
ggml_permute(ctx0,
|
| 544 |
+
ggml_reshape_3d(ctx0,
|
| 545 |
+
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
| 546 |
+
n_embd/n_head, n_head, n_past + N),
|
| 547 |
+
1, 2, 0, 3),
|
| 548 |
+
ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
|
| 549 |
|
| 550 |
// KQV = transpose(V) * KQ_soft_max
|
| 551 |
// [64, N, 12]
|
|
|
|
| 572 |
// [768, N]
|
| 573 |
{
|
| 574 |
cur = ggml_mul_mat(ctx0,
|
| 575 |
+
model.layers[il].c_attn_proj_w,
|
| 576 |
cur);
|
| 577 |
|
| 578 |
cur = ggml_add(ctx0,
|
|
|
|
| 609 |
// cur = fc_w*cur + fc_b
|
| 610 |
// [3072, N]
|
| 611 |
cur = ggml_mul_mat(ctx0,
|
| 612 |
+
model.layers[il].c_mlp_fc_w,
|
| 613 |
cur);
|
| 614 |
|
| 615 |
cur = ggml_add(ctx0,
|
|
|
|
| 629 |
// cur = proj_w*cur + proj_b
|
| 630 |
// [768, N]
|
| 631 |
cur = ggml_mul_mat(ctx0,
|
| 632 |
+
model.layers[il].c_mlp_proj_w,
|
| 633 |
cur);
|
| 634 |
|
| 635 |
cur = ggml_add(ctx0,
|
|
|
|
| 656 |
}
|
| 657 |
|
| 658 |
// inpL = WTE * inpL
|
| 659 |
+
// [ 768, 50257] - model.lm_head
|
| 660 |
// [ 768, N] - inpL
|
| 661 |
+
inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
|
| 662 |
|
| 663 |
// logits -> probs
|
| 664 |
+
//inpL = ggml_soft_max(ctx0, inpL);
|
| 665 |
|
| 666 |
// run the computation
|
| 667 |
ggml_build_forward_expand(&gf, inpL);
|
|
|
|
| 675 |
//embd_w.resize(n_vocab*N);
|
| 676 |
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
| 677 |
|
| 678 |
+
// return result just for the last token
|
| 679 |
embd_w.resize(n_vocab);
|
| 680 |
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
| 681 |
|
|
|
|
| 712 |
int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
|
| 713 |
|
| 714 |
// sampling parameters
|
| 715 |
+
int32_t top_k = 5;
|
| 716 |
float top_p = 0.9f;
|
| 717 |
float temp = 1.0f;
|
| 718 |
};
|
|
|
|
| 720 |
struct gpt2_context * gpt2_init(const char * path_model) {
|
| 721 |
gpt2_context * ctx = new gpt2_context;
|
| 722 |
|
| 723 |
+
ctx->rng = std::mt19937(time(nullptr));
|
| 724 |
|
| 725 |
// load the model
|
| 726 |
{
|
| 727 |
const int64_t t_start_us = ggml_time_us();
|
| 728 |
|
| 729 |
if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
|
| 730 |
+
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
|
| 731 |
delete ctx;
|
| 732 |
return nullptr;
|
| 733 |
}
|
|
|
|
| 772 |
|
| 773 |
std::string result;
|
| 774 |
|
| 775 |
+
for (int i = embd.size(); i < (int) embd_inp.size() + n_predict; i++) {
|
| 776 |
// predict
|
| 777 |
+
if (!embd.empty()) {
|
| 778 |
if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
|
| 779 |
printf("gpt-2: failed to generate text\n");
|
| 780 |
return "";
|
|
|
|
| 801 |
result += ctx->vocab.id_to_token[embd[0]];
|
| 802 |
|
| 803 |
// end of text token
|
| 804 |
+
if (embd.back() == 50256) {
|
|
|
|
|
|
|
|
|
|
| 805 |
break;
|
| 806 |
}
|
| 807 |
}
|
examples/talk.wasm/gpt-2.h
CHANGED
|
@@ -2,18 +2,12 @@
|
|
| 2 |
|
| 3 |
// TODO: Change to C-style API and move to ./examples for easy reuse.
|
| 4 |
|
|
|
|
|
|
|
| 5 |
#include <vector>
|
| 6 |
#include <map>
|
| 7 |
#include <string>
|
| 8 |
|
| 9 |
-
struct gpt_vocab {
|
| 10 |
-
using id = int32_t;
|
| 11 |
-
using token = std::string;
|
| 12 |
-
|
| 13 |
-
std::map<token, id> token_to_id;
|
| 14 |
-
std::map<id, token> id_to_token;
|
| 15 |
-
};
|
| 16 |
-
|
| 17 |
struct gpt2_context;
|
| 18 |
|
| 19 |
struct gpt2_context * gpt2_init(const char * path_model);
|
|
|
|
| 2 |
|
| 3 |
// TODO: Change to C-style API and move to ./examples for easy reuse.
|
| 4 |
|
| 5 |
+
#include "common.h"
|
| 6 |
+
|
| 7 |
#include <vector>
|
| 8 |
#include <map>
|
| 9 |
#include <string>
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
struct gpt2_context;
|
| 12 |
|
| 13 |
struct gpt2_context * gpt2_init(const char * path_model);
|
examples/talk.wasm/index-tmpl.html
CHANGED
|
@@ -44,6 +44,15 @@
|
|
| 44 |
|
| 45 |
<br><br>
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
<hr>
|
| 48 |
|
| 49 |
Select the models you would like to use and click the "Start" button to begin the conversation
|
|
@@ -54,6 +63,10 @@
|
|
| 54 |
Whisper model: <span id="model-whisper-status"></span>
|
| 55 |
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
|
| 56 |
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
<span id="fetch-whisper-progress"></span>
|
| 58 |
|
| 59 |
<!--
|
|
@@ -266,11 +279,17 @@
|
|
| 266 |
let urls = {
|
| 267 |
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
|
| 268 |
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
|
|
|
|
|
|
|
|
|
|
| 269 |
};
|
| 270 |
|
| 271 |
let sizes = {
|
| 272 |
'tiny.en': 75,
|
| 273 |
'base.en': 142,
|
|
|
|
|
|
|
|
|
|
| 274 |
};
|
| 275 |
|
| 276 |
let url = urls[model];
|
|
@@ -281,6 +300,10 @@
|
|
| 281 |
|
| 282 |
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
|
| 283 |
document.getElementById('fetch-whisper-base-en').style.display = 'none';
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
|
| 285 |
|
| 286 |
cbProgress = function(p) {
|
|
@@ -292,6 +315,10 @@
|
|
| 292 |
var el;
|
| 293 |
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
|
| 294 |
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
|
| 296 |
};
|
| 297 |
|
|
|
|
| 44 |
|
| 45 |
<br><br>
|
| 46 |
|
| 47 |
+
<b>More examples:</b>
|
| 48 |
+
<a href="https://whisper.ggerganov.com/">main</a> |
|
| 49 |
+
<a href="https://whisper.ggerganov.com/bench">bench</a> |
|
| 50 |
+
<a href="https://whisper.ggerganov.com/stream">stream</a> |
|
| 51 |
+
<a href="https://whisper.ggerganov.com/command">command</a> |
|
| 52 |
+
<a href="https://whisper.ggerganov.com/talk">talk</a> |
|
| 53 |
+
|
| 54 |
+
<br><br>
|
| 55 |
+
|
| 56 |
<hr>
|
| 57 |
|
| 58 |
Select the models you would like to use and click the "Start" button to begin the conversation
|
|
|
|
| 63 |
Whisper model: <span id="model-whisper-status"></span>
|
| 64 |
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
|
| 65 |
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
|
| 66 |
+
<br><br>
|
| 67 |
+
Quantized models:<br><br>
|
| 68 |
+
<button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
|
| 69 |
+
<button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
|
| 70 |
<span id="fetch-whisper-progress"></span>
|
| 71 |
|
| 72 |
<!--
|
|
|
|
| 279 |
let urls = {
|
| 280 |
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
|
| 281 |
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
|
| 282 |
+
|
| 283 |
+
'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
|
| 284 |
+
'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
|
| 285 |
};
|
| 286 |
|
| 287 |
let sizes = {
|
| 288 |
'tiny.en': 75,
|
| 289 |
'base.en': 142,
|
| 290 |
+
|
| 291 |
+
'tiny-en-q5_1': 31,
|
| 292 |
+
'base-en-q5_1': 57,
|
| 293 |
};
|
| 294 |
|
| 295 |
let url = urls[model];
|
|
|
|
| 300 |
|
| 301 |
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
|
| 302 |
document.getElementById('fetch-whisper-base-en').style.display = 'none';
|
| 303 |
+
|
| 304 |
+
document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
|
| 305 |
+
document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
|
| 306 |
+
|
| 307 |
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
|
| 308 |
|
| 309 |
cbProgress = function(p) {
|
|
|
|
| 315 |
var el;
|
| 316 |
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
|
| 317 |
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
|
| 318 |
+
|
| 319 |
+
el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
|
| 320 |
+
el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
|
| 321 |
+
|
| 322 |
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
|
| 323 |
};
|
| 324 |
|
examples/talk/CMakeLists.txt
CHANGED
|
@@ -1,16 +1,8 @@
|
|
| 1 |
if (WHISPER_SDL2)
|
| 2 |
# talk
|
| 3 |
set(TARGET talk)
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
#target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
|
| 7 |
-
|
| 8 |
-
# TODO: this is temporary
|
| 9 |
-
# need to export ggml symbols for MSVC, but too lazy ..
|
| 10 |
-
add_executable(${TARGET} talk.cpp gpt-2.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
|
| 11 |
|
| 12 |
include(DefaultTargetOptions)
|
| 13 |
-
|
| 14 |
-
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
|
| 15 |
-
target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
|
| 16 |
endif ()
|
|
|
|
| 1 |
if (WHISPER_SDL2)
|
| 2 |
# talk
|
| 3 |
set(TARGET talk)
|
| 4 |
+
add_executable(${TARGET} talk.cpp gpt-2.cpp)
|
| 5 |
+
target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
include(DefaultTargetOptions)
|
|
|
|
|
|
|
|
|
|
| 8 |
endif ()
|
examples/talk/gpt-2.cpp
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
#include "ggml.h"
|
|
|
|
|
|
|
| 2 |
#include "gpt-2.h"
|
| 3 |
|
| 4 |
#include <cmath>
|
|
@@ -14,150 +16,6 @@
|
|
| 14 |
|
| 15 |
/////////////////////// GPT-2 BEGIN /////////////////////////
|
| 16 |
|
| 17 |
-
//
|
| 18 |
-
// Vocab utils
|
| 19 |
-
//
|
| 20 |
-
|
| 21 |
-
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
| 22 |
-
std::vector<std::string> words;
|
| 23 |
-
|
| 24 |
-
// first split the text into words
|
| 25 |
-
{
|
| 26 |
-
std::string str = text;
|
| 27 |
-
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
| 28 |
-
|
| 29 |
-
std::regex re(pat);
|
| 30 |
-
std::smatch m;
|
| 31 |
-
|
| 32 |
-
while (std::regex_search(str, m, re)) {
|
| 33 |
-
for (auto x : m) {
|
| 34 |
-
words.push_back(x);
|
| 35 |
-
}
|
| 36 |
-
str = m.suffix();
|
| 37 |
-
}
|
| 38 |
-
}
|
| 39 |
-
|
| 40 |
-
// find the longest tokens that form the words:
|
| 41 |
-
std::vector<gpt_vocab::id> tokens;
|
| 42 |
-
for (const auto & word : words) {
|
| 43 |
-
if (word.empty()) continue;
|
| 44 |
-
|
| 45 |
-
int i = 0;
|
| 46 |
-
int n = word.size();
|
| 47 |
-
while (i < n) {
|
| 48 |
-
int j = n;
|
| 49 |
-
while (j > i) {
|
| 50 |
-
auto it = vocab.token_to_id.find(word.substr(i, j-i));
|
| 51 |
-
if (it != vocab.token_to_id.end()) {
|
| 52 |
-
tokens.push_back(it->second);
|
| 53 |
-
i = j;
|
| 54 |
-
break;
|
| 55 |
-
}
|
| 56 |
-
--j;
|
| 57 |
-
}
|
| 58 |
-
if (i == n) {
|
| 59 |
-
break;
|
| 60 |
-
}
|
| 61 |
-
if (j == i) {
|
| 62 |
-
auto sub = word.substr(i, 1);
|
| 63 |
-
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
|
| 64 |
-
tokens.push_back(vocab.token_to_id.at(sub));
|
| 65 |
-
} else {
|
| 66 |
-
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
|
| 67 |
-
}
|
| 68 |
-
++i;
|
| 69 |
-
}
|
| 70 |
-
}
|
| 71 |
-
}
|
| 72 |
-
|
| 73 |
-
return tokens;
|
| 74 |
-
}
|
| 75 |
-
|
| 76 |
-
gpt_vocab::id gpt_sample_top_k_top_p(
|
| 77 |
-
const gpt_vocab & vocab,
|
| 78 |
-
const float * logits,
|
| 79 |
-
int top_k,
|
| 80 |
-
double top_p,
|
| 81 |
-
double /*temp*/,
|
| 82 |
-
std::mt19937 & rng) {
|
| 83 |
-
int n_logits = vocab.id_to_token.size();
|
| 84 |
-
|
| 85 |
-
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
|
| 86 |
-
logits_id.reserve(n_logits);
|
| 87 |
-
|
| 88 |
-
for (int i = 0; i < n_logits; i++) {
|
| 89 |
-
logits_id.emplace_back(logits[i], i);
|
| 90 |
-
}
|
| 91 |
-
|
| 92 |
-
// find the top K tokens
|
| 93 |
-
std::partial_sort(
|
| 94 |
-
logits_id.begin(),
|
| 95 |
-
logits_id.begin() + top_k, logits_id.end(),
|
| 96 |
-
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
|
| 97 |
-
return a.first > b.first;
|
| 98 |
-
});
|
| 99 |
-
|
| 100 |
-
logits_id.resize(top_k);
|
| 101 |
-
|
| 102 |
-
// normalize
|
| 103 |
-
{
|
| 104 |
-
double sum = 0.0f;
|
| 105 |
-
for (int i = 0; i < (int)logits_id.size(); i++) {
|
| 106 |
-
sum += logits_id[i].first;
|
| 107 |
-
}
|
| 108 |
-
|
| 109 |
-
sum = 1.0/sum;
|
| 110 |
-
for (int i = 0; i < (int)logits_id.size(); i++) {
|
| 111 |
-
logits_id[i].first *= sum;
|
| 112 |
-
}
|
| 113 |
-
}
|
| 114 |
-
|
| 115 |
-
if (top_p < 1.0f) {
|
| 116 |
-
{
|
| 117 |
-
double cumsum = 0.0f;
|
| 118 |
-
for (int i = 0; i < top_k; i++) {
|
| 119 |
-
cumsum += logits_id[i].first;
|
| 120 |
-
if (cumsum >= top_p) {
|
| 121 |
-
logits_id.resize(i+1);
|
| 122 |
-
break;
|
| 123 |
-
}
|
| 124 |
-
}
|
| 125 |
-
}
|
| 126 |
-
|
| 127 |
-
// normalize again
|
| 128 |
-
{
|
| 129 |
-
double sum = 0.0f;
|
| 130 |
-
for (int i = 0; i < (int)logits_id.size(); i++) {
|
| 131 |
-
sum += logits_id[i].first;
|
| 132 |
-
}
|
| 133 |
-
|
| 134 |
-
sum = 1.0/sum;
|
| 135 |
-
for (int i = 0; i < (int)logits_id.size(); i++) {
|
| 136 |
-
logits_id[i].first *= sum;
|
| 137 |
-
}
|
| 138 |
-
}
|
| 139 |
-
}
|
| 140 |
-
|
| 141 |
-
//printf("\n");
|
| 142 |
-
//for (int i = 0; i < (int) logits_id.size(); i++) {
|
| 143 |
-
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
|
| 144 |
-
//}
|
| 145 |
-
//exit(0);
|
| 146 |
-
|
| 147 |
-
// sample from the obtained distribution
|
| 148 |
-
std::vector<double> probs;
|
| 149 |
-
probs.reserve(logits_id.size());
|
| 150 |
-
|
| 151 |
-
for (int i = 0; i < (int) logits_id.size(); i++) {
|
| 152 |
-
probs.push_back(logits_id[i].first);
|
| 153 |
-
}
|
| 154 |
-
|
| 155 |
-
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
| 156 |
-
int idx = dist(rng);
|
| 157 |
-
|
| 158 |
-
return logits_id[idx].second;
|
| 159 |
-
}
|
| 160 |
-
|
| 161 |
// default hparams (GPT-2 117M)
|
| 162 |
struct gpt2_hparams {
|
| 163 |
int32_t n_vocab = 50257;
|
|
@@ -165,7 +23,7 @@ struct gpt2_hparams {
|
|
| 165 |
int32_t n_embd = 768;
|
| 166 |
int32_t n_head = 12;
|
| 167 |
int32_t n_layer = 12;
|
| 168 |
-
int32_t
|
| 169 |
};
|
| 170 |
|
| 171 |
struct gpt2_layer {
|
|
@@ -187,7 +45,7 @@ struct gpt2_layer {
|
|
| 187 |
struct ggml_tensor * c_mlp_fc_w;
|
| 188 |
struct ggml_tensor * c_mlp_fc_b;
|
| 189 |
|
| 190 |
-
struct ggml_tensor *
|
| 191 |
struct ggml_tensor * c_mlp_proj_b;
|
| 192 |
};
|
| 193 |
|
|
@@ -198,8 +56,9 @@ struct gpt2_model {
|
|
| 198 |
struct ggml_tensor * ln_f_g;
|
| 199 |
struct ggml_tensor * ln_f_b;
|
| 200 |
|
| 201 |
-
struct ggml_tensor * wte;
|
| 202 |
-
struct ggml_tensor * wpe;
|
|
|
|
| 203 |
|
| 204 |
std::vector<gpt2_layer> layers;
|
| 205 |
|
|
@@ -241,14 +100,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 241 |
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
| 242 |
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
| 243 |
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
| 244 |
-
fin.read((char *) &hparams.
|
| 245 |
|
| 246 |
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
| 247 |
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
| 248 |
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
| 249 |
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
| 250 |
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
| 251 |
-
printf("%s:
|
| 252 |
}
|
| 253 |
|
| 254 |
// load vocab
|
|
@@ -268,16 +127,21 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 268 |
fin.read((char *) &len, sizeof(len));
|
| 269 |
|
| 270 |
word.resize(len);
|
| 271 |
-
fin.read((char *)
|
| 272 |
|
| 273 |
vocab.token_to_id[word] = i;
|
| 274 |
vocab.id_to_token[i] = word;
|
| 275 |
}
|
| 276 |
}
|
| 277 |
|
| 278 |
-
// for the big tensors, we have the option to store the data in 16-bit floats
|
| 279 |
// in order to save memory and also to speed up the computation
|
| 280 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
auto & ctx = model.ctx;
|
| 283 |
|
|
@@ -291,32 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 291 |
const int n_ctx = hparams.n_ctx;
|
| 292 |
const int n_vocab = hparams.n_vocab;
|
| 293 |
|
| 294 |
-
ctx_size += n_embd*
|
| 295 |
-
ctx_size += n_embd*
|
| 296 |
|
| 297 |
-
ctx_size += n_vocab*n_embd*
|
| 298 |
-
ctx_size += n_ctx*n_embd*
|
|
|
|
| 299 |
|
| 300 |
-
ctx_size += n_layer*(n_embd*
|
| 301 |
-
ctx_size += n_layer*(n_embd*
|
| 302 |
|
| 303 |
-
ctx_size += n_layer*(n_embd*
|
| 304 |
-
ctx_size += n_layer*(n_embd*
|
| 305 |
|
| 306 |
-
ctx_size += n_layer*(3*n_embd*n_embd*
|
| 307 |
-
ctx_size += n_layer*( 3*n_embd*
|
| 308 |
|
| 309 |
-
ctx_size += n_layer*(n_embd*n_embd*
|
| 310 |
-
ctx_size += n_layer*( n_embd*
|
| 311 |
|
| 312 |
-
ctx_size += n_layer*(4*n_embd*n_embd*
|
| 313 |
-
ctx_size += n_layer*( 4*n_embd*
|
| 314 |
|
| 315 |
-
ctx_size += n_layer*(4*n_embd*n_embd*
|
| 316 |
-
ctx_size += n_layer*( n_embd*
|
| 317 |
|
| 318 |
-
ctx_size += n_ctx*n_layer*n_embd*
|
| 319 |
-
ctx_size += n_ctx*n_layer*n_embd*
|
| 320 |
|
| 321 |
ctx_size += (6 + 12*n_layer)*256; // object overhead
|
| 322 |
|
|
@@ -325,9 +190,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 325 |
|
| 326 |
// create the ggml context
|
| 327 |
{
|
| 328 |
-
struct ggml_init_params params
|
| 329 |
-
|
| 330 |
-
|
|
|
|
|
|
|
| 331 |
|
| 332 |
model.ctx = ggml_init(params);
|
| 333 |
if (!model.ctx) {
|
|
@@ -350,36 +217,38 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 350 |
model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 351 |
model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 352 |
|
| 353 |
-
model.wte
|
| 354 |
-
model.wpe
|
|
|
|
| 355 |
|
| 356 |
// map by name
|
| 357 |
model.tensors["model/ln_f/g"] = model.ln_f_g;
|
| 358 |
model.tensors["model/ln_f/b"] = model.ln_f_b;
|
| 359 |
|
| 360 |
-
model.tensors["model/wte"]
|
| 361 |
-
model.tensors["model/wpe"]
|
|
|
|
| 362 |
|
| 363 |
for (int i = 0; i < n_layer; ++i) {
|
| 364 |
auto & layer = model.layers[i];
|
| 365 |
|
| 366 |
-
layer.ln_1_g
|
| 367 |
-
layer.ln_1_b
|
| 368 |
|
| 369 |
-
layer.ln_2_g
|
| 370 |
-
layer.ln_2_b
|
| 371 |
|
| 372 |
-
layer.c_attn_attn_w
|
| 373 |
-
layer.c_attn_attn_b
|
| 374 |
|
| 375 |
-
layer.c_attn_proj_w
|
| 376 |
-
layer.c_attn_proj_b
|
| 377 |
|
| 378 |
-
layer.c_mlp_fc_w
|
| 379 |
-
layer.c_mlp_fc_b
|
| 380 |
|
| 381 |
-
layer.
|
| 382 |
-
layer.c_mlp_proj_b
|
| 383 |
|
| 384 |
// map by name
|
| 385 |
model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
|
|
@@ -397,7 +266,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 397 |
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
|
| 398 |
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
|
| 399 |
|
| 400 |
-
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.
|
| 401 |
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
|
| 402 |
}
|
| 403 |
}
|
|
@@ -425,14 +294,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 425 |
{
|
| 426 |
size_t total_size = 0;
|
| 427 |
|
|
|
|
|
|
|
| 428 |
while (true) {
|
| 429 |
int32_t n_dims;
|
| 430 |
int32_t length;
|
| 431 |
-
int32_t
|
| 432 |
|
| 433 |
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
| 434 |
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
| 435 |
-
fin.read(reinterpret_cast<char *>(&
|
| 436 |
|
| 437 |
if (fin.eof()) {
|
| 438 |
break;
|
|
@@ -448,7 +319,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 448 |
std::string name(length, 0);
|
| 449 |
fin.read(&name[0], length);
|
| 450 |
|
| 451 |
-
if (model.tensors.find(name) == model.tensors.end()) {
|
| 452 |
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
| 453 |
return false;
|
| 454 |
}
|
|
@@ -461,13 +332,18 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 461 |
|
| 462 |
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
| 463 |
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
| 464 |
-
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
| 465 |
return false;
|
| 466 |
}
|
| 467 |
|
| 468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
|
| 470 |
-
if (nelements*bpe != ggml_nbytes(tensor)) {
|
| 471 |
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
| 472 |
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
| 473 |
return false;
|
|
@@ -475,7 +351,15 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 475 |
|
| 476 |
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
| 477 |
|
| 478 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
total_size += ggml_nbytes(tensor);
|
| 480 |
}
|
| 481 |
|
|
@@ -493,7 +377,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 493 |
// - n_threads: number of threads to use
|
| 494 |
// - n_past: the context size so far
|
| 495 |
// - embd_inp: the embeddings of the tokens in the context
|
| 496 |
-
// - embd_w: the predicted
|
| 497 |
//
|
| 498 |
bool gpt2_eval(
|
| 499 |
const gpt2_model & model,
|
|
@@ -512,12 +396,12 @@ bool gpt2_eval(
|
|
| 512 |
const int n_head = hparams.n_head;
|
| 513 |
const int n_vocab = hparams.n_vocab;
|
| 514 |
|
| 515 |
-
static size_t buf_size =
|
| 516 |
static void * buf = malloc(buf_size);
|
| 517 |
|
| 518 |
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
| 519 |
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
| 520 |
-
printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
| 521 |
|
| 522 |
// reallocate
|
| 523 |
buf_size = buf_size_new;
|
|
@@ -528,13 +412,14 @@ bool gpt2_eval(
|
|
| 528 |
}
|
| 529 |
}
|
| 530 |
|
| 531 |
-
struct ggml_init_params params
|
| 532 |
-
|
| 533 |
-
|
|
|
|
|
|
|
| 534 |
|
| 535 |
struct ggml_context * ctx0 = ggml_init(params);
|
| 536 |
-
|
| 537 |
-
struct ggml_cgraph gf = { };
|
| 538 |
gf.n_threads = n_threads;
|
| 539 |
|
| 540 |
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
|
@@ -578,7 +463,7 @@ bool gpt2_eval(
|
|
| 578 |
// [2304, N]
|
| 579 |
{
|
| 580 |
cur = ggml_mul_mat(ctx0,
|
| 581 |
-
|
| 582 |
cur);
|
| 583 |
|
| 584 |
cur = ggml_add(ctx0,
|
|
@@ -654,11 +539,13 @@ bool gpt2_eval(
|
|
| 654 |
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
| 655 |
// [n_past + N, 64, 12]
|
| 656 |
struct ggml_tensor * V_trans =
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
|
|
|
|
|
|
| 662 |
|
| 663 |
// KQV = transpose(V) * KQ_soft_max
|
| 664 |
// [64, N, 12]
|
|
@@ -685,7 +572,7 @@ bool gpt2_eval(
|
|
| 685 |
// [768, N]
|
| 686 |
{
|
| 687 |
cur = ggml_mul_mat(ctx0,
|
| 688 |
-
|
| 689 |
cur);
|
| 690 |
|
| 691 |
cur = ggml_add(ctx0,
|
|
@@ -722,7 +609,7 @@ bool gpt2_eval(
|
|
| 722 |
// cur = fc_w*cur + fc_b
|
| 723 |
// [3072, N]
|
| 724 |
cur = ggml_mul_mat(ctx0,
|
| 725 |
-
|
| 726 |
cur);
|
| 727 |
|
| 728 |
cur = ggml_add(ctx0,
|
|
@@ -742,7 +629,7 @@ bool gpt2_eval(
|
|
| 742 |
// cur = proj_w*cur + proj_b
|
| 743 |
// [768, N]
|
| 744 |
cur = ggml_mul_mat(ctx0,
|
| 745 |
-
model.layers[il].
|
| 746 |
cur);
|
| 747 |
|
| 748 |
cur = ggml_add(ctx0,
|
|
@@ -769,12 +656,12 @@ bool gpt2_eval(
|
|
| 769 |
}
|
| 770 |
|
| 771 |
// inpL = WTE * inpL
|
| 772 |
-
// [ 768, 50257] - model.
|
| 773 |
// [ 768, N] - inpL
|
| 774 |
-
inpL = ggml_mul_mat(ctx0, model.
|
| 775 |
|
| 776 |
// logits -> probs
|
| 777 |
-
inpL = ggml_soft_max(ctx0, inpL);
|
| 778 |
|
| 779 |
// run the computation
|
| 780 |
ggml_build_forward_expand(&gf, inpL);
|
|
@@ -788,7 +675,7 @@ bool gpt2_eval(
|
|
| 788 |
//embd_w.resize(n_vocab*N);
|
| 789 |
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
| 790 |
|
| 791 |
-
// return result for
|
| 792 |
embd_w.resize(n_vocab);
|
| 793 |
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
| 794 |
|
|
|
|
| 1 |
#include "ggml.h"
|
| 2 |
+
#include "common-ggml.h"
|
| 3 |
+
|
| 4 |
#include "gpt-2.h"
|
| 5 |
|
| 6 |
#include <cmath>
|
|
|
|
| 16 |
|
| 17 |
/////////////////////// GPT-2 BEGIN /////////////////////////
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
// default hparams (GPT-2 117M)
|
| 20 |
struct gpt2_hparams {
|
| 21 |
int32_t n_vocab = 50257;
|
|
|
|
| 23 |
int32_t n_embd = 768;
|
| 24 |
int32_t n_head = 12;
|
| 25 |
int32_t n_layer = 12;
|
| 26 |
+
int32_t ftype = 1;
|
| 27 |
};
|
| 28 |
|
| 29 |
struct gpt2_layer {
|
|
|
|
| 45 |
struct ggml_tensor * c_mlp_fc_w;
|
| 46 |
struct ggml_tensor * c_mlp_fc_b;
|
| 47 |
|
| 48 |
+
struct ggml_tensor * c_mlp_proj_w;
|
| 49 |
struct ggml_tensor * c_mlp_proj_b;
|
| 50 |
};
|
| 51 |
|
|
|
|
| 56 |
struct ggml_tensor * ln_f_g;
|
| 57 |
struct ggml_tensor * ln_f_b;
|
| 58 |
|
| 59 |
+
struct ggml_tensor * wte; // position embedding
|
| 60 |
+
struct ggml_tensor * wpe; // token embedding
|
| 61 |
+
struct ggml_tensor * lm_head; // language model head
|
| 62 |
|
| 63 |
std::vector<gpt2_layer> layers;
|
| 64 |
|
|
|
|
| 100 |
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
| 101 |
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
| 102 |
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
| 103 |
+
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
| 104 |
|
| 105 |
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
| 106 |
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
| 107 |
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
| 108 |
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
| 109 |
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
| 110 |
+
printf("%s: ftype = %d\n", __func__, hparams.ftype);
|
| 111 |
}
|
| 112 |
|
| 113 |
// load vocab
|
|
|
|
| 127 |
fin.read((char *) &len, sizeof(len));
|
| 128 |
|
| 129 |
word.resize(len);
|
| 130 |
+
fin.read((char *) word.data(), len);
|
| 131 |
|
| 132 |
vocab.token_to_id[word] = i;
|
| 133 |
vocab.id_to_token[i] = word;
|
| 134 |
}
|
| 135 |
}
|
| 136 |
|
| 137 |
+
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
| 138 |
// in order to save memory and also to speed up the computation
|
| 139 |
+
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
| 140 |
+
if (wtype == GGML_TYPE_COUNT) {
|
| 141 |
+
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
|
| 142 |
+
__func__, fname.c_str(), model.hparams.ftype);
|
| 143 |
+
return false;
|
| 144 |
+
}
|
| 145 |
|
| 146 |
auto & ctx = model.ctx;
|
| 147 |
|
|
|
|
| 155 |
const int n_ctx = hparams.n_ctx;
|
| 156 |
const int n_vocab = hparams.n_vocab;
|
| 157 |
|
| 158 |
+
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
|
| 159 |
+
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
|
| 160 |
|
| 161 |
+
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
|
| 162 |
+
ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
|
| 163 |
+
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
|
| 164 |
|
| 165 |
+
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
|
| 166 |
+
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
|
| 167 |
|
| 168 |
+
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
|
| 169 |
+
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
|
| 170 |
|
| 171 |
+
ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
|
| 172 |
+
ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
|
| 173 |
|
| 174 |
+
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
|
| 175 |
+
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
|
| 176 |
|
| 177 |
+
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
|
| 178 |
+
ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
|
| 179 |
|
| 180 |
+
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
|
| 181 |
+
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
|
| 182 |
|
| 183 |
+
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
|
| 184 |
+
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
|
| 185 |
|
| 186 |
ctx_size += (6 + 12*n_layer)*256; // object overhead
|
| 187 |
|
|
|
|
| 190 |
|
| 191 |
// create the ggml context
|
| 192 |
{
|
| 193 |
+
struct ggml_init_params params = {
|
| 194 |
+
.mem_size = ctx_size,
|
| 195 |
+
.mem_buffer = NULL,
|
| 196 |
+
.no_alloc = false,
|
| 197 |
+
};
|
| 198 |
|
| 199 |
model.ctx = ggml_init(params);
|
| 200 |
if (!model.ctx) {
|
|
|
|
| 217 |
model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 218 |
model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 219 |
|
| 220 |
+
model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
| 221 |
+
model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
|
| 222 |
+
model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
| 223 |
|
| 224 |
// map by name
|
| 225 |
model.tensors["model/ln_f/g"] = model.ln_f_g;
|
| 226 |
model.tensors["model/ln_f/b"] = model.ln_f_b;
|
| 227 |
|
| 228 |
+
model.tensors["model/wte"] = model.wte;
|
| 229 |
+
model.tensors["model/wpe"] = model.wpe;
|
| 230 |
+
model.tensors["model/lm_head"] = model.lm_head;
|
| 231 |
|
| 232 |
for (int i = 0; i < n_layer; ++i) {
|
| 233 |
auto & layer = model.layers[i];
|
| 234 |
|
| 235 |
+
layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 236 |
+
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 237 |
|
| 238 |
+
layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 239 |
+
layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 240 |
|
| 241 |
+
layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
|
| 242 |
+
layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
|
| 243 |
|
| 244 |
+
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
| 245 |
+
layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 246 |
|
| 247 |
+
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
|
| 248 |
+
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
|
| 249 |
|
| 250 |
+
layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
|
| 251 |
+
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
| 252 |
|
| 253 |
// map by name
|
| 254 |
model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
|
|
|
|
| 266 |
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
|
| 267 |
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
|
| 268 |
|
| 269 |
+
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
|
| 270 |
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
|
| 271 |
}
|
| 272 |
}
|
|
|
|
| 294 |
{
|
| 295 |
size_t total_size = 0;
|
| 296 |
|
| 297 |
+
bool has_lm_head = false;
|
| 298 |
+
|
| 299 |
while (true) {
|
| 300 |
int32_t n_dims;
|
| 301 |
int32_t length;
|
| 302 |
+
int32_t ttype;
|
| 303 |
|
| 304 |
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
| 305 |
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
| 306 |
+
fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
|
| 307 |
|
| 308 |
if (fin.eof()) {
|
| 309 |
break;
|
|
|
|
| 319 |
std::string name(length, 0);
|
| 320 |
fin.read(&name[0], length);
|
| 321 |
|
| 322 |
+
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
| 323 |
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
| 324 |
return false;
|
| 325 |
}
|
|
|
|
| 332 |
|
| 333 |
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
| 334 |
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
| 335 |
+
__func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
|
| 336 |
return false;
|
| 337 |
}
|
| 338 |
|
| 339 |
+
// for debugging
|
| 340 |
+
if (0) {
|
| 341 |
+
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
const size_t bpe = ggml_type_size(ggml_type(ttype));
|
| 345 |
|
| 346 |
+
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
| 347 |
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
| 348 |
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
| 349 |
return false;
|
|
|
|
| 351 |
|
| 352 |
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
| 353 |
|
| 354 |
+
// GPT-2 models share the WTE tensor as the LM head
|
| 355 |
+
if (name == "model/wte" && has_lm_head == false) {
|
| 356 |
+
memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
if (name == "model/lm_head") {
|
| 360 |
+
has_lm_head = true;
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
total_size += ggml_nbytes(tensor);
|
| 364 |
}
|
| 365 |
|
|
|
|
| 377 |
// - n_threads: number of threads to use
|
| 378 |
// - n_past: the context size so far
|
| 379 |
// - embd_inp: the embeddings of the tokens in the context
|
| 380 |
+
// - embd_w: the predicted logits for the next token
|
| 381 |
//
|
| 382 |
bool gpt2_eval(
|
| 383 |
const gpt2_model & model,
|
|
|
|
| 396 |
const int n_head = hparams.n_head;
|
| 397 |
const int n_vocab = hparams.n_vocab;
|
| 398 |
|
| 399 |
+
static size_t buf_size = 512u*1024*1024;
|
| 400 |
static void * buf = malloc(buf_size);
|
| 401 |
|
| 402 |
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
| 403 |
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
| 404 |
+
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
| 405 |
|
| 406 |
// reallocate
|
| 407 |
buf_size = buf_size_new;
|
|
|
|
| 412 |
}
|
| 413 |
}
|
| 414 |
|
| 415 |
+
struct ggml_init_params params = {
|
| 416 |
+
/*.mem_size =*/ buf_size,
|
| 417 |
+
/*.mem_buffer =*/ buf,
|
| 418 |
+
/*.no_alloc =*/ false,
|
| 419 |
+
};
|
| 420 |
|
| 421 |
struct ggml_context * ctx0 = ggml_init(params);
|
| 422 |
+
struct ggml_cgraph gf = {};
|
|
|
|
| 423 |
gf.n_threads = n_threads;
|
| 424 |
|
| 425 |
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
|
|
|
| 463 |
// [2304, N]
|
| 464 |
{
|
| 465 |
cur = ggml_mul_mat(ctx0,
|
| 466 |
+
model.layers[il].c_attn_attn_w,
|
| 467 |
cur);
|
| 468 |
|
| 469 |
cur = ggml_add(ctx0,
|
|
|
|
| 539 |
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
| 540 |
// [n_past + N, 64, 12]
|
| 541 |
struct ggml_tensor * V_trans =
|
| 542 |
+
ggml_cpy(ctx0,
|
| 543 |
+
ggml_permute(ctx0,
|
| 544 |
+
ggml_reshape_3d(ctx0,
|
| 545 |
+
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
| 546 |
+
n_embd/n_head, n_head, n_past + N),
|
| 547 |
+
1, 2, 0, 3),
|
| 548 |
+
ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
|
| 549 |
|
| 550 |
// KQV = transpose(V) * KQ_soft_max
|
| 551 |
// [64, N, 12]
|
|
|
|
| 572 |
// [768, N]
|
| 573 |
{
|
| 574 |
cur = ggml_mul_mat(ctx0,
|
| 575 |
+
model.layers[il].c_attn_proj_w,
|
| 576 |
cur);
|
| 577 |
|
| 578 |
cur = ggml_add(ctx0,
|
|
|
|
| 609 |
// cur = fc_w*cur + fc_b
|
| 610 |
// [3072, N]
|
| 611 |
cur = ggml_mul_mat(ctx0,
|
| 612 |
+
model.layers[il].c_mlp_fc_w,
|
| 613 |
cur);
|
| 614 |
|
| 615 |
cur = ggml_add(ctx0,
|
|
|
|
| 629 |
// cur = proj_w*cur + proj_b
|
| 630 |
// [768, N]
|
| 631 |
cur = ggml_mul_mat(ctx0,
|
| 632 |
+
model.layers[il].c_mlp_proj_w,
|
| 633 |
cur);
|
| 634 |
|
| 635 |
cur = ggml_add(ctx0,
|
|
|
|
| 656 |
}
|
| 657 |
|
| 658 |
// inpL = WTE * inpL
|
| 659 |
+
// [ 768, 50257] - model.lm_head
|
| 660 |
// [ 768, N] - inpL
|
| 661 |
+
inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
|
| 662 |
|
| 663 |
// logits -> probs
|
| 664 |
+
//inpL = ggml_soft_max(ctx0, inpL);
|
| 665 |
|
| 666 |
// run the computation
|
| 667 |
ggml_build_forward_expand(&gf, inpL);
|
|
|
|
| 675 |
//embd_w.resize(n_vocab*N);
|
| 676 |
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
| 677 |
|
| 678 |
+
// return result just for the last token
|
| 679 |
embd_w.resize(n_vocab);
|
| 680 |
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
| 681 |
|
examples/talk/gpt-2.h
CHANGED
|
@@ -2,18 +2,12 @@
|
|
| 2 |
|
| 3 |
// TODO: Change to C-style API and move to ./examples for easy reuse.
|
| 4 |
|
|
|
|
|
|
|
| 5 |
#include <vector>
|
| 6 |
#include <map>
|
| 7 |
#include <string>
|
| 8 |
|
| 9 |
-
struct gpt_vocab {
|
| 10 |
-
using id = int32_t;
|
| 11 |
-
using token = std::string;
|
| 12 |
-
|
| 13 |
-
std::map<token, id> token_to_id;
|
| 14 |
-
std::map<id, token> id_to_token;
|
| 15 |
-
};
|
| 16 |
-
|
| 17 |
struct gpt2_context;
|
| 18 |
|
| 19 |
struct gpt2_context * gpt2_init(const char * path_model);
|
|
|
|
| 2 |
|
| 3 |
// TODO: Change to C-style API and move to ./examples for easy reuse.
|
| 4 |
|
| 5 |
+
#include "common.h"
|
| 6 |
+
|
| 7 |
#include <vector>
|
| 8 |
#include <map>
|
| 9 |
#include <string>
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
struct gpt2_context;
|
| 12 |
|
| 13 |
struct gpt2_context * gpt2_init(const char * path_model);
|
examples/whisper.wasm/CMakeLists.txt
CHANGED
|
@@ -31,9 +31,9 @@ endif()
|
|
| 31 |
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
|
| 32 |
--bind \
|
| 33 |
-s USE_PTHREADS=1 \
|
| 34 |
-
-s
|
| 35 |
-
-s INITIAL_MEMORY=
|
| 36 |
-
-s TOTAL_MEMORY=
|
| 37 |
-s FORCE_FILESYSTEM=1 \
|
| 38 |
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
|
| 39 |
${EXTRA_FLAGS} \
|
|
|
|
| 31 |
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
|
| 32 |
--bind \
|
| 33 |
-s USE_PTHREADS=1 \
|
| 34 |
+
-s PTHREAD_POOL_SIZE_STRICT=0 \
|
| 35 |
+
-s INITIAL_MEMORY=2000MB \
|
| 36 |
+
-s TOTAL_MEMORY=2000MB \
|
| 37 |
-s FORCE_FILESYSTEM=1 \
|
| 38 |
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
|
| 39 |
${EXTRA_FLAGS} \
|
examples/whisper.wasm/emscripten.cpp
CHANGED
|
@@ -10,6 +10,12 @@ std::thread g_worker;
|
|
| 10 |
|
| 11 |
std::vector<struct whisper_context *> g_contexts(4, nullptr);
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
EMSCRIPTEN_BINDINGS(whisper) {
|
| 14 |
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
|
| 15 |
if (g_worker.joinable()) {
|
|
@@ -43,7 +49,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
|
|
| 43 |
}
|
| 44 |
}));
|
| 45 |
|
| 46 |
-
emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, bool translate) {
|
| 47 |
if (g_worker.joinable()) {
|
| 48 |
g_worker.join();
|
| 49 |
}
|
|
@@ -66,7 +72,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
|
|
| 66 |
params.print_special = false;
|
| 67 |
params.translate = translate;
|
| 68 |
params.language = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
|
| 69 |
-
params.n_threads = std::min(
|
| 70 |
params.offset_ms = 0;
|
| 71 |
|
| 72 |
std::vector<float> pcmf32;
|
|
|
|
| 10 |
|
| 11 |
std::vector<struct whisper_context *> g_contexts(4, nullptr);
|
| 12 |
|
| 13 |
+
static inline int mpow2(int n) {
|
| 14 |
+
int p = 1;
|
| 15 |
+
while (p <= n) p *= 2;
|
| 16 |
+
return p/2;
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
EMSCRIPTEN_BINDINGS(whisper) {
|
| 20 |
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
|
| 21 |
if (g_worker.joinable()) {
|
|
|
|
| 49 |
}
|
| 50 |
}));
|
| 51 |
|
| 52 |
+
emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, int nthreads, bool translate) {
|
| 53 |
if (g_worker.joinable()) {
|
| 54 |
g_worker.join();
|
| 55 |
}
|
|
|
|
| 72 |
params.print_special = false;
|
| 73 |
params.translate = translate;
|
| 74 |
params.language = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
|
| 75 |
+
params.n_threads = std::min(nthreads, std::min(16, mpow2(std::thread::hardware_concurrency())));
|
| 76 |
params.offset_ms = 0;
|
| 77 |
|
| 78 |
std::vector<float> pcmf32;
|
examples/whisper.wasm/index-tmpl.html
CHANGED
|
@@ -40,21 +40,42 @@
|
|
| 40 |
|
| 41 |
Note that the computation is quite heavy and may take a few seconds to complete.<br>
|
| 42 |
The transcription results will be displayed in the text area below.<br><br>
|
| 43 |
-
<b>Important
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
<div id="model">
|
| 48 |
-
Whisper
|
| 49 |
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
|
| 50 |
<button id="fetch-whisper-tiny" onclick="loadWhisper('tiny')">tiny (75 MB)</button>
|
| 51 |
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
|
| 52 |
<button id="fetch-whisper-base" onclick="loadWhisper('base')">base (142 MB)</button>
|
| 53 |
<button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
|
| 54 |
<button id="fetch-whisper-small" onclick="loadWhisper('small')">small (466 MB)</button>
|
| 55 |
-
<span id="fetch-whisper-progress"></span>
|
| 56 |
-
|
| 57 |
<input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
</div>
|
| 59 |
|
| 60 |
<br>
|
|
@@ -161,6 +182,12 @@
|
|
| 161 |
<option value="yi">Yiddish</option>
|
| 162 |
</select>
|
| 163 |
</td>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
<td>
|
| 165 |
<button onclick="onProcess(false);">Transcribe</button>
|
| 166 |
</td>
|
|
@@ -263,11 +290,13 @@
|
|
| 263 |
|
| 264 |
Module.FS_createDataFile("/", fname, buf, true, true);
|
| 265 |
|
| 266 |
-
model_whisper = fname;
|
| 267 |
|
| 268 |
document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
|
| 269 |
|
| 270 |
printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
|
|
|
|
|
|
|
| 271 |
}
|
| 272 |
|
| 273 |
function loadFile(event, fname) {
|
|
@@ -292,6 +321,17 @@
|
|
| 292 |
document.getElementById('fetch-whisper-tiny' ).style.display = 'none';
|
| 293 |
document.getElementById('fetch-whisper-base' ).style.display = 'none';
|
| 294 |
document.getElementById('fetch-whisper-small' ).style.display = 'none';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
document.getElementById('whisper-file' ).style.display = 'none';
|
| 296 |
document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
|
| 297 |
}
|
|
@@ -304,6 +344,16 @@
|
|
| 304 |
'base': 'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
|
| 305 |
'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
|
| 306 |
'small': 'https://whisper.ggerganov.com/ggml-model-whisper-small.bin',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
};
|
| 308 |
|
| 309 |
let sizes = {
|
|
@@ -313,6 +363,16 @@
|
|
| 313 |
'base': 142,
|
| 314 |
'small.en': 466,
|
| 315 |
'small': 466,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
};
|
| 317 |
|
| 318 |
let url = urls[model];
|
|
@@ -327,8 +387,19 @@
|
|
| 327 |
document.getElementById('fetch-whisper-tiny' ).style.display = 'none';
|
| 328 |
document.getElementById('fetch-whisper-base' ).style.display = 'none';
|
| 329 |
document.getElementById('fetch-whisper-small' ).style.display = 'none';
|
| 330 |
-
|
| 331 |
-
document.getElementById('
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
|
| 333 |
cbProgress = function(p) {
|
| 334 |
let el = document.getElementById('fetch-whisper-progress');
|
|
@@ -337,14 +408,26 @@
|
|
| 337 |
|
| 338 |
cbCancel = function() {
|
| 339 |
var el;
|
|
|
|
| 340 |
el = document.getElementById('fetch-whisper-tiny-en' ); if (el) el.style.display = 'inline-block';
|
| 341 |
el = document.getElementById('fetch-whisper-base-en' ); if (el) el.style.display = 'inline-block';
|
| 342 |
el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
|
| 343 |
el = document.getElementById('fetch-whisper-tiny' ); if (el) el.style.display = 'inline-block';
|
| 344 |
el = document.getElementById('fetch-whisper-base' ); if (el) el.style.display = 'inline-block';
|
| 345 |
el = document.getElementById('fetch-whisper-small' ); if (el) el.style.display = 'inline-block';
|
| 346 |
-
|
| 347 |
-
el = document.getElementById('
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
};
|
| 349 |
|
| 350 |
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
|
|
@@ -354,7 +437,8 @@
|
|
| 354 |
// audio file
|
| 355 |
//
|
| 356 |
|
| 357 |
-
const kMaxAudio_s =
|
|
|
|
| 358 |
const kSampleRate = 16000;
|
| 359 |
|
| 360 |
window.AudioContext = window.AudioContext || window.webkitAudioContext;
|
|
@@ -423,7 +507,7 @@
|
|
| 423 |
doRecording = false;
|
| 424 |
}
|
| 425 |
|
| 426 |
-
// record up to
|
| 427 |
// check if doRecording is false every 1000 ms and stop recording if so
|
| 428 |
// update progress information
|
| 429 |
function startRecording() {
|
|
@@ -479,9 +563,9 @@
|
|
| 479 |
printTextarea('js: audio recorded, size: ' + audio.length);
|
| 480 |
|
| 481 |
// truncate to first 30 seconds
|
| 482 |
-
if (audio.length >
|
| 483 |
-
audio = audio.slice(0,
|
| 484 |
-
printTextarea('js: truncated audio to first ' +
|
| 485 |
}
|
| 486 |
setAudio(audio);
|
| 487 |
});
|
|
@@ -509,24 +593,31 @@
|
|
| 509 |
});
|
| 510 |
}
|
| 511 |
|
| 512 |
-
document.getElementById('progress-bar').style.width = (100*(Date.now() - startTime)/1000/
|
| 513 |
-
document.getElementById('progress-text').innerHTML = (100*(Date.now() - startTime)/1000/
|
| 514 |
}, 1000);
|
| 515 |
|
| 516 |
printTextarea('js: recording ...');
|
| 517 |
|
| 518 |
setTimeout(function() {
|
| 519 |
if (doRecording) {
|
| 520 |
-
printTextarea('js: recording stopped after ' +
|
| 521 |
stopRecording();
|
| 522 |
}
|
| 523 |
-
},
|
| 524 |
}
|
| 525 |
|
| 526 |
//
|
| 527 |
// transcribe
|
| 528 |
//
|
| 529 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 530 |
function onProcess(translate) {
|
| 531 |
if (!instance) {
|
| 532 |
instance = Module.init('whisper.bin');
|
|
@@ -553,7 +644,7 @@
|
|
| 553 |
printTextarea('');
|
| 554 |
|
| 555 |
setTimeout(function() {
|
| 556 |
-
var ret = Module.full_default(instance, audio, document.getElementById('language').value, translate);
|
| 557 |
console.log('js: full_default returned: ' + ret);
|
| 558 |
if (ret) {
|
| 559 |
printTextarea("js: whisper returned: " + ret);
|
|
|
|
| 40 |
|
| 41 |
Note that the computation is quite heavy and may take a few seconds to complete.<br>
|
| 42 |
The transcription results will be displayed in the text area below.<br><br>
|
| 43 |
+
<b>Important:</b>
|
| 44 |
+
<ul>
|
| 45 |
+
<li>your browser must support WASM SIMD instructions for this to work</li>
|
| 46 |
+
<li>Firefox cannot load files larger than 256 MB - use Chrome instead</li>
|
| 47 |
+
</ul>
|
| 48 |
|
| 49 |
+
<b>More examples:</b>
|
| 50 |
+
<a href="https://whisper.ggerganov.com/">main</a> |
|
| 51 |
+
<a href="https://whisper.ggerganov.com/bench">bench</a> |
|
| 52 |
+
<a href="https://whisper.ggerganov.com/stream">stream</a> |
|
| 53 |
+
<a href="https://whisper.ggerganov.com/command">command</a> |
|
| 54 |
+
<a href="https://whisper.ggerganov.com/talk">talk</a> |
|
| 55 |
+
|
| 56 |
+
<hr>
|
| 57 |
|
| 58 |
<div id="model">
|
| 59 |
+
Whisper models: <span id="model-whisper-status"></span><br><br>
|
| 60 |
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
|
| 61 |
<button id="fetch-whisper-tiny" onclick="loadWhisper('tiny')">tiny (75 MB)</button>
|
| 62 |
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
|
| 63 |
<button id="fetch-whisper-base" onclick="loadWhisper('base')">base (142 MB)</button>
|
| 64 |
<button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
|
| 65 |
<button id="fetch-whisper-small" onclick="loadWhisper('small')">small (466 MB)</button>
|
|
|
|
|
|
|
| 66 |
<input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
|
| 67 |
+
<br><br>
|
| 68 |
+
Quantized models:<br><br>
|
| 69 |
+
<button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
|
| 70 |
+
<button id="fetch-whisper-tiny-q5_1" onclick="loadWhisper('tiny-q5_1')">tiny (Q5_1, 31 MB)</button>
|
| 71 |
+
<button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
|
| 72 |
+
<button id="fetch-whisper-base-q5_1" onclick="loadWhisper('base-q5_1')">base (Q5_1, 57 MB)</button>
|
| 73 |
+
<button id="fetch-whisper-small-en-q5_1" onclick="loadWhisper('small-en-q5_1')">small.en (Q5_1, 182 MB)</button>
|
| 74 |
+
<button id="fetch-whisper-small-q5_1" onclick="loadWhisper('small-q5_1')">small (Q5_1, 182 MB)</button><br>
|
| 75 |
+
<button id="fetch-whisper-medium-en-q5_0" onclick="loadWhisper('medium-en-q5_0')">medium.en (Q5_0, 515 MB)</button>
|
| 76 |
+
<button id="fetch-whisper-medium-q5_0" onclick="loadWhisper('medium-q5_0')">medium (Q5_0, 515 MB)</button>
|
| 77 |
+
<button id="fetch-whisper-large-q5_0" onclick="loadWhisper('large-q5_0')">large (Q5_0, 1030 MB)</button>
|
| 78 |
+
<span id="fetch-whisper-progress"></span>
|
| 79 |
</div>
|
| 80 |
|
| 81 |
<br>
|
|
|
|
| 182 |
<option value="yi">Yiddish</option>
|
| 183 |
</select>
|
| 184 |
</td>
|
| 185 |
+
<!-- Slider to select number of threads between 1 and 16 -->
|
| 186 |
+
<td>
|
| 187 |
+
Threads:
|
| 188 |
+
<input type="range" id="threads" name="threads" min="1" max="16" value="8" onchange="changeThreads(this.value)" />
|
| 189 |
+
<span id="threads-value">8</span>
|
| 190 |
+
</td>
|
| 191 |
<td>
|
| 192 |
<button onclick="onProcess(false);">Transcribe</button>
|
| 193 |
</td>
|
|
|
|
| 290 |
|
| 291 |
Module.FS_createDataFile("/", fname, buf, true, true);
|
| 292 |
|
| 293 |
+
//model_whisper = fname;
|
| 294 |
|
| 295 |
document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
|
| 296 |
|
| 297 |
printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
|
| 298 |
+
|
| 299 |
+
document.getElementById('model').innerHTML = 'Model fetched: ' + model_whisper;
|
| 300 |
}
|
| 301 |
|
| 302 |
function loadFile(event, fname) {
|
|
|
|
| 321 |
document.getElementById('fetch-whisper-tiny' ).style.display = 'none';
|
| 322 |
document.getElementById('fetch-whisper-base' ).style.display = 'none';
|
| 323 |
document.getElementById('fetch-whisper-small' ).style.display = 'none';
|
| 324 |
+
|
| 325 |
+
document.getElementById('fetch-whisper-tiny-en-q5_1' ).style.display = 'none';
|
| 326 |
+
document.getElementById('fetch-whisper-tiny-q5_1' ).style.display = 'none';
|
| 327 |
+
document.getElementById('fetch-whisper-base-en-q5_1' ).style.display = 'none';
|
| 328 |
+
document.getElementById('fetch-whisper-base-q5_1' ).style.display = 'none';
|
| 329 |
+
document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
|
| 330 |
+
document.getElementById('fetch-whisper-small-q5_1' ).style.display = 'none';
|
| 331 |
+
document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
|
| 332 |
+
document.getElementById('fetch-whisper-medium-q5_0' ).style.display = 'none';
|
| 333 |
+
document.getElementById('fetch-whisper-large-q5_0' ).style.display = 'none';
|
| 334 |
+
|
| 335 |
document.getElementById('whisper-file' ).style.display = 'none';
|
| 336 |
document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
|
| 337 |
}
|
|
|
|
| 344 |
'base': 'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
|
| 345 |
'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
|
| 346 |
'small': 'https://whisper.ggerganov.com/ggml-model-whisper-small.bin',
|
| 347 |
+
|
| 348 |
+
'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
|
| 349 |
+
'tiny-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny-q5_1.bin',
|
| 350 |
+
'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
|
| 351 |
+
'base-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base-q5_1.bin',
|
| 352 |
+
'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin',
|
| 353 |
+
'small-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small-q5_1.bin',
|
| 354 |
+
'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin',
|
| 355 |
+
'medium-q5_0': 'https://whisper.ggerganov.com/ggml-model-whisper-medium-q5_0.bin',
|
| 356 |
+
'large-q5_0': 'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin',
|
| 357 |
};
|
| 358 |
|
| 359 |
let sizes = {
|
|
|
|
| 363 |
'base': 142,
|
| 364 |
'small.en': 466,
|
| 365 |
'small': 466,
|
| 366 |
+
|
| 367 |
+
'tiny-en-q5_1': 31,
|
| 368 |
+
'tiny-q5_1': 31,
|
| 369 |
+
'base-en-q5_1': 57,
|
| 370 |
+
'base-q5_1': 57,
|
| 371 |
+
'small-en-q5_1': 182,
|
| 372 |
+
'small-q5_1': 182,
|
| 373 |
+
'medium-en-q5_0': 515,
|
| 374 |
+
'medium-q5_0': 515,
|
| 375 |
+
'large-q5_0': 1030,
|
| 376 |
};
|
| 377 |
|
| 378 |
let url = urls[model];
|
|
|
|
| 387 |
document.getElementById('fetch-whisper-tiny' ).style.display = 'none';
|
| 388 |
document.getElementById('fetch-whisper-base' ).style.display = 'none';
|
| 389 |
document.getElementById('fetch-whisper-small' ).style.display = 'none';
|
| 390 |
+
|
| 391 |
+
document.getElementById('fetch-whisper-tiny-en-q5_1' ).style.display = 'none';
|
| 392 |
+
document.getElementById('fetch-whisper-tiny-q5_1' ).style.display = 'none';
|
| 393 |
+
document.getElementById('fetch-whisper-base-en-q5_1' ).style.display = 'none';
|
| 394 |
+
document.getElementById('fetch-whisper-base-q5_1' ).style.display = 'none';
|
| 395 |
+
document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
|
| 396 |
+
document.getElementById('fetch-whisper-small-q5_1' ).style.display = 'none';
|
| 397 |
+
document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
|
| 398 |
+
document.getElementById('fetch-whisper-medium-q5_0' ).style.display = 'none';
|
| 399 |
+
document.getElementById('fetch-whisper-large-q5_0' ).style.display = 'none';
|
| 400 |
+
|
| 401 |
+
document.getElementById('whisper-file' ).style.display = 'none';
|
| 402 |
+
document.getElementById('model-whisper-status').innerHTML = 'loading model: ' + model;
|
| 403 |
|
| 404 |
cbProgress = function(p) {
|
| 405 |
let el = document.getElementById('fetch-whisper-progress');
|
|
|
|
| 408 |
|
| 409 |
cbCancel = function() {
|
| 410 |
var el;
|
| 411 |
+
|
| 412 |
el = document.getElementById('fetch-whisper-tiny-en' ); if (el) el.style.display = 'inline-block';
|
| 413 |
el = document.getElementById('fetch-whisper-base-en' ); if (el) el.style.display = 'inline-block';
|
| 414 |
el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
|
| 415 |
el = document.getElementById('fetch-whisper-tiny' ); if (el) el.style.display = 'inline-block';
|
| 416 |
el = document.getElementById('fetch-whisper-base' ); if (el) el.style.display = 'inline-block';
|
| 417 |
el = document.getElementById('fetch-whisper-small' ); if (el) el.style.display = 'inline-block';
|
| 418 |
+
|
| 419 |
+
el = document.getElementById('fetch-whisper-tiny-en-q5_1' ); if (el) el.style.display = 'inline-block';
|
| 420 |
+
el = document.getElementById('fetch-whisper-tiny-q5_1' ); if (el) el.style.display = 'inline-block';
|
| 421 |
+
el = document.getElementById('fetch-whisper-base-en-q5_1' ); if (el) el.style.display = 'inline-block';
|
| 422 |
+
el = document.getElementById('fetch-whisper-base-q5_1' ); if (el) el.style.display = 'inline-block';
|
| 423 |
+
el = document.getElementById('fetch-whisper-small-en-q5_1' ); if (el) el.style.display = 'inline-block';
|
| 424 |
+
el = document.getElementById('fetch-whisper-small-q5_1' ); if (el) el.style.display = 'inline-block';
|
| 425 |
+
el = document.getElementById('fetch-whisper-medium-en-q5_0'); if (el) el.style.display = 'inline-block';
|
| 426 |
+
el = document.getElementById('fetch-whisper-medium-q5_0' ); if (el) el.style.display = 'inline-block';
|
| 427 |
+
el = document.getElementById('fetch-whisper-large-q5_0' ); if (el) el.style.display = 'inline-block';
|
| 428 |
+
|
| 429 |
+
el = document.getElementById('whisper-file' ); if (el) el.style.display = 'inline-block';
|
| 430 |
+
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
|
| 431 |
};
|
| 432 |
|
| 433 |
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
|
|
|
|
| 437 |
// audio file
|
| 438 |
//
|
| 439 |
|
| 440 |
+
const kMaxAudio_s = 30*60;
|
| 441 |
+
const kMaxRecording_s = 2*60;
|
| 442 |
const kSampleRate = 16000;
|
| 443 |
|
| 444 |
window.AudioContext = window.AudioContext || window.webkitAudioContext;
|
|
|
|
| 507 |
doRecording = false;
|
| 508 |
}
|
| 509 |
|
| 510 |
+
// record up to kMaxRecording_s seconds of audio from the microphone
|
| 511 |
// check if doRecording is false every 1000 ms and stop recording if so
|
| 512 |
// update progress information
|
| 513 |
function startRecording() {
|
|
|
|
| 563 |
printTextarea('js: audio recorded, size: ' + audio.length);
|
| 564 |
|
| 565 |
// truncate to first 30 seconds
|
| 566 |
+
if (audio.length > kMaxRecording_s*kSampleRate) {
|
| 567 |
+
audio = audio.slice(0, kMaxRecording_s*kSampleRate);
|
| 568 |
+
printTextarea('js: truncated audio to first ' + kMaxRecording_s + ' seconds');
|
| 569 |
}
|
| 570 |
setAudio(audio);
|
| 571 |
});
|
|
|
|
| 593 |
});
|
| 594 |
}
|
| 595 |
|
| 596 |
+
document.getElementById('progress-bar').style.width = (100*(Date.now() - startTime)/1000/kMaxRecording_s) + '%';
|
| 597 |
+
document.getElementById('progress-text').innerHTML = (100*(Date.now() - startTime)/1000/kMaxRecording_s).toFixed(0) + '%';
|
| 598 |
}, 1000);
|
| 599 |
|
| 600 |
printTextarea('js: recording ...');
|
| 601 |
|
| 602 |
setTimeout(function() {
|
| 603 |
if (doRecording) {
|
| 604 |
+
printTextarea('js: recording stopped after ' + kMaxRecording_s + ' seconds');
|
| 605 |
stopRecording();
|
| 606 |
}
|
| 607 |
+
}, kMaxRecording_s*1000);
|
| 608 |
}
|
| 609 |
|
| 610 |
//
|
| 611 |
// transcribe
|
| 612 |
//
|
| 613 |
|
| 614 |
+
var nthreads = 8;
|
| 615 |
+
|
| 616 |
+
function changeThreads(value) {
|
| 617 |
+
nthreads = value;
|
| 618 |
+
document.getElementById('threads-value').innerHTML = nthreads;
|
| 619 |
+
}
|
| 620 |
+
|
| 621 |
function onProcess(translate) {
|
| 622 |
if (!instance) {
|
| 623 |
instance = Module.init('whisper.bin');
|
|
|
|
| 644 |
printTextarea('');
|
| 645 |
|
| 646 |
setTimeout(function() {
|
| 647 |
+
var ret = Module.full_default(instance, audio, document.getElementById('language').value, nthreads, translate);
|
| 648 |
console.log('js: full_default returned: ' + ret);
|
| 649 |
if (ret) {
|
| 650 |
printTextarea("js: whisper returned: " + ret);
|
extra/quantize-all.sh
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
printf "Usage: $0 <upload>"
|
| 4 |
+
|
| 5 |
+
if [ $# -ne 1 ]; then
|
| 6 |
+
printf "\nError: Invalid number of arguments\n"
|
| 7 |
+
exit 1
|
| 8 |
+
fi
|
| 9 |
+
|
| 10 |
+
qtype0="q5_0"
|
| 11 |
+
qtype1="q5_1"
|
| 12 |
+
upload="$1"
|
| 13 |
+
|
| 14 |
+
cd `dirname $0`
|
| 15 |
+
cd ../
|
| 16 |
+
|
| 17 |
+
./quantize ./models/ggml-tiny.en.bin ./models/ggml-tiny.en-${qtype1}.bin ${qtype1}
|
| 18 |
+
./quantize ./models/ggml-tiny.bin ./models/ggml-tiny-${qtype1}.bin ${qtype1}
|
| 19 |
+
|
| 20 |
+
./quantize ./models/ggml-base.en.bin ./models/ggml-base.en-${qtype1}.bin ${qtype1}
|
| 21 |
+
./quantize ./models/ggml-base.bin ./models/ggml-base-${qtype1}.bin ${qtype1}
|
| 22 |
+
|
| 23 |
+
./quantize ./models/ggml-small.en.bin ./models/ggml-small.en-${qtype1}.bin ${qtype1}
|
| 24 |
+
./quantize ./models/ggml-small.bin ./models/ggml-small-${qtype1}.bin ${qtype1}
|
| 25 |
+
|
| 26 |
+
./quantize ./models/ggml-medium.en.bin ./models/ggml-medium.en-${qtype0}.bin ${qtype0}
|
| 27 |
+
./quantize ./models/ggml-medium.bin ./models/ggml-medium-${qtype0}.bin ${qtype0}
|
| 28 |
+
|
| 29 |
+
./quantize ./models/ggml-large.bin ./models/ggml-large-${qtype0}.bin ${qtype0}
|
| 30 |
+
|
| 31 |
+
if [ "$upload" == "1" ]; then
|
| 32 |
+
scp ./models/ggml-tiny.en-${qtype1}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-tiny.en-${qtype1}.bin
|
| 33 |
+
scp ./models/ggml-tiny-${qtype1}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-tiny-${qtype1}.bin
|
| 34 |
+
|
| 35 |
+
scp ./models/ggml-base.en-${qtype1}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-base.en-${qtype1}.bin
|
| 36 |
+
scp ./models/ggml-base-${qtype1}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-base-${qtype1}.bin
|
| 37 |
+
|
| 38 |
+
scp ./models/ggml-small.en-${qtype1}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-small.en-${qtype1}.bin
|
| 39 |
+
scp ./models/ggml-small-${qtype1}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-small-${qtype1}.bin
|
| 40 |
+
|
| 41 |
+
scp ./models/ggml-medium.en-${qtype0}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-medium.en-${qtype0}.bin
|
| 42 |
+
scp ./models/ggml-medium-${qtype0}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-medium-${qtype0}.bin
|
| 43 |
+
|
| 44 |
+
scp ./models/ggml-large-${qtype0}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-large-${qtype0}.bin
|
| 45 |
+
fi
|
extra/sync-ggml.sh
CHANGED
|
@@ -1,6 +1,10 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
-
cp -rpv ../ggml/src/ggml.c
|
| 4 |
-
cp -rpv ../ggml/src/ggml-cuda.cu
|
| 5 |
-
cp -rpv ../ggml/src/ggml-cuda.h
|
| 6 |
-
cp -rpv ../ggml/include/ggml/ggml.h
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
+
cp -rpv ../ggml/src/ggml.c ./ggml.c
|
| 4 |
+
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
|
| 5 |
+
cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
|
| 6 |
+
cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
|
| 7 |
+
cp -rpv ../ggml/examples/common.h ./examples/common.h
|
| 8 |
+
cp -rpv ../ggml/examples/common.cpp ./examples/common.cpp
|
| 9 |
+
cp -rpv ../ggml/examples/common-ggml.h ./examples/common-ggml.h
|
| 10 |
+
cp -rpv ../ggml/examples/common-ggml.cpp ./examples/common-ggml.cpp
|
ggml.c
CHANGED
|
@@ -330,7 +330,7 @@ static ggml_fp16_t table_exp_f16[1 << 16];
|
|
| 330 |
// precomputed f32 table for f16 (256 KB)
|
| 331 |
static float table_f32_f16[1 << 16];
|
| 332 |
|
| 333 |
-
#if defined(__ARM_NEON)
|
| 334 |
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
|
| 335 |
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
|
| 336 |
#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
|
|
@@ -3180,6 +3180,72 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
| 3180 |
}
|
| 3181 |
|
| 3182 |
*s = vaddvq_f32(sumv);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3183 |
#elif defined(__AVX2__)
|
| 3184 |
// Initialize accumulator with zeros
|
| 3185 |
__m256 acc = _mm256_setzero_ps();
|
|
@@ -3311,6 +3377,77 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
| 3311 |
}
|
| 3312 |
|
| 3313 |
*s = vaddvq_f32(sumv) + summs;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3314 |
#elif defined(__AVX2__)
|
| 3315 |
// Initialize accumulator with zeros
|
| 3316 |
__m256 acc = _mm256_setzero_ps();
|
|
@@ -3827,6 +3964,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
|
| 3827 |
"DIAG_MASK_INF",
|
| 3828 |
"SOFT_MAX",
|
| 3829 |
"ROPE",
|
|
|
|
| 3830 |
"CONV_1D_1S",
|
| 3831 |
"CONV_1D_2S",
|
| 3832 |
|
|
@@ -3875,6 +4013,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
| 3875 |
"diag_mask_inf(x)",
|
| 3876 |
"soft_max(x)",
|
| 3877 |
"rope(x)",
|
|
|
|
| 3878 |
"conv_1d_1s(x)",
|
| 3879 |
"conv_1d_2s(x)",
|
| 3880 |
|
|
@@ -4055,6 +4194,27 @@ bool ggml_is_quantized(enum ggml_type type) {
|
|
| 4055 |
return GGML_IS_QUANTIZED[type];
|
| 4056 |
}
|
| 4057 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4058 |
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
| 4059 |
return tensor->nb[0] > tensor->nb[1];
|
| 4060 |
}
|
|
|
|
| 330 |
// precomputed f32 table for f16 (256 KB)
|
| 331 |
static float table_f32_f16[1 << 16];
|
| 332 |
|
| 333 |
+
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
|
| 334 |
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
|
| 335 |
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
|
| 336 |
#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
|
|
|
|
| 3180 |
}
|
| 3181 |
|
| 3182 |
*s = vaddvq_f32(sumv);
|
| 3183 |
+
#elif defined(__wasm_simd128__)
|
| 3184 |
+
v128_t sumv = wasm_f32x4_splat(0.0f);
|
| 3185 |
+
|
| 3186 |
+
uint64_t tmp[4];
|
| 3187 |
+
|
| 3188 |
+
for (int i = 0; i < nb; ++i) {
|
| 3189 |
+
const block_q5_0 * restrict x0 = &x[i];
|
| 3190 |
+
const block_q8_0 * restrict y0 = &y[i];
|
| 3191 |
+
|
| 3192 |
+
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
| 3193 |
+
const v128_t s16b = wasm_i8x16_splat(0x10);
|
| 3194 |
+
|
| 3195 |
+
// extract the 5th bit
|
| 3196 |
+
uint32_t qh;
|
| 3197 |
+
memcpy(&qh, x0->qh, sizeof(qh));
|
| 3198 |
+
|
| 3199 |
+
tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
|
| 3200 |
+
tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
|
| 3201 |
+
tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
|
| 3202 |
+
tmp[3] = table_b2b_u[(qh >> 24) ];
|
| 3203 |
+
|
| 3204 |
+
const v128_t qhl = wasm_v128_load(tmp + 0);
|
| 3205 |
+
const v128_t qhh = wasm_v128_load(tmp + 2);
|
| 3206 |
+
|
| 3207 |
+
const v128_t v0 = wasm_v128_load(x0->qs);
|
| 3208 |
+
|
| 3209 |
+
// 4-bit -> 8-bit
|
| 3210 |
+
const v128_t v0l = wasm_v128_and (v0, m4b);
|
| 3211 |
+
const v128_t v0h = wasm_u8x16_shr(v0, 4);
|
| 3212 |
+
|
| 3213 |
+
// interleave
|
| 3214 |
+
const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
|
| 3215 |
+
const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
|
| 3216 |
+
|
| 3217 |
+
// add high bit and sub 16
|
| 3218 |
+
const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b);
|
| 3219 |
+
const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b);
|
| 3220 |
+
|
| 3221 |
+
// load y
|
| 3222 |
+
const v128_t v1l = wasm_v128_load(y0->qs);
|
| 3223 |
+
const v128_t v1h = wasm_v128_load(y0->qs + 16);
|
| 3224 |
+
|
| 3225 |
+
// int8x16 -> int16x8
|
| 3226 |
+
const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
|
| 3227 |
+
const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
|
| 3228 |
+
const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
|
| 3229 |
+
const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
|
| 3230 |
+
|
| 3231 |
+
const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
|
| 3232 |
+
const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
|
| 3233 |
+
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
| 3234 |
+
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
| 3235 |
+
|
| 3236 |
+
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
| 3237 |
+
|
| 3238 |
+
// dot product
|
| 3239 |
+
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
|
| 3240 |
+
wasm_i32x4_add(
|
| 3241 |
+
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
| 3242 |
+
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
| 3243 |
+
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
| 3244 |
+
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
|
| 3245 |
+
}
|
| 3246 |
+
|
| 3247 |
+
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
| 3248 |
+
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
|
| 3249 |
#elif defined(__AVX2__)
|
| 3250 |
// Initialize accumulator with zeros
|
| 3251 |
__m256 acc = _mm256_setzero_ps();
|
|
|
|
| 3377 |
}
|
| 3378 |
|
| 3379 |
*s = vaddvq_f32(sumv) + summs;
|
| 3380 |
+
#elif defined(__wasm_simd128__)
|
| 3381 |
+
v128_t sumv = wasm_f32x4_splat(0.0f);
|
| 3382 |
+
|
| 3383 |
+
float summs = 0.0f;
|
| 3384 |
+
|
| 3385 |
+
uint64_t tmp[4];
|
| 3386 |
+
|
| 3387 |
+
for (int i = 0; i < nb; ++i) {
|
| 3388 |
+
const block_q5_1 * restrict x0 = &x[i];
|
| 3389 |
+
const block_q8_1 * restrict y0 = &y[i];
|
| 3390 |
+
|
| 3391 |
+
summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
|
| 3392 |
+
|
| 3393 |
+
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
| 3394 |
+
|
| 3395 |
+
// extract the 5th bit
|
| 3396 |
+
uint32_t qh;
|
| 3397 |
+
memcpy(&qh, x0->qh, sizeof(qh));
|
| 3398 |
+
|
| 3399 |
+
tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
|
| 3400 |
+
tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
|
| 3401 |
+
tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
|
| 3402 |
+
tmp[3] = table_b2b_u[(qh >> 24) ];
|
| 3403 |
+
|
| 3404 |
+
const v128_t qhl = wasm_v128_load(tmp + 0);
|
| 3405 |
+
const v128_t qhh = wasm_v128_load(tmp + 2);
|
| 3406 |
+
|
| 3407 |
+
const v128_t v0 = wasm_v128_load(x0->qs);
|
| 3408 |
+
|
| 3409 |
+
// 4-bit -> 8-bit
|
| 3410 |
+
const v128_t v0l = wasm_v128_and (v0, m4b);
|
| 3411 |
+
const v128_t v0h = wasm_u8x16_shr(v0, 4);
|
| 3412 |
+
|
| 3413 |
+
static bool x = true;
|
| 3414 |
+
|
| 3415 |
+
// interleave
|
| 3416 |
+
const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
|
| 3417 |
+
const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
|
| 3418 |
+
|
| 3419 |
+
// add high bit
|
| 3420 |
+
const v128_t v0lf = wasm_v128_or(v0lz, qhl);
|
| 3421 |
+
const v128_t v0hf = wasm_v128_or(v0hz, qhh);
|
| 3422 |
+
|
| 3423 |
+
// load y
|
| 3424 |
+
const v128_t v1l = wasm_v128_load(y0->qs);
|
| 3425 |
+
const v128_t v1h = wasm_v128_load(y0->qs + 16);
|
| 3426 |
+
|
| 3427 |
+
// int8x16 -> int16x8
|
| 3428 |
+
const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
|
| 3429 |
+
const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
|
| 3430 |
+
const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
|
| 3431 |
+
const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
|
| 3432 |
+
|
| 3433 |
+
const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
|
| 3434 |
+
const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
|
| 3435 |
+
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
| 3436 |
+
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
| 3437 |
+
|
| 3438 |
+
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
| 3439 |
+
|
| 3440 |
+
// dot product
|
| 3441 |
+
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
|
| 3442 |
+
wasm_i32x4_add(
|
| 3443 |
+
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
| 3444 |
+
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
| 3445 |
+
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
| 3446 |
+
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
|
| 3447 |
+
}
|
| 3448 |
+
|
| 3449 |
+
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
| 3450 |
+
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
|
| 3451 |
#elif defined(__AVX2__)
|
| 3452 |
// Initialize accumulator with zeros
|
| 3453 |
__m256 acc = _mm256_setzero_ps();
|
|
|
|
| 3964 |
"DIAG_MASK_INF",
|
| 3965 |
"SOFT_MAX",
|
| 3966 |
"ROPE",
|
| 3967 |
+
"ALIBI",
|
| 3968 |
"CONV_1D_1S",
|
| 3969 |
"CONV_1D_2S",
|
| 3970 |
|
|
|
|
| 4013 |
"diag_mask_inf(x)",
|
| 4014 |
"soft_max(x)",
|
| 4015 |
"rope(x)",
|
| 4016 |
+
"alibi(x)",
|
| 4017 |
"conv_1d_1s(x)",
|
| 4018 |
"conv_1d_2s(x)",
|
| 4019 |
|
|
|
|
| 4194 |
return GGML_IS_QUANTIZED[type];
|
| 4195 |
}
|
| 4196 |
|
| 4197 |
+
enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
| 4198 |
+
enum ggml_type wtype = GGML_TYPE_COUNT;
|
| 4199 |
+
|
| 4200 |
+
switch (ftype) {
|
| 4201 |
+
case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break;
|
| 4202 |
+
case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break;
|
| 4203 |
+
case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break;
|
| 4204 |
+
case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break;
|
| 4205 |
+
case GGML_FTYPE_MOSTLY_Q4_2: wtype = GGML_TYPE_Q4_2; break;
|
| 4206 |
+
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
| 4207 |
+
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
| 4208 |
+
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
| 4209 |
+
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
| 4210 |
+
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
| 4211 |
+
}
|
| 4212 |
+
|
| 4213 |
+
GGML_ASSERT(wtype != GGML_TYPE_COUNT);
|
| 4214 |
+
|
| 4215 |
+
return wtype;
|
| 4216 |
+
}
|
| 4217 |
+
|
| 4218 |
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
| 4219 |
return tensor->nb[0] > tensor->nb[1];
|
| 4220 |
}
|
ggml.h
CHANGED
|
@@ -232,6 +232,20 @@ extern "C" {
|
|
| 232 |
GGML_TYPE_COUNT,
|
| 233 |
};
|
| 234 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
// available tensor operations:
|
| 236 |
enum ggml_op {
|
| 237 |
GGML_OP_NONE = 0,
|
|
@@ -385,6 +399,8 @@ extern "C" {
|
|
| 385 |
|
| 386 |
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
| 387 |
|
|
|
|
|
|
|
| 388 |
// main
|
| 389 |
|
| 390 |
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
|
|
|
| 232 |
GGML_TYPE_COUNT,
|
| 233 |
};
|
| 234 |
|
| 235 |
+
// model file types
|
| 236 |
+
enum ggml_ftype {
|
| 237 |
+
GGML_FTYPE_UNKNOWN = -1,
|
| 238 |
+
GGML_FTYPE_ALL_F32 = 0,
|
| 239 |
+
GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
| 240 |
+
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
| 241 |
+
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
| 242 |
+
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
| 243 |
+
GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
| 244 |
+
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
| 245 |
+
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
| 246 |
+
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
| 247 |
+
};
|
| 248 |
+
|
| 249 |
// available tensor operations:
|
| 250 |
enum ggml_op {
|
| 251 |
GGML_OP_NONE = 0,
|
|
|
|
| 399 |
|
| 400 |
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
| 401 |
|
| 402 |
+
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
| 403 |
+
|
| 404 |
// main
|
| 405 |
|
| 406 |
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
whisper.cpp
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
#define WHISPER_BUILD
|
| 2 |
#include "whisper.h"
|
| 3 |
#if WHISPER_USE_COREML
|
| 4 |
#include "coreml/whisper-encoder.h"
|
|
@@ -255,12 +254,70 @@ static const std::map<e_model, size_t> MEM_REQ_SCRATCH3 = {
|
|
| 255 |
{ MODEL_LARGE, 9ull*MB },
|
| 256 |
};
|
| 257 |
|
| 258 |
-
static const std::map<e_model, size_t
|
| 259 |
-
{
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
};
|
| 265 |
|
| 266 |
static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
|
|
@@ -370,7 +427,7 @@ struct whisper_hparams {
|
|
| 370 |
int32_t n_text_head = 6;
|
| 371 |
int32_t n_text_layer = 4;
|
| 372 |
int32_t n_mels = 80;
|
| 373 |
-
int32_t
|
| 374 |
};
|
| 375 |
|
| 376 |
// audio encoding layer
|
|
@@ -637,10 +694,11 @@ struct whisper_state {
|
|
| 637 |
};
|
| 638 |
|
| 639 |
struct whisper_context {
|
| 640 |
-
int64_t t_load_us
|
| 641 |
int64_t t_start_us = 0;
|
| 642 |
|
| 643 |
-
ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32
|
|
|
|
| 644 |
|
| 645 |
whisper_model model;
|
| 646 |
whisper_vocab vocab;
|
|
@@ -697,7 +755,7 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
|
|
| 697 |
const ggml_type wtype = cache.k->type;
|
| 698 |
WHISPER_ASSERT(wtype == cache.v->type);
|
| 699 |
|
| 700 |
-
WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*
|
| 701 |
|
| 702 |
struct ggml_init_params params = {
|
| 703 |
/*.mem_size =*/ cache.buf.size(),
|
|
@@ -770,7 +828,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 770 |
read_safe(loader, hparams.n_text_head);
|
| 771 |
read_safe(loader, hparams.n_text_layer);
|
| 772 |
read_safe(loader, hparams.n_mels);
|
| 773 |
-
read_safe(loader, hparams.
|
| 774 |
|
| 775 |
assert(hparams.n_text_state == hparams.n_audio_state);
|
| 776 |
|
|
@@ -794,11 +852,15 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 794 |
model.type = e_model::MODEL_LARGE;
|
| 795 |
}
|
| 796 |
|
| 797 |
-
// for the big tensors, we have the option to store the data in 16-bit floats
|
| 798 |
// in order to save memory and also to speed up the computation
|
| 799 |
-
wctx.wtype = model.hparams.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 800 |
|
| 801 |
-
const size_t scale = model.hparams.
|
| 802 |
|
| 803 |
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
| 804 |
fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
|
|
@@ -810,18 +872,18 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 810 |
fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
|
| 811 |
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
|
| 812 |
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
|
| 813 |
-
fprintf(stderr, "%s:
|
| 814 |
fprintf(stderr, "%s: type = %d\n", __func__, model.type);
|
| 815 |
|
| 816 |
// print memory requirements
|
| 817 |
{
|
| 818 |
// this is the total memory required to run the inference
|
| 819 |
const size_t mem_required =
|
| 820 |
-
MEM_REQ_SCRATCH0.at
|
| 821 |
-
MEM_REQ_SCRATCH1.at
|
| 822 |
-
MEM_REQ_SCRATCH2.at
|
| 823 |
-
MEM_REQ_SCRATCH3.at
|
| 824 |
-
scale*MEM_REQ_MODEL.at
|
| 825 |
scale*MEM_REQ_KV_CROSS.at(model.type) +
|
| 826 |
scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type));
|
| 827 |
|
|
@@ -837,7 +899,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 837 |
// always have at least one decoder
|
| 838 |
|
| 839 |
wctx.model.buf = new std::vector<uint8_t>();
|
| 840 |
-
wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(model.type));
|
| 841 |
|
| 842 |
// we skip initialization of the state until it is needed
|
| 843 |
// because it might be that state will always be provided externally.
|
|
@@ -928,6 +990,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 928 |
size_t ctx_size = 0;
|
| 929 |
|
| 930 |
const ggml_type wtype = wctx.wtype;
|
|
|
|
| 931 |
|
| 932 |
{
|
| 933 |
const auto & hparams = model.hparams;
|
|
@@ -946,92 +1009,92 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 946 |
|
| 947 |
// encoder
|
| 948 |
{
|
| 949 |
-
ctx_size += n_audio_ctx*n_audio_state*
|
| 950 |
|
| 951 |
-
ctx_size += 3*n_mels*n_audio_state*
|
| 952 |
-
ctx_size += n_audio_state*
|
| 953 |
|
| 954 |
-
ctx_size += 3*n_audio_state*n_audio_state*
|
| 955 |
-
ctx_size += n_audio_state*
|
| 956 |
|
| 957 |
-
ctx_size += n_audio_state*
|
| 958 |
-
ctx_size += n_audio_state*
|
| 959 |
}
|
| 960 |
|
| 961 |
// decoder
|
| 962 |
{
|
| 963 |
-
ctx_size += n_text_ctx*n_text_state*
|
| 964 |
|
| 965 |
-
ctx_size += n_vocab*n_text_state*
|
| 966 |
|
| 967 |
-
ctx_size += n_text_state*
|
| 968 |
-
ctx_size += n_text_state*
|
| 969 |
}
|
| 970 |
|
| 971 |
// encoder layers
|
| 972 |
{
|
| 973 |
-
ctx_size += n_audio_layer*(n_audio_state*
|
| 974 |
-
ctx_size += n_audio_layer*(n_audio_state*
|
| 975 |
|
| 976 |
-
ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*
|
| 977 |
-
ctx_size += n_audio_layer*( 4*n_audio_state*
|
| 978 |
|
| 979 |
-
ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*
|
| 980 |
-
ctx_size += n_audio_layer*( n_audio_state*
|
| 981 |
|
| 982 |
-
ctx_size += n_audio_layer*(n_audio_state*
|
| 983 |
-
ctx_size += n_audio_layer*(n_audio_state*
|
| 984 |
|
| 985 |
-
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*
|
| 986 |
-
ctx_size += n_audio_layer*( n_audio_state*
|
| 987 |
|
| 988 |
-
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*
|
| 989 |
|
| 990 |
-
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*
|
| 991 |
-
ctx_size += n_audio_layer*( n_audio_state*
|
| 992 |
|
| 993 |
-
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*
|
| 994 |
-
ctx_size += n_audio_layer*( n_audio_state*
|
| 995 |
}
|
| 996 |
|
| 997 |
// decoder layers
|
| 998 |
{
|
| 999 |
-
ctx_size += n_text_layer*(n_text_state*
|
| 1000 |
-
ctx_size += n_text_layer*(n_text_state*
|
| 1001 |
|
| 1002 |
-
ctx_size += n_text_layer*(4*n_text_state*n_text_state*
|
| 1003 |
-
ctx_size += n_text_layer*( 4*n_text_state*
|
| 1004 |
|
| 1005 |
-
ctx_size += n_text_layer*(4*n_text_state*n_text_state*
|
| 1006 |
-
ctx_size += n_text_layer*( n_text_state*
|
| 1007 |
|
| 1008 |
-
ctx_size += n_text_layer*(n_text_state*
|
| 1009 |
-
ctx_size += n_text_layer*(n_text_state*
|
| 1010 |
|
| 1011 |
-
ctx_size += n_text_layer*(n_text_state*n_text_state*
|
| 1012 |
-
ctx_size += n_text_layer*( n_text_state*
|
| 1013 |
|
| 1014 |
-
ctx_size += n_text_layer*(n_text_state*n_text_state*
|
| 1015 |
|
| 1016 |
-
ctx_size += n_text_layer*(n_text_state*n_text_state*
|
| 1017 |
-
ctx_size += n_text_layer*( n_text_state*
|
| 1018 |
|
| 1019 |
-
ctx_size += n_text_layer*(n_text_state*n_text_state*
|
| 1020 |
-
ctx_size += n_text_layer*( n_text_state*
|
| 1021 |
//
|
| 1022 |
-
ctx_size += n_text_layer*(n_text_state*
|
| 1023 |
-
ctx_size += n_text_layer*(n_text_state*
|
| 1024 |
|
| 1025 |
-
ctx_size += n_text_layer*(n_text_state*n_text_state*
|
| 1026 |
-
ctx_size += n_text_layer*( n_text_state*
|
| 1027 |
|
| 1028 |
-
ctx_size += n_text_layer*(n_text_state*n_text_state*
|
| 1029 |
|
| 1030 |
-
ctx_size += n_text_layer*(n_text_state*n_text_state*
|
| 1031 |
-
ctx_size += n_text_layer*( n_text_state*
|
| 1032 |
|
| 1033 |
-
ctx_size += n_text_layer*(n_text_state*n_text_state*
|
| 1034 |
-
ctx_size += n_text_layer*( n_text_state*
|
| 1035 |
}
|
| 1036 |
|
| 1037 |
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
|
|
@@ -1077,175 +1140,175 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1077 |
|
| 1078 |
// encoder
|
| 1079 |
{
|
| 1080 |
-
model.e_pe
|
| 1081 |
|
| 1082 |
-
model.e_conv_1_w = ggml_new_tensor_3d(ctx,
|
| 1083 |
model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
|
| 1084 |
|
| 1085 |
-
model.e_conv_2_w = ggml_new_tensor_3d(ctx,
|
| 1086 |
model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
|
| 1087 |
|
| 1088 |
-
model.e_ln_w
|
| 1089 |
-
model.e_ln_b
|
| 1090 |
|
| 1091 |
// map by name
|
| 1092 |
model.tensors["encoder.positional_embedding"] = model.e_pe;
|
| 1093 |
|
| 1094 |
-
model.tensors["encoder.conv1.weight"]
|
| 1095 |
-
model.tensors["encoder.conv1.bias"]
|
| 1096 |
|
| 1097 |
-
model.tensors["encoder.conv2.weight"]
|
| 1098 |
-
model.tensors["encoder.conv2.bias"]
|
| 1099 |
|
| 1100 |
-
model.tensors["encoder.ln_post.weight"]
|
| 1101 |
-
model.tensors["encoder.ln_post.bias"]
|
| 1102 |
|
| 1103 |
for (int i = 0; i < n_audio_layer; ++i) {
|
| 1104 |
auto & layer = model.layers_encoder[i];
|
| 1105 |
|
| 1106 |
-
layer.mlp_ln_w
|
| 1107 |
-
layer.mlp_ln_b
|
| 1108 |
|
| 1109 |
-
layer.mlp_0_w
|
| 1110 |
-
layer.mlp_0_b
|
| 1111 |
|
| 1112 |
-
layer.mlp_1_w
|
| 1113 |
-
layer.mlp_1_b
|
| 1114 |
|
| 1115 |
-
layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,
|
| 1116 |
-
layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,
|
| 1117 |
|
| 1118 |
-
layer.attn_q_w
|
| 1119 |
-
layer.attn_q_b
|
| 1120 |
|
| 1121 |
-
layer.attn_k_w
|
| 1122 |
|
| 1123 |
-
layer.attn_v_w
|
| 1124 |
-
layer.attn_v_b
|
| 1125 |
|
| 1126 |
-
layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype,
|
| 1127 |
-
layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,
|
| 1128 |
|
| 1129 |
// map by name
|
| 1130 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.weight"]
|
| 1131 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.bias"]
|
| 1132 |
|
| 1133 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.weight"]
|
| 1134 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.bias"]
|
| 1135 |
|
| 1136 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.weight"]
|
| 1137 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.bias"]
|
| 1138 |
|
| 1139 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.weight"]
|
| 1140 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.bias"]
|
| 1141 |
|
| 1142 |
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w;
|
| 1143 |
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.bias"] = layer.attn_q_b;
|
| 1144 |
|
| 1145 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.key.weight"]
|
| 1146 |
|
| 1147 |
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w;
|
| 1148 |
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.bias"] = layer.attn_v_b;
|
| 1149 |
|
| 1150 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.weight"]
|
| 1151 |
-
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.bias"]
|
| 1152 |
}
|
| 1153 |
}
|
| 1154 |
|
| 1155 |
// decoder
|
| 1156 |
{
|
| 1157 |
-
model.d_pe
|
| 1158 |
|
| 1159 |
-
model.d_te
|
| 1160 |
|
| 1161 |
model.d_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1162 |
model.d_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1163 |
|
| 1164 |
// map by name
|
| 1165 |
-
model.tensors["decoder.positional_embedding"]
|
| 1166 |
|
| 1167 |
model.tensors["decoder.token_embedding.weight"] = model.d_te;
|
| 1168 |
|
| 1169 |
-
model.tensors["decoder.ln.weight"]
|
| 1170 |
-
model.tensors["decoder.ln.bias"]
|
| 1171 |
|
| 1172 |
for (int i = 0; i < n_text_layer; ++i) {
|
| 1173 |
auto & layer = model.layers_decoder[i];
|
| 1174 |
|
| 1175 |
-
layer.mlp_ln_w
|
| 1176 |
-
layer.mlp_ln_b
|
| 1177 |
|
| 1178 |
-
layer.mlp_0_w
|
| 1179 |
-
layer.mlp_0_b
|
| 1180 |
|
| 1181 |
-
layer.mlp_1_w
|
| 1182 |
-
layer.mlp_1_b
|
| 1183 |
|
| 1184 |
-
layer.attn_ln_0_w
|
| 1185 |
-
layer.attn_ln_0_b
|
| 1186 |
|
| 1187 |
-
layer.attn_q_w
|
| 1188 |
-
layer.attn_q_b
|
| 1189 |
|
| 1190 |
-
layer.attn_k_w
|
| 1191 |
|
| 1192 |
-
layer.attn_v_w
|
| 1193 |
-
layer.attn_v_b
|
| 1194 |
|
| 1195 |
-
layer.attn_ln_1_w
|
| 1196 |
-
layer.attn_ln_1_b
|
| 1197 |
|
| 1198 |
-
layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,
|
| 1199 |
-
layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,
|
| 1200 |
|
| 1201 |
-
layer.cross_attn_q_w
|
| 1202 |
-
layer.cross_attn_q_b
|
| 1203 |
|
| 1204 |
-
layer.cross_attn_k_w
|
| 1205 |
|
| 1206 |
-
layer.cross_attn_v_w
|
| 1207 |
-
layer.cross_attn_v_b
|
| 1208 |
|
| 1209 |
-
layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype,
|
| 1210 |
-
layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,
|
| 1211 |
|
| 1212 |
// map by name
|
| 1213 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.weight"]
|
| 1214 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.bias"]
|
| 1215 |
|
| 1216 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.weight"]
|
| 1217 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.bias"]
|
| 1218 |
|
| 1219 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.weight"]
|
| 1220 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.bias"]
|
| 1221 |
|
| 1222 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.weight"]
|
| 1223 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.bias"]
|
| 1224 |
|
| 1225 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.weight"]
|
| 1226 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.bias"]
|
| 1227 |
|
| 1228 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".attn.key.weight"]
|
| 1229 |
|
| 1230 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.weight"]
|
| 1231 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.bias"]
|
| 1232 |
|
| 1233 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.weight"]
|
| 1234 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.bias"]
|
| 1235 |
|
| 1236 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.weight"]
|
| 1237 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.bias"]
|
| 1238 |
|
| 1239 |
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.weight"] = layer.cross_attn_q_w;
|
| 1240 |
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.bias"] = layer.cross_attn_q_b;
|
| 1241 |
|
| 1242 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.key.weight"]
|
| 1243 |
|
| 1244 |
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.weight"] = layer.cross_attn_v_w;
|
| 1245 |
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.bias"] = layer.cross_attn_v_b;
|
| 1246 |
|
| 1247 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.weight"]
|
| 1248 |
-
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.bias"]
|
| 1249 |
}
|
| 1250 |
}
|
| 1251 |
}
|
|
@@ -1259,11 +1322,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1259 |
while (true) {
|
| 1260 |
int32_t n_dims;
|
| 1261 |
int32_t length;
|
| 1262 |
-
int32_t
|
| 1263 |
|
| 1264 |
read_safe(loader, n_dims);
|
| 1265 |
read_safe(loader, length);
|
| 1266 |
-
read_safe(loader,
|
| 1267 |
|
| 1268 |
if (loader->eof(loader->context)) {
|
| 1269 |
break;
|
|
@@ -1298,9 +1361,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1298 |
return false;
|
| 1299 |
}
|
| 1300 |
|
| 1301 |
-
const size_t bpe = (
|
| 1302 |
|
| 1303 |
-
if (nelements*bpe != ggml_nbytes(tensor)) {
|
| 1304 |
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
| 1305 |
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
| 1306 |
return false;
|
|
@@ -1309,7 +1372,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|
| 1309 |
loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
|
| 1310 |
BYTESWAP_TENSOR(tensor);
|
| 1311 |
|
| 1312 |
-
//printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2],
|
| 1313 |
total_size += ggml_nbytes(tensor);
|
| 1314 |
model.n_loaded++;
|
| 1315 |
}
|
|
@@ -1508,14 +1571,14 @@ static bool whisper_encode_internal(
|
|
| 1508 |
ggml_permute(ctx0,
|
| 1509 |
ggml_cpy(ctx0,
|
| 1510 |
Qcur,
|
| 1511 |
-
ggml_new_tensor_3d(ctx0, wctx.
|
| 1512 |
0, 2, 1, 3);
|
| 1513 |
|
| 1514 |
struct ggml_tensor * K =
|
| 1515 |
ggml_permute(ctx0,
|
| 1516 |
ggml_cpy(ctx0,
|
| 1517 |
Kcur,
|
| 1518 |
-
ggml_new_tensor_3d(ctx0, wctx.
|
| 1519 |
0, 2, 1, 3);
|
| 1520 |
|
| 1521 |
struct ggml_tensor * V =
|
|
@@ -1525,7 +1588,7 @@ static bool whisper_encode_internal(
|
|
| 1525 |
Vcur,
|
| 1526 |
n_state/n_head, n_head, n_ctx),
|
| 1527 |
1, 2, 0, 3),
|
| 1528 |
-
ggml_new_tensor_3d(ctx0, wctx.
|
| 1529 |
|
| 1530 |
struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
|
| 1531 |
#else
|
|
@@ -1540,7 +1603,7 @@ static bool whisper_encode_internal(
|
|
| 1540 |
ggml_permute(ctx0,
|
| 1541 |
ggml_cpy(ctx0,
|
| 1542 |
Kcur,
|
| 1543 |
-
ggml_new_tensor_3d(ctx0, wctx.
|
| 1544 |
0, 2, 1, 3);
|
| 1545 |
|
| 1546 |
// K * Q
|
|
@@ -1561,7 +1624,7 @@ static bool whisper_encode_internal(
|
|
| 1561 |
Vcur,
|
| 1562 |
n_state/n_head, n_head, n_ctx),
|
| 1563 |
1, 2, 0, 3),
|
| 1564 |
-
ggml_new_tensor_3d(ctx0, wctx.
|
| 1565 |
);
|
| 1566 |
|
| 1567 |
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
|
@@ -1619,7 +1682,7 @@ static bool whisper_encode_internal(
|
|
| 1619 |
wstate.use_buf(ctx0, 0);
|
| 1620 |
|
| 1621 |
cur = ggml_flash_ff(ctx0,
|
| 1622 |
-
ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.
|
| 1623 |
layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
|
| 1624 |
#else
|
| 1625 |
wstate.use_buf(ctx0, 0);
|
|
@@ -2537,9 +2600,9 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
|
|
| 2537 |
struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
| 2538 |
whisper_state * state = new whisper_state;
|
| 2539 |
|
| 2540 |
-
const size_t scale = ctx->model.hparams.
|
| 2541 |
|
| 2542 |
-
if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->
|
| 2543 |
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
| 2544 |
delete state;
|
| 2545 |
return nullptr;
|
|
@@ -2550,7 +2613,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
|
| 2550 |
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
| 2551 |
}
|
| 2552 |
|
| 2553 |
-
if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->
|
| 2554 |
fprintf(stderr, "%s: kv_cache_init() failed for cross-attention cache\n", __func__);
|
| 2555 |
delete state;
|
| 2556 |
return nullptr;
|
|
@@ -3049,8 +3112,8 @@ int whisper_model_n_mels(struct whisper_context * ctx) {
|
|
| 3049 |
return ctx->model.hparams.n_mels;
|
| 3050 |
}
|
| 3051 |
|
| 3052 |
-
int
|
| 3053 |
-
return ctx->model.hparams.
|
| 3054 |
}
|
| 3055 |
|
| 3056 |
int whisper_model_type(struct whisper_context * ctx) {
|
|
@@ -4825,23 +4888,32 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
| 4825 |
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
|
| 4826 |
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
|
| 4827 |
|
|
|
|
| 4828 |
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
|
| 4829 |
|
| 4830 |
for (int j = 0; j < (int) sizes.size(); j++) {
|
|
|
|
|
|
|
| 4831 |
int n_fp16 = 0;
|
| 4832 |
int n_fp32 = 0;
|
| 4833 |
|
| 4834 |
// GFLOPS/s
|
|
|
|
|
|
|
| 4835 |
double s_fp16 = 0.0;
|
| 4836 |
double s_fp32 = 0.0;
|
| 4837 |
|
| 4838 |
const size_t N = sizes[j];
|
| 4839 |
|
| 4840 |
-
for (int k = 0; k <
|
| 4841 |
-
const ggml_type wtype =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4842 |
|
| 4843 |
-
double & s = k == 0 ? s_fp16 : s_fp32;
|
| 4844 |
-
int & n = k == 0 ? n_fp16
|
| 4845 |
|
| 4846 |
struct ggml_init_params gparams = {
|
| 4847 |
/*.mem_size =*/ buf.size(),
|
|
@@ -4885,8 +4957,8 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|
| 4885 |
s = ((2.0*N*N*N*n)/tsum)*1e-9;
|
| 4886 |
}
|
| 4887 |
|
| 4888 |
-
snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %
|
| 4889 |
-
|
| 4890 |
s += strbuf;
|
| 4891 |
}
|
| 4892 |
|
|
|
|
|
|
|
| 1 |
#include "whisper.h"
|
| 2 |
#if WHISPER_USE_COREML
|
| 3 |
#include "coreml/whisper-encoder.h"
|
|
|
|
| 254 |
{ MODEL_LARGE, 9ull*MB },
|
| 255 |
};
|
| 256 |
|
| 257 |
+
static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
|
| 258 |
+
{ GGML_TYPE_F32,
|
| 259 |
+
{
|
| 260 |
+
{ MODEL_TINY, 74ull*MB },
|
| 261 |
+
{ MODEL_BASE, 142ull*MB },
|
| 262 |
+
{ MODEL_SMALL, 466ull*MB },
|
| 263 |
+
{ MODEL_MEDIUM, 1464ull*MB },
|
| 264 |
+
{ MODEL_LARGE, 2952ull*MB },
|
| 265 |
+
},
|
| 266 |
+
},
|
| 267 |
+
{ GGML_TYPE_F16,
|
| 268 |
+
{
|
| 269 |
+
{ MODEL_TINY, 74ull*MB },
|
| 270 |
+
{ MODEL_BASE, 142ull*MB },
|
| 271 |
+
{ MODEL_SMALL, 466ull*MB },
|
| 272 |
+
{ MODEL_MEDIUM, 1464ull*MB },
|
| 273 |
+
{ MODEL_LARGE, 2952ull*MB },
|
| 274 |
+
},
|
| 275 |
+
},
|
| 276 |
+
{ GGML_TYPE_Q4_0,
|
| 277 |
+
{
|
| 278 |
+
{ MODEL_TINY, 26ull*MB },
|
| 279 |
+
{ MODEL_BASE, 50ull*MB },
|
| 280 |
+
{ MODEL_SMALL, 154ull*MB },
|
| 281 |
+
{ MODEL_MEDIUM, 470ull*MB },
|
| 282 |
+
{ MODEL_LARGE, 940ull*MB },
|
| 283 |
+
},
|
| 284 |
+
},
|
| 285 |
+
{ GGML_TYPE_Q4_1,
|
| 286 |
+
{
|
| 287 |
+
{ MODEL_TINY, 31ull*MB },
|
| 288 |
+
{ MODEL_BASE, 57ull*MB },
|
| 289 |
+
{ MODEL_SMALL, 181ull*MB },
|
| 290 |
+
{ MODEL_MEDIUM, 559ull*MB },
|
| 291 |
+
{ MODEL_LARGE, 1122ull*MB },
|
| 292 |
+
},
|
| 293 |
+
},
|
| 294 |
+
{ GGML_TYPE_Q4_2,
|
| 295 |
+
{
|
| 296 |
+
{ MODEL_TINY, 26ull*MB },
|
| 297 |
+
{ MODEL_BASE, 50ull*MB },
|
| 298 |
+
{ MODEL_SMALL, 154ull*MB },
|
| 299 |
+
{ MODEL_MEDIUM, 470ull*MB },
|
| 300 |
+
{ MODEL_LARGE, 940ull*MB },
|
| 301 |
+
},
|
| 302 |
+
},
|
| 303 |
+
{ GGML_TYPE_Q5_0, // TODO: fix
|
| 304 |
+
{
|
| 305 |
+
{ MODEL_TINY, 31ull*MB },
|
| 306 |
+
{ MODEL_BASE, 57ull*MB },
|
| 307 |
+
{ MODEL_SMALL, 181ull*MB },
|
| 308 |
+
{ MODEL_MEDIUM, 559ull*MB },
|
| 309 |
+
{ MODEL_LARGE, 1122ull*MB },
|
| 310 |
+
},
|
| 311 |
+
},
|
| 312 |
+
{ GGML_TYPE_Q5_1,
|
| 313 |
+
{
|
| 314 |
+
{ MODEL_TINY, 31ull*MB },
|
| 315 |
+
{ MODEL_BASE, 57ull*MB },
|
| 316 |
+
{ MODEL_SMALL, 181ull*MB },
|
| 317 |
+
{ MODEL_MEDIUM, 559ull*MB },
|
| 318 |
+
{ MODEL_LARGE, 1122ull*MB },
|
| 319 |
+
},
|
| 320 |
+
},
|
| 321 |
};
|
| 322 |
|
| 323 |
static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
|
|
|
|
| 427 |
int32_t n_text_head = 6;
|
| 428 |
int32_t n_text_layer = 4;
|
| 429 |
int32_t n_mels = 80;
|
| 430 |
+
int32_t ftype = 1;
|
| 431 |
};
|
| 432 |
|
| 433 |
// audio encoding layer
|
|
|
|
| 694 |
};
|
| 695 |
|
| 696 |
struct whisper_context {
|
| 697 |
+
int64_t t_load_us = 0;
|
| 698 |
int64_t t_start_us = 0;
|
| 699 |
|
| 700 |
+
ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX)
|
| 701 |
+
ggml_type itype = ggml_type::GGML_TYPE_F16; // intermediate type (FP32 or FP16)
|
| 702 |
|
| 703 |
whisper_model model;
|
| 704 |
whisper_vocab vocab;
|
|
|
|
| 755 |
const ggml_type wtype = cache.k->type;
|
| 756 |
WHISPER_ASSERT(wtype == cache.v->type);
|
| 757 |
|
| 758 |
+
WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_sizef(wtype));
|
| 759 |
|
| 760 |
struct ggml_init_params params = {
|
| 761 |
/*.mem_size =*/ cache.buf.size(),
|
|
|
|
| 828 |
read_safe(loader, hparams.n_text_head);
|
| 829 |
read_safe(loader, hparams.n_text_layer);
|
| 830 |
read_safe(loader, hparams.n_mels);
|
| 831 |
+
read_safe(loader, hparams.ftype);
|
| 832 |
|
| 833 |
assert(hparams.n_text_state == hparams.n_audio_state);
|
| 834 |
|
|
|
|
| 852 |
model.type = e_model::MODEL_LARGE;
|
| 853 |
}
|
| 854 |
|
| 855 |
+
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
| 856 |
// in order to save memory and also to speed up the computation
|
| 857 |
+
wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
| 858 |
+
if (wctx.wtype == GGML_TYPE_COUNT) {
|
| 859 |
+
fprintf(stderr, "%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
|
| 860 |
+
return false;
|
| 861 |
+
}
|
| 862 |
|
| 863 |
+
const size_t scale = model.hparams.ftype ? 1 : 2;
|
| 864 |
|
| 865 |
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
| 866 |
fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
|
|
|
|
| 872 |
fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
|
| 873 |
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
|
| 874 |
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
|
| 875 |
+
fprintf(stderr, "%s: ftype = %d\n", __func__, model.hparams.ftype);
|
| 876 |
fprintf(stderr, "%s: type = %d\n", __func__, model.type);
|
| 877 |
|
| 878 |
// print memory requirements
|
| 879 |
{
|
| 880 |
// this is the total memory required to run the inference
|
| 881 |
const size_t mem_required =
|
| 882 |
+
MEM_REQ_SCRATCH0.at(model.type) +
|
| 883 |
+
MEM_REQ_SCRATCH1.at(model.type) +
|
| 884 |
+
MEM_REQ_SCRATCH2.at(model.type) +
|
| 885 |
+
MEM_REQ_SCRATCH3.at(model.type) +
|
| 886 |
+
scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type) +
|
| 887 |
scale*MEM_REQ_KV_CROSS.at(model.type) +
|
| 888 |
scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type));
|
| 889 |
|
|
|
|
| 899 |
// always have at least one decoder
|
| 900 |
|
| 901 |
wctx.model.buf = new std::vector<uint8_t>();
|
| 902 |
+
wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type));
|
| 903 |
|
| 904 |
// we skip initialization of the state until it is needed
|
| 905 |
// because it might be that state will always be provided externally.
|
|
|
|
| 990 |
size_t ctx_size = 0;
|
| 991 |
|
| 992 |
const ggml_type wtype = wctx.wtype;
|
| 993 |
+
const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
|
| 994 |
|
| 995 |
{
|
| 996 |
const auto & hparams = model.hparams;
|
|
|
|
| 1009 |
|
| 1010 |
// encoder
|
| 1011 |
{
|
| 1012 |
+
ctx_size += n_audio_ctx*n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_pe;
|
| 1013 |
|
| 1014 |
+
ctx_size += 3*n_mels*n_audio_state*ggml_type_sizef(vtype); // e_conv_1_w
|
| 1015 |
+
ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_1_b
|
| 1016 |
|
| 1017 |
+
ctx_size += 3*n_audio_state*n_audio_state*ggml_type_sizef(vtype); // e_conv_2_w
|
| 1018 |
+
ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_2_b
|
| 1019 |
|
| 1020 |
+
ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_w;
|
| 1021 |
+
ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_b;
|
| 1022 |
}
|
| 1023 |
|
| 1024 |
// decoder
|
| 1025 |
{
|
| 1026 |
+
ctx_size += n_text_ctx*n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_pe;
|
| 1027 |
|
| 1028 |
+
ctx_size += n_vocab*n_text_state*ggml_type_sizef(wtype); // d_te;
|
| 1029 |
|
| 1030 |
+
ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_w;
|
| 1031 |
+
ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_b;
|
| 1032 |
}
|
| 1033 |
|
| 1034 |
// encoder layers
|
| 1035 |
{
|
| 1036 |
+
ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
|
| 1037 |
+
ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
|
| 1038 |
|
| 1039 |
+
ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // mlp_0_w
|
| 1040 |
+
ctx_size += n_audio_layer*( 4*n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
|
| 1041 |
|
| 1042 |
+
ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // mlp_1_w
|
| 1043 |
+
ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
|
| 1044 |
|
| 1045 |
+
ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
|
| 1046 |
+
ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
|
| 1047 |
|
| 1048 |
+
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_q_w
|
| 1049 |
+
ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
|
| 1050 |
|
| 1051 |
+
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_k_w
|
| 1052 |
|
| 1053 |
+
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_v_w
|
| 1054 |
+
ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
|
| 1055 |
|
| 1056 |
+
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_ln_1_w
|
| 1057 |
+
ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
|
| 1058 |
}
|
| 1059 |
|
| 1060 |
// decoder layers
|
| 1061 |
{
|
| 1062 |
+
ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
|
| 1063 |
+
ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
|
| 1064 |
|
| 1065 |
+
ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype)); // mlp_0_w
|
| 1066 |
+
ctx_size += n_text_layer*( 4*n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
|
| 1067 |
|
| 1068 |
+
ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype)); // mlp_1_w
|
| 1069 |
+
ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
|
| 1070 |
|
| 1071 |
+
ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
|
| 1072 |
+
ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
|
| 1073 |
|
| 1074 |
+
ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_q_w
|
| 1075 |
+
ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
|
| 1076 |
|
| 1077 |
+
ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_k_w
|
| 1078 |
|
| 1079 |
+
ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_v_w
|
| 1080 |
+
ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
|
| 1081 |
|
| 1082 |
+
ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_ln_1_w
|
| 1083 |
+
ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
|
| 1084 |
//
|
| 1085 |
+
ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_w
|
| 1086 |
+
ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_b
|
| 1087 |
|
| 1088 |
+
ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_q_w
|
| 1089 |
+
ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_q_b
|
| 1090 |
|
| 1091 |
+
ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_k_w
|
| 1092 |
|
| 1093 |
+
ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_v_w
|
| 1094 |
+
ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_v_b
|
| 1095 |
|
| 1096 |
+
ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_ln_1_w
|
| 1097 |
+
ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
|
| 1098 |
}
|
| 1099 |
|
| 1100 |
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
|
|
|
|
| 1140 |
|
| 1141 |
// encoder
|
| 1142 |
{
|
| 1143 |
+
model.e_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx);
|
| 1144 |
|
| 1145 |
+
model.e_conv_1_w = ggml_new_tensor_3d(ctx, vtype, 3, n_mels, n_audio_state);
|
| 1146 |
model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
|
| 1147 |
|
| 1148 |
+
model.e_conv_2_w = ggml_new_tensor_3d(ctx, vtype, 3, n_audio_state, n_audio_state);
|
| 1149 |
model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
|
| 1150 |
|
| 1151 |
+
model.e_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
| 1152 |
+
model.e_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
| 1153 |
|
| 1154 |
// map by name
|
| 1155 |
model.tensors["encoder.positional_embedding"] = model.e_pe;
|
| 1156 |
|
| 1157 |
+
model.tensors["encoder.conv1.weight"] = model.e_conv_1_w;
|
| 1158 |
+
model.tensors["encoder.conv1.bias"] = model.e_conv_1_b;
|
| 1159 |
|
| 1160 |
+
model.tensors["encoder.conv2.weight"] = model.e_conv_2_w;
|
| 1161 |
+
model.tensors["encoder.conv2.bias"] = model.e_conv_2_b;
|
| 1162 |
|
| 1163 |
+
model.tensors["encoder.ln_post.weight"] = model.e_ln_w;
|
| 1164 |
+
model.tensors["encoder.ln_post.bias"] = model.e_ln_b;
|
| 1165 |
|
| 1166 |
for (int i = 0; i < n_audio_layer; ++i) {
|
| 1167 |
auto & layer = model.layers_encoder[i];
|
| 1168 |
|
| 1169 |
+
layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
| 1170 |
+
layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
| 1171 |
|
| 1172 |
+
layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, 4*n_audio_state);
|
| 1173 |
+
layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state);
|
| 1174 |
|
| 1175 |
+
layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_audio_state, n_audio_state);
|
| 1176 |
+
layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
| 1177 |
|
| 1178 |
+
layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
| 1179 |
+
layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
| 1180 |
|
| 1181 |
+
layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
|
| 1182 |
+
layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
| 1183 |
|
| 1184 |
+
layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
|
| 1185 |
|
| 1186 |
+
layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
|
| 1187 |
+
layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
| 1188 |
|
| 1189 |
+
layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
|
| 1190 |
+
layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
| 1191 |
|
| 1192 |
// map by name
|
| 1193 |
+
model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w;
|
| 1194 |
+
model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b;
|
| 1195 |
|
| 1196 |
+
model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w;
|
| 1197 |
+
model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.bias"] = layer.mlp_0_b;
|
| 1198 |
|
| 1199 |
+
model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w;
|
| 1200 |
+
model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b;
|
| 1201 |
|
| 1202 |
+
model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w;
|
| 1203 |
+
model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.bias"] = layer.attn_ln_0_b;
|
| 1204 |
|
| 1205 |
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w;
|
| 1206 |
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.bias"] = layer.attn_q_b;
|
| 1207 |
|
| 1208 |
+
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w;
|
| 1209 |
|
| 1210 |
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w;
|
| 1211 |
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.bias"] = layer.attn_v_b;
|
| 1212 |
|
| 1213 |
+
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w;
|
| 1214 |
+
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b;
|
| 1215 |
}
|
| 1216 |
}
|
| 1217 |
|
| 1218 |
// decoder
|
| 1219 |
{
|
| 1220 |
+
model.d_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx);
|
| 1221 |
|
| 1222 |
+
model.d_te = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab);
|
| 1223 |
|
| 1224 |
model.d_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1225 |
model.d_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1226 |
|
| 1227 |
// map by name
|
| 1228 |
+
model.tensors["decoder.positional_embedding"] = model.d_pe;
|
| 1229 |
|
| 1230 |
model.tensors["decoder.token_embedding.weight"] = model.d_te;
|
| 1231 |
|
| 1232 |
+
model.tensors["decoder.ln.weight"] = model.d_ln_w;
|
| 1233 |
+
model.tensors["decoder.ln.bias"] = model.d_ln_b;
|
| 1234 |
|
| 1235 |
for (int i = 0; i < n_text_layer; ++i) {
|
| 1236 |
auto & layer = model.layers_decoder[i];
|
| 1237 |
|
| 1238 |
+
layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1239 |
+
layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1240 |
|
| 1241 |
+
layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, 4*n_text_state);
|
| 1242 |
+
layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state);
|
| 1243 |
|
| 1244 |
+
layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_text_state, n_text_state);
|
| 1245 |
+
layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1246 |
|
| 1247 |
+
layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1248 |
+
layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1249 |
|
| 1250 |
+
layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
| 1251 |
+
layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1252 |
|
| 1253 |
+
layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
| 1254 |
|
| 1255 |
+
layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
| 1256 |
+
layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1257 |
|
| 1258 |
+
layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
| 1259 |
+
layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1260 |
|
| 1261 |
+
layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1262 |
+
layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1263 |
|
| 1264 |
+
layer.cross_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
| 1265 |
+
layer.cross_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1266 |
|
| 1267 |
+
layer.cross_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
| 1268 |
|
| 1269 |
+
layer.cross_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
| 1270 |
+
layer.cross_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1271 |
|
| 1272 |
+
layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
| 1273 |
+
layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
| 1274 |
|
| 1275 |
// map by name
|
| 1276 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w;
|
| 1277 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b;
|
| 1278 |
|
| 1279 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w;
|
| 1280 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.bias"] = layer.mlp_0_b;
|
| 1281 |
|
| 1282 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w;
|
| 1283 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b;
|
| 1284 |
|
| 1285 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w;
|
| 1286 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.bias"] = layer.attn_ln_0_b;
|
| 1287 |
|
| 1288 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w;
|
| 1289 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.bias"] = layer.attn_q_b;
|
| 1290 |
|
| 1291 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w;
|
| 1292 |
|
| 1293 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w;
|
| 1294 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.bias"] = layer.attn_v_b;
|
| 1295 |
|
| 1296 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w;
|
| 1297 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b;
|
| 1298 |
|
| 1299 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.weight"] = layer.cross_attn_ln_0_w;
|
| 1300 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.bias"] = layer.cross_attn_ln_0_b;
|
| 1301 |
|
| 1302 |
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.weight"] = layer.cross_attn_q_w;
|
| 1303 |
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.bias"] = layer.cross_attn_q_b;
|
| 1304 |
|
| 1305 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.key.weight"] = layer.cross_attn_k_w;
|
| 1306 |
|
| 1307 |
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.weight"] = layer.cross_attn_v_w;
|
| 1308 |
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.bias"] = layer.cross_attn_v_b;
|
| 1309 |
|
| 1310 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.weight"] = layer.cross_attn_ln_1_w;
|
| 1311 |
+
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.bias"] = layer.cross_attn_ln_1_b;
|
| 1312 |
}
|
| 1313 |
}
|
| 1314 |
}
|
|
|
|
| 1322 |
while (true) {
|
| 1323 |
int32_t n_dims;
|
| 1324 |
int32_t length;
|
| 1325 |
+
int32_t ttype;
|
| 1326 |
|
| 1327 |
read_safe(loader, n_dims);
|
| 1328 |
read_safe(loader, length);
|
| 1329 |
+
read_safe(loader, ttype);
|
| 1330 |
|
| 1331 |
if (loader->eof(loader->context)) {
|
| 1332 |
break;
|
|
|
|
| 1361 |
return false;
|
| 1362 |
}
|
| 1363 |
|
| 1364 |
+
const size_t bpe = ggml_type_size(ggml_type(ttype));
|
| 1365 |
|
| 1366 |
+
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
| 1367 |
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
| 1368 |
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
| 1369 |
return false;
|
|
|
|
| 1372 |
loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
|
| 1373 |
BYTESWAP_TENSOR(tensor);
|
| 1374 |
|
| 1375 |
+
//printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype), ggml_nbytes(tensor)/1024.0/1024.0);
|
| 1376 |
total_size += ggml_nbytes(tensor);
|
| 1377 |
model.n_loaded++;
|
| 1378 |
}
|
|
|
|
| 1571 |
ggml_permute(ctx0,
|
| 1572 |
ggml_cpy(ctx0,
|
| 1573 |
Qcur,
|
| 1574 |
+
ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
|
| 1575 |
0, 2, 1, 3);
|
| 1576 |
|
| 1577 |
struct ggml_tensor * K =
|
| 1578 |
ggml_permute(ctx0,
|
| 1579 |
ggml_cpy(ctx0,
|
| 1580 |
Kcur,
|
| 1581 |
+
ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
|
| 1582 |
0, 2, 1, 3);
|
| 1583 |
|
| 1584 |
struct ggml_tensor * V =
|
|
|
|
| 1588 |
Vcur,
|
| 1589 |
n_state/n_head, n_head, n_ctx),
|
| 1590 |
1, 2, 0, 3),
|
| 1591 |
+
ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head));
|
| 1592 |
|
| 1593 |
struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
|
| 1594 |
#else
|
|
|
|
| 1603 |
ggml_permute(ctx0,
|
| 1604 |
ggml_cpy(ctx0,
|
| 1605 |
Kcur,
|
| 1606 |
+
ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
|
| 1607 |
0, 2, 1, 3);
|
| 1608 |
|
| 1609 |
// K * Q
|
|
|
|
| 1624 |
Vcur,
|
| 1625 |
n_state/n_head, n_head, n_ctx),
|
| 1626 |
1, 2, 0, 3),
|
| 1627 |
+
ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)
|
| 1628 |
);
|
| 1629 |
|
| 1630 |
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
|
|
|
| 1682 |
wstate.use_buf(ctx0, 0);
|
| 1683 |
|
| 1684 |
cur = ggml_flash_ff(ctx0,
|
| 1685 |
+
ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)),
|
| 1686 |
layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
|
| 1687 |
#else
|
| 1688 |
wstate.use_buf(ctx0, 0);
|
|
|
|
| 2600 |
struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
| 2601 |
whisper_state * state = new whisper_state;
|
| 2602 |
|
| 2603 |
+
const size_t scale = ctx->model.hparams.ftype ? 1 : 2;
|
| 2604 |
|
| 2605 |
+
if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) {
|
| 2606 |
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
| 2607 |
delete state;
|
| 2608 |
return nullptr;
|
|
|
|
| 2613 |
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
| 2614 |
}
|
| 2615 |
|
| 2616 |
+
if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
|
| 2617 |
fprintf(stderr, "%s: kv_cache_init() failed for cross-attention cache\n", __func__);
|
| 2618 |
delete state;
|
| 2619 |
return nullptr;
|
|
|
|
| 3112 |
return ctx->model.hparams.n_mels;
|
| 3113 |
}
|
| 3114 |
|
| 3115 |
+
int whisper_model_ftype(struct whisper_context * ctx) {
|
| 3116 |
+
return ctx->model.hparams.ftype;
|
| 3117 |
}
|
| 3118 |
|
| 3119 |
int whisper_model_type(struct whisper_context * ctx) {
|
|
|
|
| 4888 |
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
|
| 4889 |
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
|
| 4890 |
|
| 4891 |
+
// put a bunch of random data in the buffer
|
| 4892 |
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
|
| 4893 |
|
| 4894 |
for (int j = 0; j < (int) sizes.size(); j++) {
|
| 4895 |
+
int n_q4_0 = 0;
|
| 4896 |
+
int n_q4_1 = 0;
|
| 4897 |
int n_fp16 = 0;
|
| 4898 |
int n_fp32 = 0;
|
| 4899 |
|
| 4900 |
// GFLOPS/s
|
| 4901 |
+
double s_q4_0 = 0.0;
|
| 4902 |
+
double s_q4_1 = 0.0;
|
| 4903 |
double s_fp16 = 0.0;
|
| 4904 |
double s_fp32 = 0.0;
|
| 4905 |
|
| 4906 |
const size_t N = sizes[j];
|
| 4907 |
|
| 4908 |
+
for (int k = 0; k < 4; ++k) {
|
| 4909 |
+
const ggml_type wtype =
|
| 4910 |
+
k == 0 ? GGML_TYPE_Q4_0 :
|
| 4911 |
+
k == 1 ? GGML_TYPE_Q4_1 :
|
| 4912 |
+
k == 2 ? GGML_TYPE_F16 :
|
| 4913 |
+
GGML_TYPE_F32;
|
| 4914 |
|
| 4915 |
+
double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_fp16 : s_fp32;
|
| 4916 |
+
int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_fp16 : n_fp32;
|
| 4917 |
|
| 4918 |
struct ggml_init_params gparams = {
|
| 4919 |
/*.mem_size =*/ buf.size(),
|
|
|
|
| 4957 |
s = ((2.0*N*N*N*n)/tsum)*1e-9;
|
| 4958 |
}
|
| 4959 |
|
| 4960 |
+
snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) / Q4_1 %7.1f GFLOPS (%3d runs) / F16 %7.1f GFLOPS (%3d runs) / F32 %7.1f GFLOPS (%3d runs)\n",
|
| 4961 |
+
N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_fp16, n_fp16, s_fp32, n_fp32);
|
| 4962 |
s += strbuf;
|
| 4963 |
}
|
| 4964 |
|
whisper.h
CHANGED
|
@@ -258,7 +258,7 @@ extern "C" {
|
|
| 258 |
WHISPER_API int whisper_model_n_text_head (struct whisper_context * ctx);
|
| 259 |
WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
|
| 260 |
WHISPER_API int whisper_model_n_mels (struct whisper_context * ctx);
|
| 261 |
-
WHISPER_API int
|
| 262 |
WHISPER_API int whisper_model_type (struct whisper_context * ctx);
|
| 263 |
|
| 264 |
// Token logits obtained from the last call to whisper_decode()
|
|
|
|
| 258 |
WHISPER_API int whisper_model_n_text_head (struct whisper_context * ctx);
|
| 259 |
WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
|
| 260 |
WHISPER_API int whisper_model_n_mels (struct whisper_context * ctx);
|
| 261 |
+
WHISPER_API int whisper_model_ftype (struct whisper_context * ctx);
|
| 262 |
WHISPER_API int whisper_model_type (struct whisper_context * ctx);
|
| 263 |
|
| 264 |
// Token logits obtained from the last call to whisper_decode()
|