Spaces:
Running
Running
ggml : sync latest ggml lib
Browse files- examples/common-ggml.cpp +11 -0
- examples/common.cpp +246 -40
- examples/common.h +30 -2
- examples/main/main.cpp +37 -32
- examples/quantize/quantize.cpp +17 -15
- ggml-cuda.cu +0 -0
- ggml-cuda.h +18 -2
- ggml-opencl.h +11 -10
- ggml.c +0 -0
- ggml.h +387 -18
- whisper.cpp +6 -2
examples/common-ggml.cpp
CHANGED
|
@@ -52,6 +52,11 @@ bool ggml_common_quantize_0(
|
|
| 52 |
case GGML_FTYPE_ALL_F32:
|
| 53 |
case GGML_FTYPE_MOSTLY_F16:
|
| 54 |
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
{
|
| 56 |
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
| 57 |
return false;
|
|
@@ -187,6 +192,12 @@ bool ggml_common_quantize_0(
|
|
| 187 |
case GGML_TYPE_I16:
|
| 188 |
case GGML_TYPE_I32:
|
| 189 |
case GGML_TYPE_Q8_1:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
case GGML_TYPE_COUNT:
|
| 191 |
{
|
| 192 |
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
|
|
|
| 52 |
case GGML_FTYPE_ALL_F32:
|
| 53 |
case GGML_FTYPE_MOSTLY_F16:
|
| 54 |
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
| 55 |
+
case GGML_FTYPE_MOSTLY_Q2_K:
|
| 56 |
+
case GGML_FTYPE_MOSTLY_Q3_K:
|
| 57 |
+
case GGML_FTYPE_MOSTLY_Q4_K:
|
| 58 |
+
case GGML_FTYPE_MOSTLY_Q5_K:
|
| 59 |
+
case GGML_FTYPE_MOSTLY_Q6_K:
|
| 60 |
{
|
| 61 |
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
| 62 |
return false;
|
|
|
|
| 192 |
case GGML_TYPE_I16:
|
| 193 |
case GGML_TYPE_I32:
|
| 194 |
case GGML_TYPE_Q8_1:
|
| 195 |
+
case GGML_TYPE_Q2_K:
|
| 196 |
+
case GGML_TYPE_Q3_K:
|
| 197 |
+
case GGML_TYPE_Q4_K:
|
| 198 |
+
case GGML_TYPE_Q5_K:
|
| 199 |
+
case GGML_TYPE_Q6_K:
|
| 200 |
+
case GGML_TYPE_Q8_K:
|
| 201 |
case GGML_TYPE_COUNT:
|
| 202 |
{
|
| 203 |
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
examples/common.cpp
CHANGED
|
@@ -6,13 +6,21 @@
|
|
| 6 |
#include "dr_wav.h"
|
| 7 |
|
| 8 |
#include <cmath>
|
|
|
|
| 9 |
#include <fstream>
|
| 10 |
#include <regex>
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
#ifndef M_PI
|
| 13 |
#define M_PI 3.14159265358979323846
|
| 14 |
#endif
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
| 17 |
for (int i = 1; i < argc; i++) {
|
| 18 |
std::string arg = argv[i];
|
|
@@ -52,7 +60,10 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
| 52 |
if (params.prompt.back() == '\n') {
|
| 53 |
params.prompt.pop_back();
|
| 54 |
}
|
| 55 |
-
} else {
|
|
|
|
|
|
|
|
|
|
| 56 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 57 |
gpt_print_usage(argc, argv, params);
|
| 58 |
exit(0);
|
|
@@ -73,6 +84,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
| 73 |
fprintf(stderr, " prompt to start generation with (default: random)\n");
|
| 74 |
fprintf(stderr, " -f FNAME, --file FNAME\n");
|
| 75 |
fprintf(stderr, " load prompt from a file\n");
|
|
|
|
|
|
|
| 76 |
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
|
| 77 |
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
|
| 78 |
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
|
|
@@ -117,6 +130,10 @@ std::string replace(const std::string & s, const std::string & from, const std::
|
|
| 117 |
return result;
|
| 118 |
}
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
std::map<std::string, int32_t> json_parse(const std::string & fname) {
|
| 121 |
std::map<std::string, int32_t> result;
|
| 122 |
|
|
@@ -208,8 +225,28 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
|
|
| 208 |
return result;
|
| 209 |
}
|
| 210 |
|
| 211 |
-
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
}
|
| 214 |
|
| 215 |
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
|
@@ -218,63 +255,52 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
|
|
| 218 |
// first split the text into words
|
| 219 |
{
|
| 220 |
std::string str = text;
|
| 221 |
-
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
| 222 |
|
| 223 |
// Generate the subpattern from the special_tokens vector if it's not empty
|
| 224 |
if (!vocab.special_tokens.empty()) {
|
|
|
|
| 225 |
std::string special_tokens_subpattern;
|
| 226 |
for (const auto & token : vocab.special_tokens) {
|
| 227 |
if (!special_tokens_subpattern.empty()) {
|
| 228 |
special_tokens_subpattern += "|";
|
| 229 |
}
|
| 230 |
-
special_tokens_subpattern += token;
|
| 231 |
}
|
| 232 |
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
|
|
|
| 243 |
}
|
| 244 |
-
|
| 245 |
}
|
|
|
|
|
|
|
| 246 |
}
|
| 247 |
|
| 248 |
-
// find the longest
|
| 249 |
std::vector<gpt_vocab::id> tokens;
|
| 250 |
for (const auto & word : words) {
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
int j = n;
|
| 257 |
-
while (j > i) {
|
| 258 |
-
auto it = vocab.token_to_id.find(word.substr(i, j-i));
|
| 259 |
-
if (it != vocab.token_to_id.end()) {
|
| 260 |
tokens.push_back(it->second);
|
| 261 |
-
i = j;
|
| 262 |
-
j = n;
|
| 263 |
break;
|
| 264 |
}
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
break;
|
| 269 |
-
}
|
| 270 |
-
if (j == i) {
|
| 271 |
-
auto sub = word.substr(i, 1);
|
| 272 |
-
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
|
| 273 |
-
tokens.push_back(vocab.token_to_id.at(sub));
|
| 274 |
-
} else {
|
| 275 |
-
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
|
| 276 |
}
|
| 277 |
-
++i;
|
| 278 |
}
|
| 279 |
}
|
| 280 |
}
|
|
@@ -282,6 +308,70 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
|
|
| 282 |
return tokens;
|
| 283 |
}
|
| 284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
|
| 286 |
printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
|
| 287 |
|
|
@@ -381,6 +471,122 @@ gpt_vocab::id gpt_sample_top_k_top_p(
|
|
| 381 |
return logits_id[idx].second;
|
| 382 |
}
|
| 383 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
|
| 385 |
drwav wav;
|
| 386 |
std::vector<uint8_t> wav_data; // used for pipe input from stdin
|
|
|
|
| 6 |
#include "dr_wav.h"
|
| 7 |
|
| 8 |
#include <cmath>
|
| 9 |
+
#include <cstring>
|
| 10 |
#include <fstream>
|
| 11 |
#include <regex>
|
| 12 |
+
#include <locale>
|
| 13 |
+
#include <codecvt>
|
| 14 |
+
#include <sstream>
|
| 15 |
|
| 16 |
#ifndef M_PI
|
| 17 |
#define M_PI 3.14159265358979323846
|
| 18 |
#endif
|
| 19 |
|
| 20 |
+
#if defined(_MSC_VER)
|
| 21 |
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
| 22 |
+
#endif
|
| 23 |
+
|
| 24 |
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
| 25 |
for (int i = 1; i < argc; i++) {
|
| 26 |
std::string arg = argv[i];
|
|
|
|
| 60 |
if (params.prompt.back() == '\n') {
|
| 61 |
params.prompt.pop_back();
|
| 62 |
}
|
| 63 |
+
} else if (arg == "-tt" || arg == "--token_test") {
|
| 64 |
+
params.token_test = argv[++i];
|
| 65 |
+
}
|
| 66 |
+
else {
|
| 67 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 68 |
gpt_print_usage(argc, argv, params);
|
| 69 |
exit(0);
|
|
|
|
| 84 |
fprintf(stderr, " prompt to start generation with (default: random)\n");
|
| 85 |
fprintf(stderr, " -f FNAME, --file FNAME\n");
|
| 86 |
fprintf(stderr, " load prompt from a file\n");
|
| 87 |
+
fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
|
| 88 |
+
fprintf(stderr, " test tokenization\n");
|
| 89 |
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
|
| 90 |
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
|
| 91 |
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
|
|
|
|
| 130 |
return result;
|
| 131 |
}
|
| 132 |
|
| 133 |
+
void gpt_vocab::add_special_token(const std::string & token) {
|
| 134 |
+
special_tokens.push_back(token);
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
std::map<std::string, int32_t> json_parse(const std::string & fname) {
|
| 138 |
std::map<std::string, int32_t> result;
|
| 139 |
|
|
|
|
| 225 |
return result;
|
| 226 |
}
|
| 227 |
|
| 228 |
+
std::string convert_to_utf8(const std::wstring & input) {
|
| 229 |
+
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
| 230 |
+
return converter.to_bytes(input);
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
std::wstring convert_to_wstring(const std::string & input) {
|
| 235 |
+
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
| 236 |
+
return converter.from_bytes(input);
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
void gpt_split_words(std::string str, std::vector<std::string>& words) {
|
| 240 |
+
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
| 241 |
+
const std::regex re(pattern);
|
| 242 |
+
std::smatch m;
|
| 243 |
+
|
| 244 |
+
while (std::regex_search(str, m, re)) {
|
| 245 |
+
for (auto x : m) {
|
| 246 |
+
words.push_back(x);
|
| 247 |
+
}
|
| 248 |
+
str = m.suffix();
|
| 249 |
+
}
|
| 250 |
}
|
| 251 |
|
| 252 |
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
|
|
|
| 255 |
// first split the text into words
|
| 256 |
{
|
| 257 |
std::string str = text;
|
|
|
|
| 258 |
|
| 259 |
// Generate the subpattern from the special_tokens vector if it's not empty
|
| 260 |
if (!vocab.special_tokens.empty()) {
|
| 261 |
+
const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
|
| 262 |
std::string special_tokens_subpattern;
|
| 263 |
for (const auto & token : vocab.special_tokens) {
|
| 264 |
if (!special_tokens_subpattern.empty()) {
|
| 265 |
special_tokens_subpattern += "|";
|
| 266 |
}
|
| 267 |
+
special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
|
| 268 |
}
|
| 269 |
|
| 270 |
+
std::regex re(special_tokens_subpattern);
|
| 271 |
+
std::smatch m;
|
| 272 |
+
// Split the text by special tokens.
|
| 273 |
+
while (std::regex_search(str, m, re)) {
|
| 274 |
+
// Split the substrings in-between special tokens into words.
|
| 275 |
+
gpt_split_words(m.prefix(), words);
|
| 276 |
+
// Add matched special tokens as words.
|
| 277 |
+
for (auto x : m) {
|
| 278 |
+
words.push_back(x);
|
| 279 |
+
}
|
| 280 |
+
str = m.suffix();
|
| 281 |
}
|
| 282 |
+
// Remaining text without special tokens will be handled below.
|
| 283 |
}
|
| 284 |
+
|
| 285 |
+
gpt_split_words(str, words);
|
| 286 |
}
|
| 287 |
|
| 288 |
+
// find the longest token that forms each word in words:
|
| 289 |
std::vector<gpt_vocab::id> tokens;
|
| 290 |
for (const auto & word : words) {
|
| 291 |
+
for (int i = 0; i < (int) word.size(); ){
|
| 292 |
+
for (int j = word.size() - 1; j >= i; j--){
|
| 293 |
+
auto cand = word.substr(i, j-i+1);
|
| 294 |
+
auto it = vocab.token_to_id.find(cand);
|
| 295 |
+
if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
tokens.push_back(it->second);
|
| 297 |
+
i = j + 1;
|
|
|
|
| 298 |
break;
|
| 299 |
}
|
| 300 |
+
else if (j == i){ // word.substr(i, 1) has no matching
|
| 301 |
+
fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
|
| 302 |
+
i++;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
}
|
|
|
|
| 304 |
}
|
| 305 |
}
|
| 306 |
}
|
|
|
|
| 308 |
return tokens;
|
| 309 |
}
|
| 310 |
|
| 311 |
+
std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) {
|
| 312 |
+
std::vector<gpt_vocab::id> output;
|
| 313 |
+
std::stringstream ss(input);
|
| 314 |
+
std::string token;
|
| 315 |
+
|
| 316 |
+
while (std::getline(ss, token, delimiter)) {
|
| 317 |
+
output.push_back(std::stoi(token));
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
return output;
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string & fpath_test){
|
| 324 |
+
if (fpath_test.empty()){
|
| 325 |
+
fprintf(stderr, "%s : No test file found.\n", __func__);
|
| 326 |
+
return std::map<std::string, std::vector<gpt_vocab::id>>();
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
std::map<std::string, std::vector<gpt_vocab::id>> tests;
|
| 330 |
+
|
| 331 |
+
auto fin = std::ifstream(fpath_test, std::ios_base::in);
|
| 332 |
+
const char * delimeter = " => ";
|
| 333 |
+
const char del_tok = ',';
|
| 334 |
+
std::string line;
|
| 335 |
+
while (std::getline(fin, line)) {
|
| 336 |
+
size_t delimiterPos = line.find(delimeter);
|
| 337 |
+
if (delimiterPos != std::string::npos) {
|
| 338 |
+
std::string text = line.substr(0, delimiterPos);
|
| 339 |
+
std::string s_tokens = line.substr(delimiterPos + std::strlen(delimeter));
|
| 340 |
+
tests[text] = parse_tokens_from_string(s_tokens, del_tok);
|
| 341 |
+
}
|
| 342 |
+
}
|
| 343 |
+
return tests;
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test){
|
| 347 |
+
std::map<std::string, std::vector<gpt_vocab::id>> tests = extract_tests_from_file(fpath_test);
|
| 348 |
+
|
| 349 |
+
size_t n_fails = 0;
|
| 350 |
+
|
| 351 |
+
for (const auto & test : tests) {
|
| 352 |
+
std::vector<gpt_vocab::id> tokens = gpt_tokenize(vocab, test.first);
|
| 353 |
+
|
| 354 |
+
if (tokens != test.second){
|
| 355 |
+
n_fails++;
|
| 356 |
+
|
| 357 |
+
// print out failure cases
|
| 358 |
+
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test.first.c_str());
|
| 359 |
+
fprintf(stderr, "%s : tokens in hf: ", __func__);
|
| 360 |
+
for (const auto & t : test.second) {
|
| 361 |
+
fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
|
| 362 |
+
}
|
| 363 |
+
fprintf(stderr, "\n");
|
| 364 |
+
fprintf(stderr, "%s : tokens in ggml: ", __func__);
|
| 365 |
+
for (const auto & t : tokens) {
|
| 366 |
+
fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
|
| 367 |
+
}
|
| 368 |
+
fprintf(stderr, "\n");
|
| 369 |
+
}
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
fprintf(stderr, "%s : %zu tests failed out of %zu tests.\n", __func__, n_fails, tests.size());
|
| 373 |
+
}
|
| 374 |
+
|
| 375 |
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
|
| 376 |
printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
|
| 377 |
|
|
|
|
| 471 |
return logits_id[idx].second;
|
| 472 |
}
|
| 473 |
|
| 474 |
+
gpt_vocab::id gpt_sample_top_k_top_p_repeat(
|
| 475 |
+
const gpt_vocab & vocab,
|
| 476 |
+
const float * logits,
|
| 477 |
+
const int32_t * last_n_tokens_data,
|
| 478 |
+
size_t last_n_tokens_data_size,
|
| 479 |
+
int top_k,
|
| 480 |
+
double top_p,
|
| 481 |
+
double temp,
|
| 482 |
+
int repeat_last_n,
|
| 483 |
+
float repeat_penalty,
|
| 484 |
+
std::mt19937 & rng) {
|
| 485 |
+
|
| 486 |
+
int n_logits = vocab.id_to_token.size();
|
| 487 |
+
|
| 488 |
+
const auto * plogits = logits;
|
| 489 |
+
|
| 490 |
+
const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size);
|
| 491 |
+
|
| 492 |
+
if (temp <= 0) {
|
| 493 |
+
// select the token with the highest logit directly
|
| 494 |
+
float max_logit = plogits[0];
|
| 495 |
+
gpt_vocab::id max_id = 0;
|
| 496 |
+
|
| 497 |
+
for (int i = 1; i < n_logits; ++i) {
|
| 498 |
+
if (plogits[i] > max_logit) {
|
| 499 |
+
max_logit = plogits[i];
|
| 500 |
+
max_id = i;
|
| 501 |
+
}
|
| 502 |
+
}
|
| 503 |
+
return max_id;
|
| 504 |
+
}
|
| 505 |
+
|
| 506 |
+
|
| 507 |
+
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
|
| 508 |
+
logits_id.reserve(n_logits);
|
| 509 |
+
|
| 510 |
+
{
|
| 511 |
+
const float scale = 1.0f/temp;
|
| 512 |
+
for (int i = 0; i < n_logits; ++i) {
|
| 513 |
+
// repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
|
| 514 |
+
// credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
|
| 515 |
+
if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) {
|
| 516 |
+
// if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
|
| 517 |
+
if (plogits[i] < 0.0f) {
|
| 518 |
+
logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
|
| 519 |
+
} else {
|
| 520 |
+
logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
|
| 521 |
+
}
|
| 522 |
+
} else {
|
| 523 |
+
logits_id.push_back(std::make_pair(plogits[i]*scale, i));
|
| 524 |
+
}
|
| 525 |
+
}
|
| 526 |
+
}
|
| 527 |
+
|
| 528 |
+
// find the top K tokens
|
| 529 |
+
std::partial_sort(
|
| 530 |
+
logits_id.begin(),
|
| 531 |
+
logits_id.begin() + top_k, logits_id.end(),
|
| 532 |
+
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
|
| 533 |
+
return a.first > b.first;
|
| 534 |
+
});
|
| 535 |
+
|
| 536 |
+
logits_id.resize(top_k);
|
| 537 |
+
|
| 538 |
+
double maxl = -INFINITY;
|
| 539 |
+
for (const auto & kv : logits_id) {
|
| 540 |
+
maxl = std::max(maxl, kv.first);
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
// compute probs for the top K tokens
|
| 544 |
+
std::vector<double> probs;
|
| 545 |
+
probs.reserve(logits_id.size());
|
| 546 |
+
|
| 547 |
+
double sum = 0.0;
|
| 548 |
+
for (const auto & kv : logits_id) {
|
| 549 |
+
double p = exp(kv.first - maxl);
|
| 550 |
+
probs.push_back(p);
|
| 551 |
+
sum += p;
|
| 552 |
+
}
|
| 553 |
+
|
| 554 |
+
// normalize the probs
|
| 555 |
+
for (auto & p : probs) {
|
| 556 |
+
p /= sum;
|
| 557 |
+
}
|
| 558 |
+
|
| 559 |
+
if (top_p < 1.0f) {
|
| 560 |
+
double cumsum = 0.0f;
|
| 561 |
+
for (int i = 0; i < top_k; i++) {
|
| 562 |
+
cumsum += probs[i];
|
| 563 |
+
if (cumsum >= top_p) {
|
| 564 |
+
top_k = i + 1;
|
| 565 |
+
probs.resize(top_k);
|
| 566 |
+
logits_id.resize(top_k);
|
| 567 |
+
break;
|
| 568 |
+
}
|
| 569 |
+
}
|
| 570 |
+
|
| 571 |
+
cumsum = 1.0/cumsum;
|
| 572 |
+
for (int i = 0; i < (int) probs.size(); i++) {
|
| 573 |
+
probs[i] *= cumsum;
|
| 574 |
+
}
|
| 575 |
+
}
|
| 576 |
+
|
| 577 |
+
// printf("\n");
|
| 578 |
+
// for (int i = 0; i < (int) probs.size(); i++) {
|
| 579 |
+
// for (int i = 0; i < 10; i++) {
|
| 580 |
+
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
|
| 581 |
+
// }
|
| 582 |
+
|
| 583 |
+
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
| 584 |
+
int idx = dist(rng);
|
| 585 |
+
|
| 586 |
+
return logits_id[idx].second;
|
| 587 |
+
|
| 588 |
+
}
|
| 589 |
+
|
| 590 |
bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
|
| 591 |
drwav wav;
|
| 592 |
std::vector<uint8_t> wav_data; // used for pipe input from stdin
|
examples/common.h
CHANGED
|
@@ -26,8 +26,9 @@ struct gpt_params {
|
|
| 26 |
|
| 27 |
int32_t n_batch = 8; // batch size for prompt processing
|
| 28 |
|
| 29 |
-
std::string model
|
| 30 |
-
std::string prompt;
|
|
|
|
| 31 |
};
|
| 32 |
|
| 33 |
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
|
@@ -61,6 +62,12 @@ struct gpt_vocab {
|
|
| 61 |
// poor-man's JSON parsing
|
| 62 |
std::map<std::string, int32_t> json_parse(const std::string & fname);
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
// split text into tokens
|
| 65 |
//
|
| 66 |
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
|
|
@@ -73,6 +80,15 @@ std::map<std::string, int32_t> json_parse(const std::string & fname);
|
|
| 73 |
//
|
| 74 |
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
// load the tokens from encoder.json
|
| 77 |
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
|
| 78 |
|
|
@@ -92,6 +108,18 @@ gpt_vocab::id gpt_sample_top_k_top_p(
|
|
| 92 |
double temp,
|
| 93 |
std::mt19937 & rng);
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
//
|
| 96 |
// Audio utils
|
| 97 |
//
|
|
|
|
| 26 |
|
| 27 |
int32_t n_batch = 8; // batch size for prompt processing
|
| 28 |
|
| 29 |
+
std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
|
| 30 |
+
std::string prompt = "";
|
| 31 |
+
std::string token_test = "";
|
| 32 |
};
|
| 33 |
|
| 34 |
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
|
|
|
| 62 |
// poor-man's JSON parsing
|
| 63 |
std::map<std::string, int32_t> json_parse(const std::string & fname);
|
| 64 |
|
| 65 |
+
std::string convert_to_utf8(const std::wstring & input);
|
| 66 |
+
|
| 67 |
+
std::wstring convert_to_wstring(const std::string & input);
|
| 68 |
+
|
| 69 |
+
void gpt_split_words(std::string str, std::vector<std::string>& words);
|
| 70 |
+
|
| 71 |
// split text into tokens
|
| 72 |
//
|
| 73 |
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
|
|
|
|
| 80 |
//
|
| 81 |
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
|
| 82 |
|
| 83 |
+
// test outputs of gpt_tokenize
|
| 84 |
+
//
|
| 85 |
+
// - compare with tokens generated by the huggingface tokenizer
|
| 86 |
+
// - test cases are chosen based on the model's main language (under 'prompt' directory)
|
| 87 |
+
// - if all sentences are tokenized identically, print 'All tests passed.'
|
| 88 |
+
// - otherwise, print sentence, huggingface tokens, ggml tokens
|
| 89 |
+
//
|
| 90 |
+
void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test);
|
| 91 |
+
|
| 92 |
// load the tokens from encoder.json
|
| 93 |
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
|
| 94 |
|
|
|
|
| 108 |
double temp,
|
| 109 |
std::mt19937 & rng);
|
| 110 |
|
| 111 |
+
gpt_vocab::id gpt_sample_top_k_top_p_repeat(
|
| 112 |
+
const gpt_vocab & vocab,
|
| 113 |
+
const float * logits,
|
| 114 |
+
const int32_t * last_n_tokens_data,
|
| 115 |
+
size_t last_n_tokens_data_size,
|
| 116 |
+
int top_k,
|
| 117 |
+
double top_p,
|
| 118 |
+
double temp,
|
| 119 |
+
int repeat_last_n,
|
| 120 |
+
float repeat_penalty,
|
| 121 |
+
std::mt19937 & rng);
|
| 122 |
+
|
| 123 |
//
|
| 124 |
// Audio utils
|
| 125 |
//
|
examples/main/main.cpp
CHANGED
|
@@ -10,6 +10,10 @@
|
|
| 10 |
#include <vector>
|
| 11 |
#include <cstring>
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
|
| 14 |
// Lowest is red, middle is yellow, highest is green.
|
| 15 |
const std::vector<std::string> k_colors = {
|
|
@@ -148,7 +152,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 148 |
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
|
| 149 |
else {
|
| 150 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 151 |
-
|
|
|
|
| 152 |
}
|
| 153 |
}
|
| 154 |
|
|
@@ -423,13 +428,13 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
|
|
| 423 |
indent++;
|
| 424 |
};
|
| 425 |
|
| 426 |
-
auto end_arr = [&](bool end
|
| 427 |
indent--;
|
| 428 |
doindent();
|
| 429 |
fout << (end ? "]\n" : "},\n");
|
| 430 |
};
|
| 431 |
|
| 432 |
-
auto start_obj = [&](const char *name
|
| 433 |
doindent();
|
| 434 |
if (name) {
|
| 435 |
fout << "\"" << name << "\": {\n";
|
|
@@ -439,7 +444,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
|
|
| 439 |
indent++;
|
| 440 |
};
|
| 441 |
|
| 442 |
-
auto end_obj = [&](bool end
|
| 443 |
indent--;
|
| 444 |
doindent();
|
| 445 |
fout << (end ? "}\n" : "},\n");
|
|
@@ -450,24 +455,24 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
|
|
| 450 |
fout << "\"" << name << "\": ";
|
| 451 |
};
|
| 452 |
|
| 453 |
-
auto value_s = [&](const char *name, const char *val, bool end
|
| 454 |
start_value(name);
|
| 455 |
char * val_escaped = escape_double_quotes_and_backslashes(val);
|
| 456 |
fout << "\"" << val_escaped << (end ? "\"\n" : "\",\n");
|
| 457 |
free(val_escaped);
|
| 458 |
};
|
| 459 |
|
| 460 |
-
auto end_value = [&](bool end
|
| 461 |
fout << (end ? "\n" : ",\n");
|
| 462 |
};
|
| 463 |
|
| 464 |
-
auto value_i = [&](const char *name, const int64_t val, bool end
|
| 465 |
start_value(name);
|
| 466 |
fout << val;
|
| 467 |
end_value(end);
|
| 468 |
};
|
| 469 |
|
| 470 |
-
auto value_b = [&](const char *name, const bool val, bool end
|
| 471 |
start_value(name);
|
| 472 |
fout << (val ? "true" : "false");
|
| 473 |
end_value(end);
|
|
@@ -479,35 +484,35 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
|
|
| 479 |
}
|
| 480 |
|
| 481 |
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
| 482 |
-
start_obj();
|
| 483 |
-
value_s("systeminfo", whisper_print_system_info());
|
| 484 |
start_obj("model");
|
| 485 |
-
value_s("type", whisper_model_type_readable(ctx));
|
| 486 |
-
value_b("multilingual", whisper_is_multilingual(ctx));
|
| 487 |
-
value_i("vocab", whisper_model_n_vocab(ctx));
|
| 488 |
start_obj("audio");
|
| 489 |
-
value_i("ctx", whisper_model_n_audio_ctx(ctx));
|
| 490 |
-
value_i("state", whisper_model_n_audio_state(ctx));
|
| 491 |
-
value_i("head", whisper_model_n_audio_head(ctx));
|
| 492 |
value_i("layer", whisper_model_n_audio_layer(ctx), true);
|
| 493 |
-
end_obj();
|
| 494 |
start_obj("text");
|
| 495 |
-
value_i("ctx", whisper_model_n_text_ctx(ctx));
|
| 496 |
-
value_i("state", whisper_model_n_text_state(ctx));
|
| 497 |
-
value_i("head", whisper_model_n_text_head(ctx));
|
| 498 |
value_i("layer", whisper_model_n_text_layer(ctx), true);
|
| 499 |
-
end_obj();
|
| 500 |
-
value_i("mels", whisper_model_n_mels(ctx));
|
| 501 |
value_i("ftype", whisper_model_ftype(ctx), true);
|
| 502 |
-
end_obj();
|
| 503 |
start_obj("params");
|
| 504 |
-
value_s("model", params.model.c_str());
|
| 505 |
-
value_s("language", params.language.c_str());
|
| 506 |
value_b("translate", params.translate, true);
|
| 507 |
-
end_obj();
|
| 508 |
start_obj("result");
|
| 509 |
value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
|
| 510 |
-
end_obj();
|
| 511 |
start_arr("transcription");
|
| 512 |
|
| 513 |
const int n_segments = whisper_full_n_segments(ctx);
|
|
@@ -516,15 +521,15 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
|
|
| 516 |
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
| 517 |
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
| 518 |
|
| 519 |
-
start_obj();
|
| 520 |
start_obj("timestamps");
|
| 521 |
-
value_s("from", to_timestamp(t0, true).c_str());
|
| 522 |
value_s("to", to_timestamp(t1, true).c_str(), true);
|
| 523 |
-
end_obj();
|
| 524 |
start_obj("offsets");
|
| 525 |
-
value_i("from", t0 * 10);
|
| 526 |
value_i("to", t1 * 10, true);
|
| 527 |
-
end_obj();
|
| 528 |
value_s("text", text, true);
|
| 529 |
end_obj(i == (n_segments - 1));
|
| 530 |
}
|
|
|
|
| 10 |
#include <vector>
|
| 11 |
#include <cstring>
|
| 12 |
|
| 13 |
+
#if defined(_MSC_VER)
|
| 14 |
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
| 15 |
+
#endif
|
| 16 |
+
|
| 17 |
// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
|
| 18 |
// Lowest is red, middle is yellow, highest is green.
|
| 19 |
const std::vector<std::string> k_colors = {
|
|
|
|
| 152 |
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
|
| 153 |
else {
|
| 154 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 155 |
+
whisper_print_usage(argc, argv, params);
|
| 156 |
+
exit(0);
|
| 157 |
}
|
| 158 |
}
|
| 159 |
|
|
|
|
| 428 |
indent++;
|
| 429 |
};
|
| 430 |
|
| 431 |
+
auto end_arr = [&](bool end) {
|
| 432 |
indent--;
|
| 433 |
doindent();
|
| 434 |
fout << (end ? "]\n" : "},\n");
|
| 435 |
};
|
| 436 |
|
| 437 |
+
auto start_obj = [&](const char *name) {
|
| 438 |
doindent();
|
| 439 |
if (name) {
|
| 440 |
fout << "\"" << name << "\": {\n";
|
|
|
|
| 444 |
indent++;
|
| 445 |
};
|
| 446 |
|
| 447 |
+
auto end_obj = [&](bool end) {
|
| 448 |
indent--;
|
| 449 |
doindent();
|
| 450 |
fout << (end ? "}\n" : "},\n");
|
|
|
|
| 455 |
fout << "\"" << name << "\": ";
|
| 456 |
};
|
| 457 |
|
| 458 |
+
auto value_s = [&](const char *name, const char *val, bool end) {
|
| 459 |
start_value(name);
|
| 460 |
char * val_escaped = escape_double_quotes_and_backslashes(val);
|
| 461 |
fout << "\"" << val_escaped << (end ? "\"\n" : "\",\n");
|
| 462 |
free(val_escaped);
|
| 463 |
};
|
| 464 |
|
| 465 |
+
auto end_value = [&](bool end) {
|
| 466 |
fout << (end ? "\n" : ",\n");
|
| 467 |
};
|
| 468 |
|
| 469 |
+
auto value_i = [&](const char *name, const int64_t val, bool end) {
|
| 470 |
start_value(name);
|
| 471 |
fout << val;
|
| 472 |
end_value(end);
|
| 473 |
};
|
| 474 |
|
| 475 |
+
auto value_b = [&](const char *name, const bool val, bool end) {
|
| 476 |
start_value(name);
|
| 477 |
fout << (val ? "true" : "false");
|
| 478 |
end_value(end);
|
|
|
|
| 484 |
}
|
| 485 |
|
| 486 |
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
| 487 |
+
start_obj(nullptr);
|
| 488 |
+
value_s("systeminfo", whisper_print_system_info(), false);
|
| 489 |
start_obj("model");
|
| 490 |
+
value_s("type", whisper_model_type_readable(ctx), false);
|
| 491 |
+
value_b("multilingual", whisper_is_multilingual(ctx), false);
|
| 492 |
+
value_i("vocab", whisper_model_n_vocab(ctx), false);
|
| 493 |
start_obj("audio");
|
| 494 |
+
value_i("ctx", whisper_model_n_audio_ctx(ctx), false);
|
| 495 |
+
value_i("state", whisper_model_n_audio_state(ctx), false);
|
| 496 |
+
value_i("head", whisper_model_n_audio_head(ctx), false);
|
| 497 |
value_i("layer", whisper_model_n_audio_layer(ctx), true);
|
| 498 |
+
end_obj(false);
|
| 499 |
start_obj("text");
|
| 500 |
+
value_i("ctx", whisper_model_n_text_ctx(ctx), false);
|
| 501 |
+
value_i("state", whisper_model_n_text_state(ctx), false);
|
| 502 |
+
value_i("head", whisper_model_n_text_head(ctx), false);
|
| 503 |
value_i("layer", whisper_model_n_text_layer(ctx), true);
|
| 504 |
+
end_obj(false);
|
| 505 |
+
value_i("mels", whisper_model_n_mels(ctx), false);
|
| 506 |
value_i("ftype", whisper_model_ftype(ctx), true);
|
| 507 |
+
end_obj(false);
|
| 508 |
start_obj("params");
|
| 509 |
+
value_s("model", params.model.c_str(), false);
|
| 510 |
+
value_s("language", params.language.c_str(), false);
|
| 511 |
value_b("translate", params.translate, true);
|
| 512 |
+
end_obj(false);
|
| 513 |
start_obj("result");
|
| 514 |
value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
|
| 515 |
+
end_obj(false);
|
| 516 |
start_arr("transcription");
|
| 517 |
|
| 518 |
const int n_segments = whisper_full_n_segments(ctx);
|
|
|
|
| 521 |
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
| 522 |
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
| 523 |
|
| 524 |
+
start_obj(nullptr);
|
| 525 |
start_obj("timestamps");
|
| 526 |
+
value_s("from", to_timestamp(t0, true).c_str(), false);
|
| 527 |
value_s("to", to_timestamp(t1, true).c_str(), true);
|
| 528 |
+
end_obj(false);
|
| 529 |
start_obj("offsets");
|
| 530 |
+
value_i("from", t0 * 10, false);
|
| 531 |
value_i("to", t1 * 10, true);
|
| 532 |
+
end_obj(false);
|
| 533 |
value_s("text", text, true);
|
| 534 |
end_obj(i == (n_segments - 1));
|
| 535 |
}
|
examples/quantize/quantize.cpp
CHANGED
|
@@ -99,17 +99,17 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
|
|
| 99 |
fprintf(stderr, "%s: ftype (dst) = %d\n", __func__, ftype_dst);
|
| 100 |
fprintf(stderr, "%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
|
| 101 |
|
| 102 |
-
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
| 103 |
-
fout.write((char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
|
| 104 |
-
fout.write((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
|
| 105 |
-
fout.write((char *) &hparams.n_audio_head, sizeof(hparams.n_audio_head));
|
| 106 |
-
fout.write((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
|
| 107 |
-
fout.write((char *) &hparams.n_text_ctx, sizeof(hparams.n_text_ctx));
|
| 108 |
-
fout.write((char *) &hparams.n_text_state, sizeof(hparams.n_text_state));
|
| 109 |
-
fout.write((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
|
| 110 |
-
fout.write((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
|
| 111 |
-
fout.write((char *) &hparams.n_mels, sizeof(hparams.n_mels));
|
| 112 |
-
fout.write((char *) &ftype_dst, sizeof(hparams.ftype));
|
| 113 |
}
|
| 114 |
|
| 115 |
// load mel filters
|
|
@@ -138,15 +138,17 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
|
|
| 138 |
// return false;
|
| 139 |
//}
|
| 140 |
|
| 141 |
-
|
|
|
|
| 142 |
for (int i = 0; i < n_vocab; i++) {
|
| 143 |
uint32_t len;
|
| 144 |
finp.read ((char *) &len, sizeof(len));
|
| 145 |
fout.write((char *) &len, sizeof(len));
|
| 146 |
|
| 147 |
-
word
|
| 148 |
-
|
| 149 |
-
|
|
|
|
| 150 |
|
| 151 |
vocab.token_to_id[word] = i;
|
| 152 |
vocab.id_to_token[i] = word;
|
|
|
|
| 99 |
fprintf(stderr, "%s: ftype (dst) = %d\n", __func__, ftype_dst);
|
| 100 |
fprintf(stderr, "%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
|
| 101 |
|
| 102 |
+
fout.write((const char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
| 103 |
+
fout.write((const char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
|
| 104 |
+
fout.write((const char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
|
| 105 |
+
fout.write((const char *) &hparams.n_audio_head, sizeof(hparams.n_audio_head));
|
| 106 |
+
fout.write((const char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
|
| 107 |
+
fout.write((const char *) &hparams.n_text_ctx, sizeof(hparams.n_text_ctx));
|
| 108 |
+
fout.write((const char *) &hparams.n_text_state, sizeof(hparams.n_text_state));
|
| 109 |
+
fout.write((const char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
|
| 110 |
+
fout.write((const char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
|
| 111 |
+
fout.write((const char *) &hparams.n_mels, sizeof(hparams.n_mels));
|
| 112 |
+
fout.write((const char *) &ftype_dst, sizeof(hparams.ftype));
|
| 113 |
}
|
| 114 |
|
| 115 |
// load mel filters
|
|
|
|
| 138 |
// return false;
|
| 139 |
//}
|
| 140 |
|
| 141 |
+
char word[128];
|
| 142 |
+
|
| 143 |
for (int i = 0; i < n_vocab; i++) {
|
| 144 |
uint32_t len;
|
| 145 |
finp.read ((char *) &len, sizeof(len));
|
| 146 |
fout.write((char *) &len, sizeof(len));
|
| 147 |
|
| 148 |
+
word[len] = '\0';
|
| 149 |
+
|
| 150 |
+
finp.read ((char *) word, len);
|
| 151 |
+
fout.write((char *) word, len);
|
| 152 |
|
| 153 |
vocab.token_to_id[word] = i;
|
| 154 |
vocab.id_to_token[i] = word;
|
ggml-cuda.cu
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ggml-cuda.h
CHANGED
|
@@ -1,10 +1,19 @@
|
|
|
|
|
|
|
|
| 1 |
#include "ggml.h"
|
| 2 |
|
| 3 |
#ifdef __cplusplus
|
| 4 |
extern "C" {
|
| 5 |
#endif
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
void ggml_init_cublas(void);
|
|
|
|
| 8 |
|
| 9 |
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 10 |
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
|
@@ -15,8 +24,15 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
|
| 15 |
void * ggml_cuda_host_malloc(size_t size);
|
| 16 |
void ggml_cuda_host_free(void * ptr);
|
| 17 |
|
| 18 |
-
void
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
#ifdef __cplusplus
|
| 22 |
}
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
#include "ggml.h"
|
| 4 |
|
| 5 |
#ifdef __cplusplus
|
| 6 |
extern "C" {
|
| 7 |
#endif
|
| 8 |
|
| 9 |
+
#define GGML_CUDA_MAX_DEVICES 16
|
| 10 |
+
|
| 11 |
+
struct ggml_tensor_extra_gpu {
|
| 12 |
+
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
| 13 |
+
};
|
| 14 |
+
|
| 15 |
void ggml_init_cublas(void);
|
| 16 |
+
void ggml_cuda_set_tensor_split(const float * tensor_split);
|
| 17 |
|
| 18 |
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 19 |
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
|
|
|
| 24 |
void * ggml_cuda_host_malloc(size_t size);
|
| 25 |
void ggml_cuda_host_free(void * ptr);
|
| 26 |
|
| 27 |
+
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
|
| 28 |
+
|
| 29 |
+
void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
| 30 |
+
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
| 31 |
+
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
| 32 |
+
void ggml_cuda_set_main_device(int main_device);
|
| 33 |
+
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
| 34 |
+
void ggml_cuda_free_scratch(void);
|
| 35 |
+
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
| 36 |
|
| 37 |
#ifdef __cplusplus
|
| 38 |
}
|
ggml-opencl.h
CHANGED
|
@@ -1,23 +1,24 @@
|
|
| 1 |
#pragma once
|
| 2 |
|
|
|
|
|
|
|
| 3 |
#ifdef __cplusplus
|
| 4 |
extern "C" {
|
| 5 |
#endif
|
| 6 |
|
| 7 |
void ggml_cl_init(void);
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
|
| 15 |
-
GGML_BLAS_OP_N = 111,
|
| 16 |
-
GGML_BLAS_OP_T = 112,
|
| 17 |
-
GGML_BLAS_OP_C = 113,
|
| 18 |
-
};
|
| 19 |
|
| 20 |
-
void
|
| 21 |
|
| 22 |
#ifdef __cplusplus
|
| 23 |
}
|
|
|
|
| 1 |
#pragma once
|
| 2 |
|
| 3 |
+
#include "ggml.h"
|
| 4 |
+
|
| 5 |
#ifdef __cplusplus
|
| 6 |
extern "C" {
|
| 7 |
#endif
|
| 8 |
|
| 9 |
void ggml_cl_init(void);
|
| 10 |
|
| 11 |
+
void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 12 |
+
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 13 |
+
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 14 |
+
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
| 15 |
+
|
| 16 |
+
void * ggml_cl_host_malloc(size_t size);
|
| 17 |
+
void ggml_cl_host_free(void * ptr);
|
| 18 |
|
| 19 |
+
void ggml_cl_free_data(const struct ggml_tensor* tensor);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
+
void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
|
| 22 |
|
| 23 |
#ifdef __cplusplus
|
| 24 |
}
|
ggml.c
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ggml.h
CHANGED
|
@@ -198,6 +198,7 @@
|
|
| 198 |
#define GGML_MAX_PARAMS 256
|
| 199 |
#define GGML_MAX_CONTEXTS 64
|
| 200 |
#define GGML_MAX_OPT 4
|
|
|
|
| 201 |
#define GGML_DEFAULT_N_THREADS 4
|
| 202 |
|
| 203 |
#define GGML_ASSERT(x) \
|
|
@@ -240,6 +241,13 @@ extern "C" {
|
|
| 240 |
GGML_TYPE_Q5_1 = 7,
|
| 241 |
GGML_TYPE_Q8_0 = 8,
|
| 242 |
GGML_TYPE_Q8_1 = 9,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
GGML_TYPE_I8,
|
| 244 |
GGML_TYPE_I16,
|
| 245 |
GGML_TYPE_I32,
|
|
@@ -248,7 +256,8 @@ extern "C" {
|
|
| 248 |
|
| 249 |
enum ggml_backend {
|
| 250 |
GGML_BACKEND_CPU = 0,
|
| 251 |
-
|
|
|
|
| 252 |
};
|
| 253 |
|
| 254 |
// model file types
|
|
@@ -262,6 +271,11 @@ extern "C" {
|
|
| 262 |
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
| 263 |
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
| 264 |
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
};
|
| 266 |
|
| 267 |
// available tensor operations:
|
|
@@ -282,12 +296,14 @@ extern "C" {
|
|
| 282 |
GGML_OP_SUM_ROWS,
|
| 283 |
GGML_OP_MEAN,
|
| 284 |
GGML_OP_REPEAT,
|
|
|
|
| 285 |
GGML_OP_ABS,
|
| 286 |
GGML_OP_SGN,
|
| 287 |
GGML_OP_NEG,
|
| 288 |
GGML_OP_STEP,
|
| 289 |
GGML_OP_RELU,
|
| 290 |
GGML_OP_GELU,
|
|
|
|
| 291 |
GGML_OP_SILU,
|
| 292 |
GGML_OP_SILU_BACK,
|
| 293 |
GGML_OP_NORM, // normalize
|
|
@@ -295,6 +311,7 @@ extern "C" {
|
|
| 295 |
GGML_OP_RMS_NORM_BACK,
|
| 296 |
|
| 297 |
GGML_OP_MUL_MAT,
|
|
|
|
| 298 |
|
| 299 |
GGML_OP_SCALE,
|
| 300 |
GGML_OP_SET,
|
|
@@ -310,19 +327,31 @@ extern "C" {
|
|
| 310 |
GGML_OP_DIAG_MASK_INF,
|
| 311 |
GGML_OP_DIAG_MASK_ZERO,
|
| 312 |
GGML_OP_SOFT_MAX,
|
|
|
|
| 313 |
GGML_OP_ROPE,
|
| 314 |
GGML_OP_ROPE_BACK,
|
| 315 |
GGML_OP_ALIBI,
|
| 316 |
GGML_OP_CLAMP,
|
| 317 |
-
|
| 318 |
-
|
|
|
|
| 319 |
|
| 320 |
GGML_OP_FLASH_ATTN,
|
| 321 |
GGML_OP_FLASH_FF,
|
|
|
|
|
|
|
|
|
|
| 322 |
|
| 323 |
GGML_OP_MAP_UNARY,
|
| 324 |
GGML_OP_MAP_BINARY,
|
| 325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
GGML_OP_COUNT,
|
| 327 |
};
|
| 328 |
|
|
@@ -371,11 +400,15 @@ extern "C" {
|
|
| 371 |
|
| 372 |
void * data;
|
| 373 |
|
| 374 |
-
char name[
|
|
|
|
|
|
|
| 375 |
|
| 376 |
-
char padding[
|
| 377 |
};
|
| 378 |
|
|
|
|
|
|
|
| 379 |
// computation graph
|
| 380 |
struct ggml_cgraph {
|
| 381 |
int n_nodes;
|
|
@@ -409,6 +442,25 @@ extern "C" {
|
|
| 409 |
bool no_alloc; // don't allocate memory for the tensor data
|
| 410 |
};
|
| 411 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
// misc
|
| 413 |
|
| 414 |
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
|
@@ -420,14 +472,17 @@ extern "C" {
|
|
| 420 |
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
| 421 |
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
| 422 |
|
| 423 |
-
GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
| 424 |
-
GGML_API
|
|
|
|
|
|
|
| 425 |
|
| 426 |
GGML_API int ggml_blck_size (enum ggml_type type);
|
| 427 |
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
| 428 |
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
| 429 |
|
| 430 |
GGML_API const char * ggml_type_name(enum ggml_type type);
|
|
|
|
| 431 |
|
| 432 |
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
| 433 |
|
|
@@ -436,14 +491,26 @@ extern "C" {
|
|
| 436 |
// TODO: temporary until model loading of ggml examples is refactored
|
| 437 |
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
| 438 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
// main
|
| 440 |
|
| 441 |
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
| 442 |
-
GGML_API void
|
| 443 |
|
| 444 |
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
| 445 |
|
| 446 |
-
GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
|
| 448 |
GGML_API struct ggml_tensor * ggml_new_tensor(
|
| 449 |
struct ggml_context * ctx,
|
|
@@ -483,6 +550,8 @@ extern "C" {
|
|
| 483 |
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
| 484 |
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
|
| 485 |
|
|
|
|
|
|
|
| 486 |
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
| 487 |
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
| 488 |
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
|
@@ -496,8 +565,9 @@ extern "C" {
|
|
| 496 |
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
| 497 |
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
| 498 |
|
| 499 |
-
GGML_API const char *
|
| 500 |
-
GGML_API
|
|
|
|
| 501 |
|
| 502 |
//
|
| 503 |
// operations on tensors with backpropagation
|
|
@@ -522,6 +592,11 @@ extern "C" {
|
|
| 522 |
struct ggml_tensor * a,
|
| 523 |
struct ggml_tensor * b);
|
| 524 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 525 |
GGML_API struct ggml_tensor * ggml_acc(
|
| 526 |
struct ggml_context * ctx,
|
| 527 |
struct ggml_tensor * a,
|
|
@@ -545,24 +620,47 @@ extern "C" {
|
|
| 545 |
struct ggml_tensor * a,
|
| 546 |
struct ggml_tensor * b);
|
| 547 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 548 |
GGML_API struct ggml_tensor * ggml_mul(
|
| 549 |
struct ggml_context * ctx,
|
| 550 |
struct ggml_tensor * a,
|
| 551 |
struct ggml_tensor * b);
|
| 552 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 553 |
GGML_API struct ggml_tensor * ggml_div(
|
| 554 |
struct ggml_context * ctx,
|
| 555 |
struct ggml_tensor * a,
|
| 556 |
struct ggml_tensor * b);
|
| 557 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 558 |
GGML_API struct ggml_tensor * ggml_sqr(
|
| 559 |
struct ggml_context * ctx,
|
| 560 |
struct ggml_tensor * a);
|
| 561 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
GGML_API struct ggml_tensor * ggml_sqrt(
|
| 563 |
struct ggml_context * ctx,
|
| 564 |
struct ggml_tensor * a);
|
| 565 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 566 |
GGML_API struct ggml_tensor * ggml_log(
|
| 567 |
struct ggml_context * ctx,
|
| 568 |
struct ggml_tensor * a);
|
|
@@ -593,35 +691,76 @@ extern "C" {
|
|
| 593 |
struct ggml_tensor * a,
|
| 594 |
struct ggml_tensor * b);
|
| 595 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 596 |
GGML_API struct ggml_tensor * ggml_abs(
|
| 597 |
struct ggml_context * ctx,
|
| 598 |
struct ggml_tensor * a);
|
| 599 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 600 |
GGML_API struct ggml_tensor * ggml_sgn(
|
| 601 |
struct ggml_context * ctx,
|
| 602 |
struct ggml_tensor * a);
|
| 603 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 604 |
GGML_API struct ggml_tensor * ggml_neg(
|
| 605 |
struct ggml_context * ctx,
|
| 606 |
struct ggml_tensor * a);
|
| 607 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 608 |
GGML_API struct ggml_tensor * ggml_step(
|
| 609 |
struct ggml_context * ctx,
|
| 610 |
struct ggml_tensor * a);
|
| 611 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 612 |
GGML_API struct ggml_tensor * ggml_relu(
|
| 613 |
struct ggml_context * ctx,
|
| 614 |
struct ggml_tensor * a);
|
| 615 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 616 |
// TODO: double-check this computation is correct
|
| 617 |
GGML_API struct ggml_tensor * ggml_gelu(
|
| 618 |
struct ggml_context * ctx,
|
| 619 |
struct ggml_tensor * a);
|
| 620 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
GGML_API struct ggml_tensor * ggml_silu(
|
| 622 |
struct ggml_context * ctx,
|
| 623 |
struct ggml_tensor * a);
|
| 624 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 625 |
// a - x
|
| 626 |
// b - dy
|
| 627 |
GGML_API struct ggml_tensor * ggml_silu_back(
|
|
@@ -635,10 +774,18 @@ extern "C" {
|
|
| 635 |
struct ggml_context * ctx,
|
| 636 |
struct ggml_tensor * a);
|
| 637 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 638 |
GGML_API struct ggml_tensor * ggml_rms_norm(
|
| 639 |
struct ggml_context * ctx,
|
| 640 |
struct ggml_tensor * a);
|
| 641 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 642 |
// a - x
|
| 643 |
// b - dy
|
| 644 |
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
|
@@ -646,14 +793,22 @@ extern "C" {
|
|
| 646 |
struct ggml_tensor * a,
|
| 647 |
struct ggml_tensor * b);
|
| 648 |
|
| 649 |
-
// A:
|
| 650 |
-
// B:
|
| 651 |
// result is m columns, p rows
|
| 652 |
GGML_API struct ggml_tensor * ggml_mul_mat(
|
| 653 |
struct ggml_context * ctx,
|
| 654 |
struct ggml_tensor * a,
|
| 655 |
struct ggml_tensor * b);
|
| 656 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 657 |
//
|
| 658 |
// operations on tensors without backpropagation
|
| 659 |
//
|
|
@@ -864,6 +1019,17 @@ extern "C" {
|
|
| 864 |
struct ggml_context * ctx,
|
| 865 |
struct ggml_tensor * a);
|
| 866 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 867 |
// rotary position embedding
|
| 868 |
// if mode & 1 == 1, skip n_past elements
|
| 869 |
// if mode & 2 == 1, GPT-NeoX style
|
|
@@ -909,16 +1075,55 @@ extern "C" {
|
|
| 909 |
float min,
|
| 910 |
float max);
|
| 911 |
|
| 912 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 913 |
// TODO: we don't support extra parameters for now
|
| 914 |
// that's why we are hard-coding the stride, padding, and dilation
|
| 915 |
// not great ..
|
| 916 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 917 |
struct ggml_context * ctx,
|
| 918 |
struct ggml_tensor * a,
|
| 919 |
struct ggml_tensor * b);
|
| 920 |
|
| 921 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 922 |
struct ggml_context * ctx,
|
| 923 |
struct ggml_tensor * a,
|
| 924 |
struct ggml_tensor * b);
|
|
@@ -930,6 +1135,14 @@ extern "C" {
|
|
| 930 |
struct ggml_tensor * v,
|
| 931 |
bool masked);
|
| 932 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 933 |
GGML_API struct ggml_tensor * ggml_flash_ff(
|
| 934 |
struct ggml_context * ctx,
|
| 935 |
struct ggml_tensor * a,
|
|
@@ -938,21 +1151,106 @@ extern "C" {
|
|
| 938 |
struct ggml_tensor * c0,
|
| 939 |
struct ggml_tensor * c1);
|
| 940 |
|
| 941 |
-
//
|
| 942 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 943 |
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
| 944 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 945 |
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
| 946 |
struct ggml_context * ctx,
|
| 947 |
struct ggml_tensor * a,
|
| 948 |
ggml_unary_op_f32_t fun);
|
| 949 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 950 |
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
| 951 |
struct ggml_context * ctx,
|
| 952 |
struct ggml_tensor * a,
|
| 953 |
struct ggml_tensor * b,
|
| 954 |
ggml_binary_op_f32_t fun);
|
| 955 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 956 |
//
|
| 957 |
// automatic differentiation
|
| 958 |
//
|
|
@@ -969,6 +1267,11 @@ extern "C" {
|
|
| 969 |
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
| 970 |
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
| 971 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 972 |
// print info and performance information for the graph
|
| 973 |
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
| 974 |
|
|
@@ -1042,6 +1345,8 @@ extern "C" {
|
|
| 1042 |
struct {
|
| 1043 |
int n_iter;
|
| 1044 |
|
|
|
|
|
|
|
| 1045 |
float alpha; // learning rate
|
| 1046 |
float beta1;
|
| 1047 |
float beta2;
|
|
@@ -1066,6 +1371,49 @@ extern "C" {
|
|
| 1066 |
} lbfgs;
|
| 1067 |
};
|
| 1068 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1069 |
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
|
| 1070 |
|
| 1071 |
// optimize the function defined by the tensor f
|
|
@@ -1074,6 +1422,27 @@ extern "C" {
|
|
| 1074 |
struct ggml_opt_params params,
|
| 1075 |
struct ggml_tensor * f);
|
| 1076 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1077 |
//
|
| 1078 |
// quantization
|
| 1079 |
//
|
|
|
|
| 198 |
#define GGML_MAX_PARAMS 256
|
| 199 |
#define GGML_MAX_CONTEXTS 64
|
| 200 |
#define GGML_MAX_OPT 4
|
| 201 |
+
#define GGML_MAX_NAME 32
|
| 202 |
#define GGML_DEFAULT_N_THREADS 4
|
| 203 |
|
| 204 |
#define GGML_ASSERT(x) \
|
|
|
|
| 241 |
GGML_TYPE_Q5_1 = 7,
|
| 242 |
GGML_TYPE_Q8_0 = 8,
|
| 243 |
GGML_TYPE_Q8_1 = 9,
|
| 244 |
+
// k-quantizations
|
| 245 |
+
GGML_TYPE_Q2_K = 10,
|
| 246 |
+
GGML_TYPE_Q3_K = 11,
|
| 247 |
+
GGML_TYPE_Q4_K = 12,
|
| 248 |
+
GGML_TYPE_Q5_K = 13,
|
| 249 |
+
GGML_TYPE_Q6_K = 14,
|
| 250 |
+
GGML_TYPE_Q8_K = 15,
|
| 251 |
GGML_TYPE_I8,
|
| 252 |
GGML_TYPE_I16,
|
| 253 |
GGML_TYPE_I32,
|
|
|
|
| 256 |
|
| 257 |
enum ggml_backend {
|
| 258 |
GGML_BACKEND_CPU = 0,
|
| 259 |
+
GGML_BACKEND_GPU = 10,
|
| 260 |
+
GGML_BACKEND_GPU_SPLIT = 20,
|
| 261 |
};
|
| 262 |
|
| 263 |
// model file types
|
|
|
|
| 271 |
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
| 272 |
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
| 273 |
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
| 274 |
+
GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
| 275 |
+
GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
|
| 276 |
+
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
|
| 277 |
+
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
|
| 278 |
+
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
| 279 |
};
|
| 280 |
|
| 281 |
// available tensor operations:
|
|
|
|
| 296 |
GGML_OP_SUM_ROWS,
|
| 297 |
GGML_OP_MEAN,
|
| 298 |
GGML_OP_REPEAT,
|
| 299 |
+
GGML_OP_REPEAT_BACK,
|
| 300 |
GGML_OP_ABS,
|
| 301 |
GGML_OP_SGN,
|
| 302 |
GGML_OP_NEG,
|
| 303 |
GGML_OP_STEP,
|
| 304 |
GGML_OP_RELU,
|
| 305 |
GGML_OP_GELU,
|
| 306 |
+
GGML_OP_GELU_QUICK,
|
| 307 |
GGML_OP_SILU,
|
| 308 |
GGML_OP_SILU_BACK,
|
| 309 |
GGML_OP_NORM, // normalize
|
|
|
|
| 311 |
GGML_OP_RMS_NORM_BACK,
|
| 312 |
|
| 313 |
GGML_OP_MUL_MAT,
|
| 314 |
+
GGML_OP_OUT_PROD,
|
| 315 |
|
| 316 |
GGML_OP_SCALE,
|
| 317 |
GGML_OP_SET,
|
|
|
|
| 327 |
GGML_OP_DIAG_MASK_INF,
|
| 328 |
GGML_OP_DIAG_MASK_ZERO,
|
| 329 |
GGML_OP_SOFT_MAX,
|
| 330 |
+
GGML_OP_SOFT_MAX_BACK,
|
| 331 |
GGML_OP_ROPE,
|
| 332 |
GGML_OP_ROPE_BACK,
|
| 333 |
GGML_OP_ALIBI,
|
| 334 |
GGML_OP_CLAMP,
|
| 335 |
+
GGML_OP_CONV_1D_S1_PH,
|
| 336 |
+
GGML_OP_CONV_1D_S2_PH,
|
| 337 |
+
GGML_OP_CONV_2D_SK_P0,
|
| 338 |
|
| 339 |
GGML_OP_FLASH_ATTN,
|
| 340 |
GGML_OP_FLASH_FF,
|
| 341 |
+
GGML_OP_FLASH_ATTN_BACK,
|
| 342 |
+
GGML_OP_WIN_PART,
|
| 343 |
+
GGML_OP_WIN_UNPART,
|
| 344 |
|
| 345 |
GGML_OP_MAP_UNARY,
|
| 346 |
GGML_OP_MAP_BINARY,
|
| 347 |
|
| 348 |
+
GGML_OP_MAP_CUSTOM1,
|
| 349 |
+
GGML_OP_MAP_CUSTOM2,
|
| 350 |
+
GGML_OP_MAP_CUSTOM3,
|
| 351 |
+
|
| 352 |
+
GGML_OP_CROSS_ENTROPY_LOSS,
|
| 353 |
+
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
| 354 |
+
|
| 355 |
GGML_OP_COUNT,
|
| 356 |
};
|
| 357 |
|
|
|
|
| 400 |
|
| 401 |
void * data;
|
| 402 |
|
| 403 |
+
char name[GGML_MAX_NAME];
|
| 404 |
+
|
| 405 |
+
void * extra; // extra things e.g. for ggml-cuda.cu
|
| 406 |
|
| 407 |
+
char padding[4];
|
| 408 |
};
|
| 409 |
|
| 410 |
+
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
| 411 |
+
|
| 412 |
// computation graph
|
| 413 |
struct ggml_cgraph {
|
| 414 |
int n_nodes;
|
|
|
|
| 442 |
bool no_alloc; // don't allocate memory for the tensor data
|
| 443 |
};
|
| 444 |
|
| 445 |
+
|
| 446 |
+
// compute types
|
| 447 |
+
enum ggml_task_type {
|
| 448 |
+
GGML_TASK_INIT = 0,
|
| 449 |
+
GGML_TASK_COMPUTE,
|
| 450 |
+
GGML_TASK_FINALIZE,
|
| 451 |
+
};
|
| 452 |
+
|
| 453 |
+
struct ggml_compute_params {
|
| 454 |
+
enum ggml_task_type type;
|
| 455 |
+
|
| 456 |
+
// ith = thread index, nth = number of threads
|
| 457 |
+
int ith, nth;
|
| 458 |
+
|
| 459 |
+
// work buffer for all threads
|
| 460 |
+
size_t wsize;
|
| 461 |
+
void * wdata;
|
| 462 |
+
};
|
| 463 |
+
|
| 464 |
// misc
|
| 465 |
|
| 466 |
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
|
|
|
| 472 |
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
| 473 |
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
| 474 |
|
| 475 |
+
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
| 476 |
+
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
| 477 |
+
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
| 478 |
+
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
| 479 |
|
| 480 |
GGML_API int ggml_blck_size (enum ggml_type type);
|
| 481 |
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
| 482 |
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
| 483 |
|
| 484 |
GGML_API const char * ggml_type_name(enum ggml_type type);
|
| 485 |
+
GGML_API const char * ggml_op_name (enum ggml_op op);
|
| 486 |
|
| 487 |
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
| 488 |
|
|
|
|
| 491 |
// TODO: temporary until model loading of ggml examples is refactored
|
| 492 |
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
| 493 |
|
| 494 |
+
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
| 495 |
+
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
| 496 |
+
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
| 497 |
+
|
| 498 |
+
// use this to compute the memory overhead of a tensor
|
| 499 |
+
GGML_API size_t ggml_tensor_overhead(void);
|
| 500 |
+
|
| 501 |
// main
|
| 502 |
|
| 503 |
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
| 504 |
+
GGML_API void ggml_free(struct ggml_context * ctx);
|
| 505 |
|
| 506 |
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
| 507 |
|
| 508 |
+
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
| 509 |
+
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
| 510 |
+
|
| 511 |
+
GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
|
| 512 |
+
GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
|
| 513 |
+
GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
|
| 514 |
|
| 515 |
GGML_API struct ggml_tensor * ggml_new_tensor(
|
| 516 |
struct ggml_context * ctx,
|
|
|
|
| 550 |
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
| 551 |
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
|
| 552 |
|
| 553 |
+
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
| 554 |
+
|
| 555 |
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
| 556 |
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
| 557 |
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
|
|
|
| 565 |
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
| 566 |
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
| 567 |
|
| 568 |
+
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
|
| 569 |
+
GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
|
| 570 |
+
GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
|
| 571 |
|
| 572 |
//
|
| 573 |
// operations on tensors with backpropagation
|
|
|
|
| 592 |
struct ggml_tensor * a,
|
| 593 |
struct ggml_tensor * b);
|
| 594 |
|
| 595 |
+
GGML_API struct ggml_tensor * ggml_add1_inplace(
|
| 596 |
+
struct ggml_context * ctx,
|
| 597 |
+
struct ggml_tensor * a,
|
| 598 |
+
struct ggml_tensor * b);
|
| 599 |
+
|
| 600 |
GGML_API struct ggml_tensor * ggml_acc(
|
| 601 |
struct ggml_context * ctx,
|
| 602 |
struct ggml_tensor * a,
|
|
|
|
| 620 |
struct ggml_tensor * a,
|
| 621 |
struct ggml_tensor * b);
|
| 622 |
|
| 623 |
+
GGML_API struct ggml_tensor * ggml_sub_inplace(
|
| 624 |
+
struct ggml_context * ctx,
|
| 625 |
+
struct ggml_tensor * a,
|
| 626 |
+
struct ggml_tensor * b);
|
| 627 |
+
|
| 628 |
GGML_API struct ggml_tensor * ggml_mul(
|
| 629 |
struct ggml_context * ctx,
|
| 630 |
struct ggml_tensor * a,
|
| 631 |
struct ggml_tensor * b);
|
| 632 |
|
| 633 |
+
GGML_API struct ggml_tensor * ggml_mul_inplace(
|
| 634 |
+
struct ggml_context * ctx,
|
| 635 |
+
struct ggml_tensor * a,
|
| 636 |
+
struct ggml_tensor * b);
|
| 637 |
+
|
| 638 |
GGML_API struct ggml_tensor * ggml_div(
|
| 639 |
struct ggml_context * ctx,
|
| 640 |
struct ggml_tensor * a,
|
| 641 |
struct ggml_tensor * b);
|
| 642 |
|
| 643 |
+
GGML_API struct ggml_tensor * ggml_div_inplace(
|
| 644 |
+
struct ggml_context * ctx,
|
| 645 |
+
struct ggml_tensor * a,
|
| 646 |
+
struct ggml_tensor * b);
|
| 647 |
+
|
| 648 |
GGML_API struct ggml_tensor * ggml_sqr(
|
| 649 |
struct ggml_context * ctx,
|
| 650 |
struct ggml_tensor * a);
|
| 651 |
|
| 652 |
+
GGML_API struct ggml_tensor * ggml_sqr_inplace(
|
| 653 |
+
struct ggml_context * ctx,
|
| 654 |
+
struct ggml_tensor * a);
|
| 655 |
+
|
| 656 |
GGML_API struct ggml_tensor * ggml_sqrt(
|
| 657 |
struct ggml_context * ctx,
|
| 658 |
struct ggml_tensor * a);
|
| 659 |
|
| 660 |
+
GGML_API struct ggml_tensor * ggml_sqrt_inplace(
|
| 661 |
+
struct ggml_context * ctx,
|
| 662 |
+
struct ggml_tensor * a);
|
| 663 |
+
|
| 664 |
GGML_API struct ggml_tensor * ggml_log(
|
| 665 |
struct ggml_context * ctx,
|
| 666 |
struct ggml_tensor * a);
|
|
|
|
| 691 |
struct ggml_tensor * a,
|
| 692 |
struct ggml_tensor * b);
|
| 693 |
|
| 694 |
+
GGML_API struct ggml_tensor * ggml_repeat_back(
|
| 695 |
+
struct ggml_context * ctx,
|
| 696 |
+
struct ggml_tensor * a,
|
| 697 |
+
struct ggml_tensor * b);
|
| 698 |
+
|
| 699 |
GGML_API struct ggml_tensor * ggml_abs(
|
| 700 |
struct ggml_context * ctx,
|
| 701 |
struct ggml_tensor * a);
|
| 702 |
|
| 703 |
+
GGML_API struct ggml_tensor * ggml_abs_inplace(
|
| 704 |
+
struct ggml_context * ctx,
|
| 705 |
+
struct ggml_tensor * a);
|
| 706 |
+
|
| 707 |
GGML_API struct ggml_tensor * ggml_sgn(
|
| 708 |
struct ggml_context * ctx,
|
| 709 |
struct ggml_tensor * a);
|
| 710 |
|
| 711 |
+
GGML_API struct ggml_tensor * ggml_sgn_inplace(
|
| 712 |
+
struct ggml_context * ctx,
|
| 713 |
+
struct ggml_tensor * a);
|
| 714 |
+
|
| 715 |
GGML_API struct ggml_tensor * ggml_neg(
|
| 716 |
struct ggml_context * ctx,
|
| 717 |
struct ggml_tensor * a);
|
| 718 |
|
| 719 |
+
GGML_API struct ggml_tensor * ggml_neg_inplace(
|
| 720 |
+
struct ggml_context * ctx,
|
| 721 |
+
struct ggml_tensor * a);
|
| 722 |
+
|
| 723 |
GGML_API struct ggml_tensor * ggml_step(
|
| 724 |
struct ggml_context * ctx,
|
| 725 |
struct ggml_tensor * a);
|
| 726 |
|
| 727 |
+
GGML_API struct ggml_tensor * ggml_step_inplace(
|
| 728 |
+
struct ggml_context * ctx,
|
| 729 |
+
struct ggml_tensor * a);
|
| 730 |
+
|
| 731 |
GGML_API struct ggml_tensor * ggml_relu(
|
| 732 |
struct ggml_context * ctx,
|
| 733 |
struct ggml_tensor * a);
|
| 734 |
|
| 735 |
+
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
| 736 |
+
struct ggml_context * ctx,
|
| 737 |
+
struct ggml_tensor * a);
|
| 738 |
+
|
| 739 |
// TODO: double-check this computation is correct
|
| 740 |
GGML_API struct ggml_tensor * ggml_gelu(
|
| 741 |
struct ggml_context * ctx,
|
| 742 |
struct ggml_tensor * a);
|
| 743 |
|
| 744 |
+
GGML_API struct ggml_tensor * ggml_gelu_inplace(
|
| 745 |
+
struct ggml_context * ctx,
|
| 746 |
+
struct ggml_tensor * a);
|
| 747 |
+
|
| 748 |
+
GGML_API struct ggml_tensor * ggml_gelu_quick(
|
| 749 |
+
struct ggml_context * ctx,
|
| 750 |
+
struct ggml_tensor * a);
|
| 751 |
+
|
| 752 |
+
GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
|
| 753 |
+
struct ggml_context * ctx,
|
| 754 |
+
struct ggml_tensor * a);
|
| 755 |
+
|
| 756 |
GGML_API struct ggml_tensor * ggml_silu(
|
| 757 |
struct ggml_context * ctx,
|
| 758 |
struct ggml_tensor * a);
|
| 759 |
|
| 760 |
+
GGML_API struct ggml_tensor * ggml_silu_inplace(
|
| 761 |
+
struct ggml_context * ctx,
|
| 762 |
+
struct ggml_tensor * a);
|
| 763 |
+
|
| 764 |
// a - x
|
| 765 |
// b - dy
|
| 766 |
GGML_API struct ggml_tensor * ggml_silu_back(
|
|
|
|
| 774 |
struct ggml_context * ctx,
|
| 775 |
struct ggml_tensor * a);
|
| 776 |
|
| 777 |
+
GGML_API struct ggml_tensor * ggml_norm_inplace(
|
| 778 |
+
struct ggml_context * ctx,
|
| 779 |
+
struct ggml_tensor * a);
|
| 780 |
+
|
| 781 |
GGML_API struct ggml_tensor * ggml_rms_norm(
|
| 782 |
struct ggml_context * ctx,
|
| 783 |
struct ggml_tensor * a);
|
| 784 |
|
| 785 |
+
GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
|
| 786 |
+
struct ggml_context * ctx,
|
| 787 |
+
struct ggml_tensor * a);
|
| 788 |
+
|
| 789 |
// a - x
|
| 790 |
// b - dy
|
| 791 |
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
|
|
|
| 793 |
struct ggml_tensor * a,
|
| 794 |
struct ggml_tensor * b);
|
| 795 |
|
| 796 |
+
// A: n columns, m rows
|
| 797 |
+
// B: n columns, p rows (i.e. we transpose it internally)
|
| 798 |
// result is m columns, p rows
|
| 799 |
GGML_API struct ggml_tensor * ggml_mul_mat(
|
| 800 |
struct ggml_context * ctx,
|
| 801 |
struct ggml_tensor * a,
|
| 802 |
struct ggml_tensor * b);
|
| 803 |
|
| 804 |
+
// A: m columns, n rows,
|
| 805 |
+
// B: p columns, n rows,
|
| 806 |
+
// result is m columns, p rows
|
| 807 |
+
GGML_API struct ggml_tensor * ggml_out_prod(
|
| 808 |
+
struct ggml_context * ctx,
|
| 809 |
+
struct ggml_tensor * a,
|
| 810 |
+
struct ggml_tensor * b);
|
| 811 |
+
|
| 812 |
//
|
| 813 |
// operations on tensors without backpropagation
|
| 814 |
//
|
|
|
|
| 1019 |
struct ggml_context * ctx,
|
| 1020 |
struct ggml_tensor * a);
|
| 1021 |
|
| 1022 |
+
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
| 1023 |
+
struct ggml_context * ctx,
|
| 1024 |
+
struct ggml_tensor * a,
|
| 1025 |
+
struct ggml_tensor * b);
|
| 1026 |
+
|
| 1027 |
+
// in-place, returns view(a)
|
| 1028 |
+
GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
|
| 1029 |
+
struct ggml_context * ctx,
|
| 1030 |
+
struct ggml_tensor * a,
|
| 1031 |
+
struct ggml_tensor * b);
|
| 1032 |
+
|
| 1033 |
// rotary position embedding
|
| 1034 |
// if mode & 1 == 1, skip n_past elements
|
| 1035 |
// if mode & 2 == 1, GPT-NeoX style
|
|
|
|
| 1075 |
float min,
|
| 1076 |
float max);
|
| 1077 |
|
| 1078 |
+
// TODO: implement general-purpose convolutions
|
| 1079 |
+
// GGML_API struct ggml_tensor * ggml_conv_1d(
|
| 1080 |
+
// struct ggml_context * ctx,
|
| 1081 |
+
// struct ggml_tensor * a,
|
| 1082 |
+
// struct ggml_tensor * b,
|
| 1083 |
+
// int s0
|
| 1084 |
+
// int p0,
|
| 1085 |
+
// int d0);
|
| 1086 |
+
//
|
| 1087 |
+
// GGML_API struct ggml_tensor * ggml_conv_2d(
|
| 1088 |
+
// struct ggml_context * ctx,
|
| 1089 |
+
// struct ggml_tensor * a,
|
| 1090 |
+
// struct ggml_tensor * b,
|
| 1091 |
+
// int s0,
|
| 1092 |
+
// int s1,
|
| 1093 |
+
// int p0,
|
| 1094 |
+
// int p1,
|
| 1095 |
+
// int d0,
|
| 1096 |
+
// int d1);
|
| 1097 |
+
|
| 1098 |
+
// padding = half
|
| 1099 |
// TODO: we don't support extra parameters for now
|
| 1100 |
// that's why we are hard-coding the stride, padding, and dilation
|
| 1101 |
// not great ..
|
| 1102 |
+
// example:
|
| 1103 |
+
// a: 3 80 768 1
|
| 1104 |
+
// b: 3000 80 1 1
|
| 1105 |
+
// res: 3000 768 1 1
|
| 1106 |
+
// used in whisper
|
| 1107 |
+
GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
|
| 1108 |
struct ggml_context * ctx,
|
| 1109 |
struct ggml_tensor * a,
|
| 1110 |
struct ggml_tensor * b);
|
| 1111 |
|
| 1112 |
+
// used in whisper
|
| 1113 |
+
GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
|
| 1114 |
+
struct ggml_context * ctx,
|
| 1115 |
+
struct ggml_tensor * a,
|
| 1116 |
+
struct ggml_tensor * b);
|
| 1117 |
+
|
| 1118 |
+
// kernel size is a->ne[0] x a->ne[1]
|
| 1119 |
+
// stride is equal to kernel size
|
| 1120 |
+
// padding is zero
|
| 1121 |
+
// example:
|
| 1122 |
+
// a: 16 16 3 768
|
| 1123 |
+
// b: 1024 1024 3 1
|
| 1124 |
+
// res: 64 64 768 1
|
| 1125 |
+
// used in sam
|
| 1126 |
+
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
|
| 1127 |
struct ggml_context * ctx,
|
| 1128 |
struct ggml_tensor * a,
|
| 1129 |
struct ggml_tensor * b);
|
|
|
|
| 1135 |
struct ggml_tensor * v,
|
| 1136 |
bool masked);
|
| 1137 |
|
| 1138 |
+
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
| 1139 |
+
struct ggml_context * ctx,
|
| 1140 |
+
struct ggml_tensor * q,
|
| 1141 |
+
struct ggml_tensor * k,
|
| 1142 |
+
struct ggml_tensor * v,
|
| 1143 |
+
struct ggml_tensor * d,
|
| 1144 |
+
bool masked);
|
| 1145 |
+
|
| 1146 |
GGML_API struct ggml_tensor * ggml_flash_ff(
|
| 1147 |
struct ggml_context * ctx,
|
| 1148 |
struct ggml_tensor * a,
|
|
|
|
| 1151 |
struct ggml_tensor * c0,
|
| 1152 |
struct ggml_tensor * c1);
|
| 1153 |
|
| 1154 |
+
// partition into non-overlapping windows with padding if needed
|
| 1155 |
+
// example:
|
| 1156 |
+
// a: 768 64 64 1
|
| 1157 |
+
// w: 14
|
| 1158 |
+
// res: 768 14 14 25
|
| 1159 |
+
// used in sam
|
| 1160 |
+
GGML_API struct ggml_tensor * ggml_win_part(
|
| 1161 |
+
struct ggml_context * ctx,
|
| 1162 |
+
struct ggml_tensor * a,
|
| 1163 |
+
int w);
|
| 1164 |
+
|
| 1165 |
+
// reverse of ggml_win_part
|
| 1166 |
+
// used in sam
|
| 1167 |
+
GGML_API struct ggml_tensor * ggml_win_unpart(
|
| 1168 |
+
struct ggml_context * ctx,
|
| 1169 |
+
struct ggml_tensor * a,
|
| 1170 |
+
int w0,
|
| 1171 |
+
int h0,
|
| 1172 |
+
int w);
|
| 1173 |
+
|
| 1174 |
+
// custom operators
|
| 1175 |
+
|
| 1176 |
+
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
| 1177 |
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
| 1178 |
|
| 1179 |
+
typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
|
| 1180 |
+
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
| 1181 |
+
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
| 1182 |
+
|
| 1183 |
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
| 1184 |
struct ggml_context * ctx,
|
| 1185 |
struct ggml_tensor * a,
|
| 1186 |
ggml_unary_op_f32_t fun);
|
| 1187 |
|
| 1188 |
+
GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
|
| 1189 |
+
struct ggml_context * ctx,
|
| 1190 |
+
struct ggml_tensor * a,
|
| 1191 |
+
ggml_unary_op_f32_t fun);
|
| 1192 |
+
|
| 1193 |
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
| 1194 |
struct ggml_context * ctx,
|
| 1195 |
struct ggml_tensor * a,
|
| 1196 |
struct ggml_tensor * b,
|
| 1197 |
ggml_binary_op_f32_t fun);
|
| 1198 |
|
| 1199 |
+
GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
|
| 1200 |
+
struct ggml_context * ctx,
|
| 1201 |
+
struct ggml_tensor * a,
|
| 1202 |
+
struct ggml_tensor * b,
|
| 1203 |
+
ggml_binary_op_f32_t fun);
|
| 1204 |
+
|
| 1205 |
+
GGML_API struct ggml_tensor * ggml_map_custom1_f32(
|
| 1206 |
+
struct ggml_context * ctx,
|
| 1207 |
+
struct ggml_tensor * a,
|
| 1208 |
+
ggml_custom1_op_f32_t fun);
|
| 1209 |
+
|
| 1210 |
+
GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
| 1211 |
+
struct ggml_context * ctx,
|
| 1212 |
+
struct ggml_tensor * a,
|
| 1213 |
+
ggml_custom1_op_f32_t fun);
|
| 1214 |
+
|
| 1215 |
+
GGML_API struct ggml_tensor * ggml_map_custom2_f32(
|
| 1216 |
+
struct ggml_context * ctx,
|
| 1217 |
+
struct ggml_tensor * a,
|
| 1218 |
+
struct ggml_tensor * b,
|
| 1219 |
+
ggml_custom2_op_f32_t fun);
|
| 1220 |
+
|
| 1221 |
+
GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
| 1222 |
+
struct ggml_context * ctx,
|
| 1223 |
+
struct ggml_tensor * a,
|
| 1224 |
+
struct ggml_tensor * b,
|
| 1225 |
+
ggml_custom2_op_f32_t fun);
|
| 1226 |
+
|
| 1227 |
+
GGML_API struct ggml_tensor * ggml_map_custom3_f32(
|
| 1228 |
+
struct ggml_context * ctx,
|
| 1229 |
+
struct ggml_tensor * a,
|
| 1230 |
+
struct ggml_tensor * b,
|
| 1231 |
+
struct ggml_tensor * c,
|
| 1232 |
+
ggml_custom3_op_f32_t fun);
|
| 1233 |
+
|
| 1234 |
+
GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
| 1235 |
+
struct ggml_context * ctx,
|
| 1236 |
+
struct ggml_tensor * a,
|
| 1237 |
+
struct ggml_tensor * b,
|
| 1238 |
+
struct ggml_tensor * c,
|
| 1239 |
+
ggml_custom3_op_f32_t fun);
|
| 1240 |
+
|
| 1241 |
+
// loss function
|
| 1242 |
+
|
| 1243 |
+
GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
|
| 1244 |
+
struct ggml_context * ctx,
|
| 1245 |
+
struct ggml_tensor * a,
|
| 1246 |
+
struct ggml_tensor * b);
|
| 1247 |
+
|
| 1248 |
+
GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
|
| 1249 |
+
struct ggml_context * ctx,
|
| 1250 |
+
struct ggml_tensor * a,
|
| 1251 |
+
struct ggml_tensor * b,
|
| 1252 |
+
struct ggml_tensor * c);
|
| 1253 |
+
|
| 1254 |
//
|
| 1255 |
// automatic differentiation
|
| 1256 |
//
|
|
|
|
| 1267 |
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
| 1268 |
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
| 1269 |
|
| 1270 |
+
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
| 1271 |
+
|
| 1272 |
+
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
| 1273 |
+
GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
| 1274 |
+
|
| 1275 |
// print info and performance information for the graph
|
| 1276 |
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
| 1277 |
|
|
|
|
| 1345 |
struct {
|
| 1346 |
int n_iter;
|
| 1347 |
|
| 1348 |
+
float sched; // schedule multiplier (fixed, decay or warmup)
|
| 1349 |
+
float decay; // weight decay for AdamW, use 0.0f to disable
|
| 1350 |
float alpha; // learning rate
|
| 1351 |
float beta1;
|
| 1352 |
float beta2;
|
|
|
|
| 1371 |
} lbfgs;
|
| 1372 |
};
|
| 1373 |
|
| 1374 |
+
struct ggml_opt_context {
|
| 1375 |
+
struct ggml_context * ctx;
|
| 1376 |
+
struct ggml_opt_params params;
|
| 1377 |
+
|
| 1378 |
+
int iter;
|
| 1379 |
+
int64_t nx; // number of parameter elements
|
| 1380 |
+
|
| 1381 |
+
bool just_initialized;
|
| 1382 |
+
|
| 1383 |
+
struct {
|
| 1384 |
+
struct ggml_tensor * x; // view of the parameters
|
| 1385 |
+
struct ggml_tensor * g1; // gradient
|
| 1386 |
+
struct ggml_tensor * g2; // gradient squared
|
| 1387 |
+
struct ggml_tensor * m; // first moment
|
| 1388 |
+
struct ggml_tensor * v; // second moment
|
| 1389 |
+
struct ggml_tensor * mh; // first moment hat
|
| 1390 |
+
struct ggml_tensor * vh; // second moment hat
|
| 1391 |
+
struct ggml_tensor * pf; // past function values
|
| 1392 |
+
float fx_best;
|
| 1393 |
+
float fx_prev;
|
| 1394 |
+
int n_no_improvement;
|
| 1395 |
+
} adam;
|
| 1396 |
+
|
| 1397 |
+
struct {
|
| 1398 |
+
struct ggml_tensor * x; // current parameters
|
| 1399 |
+
struct ggml_tensor * xp; // previous parameters
|
| 1400 |
+
struct ggml_tensor * g; // current gradient
|
| 1401 |
+
struct ggml_tensor * gp; // previous gradient
|
| 1402 |
+
struct ggml_tensor * d; // search direction
|
| 1403 |
+
struct ggml_tensor * pf; // past function values
|
| 1404 |
+
struct ggml_tensor * lmal; // the L-BFGS memory alpha
|
| 1405 |
+
struct ggml_tensor * lmys; // the L-BFGS memory ys
|
| 1406 |
+
struct ggml_tensor * lms; // the L-BFGS memory s
|
| 1407 |
+
struct ggml_tensor * lmy; // the L-BFGS memory y
|
| 1408 |
+
float fx_best;
|
| 1409 |
+
float step;
|
| 1410 |
+
int j;
|
| 1411 |
+
int k;
|
| 1412 |
+
int end;
|
| 1413 |
+
int n_no_improvement;
|
| 1414 |
+
} lbfgs;
|
| 1415 |
+
};
|
| 1416 |
+
|
| 1417 |
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
|
| 1418 |
|
| 1419 |
// optimize the function defined by the tensor f
|
|
|
|
| 1422 |
struct ggml_opt_params params,
|
| 1423 |
struct ggml_tensor * f);
|
| 1424 |
|
| 1425 |
+
// initialize optimizer context
|
| 1426 |
+
GGML_API void ggml_opt_init(
|
| 1427 |
+
struct ggml_context * ctx,
|
| 1428 |
+
struct ggml_opt_context * opt,
|
| 1429 |
+
struct ggml_opt_params params,
|
| 1430 |
+
int64_t nx);
|
| 1431 |
+
|
| 1432 |
+
// continue optimizing the function defined by the tensor f
|
| 1433 |
+
GGML_API enum ggml_opt_result ggml_opt_resume(
|
| 1434 |
+
struct ggml_context * ctx,
|
| 1435 |
+
struct ggml_opt_context * opt,
|
| 1436 |
+
struct ggml_tensor * f);
|
| 1437 |
+
|
| 1438 |
+
// continue optimizing the function defined by the tensor f
|
| 1439 |
+
GGML_API enum ggml_opt_result ggml_opt_resume_g(
|
| 1440 |
+
struct ggml_context * ctx,
|
| 1441 |
+
struct ggml_opt_context * opt,
|
| 1442 |
+
struct ggml_tensor * f,
|
| 1443 |
+
struct ggml_cgraph * gf,
|
| 1444 |
+
struct ggml_cgraph * gb);
|
| 1445 |
+
|
| 1446 |
//
|
| 1447 |
// quantization
|
| 1448 |
//
|
whisper.cpp
CHANGED
|
@@ -19,6 +19,10 @@
|
|
| 19 |
#include <regex>
|
| 20 |
#include <random>
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
#if defined(GGML_BIG_ENDIAN)
|
| 23 |
#include <bit>
|
| 24 |
|
|
@@ -1468,7 +1472,7 @@ static bool whisper_encode_internal(
|
|
| 1468 |
{
|
| 1469 |
wstate.use_buf(ctx0, 1);
|
| 1470 |
|
| 1471 |
-
cur =
|
| 1472 |
cur = ggml_add(ctx0,
|
| 1473 |
ggml_repeat(ctx0,
|
| 1474 |
model.e_conv_1_b,
|
|
@@ -1479,7 +1483,7 @@ static bool whisper_encode_internal(
|
|
| 1479 |
|
| 1480 |
wstate.use_buf(ctx0, 0);
|
| 1481 |
|
| 1482 |
-
cur =
|
| 1483 |
cur = ggml_add(ctx0,
|
| 1484 |
ggml_repeat(ctx0,
|
| 1485 |
model.e_conv_2_b,
|
|
|
|
| 19 |
#include <regex>
|
| 20 |
#include <random>
|
| 21 |
|
| 22 |
+
#if defined(_MSC_VER)
|
| 23 |
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
| 24 |
+
#endif
|
| 25 |
+
|
| 26 |
#if defined(GGML_BIG_ENDIAN)
|
| 27 |
#include <bit>
|
| 28 |
|
|
|
|
| 1472 |
{
|
| 1473 |
wstate.use_buf(ctx0, 1);
|
| 1474 |
|
| 1475 |
+
cur = ggml_conv_1d_s1_ph(ctx0, model.e_conv_1_w, mel);
|
| 1476 |
cur = ggml_add(ctx0,
|
| 1477 |
ggml_repeat(ctx0,
|
| 1478 |
model.e_conv_1_b,
|
|
|
|
| 1483 |
|
| 1484 |
wstate.use_buf(ctx0, 0);
|
| 1485 |
|
| 1486 |
+
cur = ggml_conv_1d_s2_ph(ctx0, model.e_conv_2_w, cur);
|
| 1487 |
cur = ggml_add(ctx0,
|
| 1488 |
ggml_repeat(ctx0,
|
| 1489 |
model.e_conv_2_b,
|