ggerganov commited on
Commit
a100c9a
·
unverified ·
1 Parent(s): 19d7f69

ggml : sync latest ggml lib

Browse files
examples/common-ggml.cpp CHANGED
@@ -52,6 +52,11 @@ bool ggml_common_quantize_0(
52
  case GGML_FTYPE_ALL_F32:
53
  case GGML_FTYPE_MOSTLY_F16:
54
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
 
 
 
 
 
55
  {
56
  fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
57
  return false;
@@ -187,6 +192,12 @@ bool ggml_common_quantize_0(
187
  case GGML_TYPE_I16:
188
  case GGML_TYPE_I32:
189
  case GGML_TYPE_Q8_1:
 
 
 
 
 
 
190
  case GGML_TYPE_COUNT:
191
  {
192
  fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
 
52
  case GGML_FTYPE_ALL_F32:
53
  case GGML_FTYPE_MOSTLY_F16:
54
  case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
55
+ case GGML_FTYPE_MOSTLY_Q2_K:
56
+ case GGML_FTYPE_MOSTLY_Q3_K:
57
+ case GGML_FTYPE_MOSTLY_Q4_K:
58
+ case GGML_FTYPE_MOSTLY_Q5_K:
59
+ case GGML_FTYPE_MOSTLY_Q6_K:
60
  {
61
  fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
62
  return false;
 
192
  case GGML_TYPE_I16:
193
  case GGML_TYPE_I32:
194
  case GGML_TYPE_Q8_1:
195
+ case GGML_TYPE_Q2_K:
196
+ case GGML_TYPE_Q3_K:
197
+ case GGML_TYPE_Q4_K:
198
+ case GGML_TYPE_Q5_K:
199
+ case GGML_TYPE_Q6_K:
200
+ case GGML_TYPE_Q8_K:
201
  case GGML_TYPE_COUNT:
202
  {
203
  fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
examples/common.cpp CHANGED
@@ -6,13 +6,21 @@
6
  #include "dr_wav.h"
7
 
8
  #include <cmath>
 
9
  #include <fstream>
10
  #include <regex>
 
 
 
11
 
12
  #ifndef M_PI
13
  #define M_PI 3.14159265358979323846
14
  #endif
15
 
 
 
 
 
16
  bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
17
  for (int i = 1; i < argc; i++) {
18
  std::string arg = argv[i];
@@ -52,7 +60,10 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
52
  if (params.prompt.back() == '\n') {
53
  params.prompt.pop_back();
54
  }
55
- } else {
 
 
 
56
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
57
  gpt_print_usage(argc, argv, params);
58
  exit(0);
@@ -73,6 +84,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
73
  fprintf(stderr, " prompt to start generation with (default: random)\n");
74
  fprintf(stderr, " -f FNAME, --file FNAME\n");
75
  fprintf(stderr, " load prompt from a file\n");
 
 
76
  fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
77
  fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
78
  fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
@@ -117,6 +130,10 @@ std::string replace(const std::string & s, const std::string & from, const std::
117
  return result;
118
  }
119
 
 
 
 
 
120
  std::map<std::string, int32_t> json_parse(const std::string & fname) {
121
  std::map<std::string, int32_t> result;
122
 
@@ -208,8 +225,28 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
208
  return result;
209
  }
210
 
211
- void gpt_vocab::add_special_token(const std::string & token) {
212
- special_tokens.push_back(token);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  }
214
 
215
  std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
@@ -218,63 +255,52 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
218
  // first split the text into words
219
  {
220
  std::string str = text;
221
- std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
222
 
223
  // Generate the subpattern from the special_tokens vector if it's not empty
224
  if (!vocab.special_tokens.empty()) {
 
225
  std::string special_tokens_subpattern;
226
  for (const auto & token : vocab.special_tokens) {
227
  if (!special_tokens_subpattern.empty()) {
228
  special_tokens_subpattern += "|";
229
  }
230
- special_tokens_subpattern += token;
231
  }
232
 
233
- // Modify the regex pattern with the generated special tokens subpattern
234
- pat = special_tokens_subpattern + "|" + pat;
235
- }
236
-
237
- std::regex re(pat);
238
- std::smatch m;
239
-
240
- while (std::regex_search(str, m, re)) {
241
- for (auto x : m) {
242
- words.push_back(x);
 
243
  }
244
- str = m.suffix();
245
  }
 
 
246
  }
247
 
248
- // find the longest tokens that form the words:
249
  std::vector<gpt_vocab::id> tokens;
250
  for (const auto & word : words) {
251
- if (word.size() == 0) continue;
252
-
253
- int i = 0;
254
- int n = word.size();
255
- while (i < n) {
256
- int j = n;
257
- while (j > i) {
258
- auto it = vocab.token_to_id.find(word.substr(i, j-i));
259
- if (it != vocab.token_to_id.end()) {
260
  tokens.push_back(it->second);
261
- i = j;
262
- j = n;
263
  break;
264
  }
265
- --j;
266
- }
267
- if (i == n) {
268
- break;
269
- }
270
- if (j == i) {
271
- auto sub = word.substr(i, 1);
272
- if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
273
- tokens.push_back(vocab.token_to_id.at(sub));
274
- } else {
275
- fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
276
  }
277
- ++i;
278
  }
279
  }
280
  }
@@ -282,6 +308,70 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
282
  return tokens;
283
  }
284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
286
  printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
287
 
@@ -381,6 +471,122 @@ gpt_vocab::id gpt_sample_top_k_top_p(
381
  return logits_id[idx].second;
382
  }
383
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
385
  drwav wav;
386
  std::vector<uint8_t> wav_data; // used for pipe input from stdin
 
6
  #include "dr_wav.h"
7
 
8
  #include <cmath>
9
+ #include <cstring>
10
  #include <fstream>
11
  #include <regex>
12
+ #include <locale>
13
+ #include <codecvt>
14
+ #include <sstream>
15
 
16
  #ifndef M_PI
17
  #define M_PI 3.14159265358979323846
18
  #endif
19
 
20
+ #if defined(_MSC_VER)
21
+ #pragma warning(disable: 4244 4267) // possible loss of data
22
+ #endif
23
+
24
  bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
25
  for (int i = 1; i < argc; i++) {
26
  std::string arg = argv[i];
 
60
  if (params.prompt.back() == '\n') {
61
  params.prompt.pop_back();
62
  }
63
+ } else if (arg == "-tt" || arg == "--token_test") {
64
+ params.token_test = argv[++i];
65
+ }
66
+ else {
67
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
68
  gpt_print_usage(argc, argv, params);
69
  exit(0);
 
84
  fprintf(stderr, " prompt to start generation with (default: random)\n");
85
  fprintf(stderr, " -f FNAME, --file FNAME\n");
86
  fprintf(stderr, " load prompt from a file\n");
87
+ fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
88
+ fprintf(stderr, " test tokenization\n");
89
  fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
90
  fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
91
  fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
 
130
  return result;
131
  }
132
 
133
+ void gpt_vocab::add_special_token(const std::string & token) {
134
+ special_tokens.push_back(token);
135
+ }
136
+
137
  std::map<std::string, int32_t> json_parse(const std::string & fname) {
138
  std::map<std::string, int32_t> result;
139
 
 
225
  return result;
226
  }
227
 
228
+ std::string convert_to_utf8(const std::wstring & input) {
229
+ std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
230
+ return converter.to_bytes(input);
231
+ }
232
+
233
+
234
+ std::wstring convert_to_wstring(const std::string & input) {
235
+ std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
236
+ return converter.from_bytes(input);
237
+ }
238
+
239
+ void gpt_split_words(std::string str, std::vector<std::string>& words) {
240
+ const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
241
+ const std::regex re(pattern);
242
+ std::smatch m;
243
+
244
+ while (std::regex_search(str, m, re)) {
245
+ for (auto x : m) {
246
+ words.push_back(x);
247
+ }
248
+ str = m.suffix();
249
+ }
250
  }
251
 
252
  std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
 
255
  // first split the text into words
256
  {
257
  std::string str = text;
 
258
 
259
  // Generate the subpattern from the special_tokens vector if it's not empty
260
  if (!vocab.special_tokens.empty()) {
261
+ const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
262
  std::string special_tokens_subpattern;
263
  for (const auto & token : vocab.special_tokens) {
264
  if (!special_tokens_subpattern.empty()) {
265
  special_tokens_subpattern += "|";
266
  }
267
+ special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
268
  }
269
 
270
+ std::regex re(special_tokens_subpattern);
271
+ std::smatch m;
272
+ // Split the text by special tokens.
273
+ while (std::regex_search(str, m, re)) {
274
+ // Split the substrings in-between special tokens into words.
275
+ gpt_split_words(m.prefix(), words);
276
+ // Add matched special tokens as words.
277
+ for (auto x : m) {
278
+ words.push_back(x);
279
+ }
280
+ str = m.suffix();
281
  }
282
+ // Remaining text without special tokens will be handled below.
283
  }
284
+
285
+ gpt_split_words(str, words);
286
  }
287
 
288
+ // find the longest token that forms each word in words:
289
  std::vector<gpt_vocab::id> tokens;
290
  for (const auto & word : words) {
291
+ for (int i = 0; i < (int) word.size(); ){
292
+ for (int j = word.size() - 1; j >= i; j--){
293
+ auto cand = word.substr(i, j-i+1);
294
+ auto it = vocab.token_to_id.find(cand);
295
+ if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
 
 
 
 
296
  tokens.push_back(it->second);
297
+ i = j + 1;
 
298
  break;
299
  }
300
+ else if (j == i){ // word.substr(i, 1) has no matching
301
+ fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
302
+ i++;
 
 
 
 
 
 
 
 
303
  }
 
304
  }
305
  }
306
  }
 
308
  return tokens;
309
  }
310
 
311
+ std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) {
312
+ std::vector<gpt_vocab::id> output;
313
+ std::stringstream ss(input);
314
+ std::string token;
315
+
316
+ while (std::getline(ss, token, delimiter)) {
317
+ output.push_back(std::stoi(token));
318
+ }
319
+
320
+ return output;
321
+ }
322
+
323
+ std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string & fpath_test){
324
+ if (fpath_test.empty()){
325
+ fprintf(stderr, "%s : No test file found.\n", __func__);
326
+ return std::map<std::string, std::vector<gpt_vocab::id>>();
327
+ }
328
+
329
+ std::map<std::string, std::vector<gpt_vocab::id>> tests;
330
+
331
+ auto fin = std::ifstream(fpath_test, std::ios_base::in);
332
+ const char * delimeter = " => ";
333
+ const char del_tok = ',';
334
+ std::string line;
335
+ while (std::getline(fin, line)) {
336
+ size_t delimiterPos = line.find(delimeter);
337
+ if (delimiterPos != std::string::npos) {
338
+ std::string text = line.substr(0, delimiterPos);
339
+ std::string s_tokens = line.substr(delimiterPos + std::strlen(delimeter));
340
+ tests[text] = parse_tokens_from_string(s_tokens, del_tok);
341
+ }
342
+ }
343
+ return tests;
344
+ }
345
+
346
+ void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test){
347
+ std::map<std::string, std::vector<gpt_vocab::id>> tests = extract_tests_from_file(fpath_test);
348
+
349
+ size_t n_fails = 0;
350
+
351
+ for (const auto & test : tests) {
352
+ std::vector<gpt_vocab::id> tokens = gpt_tokenize(vocab, test.first);
353
+
354
+ if (tokens != test.second){
355
+ n_fails++;
356
+
357
+ // print out failure cases
358
+ fprintf(stderr, "%s : failed test: '%s'\n", __func__, test.first.c_str());
359
+ fprintf(stderr, "%s : tokens in hf: ", __func__);
360
+ for (const auto & t : test.second) {
361
+ fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
362
+ }
363
+ fprintf(stderr, "\n");
364
+ fprintf(stderr, "%s : tokens in ggml: ", __func__);
365
+ for (const auto & t : tokens) {
366
+ fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
367
+ }
368
+ fprintf(stderr, "\n");
369
+ }
370
+ }
371
+
372
+ fprintf(stderr, "%s : %zu tests failed out of %zu tests.\n", __func__, n_fails, tests.size());
373
+ }
374
+
375
  bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
376
  printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
377
 
 
471
  return logits_id[idx].second;
472
  }
473
 
474
+ gpt_vocab::id gpt_sample_top_k_top_p_repeat(
475
+ const gpt_vocab & vocab,
476
+ const float * logits,
477
+ const int32_t * last_n_tokens_data,
478
+ size_t last_n_tokens_data_size,
479
+ int top_k,
480
+ double top_p,
481
+ double temp,
482
+ int repeat_last_n,
483
+ float repeat_penalty,
484
+ std::mt19937 & rng) {
485
+
486
+ int n_logits = vocab.id_to_token.size();
487
+
488
+ const auto * plogits = logits;
489
+
490
+ const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size);
491
+
492
+ if (temp <= 0) {
493
+ // select the token with the highest logit directly
494
+ float max_logit = plogits[0];
495
+ gpt_vocab::id max_id = 0;
496
+
497
+ for (int i = 1; i < n_logits; ++i) {
498
+ if (plogits[i] > max_logit) {
499
+ max_logit = plogits[i];
500
+ max_id = i;
501
+ }
502
+ }
503
+ return max_id;
504
+ }
505
+
506
+
507
+ std::vector<std::pair<double, gpt_vocab::id>> logits_id;
508
+ logits_id.reserve(n_logits);
509
+
510
+ {
511
+ const float scale = 1.0f/temp;
512
+ for (int i = 0; i < n_logits; ++i) {
513
+ // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
514
+ // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
515
+ if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) {
516
+ // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
517
+ if (plogits[i] < 0.0f) {
518
+ logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
519
+ } else {
520
+ logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
521
+ }
522
+ } else {
523
+ logits_id.push_back(std::make_pair(plogits[i]*scale, i));
524
+ }
525
+ }
526
+ }
527
+
528
+ // find the top K tokens
529
+ std::partial_sort(
530
+ logits_id.begin(),
531
+ logits_id.begin() + top_k, logits_id.end(),
532
+ [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
533
+ return a.first > b.first;
534
+ });
535
+
536
+ logits_id.resize(top_k);
537
+
538
+ double maxl = -INFINITY;
539
+ for (const auto & kv : logits_id) {
540
+ maxl = std::max(maxl, kv.first);
541
+ }
542
+
543
+ // compute probs for the top K tokens
544
+ std::vector<double> probs;
545
+ probs.reserve(logits_id.size());
546
+
547
+ double sum = 0.0;
548
+ for (const auto & kv : logits_id) {
549
+ double p = exp(kv.first - maxl);
550
+ probs.push_back(p);
551
+ sum += p;
552
+ }
553
+
554
+ // normalize the probs
555
+ for (auto & p : probs) {
556
+ p /= sum;
557
+ }
558
+
559
+ if (top_p < 1.0f) {
560
+ double cumsum = 0.0f;
561
+ for (int i = 0; i < top_k; i++) {
562
+ cumsum += probs[i];
563
+ if (cumsum >= top_p) {
564
+ top_k = i + 1;
565
+ probs.resize(top_k);
566
+ logits_id.resize(top_k);
567
+ break;
568
+ }
569
+ }
570
+
571
+ cumsum = 1.0/cumsum;
572
+ for (int i = 0; i < (int) probs.size(); i++) {
573
+ probs[i] *= cumsum;
574
+ }
575
+ }
576
+
577
+ // printf("\n");
578
+ // for (int i = 0; i < (int) probs.size(); i++) {
579
+ // for (int i = 0; i < 10; i++) {
580
+ // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
581
+ // }
582
+
583
+ std::discrete_distribution<> dist(probs.begin(), probs.end());
584
+ int idx = dist(rng);
585
+
586
+ return logits_id[idx].second;
587
+
588
+ }
589
+
590
  bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
591
  drwav wav;
592
  std::vector<uint8_t> wav_data; // used for pipe input from stdin
examples/common.h CHANGED
@@ -26,8 +26,9 @@ struct gpt_params {
26
 
27
  int32_t n_batch = 8; // batch size for prompt processing
28
 
29
- std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
30
- std::string prompt;
 
31
  };
32
 
33
  bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
@@ -61,6 +62,12 @@ struct gpt_vocab {
61
  // poor-man's JSON parsing
62
  std::map<std::string, int32_t> json_parse(const std::string & fname);
63
 
 
 
 
 
 
 
64
  // split text into tokens
65
  //
66
  // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
@@ -73,6 +80,15 @@ std::map<std::string, int32_t> json_parse(const std::string & fname);
73
  //
74
  std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
75
 
 
 
 
 
 
 
 
 
 
76
  // load the tokens from encoder.json
77
  bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
78
 
@@ -92,6 +108,18 @@ gpt_vocab::id gpt_sample_top_k_top_p(
92
  double temp,
93
  std::mt19937 & rng);
94
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  //
96
  // Audio utils
97
  //
 
26
 
27
  int32_t n_batch = 8; // batch size for prompt processing
28
 
29
+ std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
30
+ std::string prompt = "";
31
+ std::string token_test = "";
32
  };
33
 
34
  bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
 
62
  // poor-man's JSON parsing
63
  std::map<std::string, int32_t> json_parse(const std::string & fname);
64
 
65
+ std::string convert_to_utf8(const std::wstring & input);
66
+
67
+ std::wstring convert_to_wstring(const std::string & input);
68
+
69
+ void gpt_split_words(std::string str, std::vector<std::string>& words);
70
+
71
  // split text into tokens
72
  //
73
  // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
 
80
  //
81
  std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
82
 
83
+ // test outputs of gpt_tokenize
84
+ //
85
+ // - compare with tokens generated by the huggingface tokenizer
86
+ // - test cases are chosen based on the model's main language (under 'prompt' directory)
87
+ // - if all sentences are tokenized identically, print 'All tests passed.'
88
+ // - otherwise, print sentence, huggingface tokens, ggml tokens
89
+ //
90
+ void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test);
91
+
92
  // load the tokens from encoder.json
93
  bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
94
 
 
108
  double temp,
109
  std::mt19937 & rng);
110
 
111
+ gpt_vocab::id gpt_sample_top_k_top_p_repeat(
112
+ const gpt_vocab & vocab,
113
+ const float * logits,
114
+ const int32_t * last_n_tokens_data,
115
+ size_t last_n_tokens_data_size,
116
+ int top_k,
117
+ double top_p,
118
+ double temp,
119
+ int repeat_last_n,
120
+ float repeat_penalty,
121
+ std::mt19937 & rng);
122
+
123
  //
124
  // Audio utils
125
  //
examples/main/main.cpp CHANGED
@@ -10,6 +10,10 @@
10
  #include <vector>
11
  #include <cstring>
12
 
 
 
 
 
13
  // Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
14
  // Lowest is red, middle is yellow, highest is green.
15
  const std::vector<std::string> k_colors = {
@@ -148,7 +152,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
148
  else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
149
  else {
150
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
151
- return false;
 
152
  }
153
  }
154
 
@@ -423,13 +428,13 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
423
  indent++;
424
  };
425
 
426
- auto end_arr = [&](bool end = false) {
427
  indent--;
428
  doindent();
429
  fout << (end ? "]\n" : "},\n");
430
  };
431
 
432
- auto start_obj = [&](const char *name = nullptr) {
433
  doindent();
434
  if (name) {
435
  fout << "\"" << name << "\": {\n";
@@ -439,7 +444,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
439
  indent++;
440
  };
441
 
442
- auto end_obj = [&](bool end = false) {
443
  indent--;
444
  doindent();
445
  fout << (end ? "}\n" : "},\n");
@@ -450,24 +455,24 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
450
  fout << "\"" << name << "\": ";
451
  };
452
 
453
- auto value_s = [&](const char *name, const char *val, bool end = false) {
454
  start_value(name);
455
  char * val_escaped = escape_double_quotes_and_backslashes(val);
456
  fout << "\"" << val_escaped << (end ? "\"\n" : "\",\n");
457
  free(val_escaped);
458
  };
459
 
460
- auto end_value = [&](bool end = false) {
461
  fout << (end ? "\n" : ",\n");
462
  };
463
 
464
- auto value_i = [&](const char *name, const int64_t val, bool end = false) {
465
  start_value(name);
466
  fout << val;
467
  end_value(end);
468
  };
469
 
470
- auto value_b = [&](const char *name, const bool val, bool end = false) {
471
  start_value(name);
472
  fout << (val ? "true" : "false");
473
  end_value(end);
@@ -479,35 +484,35 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
479
  }
480
 
481
  fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
482
- start_obj();
483
- value_s("systeminfo", whisper_print_system_info());
484
  start_obj("model");
485
- value_s("type", whisper_model_type_readable(ctx));
486
- value_b("multilingual", whisper_is_multilingual(ctx));
487
- value_i("vocab", whisper_model_n_vocab(ctx));
488
  start_obj("audio");
489
- value_i("ctx", whisper_model_n_audio_ctx(ctx));
490
- value_i("state", whisper_model_n_audio_state(ctx));
491
- value_i("head", whisper_model_n_audio_head(ctx));
492
  value_i("layer", whisper_model_n_audio_layer(ctx), true);
493
- end_obj();
494
  start_obj("text");
495
- value_i("ctx", whisper_model_n_text_ctx(ctx));
496
- value_i("state", whisper_model_n_text_state(ctx));
497
- value_i("head", whisper_model_n_text_head(ctx));
498
  value_i("layer", whisper_model_n_text_layer(ctx), true);
499
- end_obj();
500
- value_i("mels", whisper_model_n_mels(ctx));
501
  value_i("ftype", whisper_model_ftype(ctx), true);
502
- end_obj();
503
  start_obj("params");
504
- value_s("model", params.model.c_str());
505
- value_s("language", params.language.c_str());
506
  value_b("translate", params.translate, true);
507
- end_obj();
508
  start_obj("result");
509
  value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
510
- end_obj();
511
  start_arr("transcription");
512
 
513
  const int n_segments = whisper_full_n_segments(ctx);
@@ -516,15 +521,15 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
516
  const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
517
  const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
518
 
519
- start_obj();
520
  start_obj("timestamps");
521
- value_s("from", to_timestamp(t0, true).c_str());
522
  value_s("to", to_timestamp(t1, true).c_str(), true);
523
- end_obj();
524
  start_obj("offsets");
525
- value_i("from", t0 * 10);
526
  value_i("to", t1 * 10, true);
527
- end_obj();
528
  value_s("text", text, true);
529
  end_obj(i == (n_segments - 1));
530
  }
 
10
  #include <vector>
11
  #include <cstring>
12
 
13
+ #if defined(_MSC_VER)
14
+ #pragma warning(disable: 4244 4267) // possible loss of data
15
+ #endif
16
+
17
  // Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
18
  // Lowest is red, middle is yellow, highest is green.
19
  const std::vector<std::string> k_colors = {
 
152
  else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
153
  else {
154
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
155
+ whisper_print_usage(argc, argv, params);
156
+ exit(0);
157
  }
158
  }
159
 
 
428
  indent++;
429
  };
430
 
431
+ auto end_arr = [&](bool end) {
432
  indent--;
433
  doindent();
434
  fout << (end ? "]\n" : "},\n");
435
  };
436
 
437
+ auto start_obj = [&](const char *name) {
438
  doindent();
439
  if (name) {
440
  fout << "\"" << name << "\": {\n";
 
444
  indent++;
445
  };
446
 
447
+ auto end_obj = [&](bool end) {
448
  indent--;
449
  doindent();
450
  fout << (end ? "}\n" : "},\n");
 
455
  fout << "\"" << name << "\": ";
456
  };
457
 
458
+ auto value_s = [&](const char *name, const char *val, bool end) {
459
  start_value(name);
460
  char * val_escaped = escape_double_quotes_and_backslashes(val);
461
  fout << "\"" << val_escaped << (end ? "\"\n" : "\",\n");
462
  free(val_escaped);
463
  };
464
 
465
+ auto end_value = [&](bool end) {
466
  fout << (end ? "\n" : ",\n");
467
  };
468
 
469
+ auto value_i = [&](const char *name, const int64_t val, bool end) {
470
  start_value(name);
471
  fout << val;
472
  end_value(end);
473
  };
474
 
475
+ auto value_b = [&](const char *name, const bool val, bool end) {
476
  start_value(name);
477
  fout << (val ? "true" : "false");
478
  end_value(end);
 
484
  }
485
 
486
  fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
487
+ start_obj(nullptr);
488
+ value_s("systeminfo", whisper_print_system_info(), false);
489
  start_obj("model");
490
+ value_s("type", whisper_model_type_readable(ctx), false);
491
+ value_b("multilingual", whisper_is_multilingual(ctx), false);
492
+ value_i("vocab", whisper_model_n_vocab(ctx), false);
493
  start_obj("audio");
494
+ value_i("ctx", whisper_model_n_audio_ctx(ctx), false);
495
+ value_i("state", whisper_model_n_audio_state(ctx), false);
496
+ value_i("head", whisper_model_n_audio_head(ctx), false);
497
  value_i("layer", whisper_model_n_audio_layer(ctx), true);
498
+ end_obj(false);
499
  start_obj("text");
500
+ value_i("ctx", whisper_model_n_text_ctx(ctx), false);
501
+ value_i("state", whisper_model_n_text_state(ctx), false);
502
+ value_i("head", whisper_model_n_text_head(ctx), false);
503
  value_i("layer", whisper_model_n_text_layer(ctx), true);
504
+ end_obj(false);
505
+ value_i("mels", whisper_model_n_mels(ctx), false);
506
  value_i("ftype", whisper_model_ftype(ctx), true);
507
+ end_obj(false);
508
  start_obj("params");
509
+ value_s("model", params.model.c_str(), false);
510
+ value_s("language", params.language.c_str(), false);
511
  value_b("translate", params.translate, true);
512
+ end_obj(false);
513
  start_obj("result");
514
  value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
515
+ end_obj(false);
516
  start_arr("transcription");
517
 
518
  const int n_segments = whisper_full_n_segments(ctx);
 
521
  const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
522
  const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
523
 
524
+ start_obj(nullptr);
525
  start_obj("timestamps");
526
+ value_s("from", to_timestamp(t0, true).c_str(), false);
527
  value_s("to", to_timestamp(t1, true).c_str(), true);
528
+ end_obj(false);
529
  start_obj("offsets");
530
+ value_i("from", t0 * 10, false);
531
  value_i("to", t1 * 10, true);
532
+ end_obj(false);
533
  value_s("text", text, true);
534
  end_obj(i == (n_segments - 1));
535
  }
examples/quantize/quantize.cpp CHANGED
@@ -99,17 +99,17 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
99
  fprintf(stderr, "%s: ftype (dst) = %d\n", __func__, ftype_dst);
100
  fprintf(stderr, "%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
101
 
102
- fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
103
- fout.write((char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
104
- fout.write((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
105
- fout.write((char *) &hparams.n_audio_head, sizeof(hparams.n_audio_head));
106
- fout.write((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
107
- fout.write((char *) &hparams.n_text_ctx, sizeof(hparams.n_text_ctx));
108
- fout.write((char *) &hparams.n_text_state, sizeof(hparams.n_text_state));
109
- fout.write((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
110
- fout.write((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
111
- fout.write((char *) &hparams.n_mels, sizeof(hparams.n_mels));
112
- fout.write((char *) &ftype_dst, sizeof(hparams.ftype));
113
  }
114
 
115
  // load mel filters
@@ -138,15 +138,17 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
138
  // return false;
139
  //}
140
 
141
- std::string word;
 
142
  for (int i = 0; i < n_vocab; i++) {
143
  uint32_t len;
144
  finp.read ((char *) &len, sizeof(len));
145
  fout.write((char *) &len, sizeof(len));
146
 
147
- word.resize(len);
148
- finp.read ((char *) word.data(), len);
149
- fout.write((char *) word.data(), len);
 
150
 
151
  vocab.token_to_id[word] = i;
152
  vocab.id_to_token[i] = word;
 
99
  fprintf(stderr, "%s: ftype (dst) = %d\n", __func__, ftype_dst);
100
  fprintf(stderr, "%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
101
 
102
+ fout.write((const char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
103
+ fout.write((const char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
104
+ fout.write((const char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
105
+ fout.write((const char *) &hparams.n_audio_head, sizeof(hparams.n_audio_head));
106
+ fout.write((const char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
107
+ fout.write((const char *) &hparams.n_text_ctx, sizeof(hparams.n_text_ctx));
108
+ fout.write((const char *) &hparams.n_text_state, sizeof(hparams.n_text_state));
109
+ fout.write((const char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
110
+ fout.write((const char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
111
+ fout.write((const char *) &hparams.n_mels, sizeof(hparams.n_mels));
112
+ fout.write((const char *) &ftype_dst, sizeof(hparams.ftype));
113
  }
114
 
115
  // load mel filters
 
138
  // return false;
139
  //}
140
 
141
+ char word[128];
142
+
143
  for (int i = 0; i < n_vocab; i++) {
144
  uint32_t len;
145
  finp.read ((char *) &len, sizeof(len));
146
  fout.write((char *) &len, sizeof(len));
147
 
148
+ word[len] = '\0';
149
+
150
+ finp.read ((char *) word, len);
151
+ fout.write((char *) word, len);
152
 
153
  vocab.token_to_id[word] = i;
154
  vocab.id_to_token[i] = word;
ggml-cuda.cu CHANGED
The diff for this file is too large to render. See raw diff
 
ggml-cuda.h CHANGED
@@ -1,10 +1,19 @@
 
 
1
  #include "ggml.h"
2
 
3
  #ifdef __cplusplus
4
  extern "C" {
5
  #endif
6
 
 
 
 
 
 
 
7
  void ggml_init_cublas(void);
 
8
 
9
  void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
10
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
@@ -15,8 +24,15 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
15
  void * ggml_cuda_host_malloc(size_t size);
16
  void ggml_cuda_host_free(void * ptr);
17
 
18
- void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
19
- void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
 
 
 
 
 
 
 
20
 
21
  #ifdef __cplusplus
22
  }
 
1
+ #pragma once
2
+
3
  #include "ggml.h"
4
 
5
  #ifdef __cplusplus
6
  extern "C" {
7
  #endif
8
 
9
+ #define GGML_CUDA_MAX_DEVICES 16
10
+
11
+ struct ggml_tensor_extra_gpu {
12
+ void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
13
+ };
14
+
15
  void ggml_init_cublas(void);
16
+ void ggml_cuda_set_tensor_split(const float * tensor_split);
17
 
18
  void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
19
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 
24
  void * ggml_cuda_host_malloc(size_t size);
25
  void ggml_cuda_host_free(void * ptr);
26
 
27
+ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
28
+
29
+ void ggml_cuda_free_data(struct ggml_tensor * tensor);
30
+ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
31
+ void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
32
+ void ggml_cuda_set_main_device(int main_device);
33
+ void ggml_cuda_set_scratch_size(size_t scratch_size);
34
+ void ggml_cuda_free_scratch(void);
35
+ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
36
 
37
  #ifdef __cplusplus
38
  }
ggml-opencl.h CHANGED
@@ -1,23 +1,24 @@
1
  #pragma once
2
 
 
 
3
  #ifdef __cplusplus
4
  extern "C" {
5
  #endif
6
 
7
  void ggml_cl_init(void);
8
 
9
- enum ggml_blas_order {
10
- GGML_BLAS_ORDER_ROW_MAJOR = 101,
11
- GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
12
- };
 
 
 
13
 
14
- enum ggml_blas_op {
15
- GGML_BLAS_OP_N = 111,
16
- GGML_BLAS_OP_T = 112,
17
- GGML_BLAS_OP_C = 113,
18
- };
19
 
20
- void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
21
 
22
  #ifdef __cplusplus
23
  }
 
1
  #pragma once
2
 
3
+ #include "ggml.h"
4
+
5
  #ifdef __cplusplus
6
  extern "C" {
7
  #endif
8
 
9
  void ggml_cl_init(void);
10
 
11
+ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
12
+ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
13
+ size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
14
+ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
15
+
16
+ void * ggml_cl_host_malloc(size_t size);
17
+ void ggml_cl_host_free(void * ptr);
18
 
19
+ void ggml_cl_free_data(const struct ggml_tensor* tensor);
 
 
 
 
20
 
21
+ void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
22
 
23
  #ifdef __cplusplus
24
  }
ggml.c CHANGED
The diff for this file is too large to render. See raw diff
 
ggml.h CHANGED
@@ -198,6 +198,7 @@
198
  #define GGML_MAX_PARAMS 256
199
  #define GGML_MAX_CONTEXTS 64
200
  #define GGML_MAX_OPT 4
 
201
  #define GGML_DEFAULT_N_THREADS 4
202
 
203
  #define GGML_ASSERT(x) \
@@ -240,6 +241,13 @@ extern "C" {
240
  GGML_TYPE_Q5_1 = 7,
241
  GGML_TYPE_Q8_0 = 8,
242
  GGML_TYPE_Q8_1 = 9,
 
 
 
 
 
 
 
243
  GGML_TYPE_I8,
244
  GGML_TYPE_I16,
245
  GGML_TYPE_I32,
@@ -248,7 +256,8 @@ extern "C" {
248
 
249
  enum ggml_backend {
250
  GGML_BACKEND_CPU = 0,
251
- GGML_BACKEND_CUDA = 1,
 
252
  };
253
 
254
  // model file types
@@ -262,6 +271,11 @@ extern "C" {
262
  GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
263
  GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
264
  GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
 
 
 
 
 
265
  };
266
 
267
  // available tensor operations:
@@ -282,12 +296,14 @@ extern "C" {
282
  GGML_OP_SUM_ROWS,
283
  GGML_OP_MEAN,
284
  GGML_OP_REPEAT,
 
285
  GGML_OP_ABS,
286
  GGML_OP_SGN,
287
  GGML_OP_NEG,
288
  GGML_OP_STEP,
289
  GGML_OP_RELU,
290
  GGML_OP_GELU,
 
291
  GGML_OP_SILU,
292
  GGML_OP_SILU_BACK,
293
  GGML_OP_NORM, // normalize
@@ -295,6 +311,7 @@ extern "C" {
295
  GGML_OP_RMS_NORM_BACK,
296
 
297
  GGML_OP_MUL_MAT,
 
298
 
299
  GGML_OP_SCALE,
300
  GGML_OP_SET,
@@ -310,19 +327,31 @@ extern "C" {
310
  GGML_OP_DIAG_MASK_INF,
311
  GGML_OP_DIAG_MASK_ZERO,
312
  GGML_OP_SOFT_MAX,
 
313
  GGML_OP_ROPE,
314
  GGML_OP_ROPE_BACK,
315
  GGML_OP_ALIBI,
316
  GGML_OP_CLAMP,
317
- GGML_OP_CONV_1D_1S,
318
- GGML_OP_CONV_1D_2S,
 
319
 
320
  GGML_OP_FLASH_ATTN,
321
  GGML_OP_FLASH_FF,
 
 
 
322
 
323
  GGML_OP_MAP_UNARY,
324
  GGML_OP_MAP_BINARY,
325
 
 
 
 
 
 
 
 
326
  GGML_OP_COUNT,
327
  };
328
 
@@ -371,11 +400,15 @@ extern "C" {
371
 
372
  void * data;
373
 
374
- char name[32];
 
 
375
 
376
- char padding[16];
377
  };
378
 
 
 
379
  // computation graph
380
  struct ggml_cgraph {
381
  int n_nodes;
@@ -409,6 +442,25 @@ extern "C" {
409
  bool no_alloc; // don't allocate memory for the tensor data
410
  };
411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  // misc
413
 
414
  GGML_API void ggml_time_init(void); // call this once at the beginning of the program
@@ -420,14 +472,17 @@ extern "C" {
420
  GGML_API void ggml_print_object (const struct ggml_object * obj);
421
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
422
 
423
- GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
424
- GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
 
 
425
 
426
  GGML_API int ggml_blck_size (enum ggml_type type);
427
  GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
428
  GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
429
 
430
  GGML_API const char * ggml_type_name(enum ggml_type type);
 
431
 
432
  GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
433
 
@@ -436,14 +491,26 @@ extern "C" {
436
  // TODO: temporary until model loading of ggml examples is refactored
437
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
438
 
 
 
 
 
 
 
 
439
  // main
440
 
441
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
442
- GGML_API void ggml_free(struct ggml_context * ctx);
443
 
444
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
445
 
446
- GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
 
 
 
 
 
447
 
448
  GGML_API struct ggml_tensor * ggml_new_tensor(
449
  struct ggml_context * ctx,
@@ -483,6 +550,8 @@ extern "C" {
483
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
484
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
485
 
 
 
486
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
487
  GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
488
  GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
@@ -496,8 +565,9 @@ extern "C" {
496
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
497
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
498
 
499
- GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
500
- GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
 
501
 
502
  //
503
  // operations on tensors with backpropagation
@@ -522,6 +592,11 @@ extern "C" {
522
  struct ggml_tensor * a,
523
  struct ggml_tensor * b);
524
 
 
 
 
 
 
525
  GGML_API struct ggml_tensor * ggml_acc(
526
  struct ggml_context * ctx,
527
  struct ggml_tensor * a,
@@ -545,24 +620,47 @@ extern "C" {
545
  struct ggml_tensor * a,
546
  struct ggml_tensor * b);
547
 
 
 
 
 
 
548
  GGML_API struct ggml_tensor * ggml_mul(
549
  struct ggml_context * ctx,
550
  struct ggml_tensor * a,
551
  struct ggml_tensor * b);
552
 
 
 
 
 
 
553
  GGML_API struct ggml_tensor * ggml_div(
554
  struct ggml_context * ctx,
555
  struct ggml_tensor * a,
556
  struct ggml_tensor * b);
557
 
 
 
 
 
 
558
  GGML_API struct ggml_tensor * ggml_sqr(
559
  struct ggml_context * ctx,
560
  struct ggml_tensor * a);
561
 
 
 
 
 
562
  GGML_API struct ggml_tensor * ggml_sqrt(
563
  struct ggml_context * ctx,
564
  struct ggml_tensor * a);
565
 
 
 
 
 
566
  GGML_API struct ggml_tensor * ggml_log(
567
  struct ggml_context * ctx,
568
  struct ggml_tensor * a);
@@ -593,35 +691,76 @@ extern "C" {
593
  struct ggml_tensor * a,
594
  struct ggml_tensor * b);
595
 
 
 
 
 
 
596
  GGML_API struct ggml_tensor * ggml_abs(
597
  struct ggml_context * ctx,
598
  struct ggml_tensor * a);
599
 
 
 
 
 
600
  GGML_API struct ggml_tensor * ggml_sgn(
601
  struct ggml_context * ctx,
602
  struct ggml_tensor * a);
603
 
 
 
 
 
604
  GGML_API struct ggml_tensor * ggml_neg(
605
  struct ggml_context * ctx,
606
  struct ggml_tensor * a);
607
 
 
 
 
 
608
  GGML_API struct ggml_tensor * ggml_step(
609
  struct ggml_context * ctx,
610
  struct ggml_tensor * a);
611
 
 
 
 
 
612
  GGML_API struct ggml_tensor * ggml_relu(
613
  struct ggml_context * ctx,
614
  struct ggml_tensor * a);
615
 
 
 
 
 
616
  // TODO: double-check this computation is correct
617
  GGML_API struct ggml_tensor * ggml_gelu(
618
  struct ggml_context * ctx,
619
  struct ggml_tensor * a);
620
 
 
 
 
 
 
 
 
 
 
 
 
 
621
  GGML_API struct ggml_tensor * ggml_silu(
622
  struct ggml_context * ctx,
623
  struct ggml_tensor * a);
624
 
 
 
 
 
625
  // a - x
626
  // b - dy
627
  GGML_API struct ggml_tensor * ggml_silu_back(
@@ -635,10 +774,18 @@ extern "C" {
635
  struct ggml_context * ctx,
636
  struct ggml_tensor * a);
637
 
 
 
 
 
638
  GGML_API struct ggml_tensor * ggml_rms_norm(
639
  struct ggml_context * ctx,
640
  struct ggml_tensor * a);
641
 
 
 
 
 
642
  // a - x
643
  // b - dy
644
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
@@ -646,14 +793,22 @@ extern "C" {
646
  struct ggml_tensor * a,
647
  struct ggml_tensor * b);
648
 
649
- // A: m rows, n columns
650
- // B: p rows, n columns (i.e. we transpose it internally)
651
  // result is m columns, p rows
652
  GGML_API struct ggml_tensor * ggml_mul_mat(
653
  struct ggml_context * ctx,
654
  struct ggml_tensor * a,
655
  struct ggml_tensor * b);
656
 
 
 
 
 
 
 
 
 
657
  //
658
  // operations on tensors without backpropagation
659
  //
@@ -864,6 +1019,17 @@ extern "C" {
864
  struct ggml_context * ctx,
865
  struct ggml_tensor * a);
866
 
 
 
 
 
 
 
 
 
 
 
 
867
  // rotary position embedding
868
  // if mode & 1 == 1, skip n_past elements
869
  // if mode & 2 == 1, GPT-NeoX style
@@ -909,16 +1075,55 @@ extern "C" {
909
  float min,
910
  float max);
911
 
912
- // padding = 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
913
  // TODO: we don't support extra parameters for now
914
  // that's why we are hard-coding the stride, padding, and dilation
915
  // not great ..
916
- GGML_API struct ggml_tensor * ggml_conv_1d_1s(
 
 
 
 
 
917
  struct ggml_context * ctx,
918
  struct ggml_tensor * a,
919
  struct ggml_tensor * b);
920
 
921
- GGML_API struct ggml_tensor * ggml_conv_1d_2s(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
922
  struct ggml_context * ctx,
923
  struct ggml_tensor * a,
924
  struct ggml_tensor * b);
@@ -930,6 +1135,14 @@ extern "C" {
930
  struct ggml_tensor * v,
931
  bool masked);
932
 
 
 
 
 
 
 
 
 
933
  GGML_API struct ggml_tensor * ggml_flash_ff(
934
  struct ggml_context * ctx,
935
  struct ggml_tensor * a,
@@ -938,21 +1151,106 @@ extern "C" {
938
  struct ggml_tensor * c0,
939
  struct ggml_tensor * c1);
940
 
941
- // Mapping operations
942
- typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
943
  typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
944
 
 
 
 
 
945
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
946
  struct ggml_context * ctx,
947
  struct ggml_tensor * a,
948
  ggml_unary_op_f32_t fun);
949
 
 
 
 
 
 
950
  GGML_API struct ggml_tensor * ggml_map_binary_f32(
951
  struct ggml_context * ctx,
952
  struct ggml_tensor * a,
953
  struct ggml_tensor * b,
954
  ggml_binary_op_f32_t fun);
955
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
956
  //
957
  // automatic differentiation
958
  //
@@ -969,6 +1267,11 @@ extern "C" {
969
  GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
970
  GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
971
 
 
 
 
 
 
972
  // print info and performance information for the graph
973
  GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
974
 
@@ -1042,6 +1345,8 @@ extern "C" {
1042
  struct {
1043
  int n_iter;
1044
 
 
 
1045
  float alpha; // learning rate
1046
  float beta1;
1047
  float beta2;
@@ -1066,6 +1371,49 @@ extern "C" {
1066
  } lbfgs;
1067
  };
1068
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1069
  GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
1070
 
1071
  // optimize the function defined by the tensor f
@@ -1074,6 +1422,27 @@ extern "C" {
1074
  struct ggml_opt_params params,
1075
  struct ggml_tensor * f);
1076
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1077
  //
1078
  // quantization
1079
  //
 
198
  #define GGML_MAX_PARAMS 256
199
  #define GGML_MAX_CONTEXTS 64
200
  #define GGML_MAX_OPT 4
201
+ #define GGML_MAX_NAME 32
202
  #define GGML_DEFAULT_N_THREADS 4
203
 
204
  #define GGML_ASSERT(x) \
 
241
  GGML_TYPE_Q5_1 = 7,
242
  GGML_TYPE_Q8_0 = 8,
243
  GGML_TYPE_Q8_1 = 9,
244
+ // k-quantizations
245
+ GGML_TYPE_Q2_K = 10,
246
+ GGML_TYPE_Q3_K = 11,
247
+ GGML_TYPE_Q4_K = 12,
248
+ GGML_TYPE_Q5_K = 13,
249
+ GGML_TYPE_Q6_K = 14,
250
+ GGML_TYPE_Q8_K = 15,
251
  GGML_TYPE_I8,
252
  GGML_TYPE_I16,
253
  GGML_TYPE_I32,
 
256
 
257
  enum ggml_backend {
258
  GGML_BACKEND_CPU = 0,
259
+ GGML_BACKEND_GPU = 10,
260
+ GGML_BACKEND_GPU_SPLIT = 20,
261
  };
262
 
263
  // model file types
 
271
  GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
272
  GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
273
  GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
274
+ GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
275
+ GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
276
+ GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
277
+ GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
278
+ GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
279
  };
280
 
281
  // available tensor operations:
 
296
  GGML_OP_SUM_ROWS,
297
  GGML_OP_MEAN,
298
  GGML_OP_REPEAT,
299
+ GGML_OP_REPEAT_BACK,
300
  GGML_OP_ABS,
301
  GGML_OP_SGN,
302
  GGML_OP_NEG,
303
  GGML_OP_STEP,
304
  GGML_OP_RELU,
305
  GGML_OP_GELU,
306
+ GGML_OP_GELU_QUICK,
307
  GGML_OP_SILU,
308
  GGML_OP_SILU_BACK,
309
  GGML_OP_NORM, // normalize
 
311
  GGML_OP_RMS_NORM_BACK,
312
 
313
  GGML_OP_MUL_MAT,
314
+ GGML_OP_OUT_PROD,
315
 
316
  GGML_OP_SCALE,
317
  GGML_OP_SET,
 
327
  GGML_OP_DIAG_MASK_INF,
328
  GGML_OP_DIAG_MASK_ZERO,
329
  GGML_OP_SOFT_MAX,
330
+ GGML_OP_SOFT_MAX_BACK,
331
  GGML_OP_ROPE,
332
  GGML_OP_ROPE_BACK,
333
  GGML_OP_ALIBI,
334
  GGML_OP_CLAMP,
335
+ GGML_OP_CONV_1D_S1_PH,
336
+ GGML_OP_CONV_1D_S2_PH,
337
+ GGML_OP_CONV_2D_SK_P0,
338
 
339
  GGML_OP_FLASH_ATTN,
340
  GGML_OP_FLASH_FF,
341
+ GGML_OP_FLASH_ATTN_BACK,
342
+ GGML_OP_WIN_PART,
343
+ GGML_OP_WIN_UNPART,
344
 
345
  GGML_OP_MAP_UNARY,
346
  GGML_OP_MAP_BINARY,
347
 
348
+ GGML_OP_MAP_CUSTOM1,
349
+ GGML_OP_MAP_CUSTOM2,
350
+ GGML_OP_MAP_CUSTOM3,
351
+
352
+ GGML_OP_CROSS_ENTROPY_LOSS,
353
+ GGML_OP_CROSS_ENTROPY_LOSS_BACK,
354
+
355
  GGML_OP_COUNT,
356
  };
357
 
 
400
 
401
  void * data;
402
 
403
+ char name[GGML_MAX_NAME];
404
+
405
+ void * extra; // extra things e.g. for ggml-cuda.cu
406
 
407
+ char padding[4];
408
  };
409
 
410
+ static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
411
+
412
  // computation graph
413
  struct ggml_cgraph {
414
  int n_nodes;
 
442
  bool no_alloc; // don't allocate memory for the tensor data
443
  };
444
 
445
+
446
+ // compute types
447
+ enum ggml_task_type {
448
+ GGML_TASK_INIT = 0,
449
+ GGML_TASK_COMPUTE,
450
+ GGML_TASK_FINALIZE,
451
+ };
452
+
453
+ struct ggml_compute_params {
454
+ enum ggml_task_type type;
455
+
456
+ // ith = thread index, nth = number of threads
457
+ int ith, nth;
458
+
459
+ // work buffer for all threads
460
+ size_t wsize;
461
+ void * wdata;
462
+ };
463
+
464
  // misc
465
 
466
  GGML_API void ggml_time_init(void); // call this once at the beginning of the program
 
472
  GGML_API void ggml_print_object (const struct ggml_object * obj);
473
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
474
 
475
+ GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
476
+ GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
477
+ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
478
+ GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
479
 
480
  GGML_API int ggml_blck_size (enum ggml_type type);
481
  GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
482
  GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
483
 
484
  GGML_API const char * ggml_type_name(enum ggml_type type);
485
+ GGML_API const char * ggml_op_name (enum ggml_op op);
486
 
487
  GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
488
 
 
491
  // TODO: temporary until model loading of ggml examples is refactored
492
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
493
 
494
+ GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
495
+ GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
496
+ GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
497
+
498
+ // use this to compute the memory overhead of a tensor
499
+ GGML_API size_t ggml_tensor_overhead(void);
500
+
501
  // main
502
 
503
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
504
+ GGML_API void ggml_free(struct ggml_context * ctx);
505
 
506
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
507
 
508
+ GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
509
+ GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
510
+
511
+ GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
512
+ GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
513
+ GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
514
 
515
  GGML_API struct ggml_tensor * ggml_new_tensor(
516
  struct ggml_context * ctx,
 
550
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
551
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
552
 
553
+ GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
554
+
555
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
556
  GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
557
  GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
 
565
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
566
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
567
 
568
+ GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
569
+ GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
570
+ GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
571
 
572
  //
573
  // operations on tensors with backpropagation
 
592
  struct ggml_tensor * a,
593
  struct ggml_tensor * b);
594
 
595
+ GGML_API struct ggml_tensor * ggml_add1_inplace(
596
+ struct ggml_context * ctx,
597
+ struct ggml_tensor * a,
598
+ struct ggml_tensor * b);
599
+
600
  GGML_API struct ggml_tensor * ggml_acc(
601
  struct ggml_context * ctx,
602
  struct ggml_tensor * a,
 
620
  struct ggml_tensor * a,
621
  struct ggml_tensor * b);
622
 
623
+ GGML_API struct ggml_tensor * ggml_sub_inplace(
624
+ struct ggml_context * ctx,
625
+ struct ggml_tensor * a,
626
+ struct ggml_tensor * b);
627
+
628
  GGML_API struct ggml_tensor * ggml_mul(
629
  struct ggml_context * ctx,
630
  struct ggml_tensor * a,
631
  struct ggml_tensor * b);
632
 
633
+ GGML_API struct ggml_tensor * ggml_mul_inplace(
634
+ struct ggml_context * ctx,
635
+ struct ggml_tensor * a,
636
+ struct ggml_tensor * b);
637
+
638
  GGML_API struct ggml_tensor * ggml_div(
639
  struct ggml_context * ctx,
640
  struct ggml_tensor * a,
641
  struct ggml_tensor * b);
642
 
643
+ GGML_API struct ggml_tensor * ggml_div_inplace(
644
+ struct ggml_context * ctx,
645
+ struct ggml_tensor * a,
646
+ struct ggml_tensor * b);
647
+
648
  GGML_API struct ggml_tensor * ggml_sqr(
649
  struct ggml_context * ctx,
650
  struct ggml_tensor * a);
651
 
652
+ GGML_API struct ggml_tensor * ggml_sqr_inplace(
653
+ struct ggml_context * ctx,
654
+ struct ggml_tensor * a);
655
+
656
  GGML_API struct ggml_tensor * ggml_sqrt(
657
  struct ggml_context * ctx,
658
  struct ggml_tensor * a);
659
 
660
+ GGML_API struct ggml_tensor * ggml_sqrt_inplace(
661
+ struct ggml_context * ctx,
662
+ struct ggml_tensor * a);
663
+
664
  GGML_API struct ggml_tensor * ggml_log(
665
  struct ggml_context * ctx,
666
  struct ggml_tensor * a);
 
691
  struct ggml_tensor * a,
692
  struct ggml_tensor * b);
693
 
694
+ GGML_API struct ggml_tensor * ggml_repeat_back(
695
+ struct ggml_context * ctx,
696
+ struct ggml_tensor * a,
697
+ struct ggml_tensor * b);
698
+
699
  GGML_API struct ggml_tensor * ggml_abs(
700
  struct ggml_context * ctx,
701
  struct ggml_tensor * a);
702
 
703
+ GGML_API struct ggml_tensor * ggml_abs_inplace(
704
+ struct ggml_context * ctx,
705
+ struct ggml_tensor * a);
706
+
707
  GGML_API struct ggml_tensor * ggml_sgn(
708
  struct ggml_context * ctx,
709
  struct ggml_tensor * a);
710
 
711
+ GGML_API struct ggml_tensor * ggml_sgn_inplace(
712
+ struct ggml_context * ctx,
713
+ struct ggml_tensor * a);
714
+
715
  GGML_API struct ggml_tensor * ggml_neg(
716
  struct ggml_context * ctx,
717
  struct ggml_tensor * a);
718
 
719
+ GGML_API struct ggml_tensor * ggml_neg_inplace(
720
+ struct ggml_context * ctx,
721
+ struct ggml_tensor * a);
722
+
723
  GGML_API struct ggml_tensor * ggml_step(
724
  struct ggml_context * ctx,
725
  struct ggml_tensor * a);
726
 
727
+ GGML_API struct ggml_tensor * ggml_step_inplace(
728
+ struct ggml_context * ctx,
729
+ struct ggml_tensor * a);
730
+
731
  GGML_API struct ggml_tensor * ggml_relu(
732
  struct ggml_context * ctx,
733
  struct ggml_tensor * a);
734
 
735
+ GGML_API struct ggml_tensor * ggml_relu_inplace(
736
+ struct ggml_context * ctx,
737
+ struct ggml_tensor * a);
738
+
739
  // TODO: double-check this computation is correct
740
  GGML_API struct ggml_tensor * ggml_gelu(
741
  struct ggml_context * ctx,
742
  struct ggml_tensor * a);
743
 
744
+ GGML_API struct ggml_tensor * ggml_gelu_inplace(
745
+ struct ggml_context * ctx,
746
+ struct ggml_tensor * a);
747
+
748
+ GGML_API struct ggml_tensor * ggml_gelu_quick(
749
+ struct ggml_context * ctx,
750
+ struct ggml_tensor * a);
751
+
752
+ GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
753
+ struct ggml_context * ctx,
754
+ struct ggml_tensor * a);
755
+
756
  GGML_API struct ggml_tensor * ggml_silu(
757
  struct ggml_context * ctx,
758
  struct ggml_tensor * a);
759
 
760
+ GGML_API struct ggml_tensor * ggml_silu_inplace(
761
+ struct ggml_context * ctx,
762
+ struct ggml_tensor * a);
763
+
764
  // a - x
765
  // b - dy
766
  GGML_API struct ggml_tensor * ggml_silu_back(
 
774
  struct ggml_context * ctx,
775
  struct ggml_tensor * a);
776
 
777
+ GGML_API struct ggml_tensor * ggml_norm_inplace(
778
+ struct ggml_context * ctx,
779
+ struct ggml_tensor * a);
780
+
781
  GGML_API struct ggml_tensor * ggml_rms_norm(
782
  struct ggml_context * ctx,
783
  struct ggml_tensor * a);
784
 
785
+ GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
786
+ struct ggml_context * ctx,
787
+ struct ggml_tensor * a);
788
+
789
  // a - x
790
  // b - dy
791
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
 
793
  struct ggml_tensor * a,
794
  struct ggml_tensor * b);
795
 
796
+ // A: n columns, m rows
797
+ // B: n columns, p rows (i.e. we transpose it internally)
798
  // result is m columns, p rows
799
  GGML_API struct ggml_tensor * ggml_mul_mat(
800
  struct ggml_context * ctx,
801
  struct ggml_tensor * a,
802
  struct ggml_tensor * b);
803
 
804
+ // A: m columns, n rows,
805
+ // B: p columns, n rows,
806
+ // result is m columns, p rows
807
+ GGML_API struct ggml_tensor * ggml_out_prod(
808
+ struct ggml_context * ctx,
809
+ struct ggml_tensor * a,
810
+ struct ggml_tensor * b);
811
+
812
  //
813
  // operations on tensors without backpropagation
814
  //
 
1019
  struct ggml_context * ctx,
1020
  struct ggml_tensor * a);
1021
 
1022
+ GGML_API struct ggml_tensor * ggml_soft_max_back(
1023
+ struct ggml_context * ctx,
1024
+ struct ggml_tensor * a,
1025
+ struct ggml_tensor * b);
1026
+
1027
+ // in-place, returns view(a)
1028
+ GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
1029
+ struct ggml_context * ctx,
1030
+ struct ggml_tensor * a,
1031
+ struct ggml_tensor * b);
1032
+
1033
  // rotary position embedding
1034
  // if mode & 1 == 1, skip n_past elements
1035
  // if mode & 2 == 1, GPT-NeoX style
 
1075
  float min,
1076
  float max);
1077
 
1078
+ // TODO: implement general-purpose convolutions
1079
+ // GGML_API struct ggml_tensor * ggml_conv_1d(
1080
+ // struct ggml_context * ctx,
1081
+ // struct ggml_tensor * a,
1082
+ // struct ggml_tensor * b,
1083
+ // int s0
1084
+ // int p0,
1085
+ // int d0);
1086
+ //
1087
+ // GGML_API struct ggml_tensor * ggml_conv_2d(
1088
+ // struct ggml_context * ctx,
1089
+ // struct ggml_tensor * a,
1090
+ // struct ggml_tensor * b,
1091
+ // int s0,
1092
+ // int s1,
1093
+ // int p0,
1094
+ // int p1,
1095
+ // int d0,
1096
+ // int d1);
1097
+
1098
+ // padding = half
1099
  // TODO: we don't support extra parameters for now
1100
  // that's why we are hard-coding the stride, padding, and dilation
1101
  // not great ..
1102
+ // example:
1103
+ // a: 3 80 768 1
1104
+ // b: 3000 80 1 1
1105
+ // res: 3000 768 1 1
1106
+ // used in whisper
1107
+ GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
1108
  struct ggml_context * ctx,
1109
  struct ggml_tensor * a,
1110
  struct ggml_tensor * b);
1111
 
1112
+ // used in whisper
1113
+ GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
1114
+ struct ggml_context * ctx,
1115
+ struct ggml_tensor * a,
1116
+ struct ggml_tensor * b);
1117
+
1118
+ // kernel size is a->ne[0] x a->ne[1]
1119
+ // stride is equal to kernel size
1120
+ // padding is zero
1121
+ // example:
1122
+ // a: 16 16 3 768
1123
+ // b: 1024 1024 3 1
1124
+ // res: 64 64 768 1
1125
+ // used in sam
1126
+ GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
1127
  struct ggml_context * ctx,
1128
  struct ggml_tensor * a,
1129
  struct ggml_tensor * b);
 
1135
  struct ggml_tensor * v,
1136
  bool masked);
1137
 
1138
+ GGML_API struct ggml_tensor * ggml_flash_attn_back(
1139
+ struct ggml_context * ctx,
1140
+ struct ggml_tensor * q,
1141
+ struct ggml_tensor * k,
1142
+ struct ggml_tensor * v,
1143
+ struct ggml_tensor * d,
1144
+ bool masked);
1145
+
1146
  GGML_API struct ggml_tensor * ggml_flash_ff(
1147
  struct ggml_context * ctx,
1148
  struct ggml_tensor * a,
 
1151
  struct ggml_tensor * c0,
1152
  struct ggml_tensor * c1);
1153
 
1154
+ // partition into non-overlapping windows with padding if needed
1155
+ // example:
1156
+ // a: 768 64 64 1
1157
+ // w: 14
1158
+ // res: 768 14 14 25
1159
+ // used in sam
1160
+ GGML_API struct ggml_tensor * ggml_win_part(
1161
+ struct ggml_context * ctx,
1162
+ struct ggml_tensor * a,
1163
+ int w);
1164
+
1165
+ // reverse of ggml_win_part
1166
+ // used in sam
1167
+ GGML_API struct ggml_tensor * ggml_win_unpart(
1168
+ struct ggml_context * ctx,
1169
+ struct ggml_tensor * a,
1170
+ int w0,
1171
+ int h0,
1172
+ int w);
1173
+
1174
+ // custom operators
1175
+
1176
+ typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
1177
  typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
1178
 
1179
+ typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
1180
+ typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1181
+ typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1182
+
1183
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
1184
  struct ggml_context * ctx,
1185
  struct ggml_tensor * a,
1186
  ggml_unary_op_f32_t fun);
1187
 
1188
+ GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
1189
+ struct ggml_context * ctx,
1190
+ struct ggml_tensor * a,
1191
+ ggml_unary_op_f32_t fun);
1192
+
1193
  GGML_API struct ggml_tensor * ggml_map_binary_f32(
1194
  struct ggml_context * ctx,
1195
  struct ggml_tensor * a,
1196
  struct ggml_tensor * b,
1197
  ggml_binary_op_f32_t fun);
1198
 
1199
+ GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
1200
+ struct ggml_context * ctx,
1201
+ struct ggml_tensor * a,
1202
+ struct ggml_tensor * b,
1203
+ ggml_binary_op_f32_t fun);
1204
+
1205
+ GGML_API struct ggml_tensor * ggml_map_custom1_f32(
1206
+ struct ggml_context * ctx,
1207
+ struct ggml_tensor * a,
1208
+ ggml_custom1_op_f32_t fun);
1209
+
1210
+ GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
1211
+ struct ggml_context * ctx,
1212
+ struct ggml_tensor * a,
1213
+ ggml_custom1_op_f32_t fun);
1214
+
1215
+ GGML_API struct ggml_tensor * ggml_map_custom2_f32(
1216
+ struct ggml_context * ctx,
1217
+ struct ggml_tensor * a,
1218
+ struct ggml_tensor * b,
1219
+ ggml_custom2_op_f32_t fun);
1220
+
1221
+ GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
1222
+ struct ggml_context * ctx,
1223
+ struct ggml_tensor * a,
1224
+ struct ggml_tensor * b,
1225
+ ggml_custom2_op_f32_t fun);
1226
+
1227
+ GGML_API struct ggml_tensor * ggml_map_custom3_f32(
1228
+ struct ggml_context * ctx,
1229
+ struct ggml_tensor * a,
1230
+ struct ggml_tensor * b,
1231
+ struct ggml_tensor * c,
1232
+ ggml_custom3_op_f32_t fun);
1233
+
1234
+ GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
1235
+ struct ggml_context * ctx,
1236
+ struct ggml_tensor * a,
1237
+ struct ggml_tensor * b,
1238
+ struct ggml_tensor * c,
1239
+ ggml_custom3_op_f32_t fun);
1240
+
1241
+ // loss function
1242
+
1243
+ GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
1244
+ struct ggml_context * ctx,
1245
+ struct ggml_tensor * a,
1246
+ struct ggml_tensor * b);
1247
+
1248
+ GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
1249
+ struct ggml_context * ctx,
1250
+ struct ggml_tensor * a,
1251
+ struct ggml_tensor * b,
1252
+ struct ggml_tensor * c);
1253
+
1254
  //
1255
  // automatic differentiation
1256
  //
 
1267
  GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
1268
  GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
1269
 
1270
+ GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1271
+
1272
+ GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
1273
+ GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
1274
+
1275
  // print info and performance information for the graph
1276
  GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
1277
 
 
1345
  struct {
1346
  int n_iter;
1347
 
1348
+ float sched; // schedule multiplier (fixed, decay or warmup)
1349
+ float decay; // weight decay for AdamW, use 0.0f to disable
1350
  float alpha; // learning rate
1351
  float beta1;
1352
  float beta2;
 
1371
  } lbfgs;
1372
  };
1373
 
1374
+ struct ggml_opt_context {
1375
+ struct ggml_context * ctx;
1376
+ struct ggml_opt_params params;
1377
+
1378
+ int iter;
1379
+ int64_t nx; // number of parameter elements
1380
+
1381
+ bool just_initialized;
1382
+
1383
+ struct {
1384
+ struct ggml_tensor * x; // view of the parameters
1385
+ struct ggml_tensor * g1; // gradient
1386
+ struct ggml_tensor * g2; // gradient squared
1387
+ struct ggml_tensor * m; // first moment
1388
+ struct ggml_tensor * v; // second moment
1389
+ struct ggml_tensor * mh; // first moment hat
1390
+ struct ggml_tensor * vh; // second moment hat
1391
+ struct ggml_tensor * pf; // past function values
1392
+ float fx_best;
1393
+ float fx_prev;
1394
+ int n_no_improvement;
1395
+ } adam;
1396
+
1397
+ struct {
1398
+ struct ggml_tensor * x; // current parameters
1399
+ struct ggml_tensor * xp; // previous parameters
1400
+ struct ggml_tensor * g; // current gradient
1401
+ struct ggml_tensor * gp; // previous gradient
1402
+ struct ggml_tensor * d; // search direction
1403
+ struct ggml_tensor * pf; // past function values
1404
+ struct ggml_tensor * lmal; // the L-BFGS memory alpha
1405
+ struct ggml_tensor * lmys; // the L-BFGS memory ys
1406
+ struct ggml_tensor * lms; // the L-BFGS memory s
1407
+ struct ggml_tensor * lmy; // the L-BFGS memory y
1408
+ float fx_best;
1409
+ float step;
1410
+ int j;
1411
+ int k;
1412
+ int end;
1413
+ int n_no_improvement;
1414
+ } lbfgs;
1415
+ };
1416
+
1417
  GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
1418
 
1419
  // optimize the function defined by the tensor f
 
1422
  struct ggml_opt_params params,
1423
  struct ggml_tensor * f);
1424
 
1425
+ // initialize optimizer context
1426
+ GGML_API void ggml_opt_init(
1427
+ struct ggml_context * ctx,
1428
+ struct ggml_opt_context * opt,
1429
+ struct ggml_opt_params params,
1430
+ int64_t nx);
1431
+
1432
+ // continue optimizing the function defined by the tensor f
1433
+ GGML_API enum ggml_opt_result ggml_opt_resume(
1434
+ struct ggml_context * ctx,
1435
+ struct ggml_opt_context * opt,
1436
+ struct ggml_tensor * f);
1437
+
1438
+ // continue optimizing the function defined by the tensor f
1439
+ GGML_API enum ggml_opt_result ggml_opt_resume_g(
1440
+ struct ggml_context * ctx,
1441
+ struct ggml_opt_context * opt,
1442
+ struct ggml_tensor * f,
1443
+ struct ggml_cgraph * gf,
1444
+ struct ggml_cgraph * gb);
1445
+
1446
  //
1447
  // quantization
1448
  //
whisper.cpp CHANGED
@@ -19,6 +19,10 @@
19
  #include <regex>
20
  #include <random>
21
 
 
 
 
 
22
  #if defined(GGML_BIG_ENDIAN)
23
  #include <bit>
24
 
@@ -1468,7 +1472,7 @@ static bool whisper_encode_internal(
1468
  {
1469
  wstate.use_buf(ctx0, 1);
1470
 
1471
- cur = ggml_conv_1d_1s(ctx0, model.e_conv_1_w, mel);
1472
  cur = ggml_add(ctx0,
1473
  ggml_repeat(ctx0,
1474
  model.e_conv_1_b,
@@ -1479,7 +1483,7 @@ static bool whisper_encode_internal(
1479
 
1480
  wstate.use_buf(ctx0, 0);
1481
 
1482
- cur = ggml_conv_1d_2s(ctx0, model.e_conv_2_w, cur);
1483
  cur = ggml_add(ctx0,
1484
  ggml_repeat(ctx0,
1485
  model.e_conv_2_b,
 
19
  #include <regex>
20
  #include <random>
21
 
22
+ #if defined(_MSC_VER)
23
+ #pragma warning(disable: 4244 4267) // possible loss of data
24
+ #endif
25
+
26
  #if defined(GGML_BIG_ENDIAN)
27
  #include <bit>
28
 
 
1472
  {
1473
  wstate.use_buf(ctx0, 1);
1474
 
1475
+ cur = ggml_conv_1d_s1_ph(ctx0, model.e_conv_1_w, mel);
1476
  cur = ggml_add(ctx0,
1477
  ggml_repeat(ctx0,
1478
  model.e_conv_1_b,
 
1483
 
1484
  wstate.use_buf(ctx0, 0);
1485
 
1486
+ cur = ggml_conv_1d_s2_ph(ctx0, model.e_conv_2_w, cur);
1487
  cur = ggml_add(ctx0,
1488
  ggml_repeat(ctx0,
1489
  model.e_conv_2_b,