ggerganov commited on
Commit
c4fb34c
·
1 Parent(s): 3d08664

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama-sampling.cpp CHANGED
@@ -1396,19 +1396,15 @@ struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab
1396
  // penalties
1397
 
1398
  struct llama_sampler_penalties {
1399
- const int32_t n_vocab;
1400
- const llama_token special_eos_id;
1401
- const llama_token linefeed_id;
1402
-
1403
  const int32_t penalty_last_n;
1404
  const float penalty_repeat;
1405
  const float penalty_freq;
1406
  const float penalty_present;
1407
 
1408
- const bool penalize_nl;
1409
- const bool ignore_eos;
1410
-
1411
  ring_buffer<llama_token> prev;
 
 
 
1412
  };
1413
 
1414
  static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
@@ -1421,76 +1417,50 @@ static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_to
1421
  return;
1422
  }
1423
 
1424
- ctx->prev.push_back(token);
1425
- }
1426
-
1427
- static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
1428
- auto * ctx = (llama_sampler_penalties *) smpl->ctx;
1429
 
1430
- if (ctx->ignore_eos) {
1431
- assert(ctx->special_eos_id >= 0);
 
1432
 
1433
- // optimistically check if the candidates are not yet sorted/shuffled/truncated
1434
- if (cur_p->size > (size_t) ctx->special_eos_id && cur_p->data[ctx->special_eos_id].id == ctx->special_eos_id) {
1435
- cur_p->data[ctx->special_eos_id].logit = -INFINITY;
1436
- } else {
1437
- // else, search for the special EOS token
1438
- for (size_t i = 0; i < cur_p->size; ++i) {
1439
- if (cur_p->data[i].id == ctx->special_eos_id) {
1440
- cur_p->data[i].logit = -INFINITY;
1441
- break;
1442
- }
1443
- }
1444
  }
1445
  }
1446
 
1447
- if ((ctx->penalty_last_n == 0) ||
1448
- (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
1449
- return;
1450
- }
1451
-
1452
- bool nl_found = false;
1453
- size_t nl_idx = 0;
1454
- float nl_logit = -INFINITY;
1455
- if (!ctx->penalize_nl) {
1456
- assert(ctx->linefeed_id >= 0);
1457
 
1458
- // optimistically check if the candidates are not yet sorted/shuffled/truncated
1459
- if (cur_p->size > (size_t) ctx->linefeed_id && cur_p->data[ctx->linefeed_id].id == ctx->linefeed_id) {
1460
- nl_found = true;
1461
- nl_idx = ctx->linefeed_id;
1462
- nl_logit = cur_p->data[ctx->linefeed_id].logit;
1463
- } else {
1464
- // else, search for the linefeed token
1465
- for (size_t i = 0; i < cur_p->size; ++i) {
1466
- if (cur_p->data[i].id == ctx->linefeed_id) {
1467
- nl_found = true;
1468
- nl_idx = i;
1469
- nl_logit = cur_p->data[i].logit;
1470
- break;
1471
- }
1472
- }
1473
- }
1474
  }
1475
 
1476
- // Create a frequency map to count occurrences of each token in last_tokens
1477
- // TODO: optimize this by maintaining the token count in the sampler context
1478
- using llama_token_cnt = std::unordered_map<llama_token, int>;
1479
- llama_token_cnt token_count;
 
 
1480
 
1481
- for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
1482
- token_count[ctx->prev.rat(i)]++;
 
1483
  }
1484
 
1485
  // Apply frequency and presence penalties to the cur_p
1486
  for (size_t i = 0; i < cur_p->size; ++i) {
1487
- const auto token_iter = token_count.find(cur_p->data[i].id);
1488
- if (token_iter == token_count.end()) {
1489
  continue;
1490
  }
1491
 
1492
  const int count = token_iter->second;
1493
 
 
 
1494
  // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
1495
  // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
1496
  if (cur_p->data[i].logit <= 0) {
@@ -1503,30 +1473,21 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
1503
  }
1504
 
1505
  cur_p->sorted = false;
1506
-
1507
- if (!ctx->penalize_nl && nl_found) {
1508
- // restore the logit of the newline token if it was penalized
1509
- cur_p->data[nl_idx].logit = nl_logit;
1510
- }
1511
  }
1512
 
1513
  static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
1514
  auto * ctx = (llama_sampler_penalties *) smpl->ctx;
1515
  ctx->prev.clear();
 
1516
  }
1517
 
1518
  static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
1519
  const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
1520
  auto * result = llama_sampler_init_penalties(
1521
- ctx->n_vocab,
1522
- ctx->special_eos_id,
1523
- ctx->linefeed_id,
1524
  ctx->penalty_last_n,
1525
  ctx->penalty_repeat,
1526
  ctx->penalty_freq,
1527
- ctx->penalty_present,
1528
- ctx->penalize_nl,
1529
- ctx->ignore_eos);
1530
 
1531
  // copy the state
1532
  {
@@ -1552,38 +1513,21 @@ static struct llama_sampler_i llama_sampler_penalties_i = {
1552
  };
1553
 
1554
  struct llama_sampler * llama_sampler_init_penalties(
1555
- int32_t n_vocab,
1556
- llama_token special_eos_id,
1557
- llama_token linefeed_id,
1558
  int32_t penalty_last_n,
1559
  float penalty_repeat,
1560
  float penalty_freq,
1561
- float penalty_present,
1562
- bool penalize_nl,
1563
- bool ignore_eos) {
1564
- if (linefeed_id == LLAMA_TOKEN_NULL) {
1565
- penalize_nl = true;
1566
- }
1567
-
1568
- if (special_eos_id == LLAMA_TOKEN_NULL) {
1569
- ignore_eos = false;
1570
- }
1571
-
1572
  penalty_last_n = std::max(penalty_last_n, 0);
1573
 
1574
  return new llama_sampler {
1575
  /* .iface = */ &llama_sampler_penalties_i,
1576
  /* .ctx = */ new llama_sampler_penalties {
1577
- /* .n_vocab = */ n_vocab,
1578
- /* .special_eos_id = */ special_eos_id,
1579
- /* .linefeed_id = */ linefeed_id,
1580
  /* .penalty_last_n = */ penalty_last_n,
1581
  /* .penalty_repeat = */ penalty_repeat,
1582
  /* .penalty_freq = */ penalty_freq,
1583
  /* .penalty_present = */ penalty_present,
1584
- /* .penalize_nl = */ penalize_nl,
1585
- /* .ignore_eos = */ ignore_eos,
1586
  /* .prev = */ ring_buffer<llama_token>(penalty_last_n),
 
1587
  },
1588
  };
1589
  }
@@ -1611,7 +1555,8 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
1611
  if (word.find(str) != std::string::npos) {
1612
  token_sequences.emplace(token_id, std::vector<llama_token>());
1613
  } else {
1614
- size_t word_len = word.size(), str_len = str.size();
 
1615
  size_t pos = -1;
1616
  while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
1617
  bool match = true;
 
1396
  // penalties
1397
 
1398
  struct llama_sampler_penalties {
 
 
 
 
1399
  const int32_t penalty_last_n;
1400
  const float penalty_repeat;
1401
  const float penalty_freq;
1402
  const float penalty_present;
1403
 
 
 
 
1404
  ring_buffer<llama_token> prev;
1405
+
1406
+ // a frequency map to count token occurrences
1407
+ std::unordered_map<llama_token, int> token_count;
1408
  };
1409
 
1410
  static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
 
1417
  return;
1418
  }
1419
 
1420
+ ctx->token_count[token]++;
 
 
 
 
1421
 
1422
+ // if the ring buffer is full, remove the oldest token
1423
+ if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) {
1424
+ const auto old = ctx->prev.front();
1425
 
1426
+ ctx->token_count[old]--;
1427
+ if (ctx->token_count[old] == 0) {
1428
+ ctx->token_count.erase(old);
 
 
 
 
 
 
 
 
1429
  }
1430
  }
1431
 
1432
+ ctx->prev.push_back(token);
 
 
 
 
 
 
 
 
 
1433
 
1434
+ #if 0
1435
+ // sanity check
1436
+ std::unordered_map<llama_token, int> tmp;
1437
+ for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
1438
+ tmp[ctx->prev.rat(i)]++;
 
 
 
 
 
 
 
 
 
 
 
1439
  }
1440
 
1441
+ assert(ctx->token_count == tmp);
1442
+ #endif
1443
+ }
1444
+
1445
+ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
1446
+ auto * ctx = (llama_sampler_penalties *) smpl->ctx;
1447
 
1448
+ if ((ctx->penalty_last_n == 0) ||
1449
+ (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
1450
+ return;
1451
  }
1452
 
1453
  // Apply frequency and presence penalties to the cur_p
1454
  for (size_t i = 0; i < cur_p->size; ++i) {
1455
+ const auto token_iter = ctx->token_count.find(cur_p->data[i].id);
1456
+ if (token_iter == ctx->token_count.end()) {
1457
  continue;
1458
  }
1459
 
1460
  const int count = token_iter->second;
1461
 
1462
+ assert(count > 0 && count <= ctx->penalty_last_n);
1463
+
1464
  // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
1465
  // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
1466
  if (cur_p->data[i].logit <= 0) {
 
1473
  }
1474
 
1475
  cur_p->sorted = false;
 
 
 
 
 
1476
  }
1477
 
1478
  static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
1479
  auto * ctx = (llama_sampler_penalties *) smpl->ctx;
1480
  ctx->prev.clear();
1481
+ ctx->token_count.clear();
1482
  }
1483
 
1484
  static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
1485
  const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
1486
  auto * result = llama_sampler_init_penalties(
 
 
 
1487
  ctx->penalty_last_n,
1488
  ctx->penalty_repeat,
1489
  ctx->penalty_freq,
1490
+ ctx->penalty_present);
 
 
1491
 
1492
  // copy the state
1493
  {
 
1513
  };
1514
 
1515
  struct llama_sampler * llama_sampler_init_penalties(
 
 
 
1516
  int32_t penalty_last_n,
1517
  float penalty_repeat,
1518
  float penalty_freq,
1519
+ float penalty_present) {
 
 
 
 
 
 
 
 
 
 
1520
  penalty_last_n = std::max(penalty_last_n, 0);
1521
 
1522
  return new llama_sampler {
1523
  /* .iface = */ &llama_sampler_penalties_i,
1524
  /* .ctx = */ new llama_sampler_penalties {
 
 
 
1525
  /* .penalty_last_n = */ penalty_last_n,
1526
  /* .penalty_repeat = */ penalty_repeat,
1527
  /* .penalty_freq = */ penalty_freq,
1528
  /* .penalty_present = */ penalty_present,
 
 
1529
  /* .prev = */ ring_buffer<llama_token>(penalty_last_n),
1530
+ /* .token_count = */ {},
1531
  },
1532
  };
1533
  }
 
1555
  if (word.find(str) != std::string::npos) {
1556
  token_sequences.emplace(token_id, std::vector<llama_token>());
1557
  } else {
1558
+ size_t word_len = word.size();
1559
+ size_t str_len = str.size();
1560
  size_t pos = -1;
1561
  while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
1562
  bool match = true;
examples/talk-llama/llama-vocab.cpp CHANGED
@@ -418,6 +418,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
418
  case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
419
  case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
420
  case LLAMA_VOCAB_PRE_TYPE_EXAONE:
 
421
  regex_exprs = {
422
  "\\p{N}",
423
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
@@ -737,7 +738,7 @@ struct llm_tokenizer_wpm_session {
737
  std::vector<std::string> words(1, "");
738
 
739
  for (const uint32_t cpt : cpts_nfd) {
740
- const auto flags = unicode_cpt_flags(cpt);
741
 
742
  if (flags.is_whitespace) {
743
  if (words.back().size()) { // finish previous word if any
 
418
  case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
419
  case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
420
  case LLAMA_VOCAB_PRE_TYPE_EXAONE:
421
+ case LLAMA_VOCAB_PRE_TYPE_MINERVA:
422
  regex_exprs = {
423
  "\\p{N}",
424
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
 
738
  std::vector<std::string> words(1, "");
739
 
740
  for (const uint32_t cpt : cpts_nfd) {
741
+ const auto flags = unicode_cpt_flags_from_cpt(cpt);
742
 
743
  if (flags.is_whitespace) {
744
  if (words.back().size()) { // finish previous word if any
examples/talk-llama/llama.cpp CHANGED
@@ -163,6 +163,7 @@ enum llm_arch {
163
  LLM_ARCH_QWEN,
164
  LLM_ARCH_QWEN2,
165
  LLM_ARCH_QWEN2MOE,
 
166
  LLM_ARCH_PHI2,
167
  LLM_ARCH_PHI3,
168
  LLM_ARCH_PLAMO,
@@ -183,6 +184,7 @@ enum llm_arch {
183
  LLM_ARCH_OLMOE,
184
  LLM_ARCH_OPENELM,
185
  LLM_ARCH_ARCTIC,
 
186
  LLM_ARCH_DEEPSEEK2,
187
  LLM_ARCH_CHATGLM,
188
  LLM_ARCH_BITNET,
@@ -217,6 +219,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
217
  { LLM_ARCH_QWEN, "qwen" },
218
  { LLM_ARCH_QWEN2, "qwen2" },
219
  { LLM_ARCH_QWEN2MOE, "qwen2moe" },
 
220
  { LLM_ARCH_PHI2, "phi2" },
221
  { LLM_ARCH_PHI3, "phi3" },
222
  { LLM_ARCH_PLAMO, "plamo" },
@@ -237,6 +240,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
237
  { LLM_ARCH_OLMOE, "olmoe" },
238
  { LLM_ARCH_OPENELM, "openelm" },
239
  { LLM_ARCH_ARCTIC, "arctic" },
 
240
  { LLM_ARCH_DEEPSEEK2, "deepseek2" },
241
  { LLM_ARCH_CHATGLM, "chatglm" },
242
  { LLM_ARCH_BITNET, "bitnet" },
@@ -308,6 +312,7 @@ enum llm_kv {
308
  LLM_KV_ATTENTION_SCALE,
309
 
310
  LLM_KV_ROPE_DIMENSION_COUNT,
 
311
  LLM_KV_ROPE_FREQ_BASE,
312
  LLM_KV_ROPE_SCALE_LINEAR,
313
  LLM_KV_ROPE_SCALING_TYPE,
@@ -424,6 +429,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
424
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
425
 
426
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
 
427
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
428
  { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
429
  { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
@@ -898,6 +904,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
898
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
899
  },
900
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
901
  {
902
  LLM_ARCH_QWEN2MOE,
903
  {
@@ -1288,6 +1311,33 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1288
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1289
  },
1290
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1291
  {
1292
  LLM_ARCH_DEEPSEEK2,
1293
  {
@@ -1562,6 +1612,7 @@ enum llm_chat_template {
1562
  LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
1563
  LLM_CHAT_TEMPLATE_MISTRAL_V7,
1564
  LLM_CHAT_TEMPLATE_PHI_3,
 
1565
  LLM_CHAT_TEMPLATE_ZEPHYR,
1566
  LLM_CHAT_TEMPLATE_MONARCH,
1567
  LLM_CHAT_TEMPLATE_GEMMA,
@@ -1579,6 +1630,7 @@ enum llm_chat_template {
1579
  LLM_CHAT_TEMPLATE_EXAONE_3,
1580
  LLM_CHAT_TEMPLATE_RWKV_WORLD,
1581
  LLM_CHAT_TEMPLATE_GRANITE,
 
1582
  LLM_CHAT_TEMPLATE_UNKNOWN,
1583
  };
1584
 
@@ -1593,6 +1645,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
1593
  { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
1594
  { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
1595
  { "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
 
1596
  { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
1597
  { "monarch", LLM_CHAT_TEMPLATE_MONARCH },
1598
  { "gemma", LLM_CHAT_TEMPLATE_GEMMA },
@@ -1610,6 +1663,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
1610
  { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
1611
  { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
1612
  { "granite", LLM_CHAT_TEMPLATE_GRANITE },
 
1613
  };
1614
 
1615
  static llm_arch llm_arch_from_string(const std::string & name) {
@@ -1794,7 +1848,7 @@ private:
1794
  DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
1795
  NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
1796
  if (!bufLen) {
1797
- ret = format("Win32 error code: %s", error_code);
1798
  } else {
1799
  ret = lpMsgBuf;
1800
  LocalFree(lpMsgBuf);
@@ -2132,7 +2186,7 @@ struct llama_mmap {
2132
  HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
2133
 
2134
  // may fail on pre-Windows 8 systems
2135
- pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
2136
 
2137
  if (pPrefetchVirtualMemory) {
2138
  // advise the kernel to preload the mapped memory
@@ -2474,11 +2528,12 @@ struct llama_hparams {
2474
  uint32_t time_decay_extra_dim = 0;
2475
  uint32_t wkv_head_size = 0;
2476
 
2477
- float rope_attn_factor = 1.0f;
2478
- float rope_freq_base_train;
2479
- float rope_freq_scale_train;
2480
- uint32_t n_ctx_orig_yarn;
2481
- float rope_yarn_log_mul;
 
2482
 
2483
  // for State Space Models
2484
  uint32_t ssm_d_conv = 0;
@@ -2535,6 +2590,9 @@ struct llama_hparams {
2535
 
2536
  if (this->rope_finetuned != other.rope_finetuned) return true;
2537
  if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
 
 
 
2538
 
2539
  if (this->ssm_d_conv != other.ssm_d_conv) return true;
2540
  if (this->ssm_d_inner != other.ssm_d_inner) return true;
@@ -3378,6 +3436,11 @@ struct llama_context {
3378
  // whether we are computing encoder output or decoder output
3379
  bool is_encoding = false;
3380
 
 
 
 
 
 
3381
  // output of the encoder part of the encoder-decoder models
3382
  std::vector<float> embd_enc;
3383
  std::vector<std::set<llama_seq_id>> seq_ids_enc;
@@ -4578,9 +4641,6 @@ struct llama_model_loader {
4578
  case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
4579
  case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
4580
  case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
4581
- case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
4582
- case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
4583
- case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
4584
  default:
4585
  {
4586
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -5344,9 +5404,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
5344
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
5345
  case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
5346
  case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
5347
- case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
5348
- case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
5349
- case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
5350
 
5351
  default: return "unknown, may not work";
5352
  }
@@ -5753,6 +5810,13 @@ static void llm_load_hparams(
5753
  default: model.type = e_model::MODEL_UNKNOWN;
5754
  }
5755
  } break;
 
 
 
 
 
 
 
5756
  case LLM_ARCH_QWEN2:
5757
  {
5758
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -6063,6 +6127,19 @@ static void llm_load_hparams(
6063
  model.type = e_model::MODEL_UNKNOWN;
6064
  }
6065
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
6066
  case LLM_ARCH_DEEPSEEK2:
6067
  {
6068
  bool is_lite = (hparams.n_layer == 27);
@@ -6398,6 +6475,11 @@ static void llm_load_vocab(
6398
  } else if (
6399
  tokenizer_pre == "falcon") {
6400
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
 
 
 
 
 
6401
  } else if (
6402
  tokenizer_pre == "mpt") {
6403
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
@@ -6409,6 +6491,7 @@ static void llm_load_vocab(
6409
  tokenizer_pre == "phi-2" ||
6410
  tokenizer_pre == "jina-es" ||
6411
  tokenizer_pre == "jina-de" ||
 
6412
  tokenizer_pre == "jina-v1-en" ||
6413
  tokenizer_pre == "jina-v2-es" ||
6414
  tokenizer_pre == "jina-v2-de" ||
@@ -6479,6 +6562,9 @@ static void llm_load_vocab(
6479
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
6480
  vocab.tokenizer_add_bos = true;
6481
  vocab.tokenizer_clean_spaces = false;
 
 
 
6482
  } else {
6483
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
6484
  }
@@ -7057,6 +7143,13 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
7057
 
7058
  LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
7059
 
 
 
 
 
 
 
 
7060
  if (model.arch == LLM_ARCH_DEEPSEEK2) {
7061
  LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7062
  LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
@@ -8170,6 +8263,7 @@ static bool llm_load_tensors(
8170
  }
8171
  } break;
8172
  case LLM_ARCH_QWEN2:
 
8173
  {
8174
  model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
8175
 
@@ -8830,6 +8924,55 @@ static bool llm_load_tensors(
8830
  layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
8831
  }
8832
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8833
  case LLM_ARCH_DEEPSEEK2:
8834
  {
8835
  const bool is_lite = (hparams.n_layer == 27);
@@ -12559,6 +12702,124 @@ struct llm_build_context {
12559
  return gf;
12560
  }
12561
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12562
  struct ggml_cgraph * build_qwen2moe() {
12563
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
12564
 
@@ -15066,6 +15327,161 @@ struct llm_build_context {
15066
  return gf;
15067
  }
15068
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15069
  struct ggml_cgraph * build_deepseek2() {
15070
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
15071
 
@@ -16660,6 +17076,11 @@ static struct ggml_cgraph * llama_build_graph(
16660
  {
16661
  result = llm.build_qwen2();
16662
  } break;
 
 
 
 
 
16663
  case LLM_ARCH_QWEN2MOE:
16664
  {
16665
  result = llm.build_qwen2moe();
@@ -16748,6 +17169,10 @@ static struct ggml_cgraph * llama_build_graph(
16748
  {
16749
  result = llm.build_arctic();
16750
  } break;
 
 
 
 
16751
  case LLM_ARCH_DEEPSEEK2:
16752
  {
16753
  result = llm.build_deepseek2();
@@ -16878,8 +17303,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
16878
 
16879
  if (ubatch.pos && lctx.inp_pos) {
16880
  const int64_t n_tokens = ubatch.n_tokens;
16881
-
16882
- ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
16883
  }
16884
 
16885
  if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
@@ -18364,10 +18789,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18364
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
18365
  new_type = GGML_TYPE_IQ3_S;
18366
  }
18367
- else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
18368
- new_type == GGML_TYPE_Q4_0_8_8) {
18369
- new_type = GGML_TYPE_Q4_0;
18370
- }
18371
  else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
18372
  new_type = GGML_TYPE_Q4_K;
18373
  }
@@ -18690,9 +19111,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18690
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
18691
  case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
18692
  case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
18693
- case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
18694
- case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
18695
- case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
18696
 
18697
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
18698
  }
@@ -19031,14 +19449,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
19031
  f32_data = (float *) f32_conv_buf.data();
19032
  }
19033
 
19034
- int chunk_size_multiplier = 1;
19035
- if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
19036
- if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
19037
- else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
19038
- if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
19039
- else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
19040
- }
19041
-
19042
  LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
19043
  fflush(stdout);
19044
 
@@ -19051,8 +19461,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
19051
  const int64_t nrows = tensor->ne[1];
19052
 
19053
  static const int64_t min_chunk_size = 32 * 512;
19054
- const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
19055
- chunk_size_multiplier;
19056
 
19057
  const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
19058
  const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
@@ -19995,6 +20404,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
19995
  case LLM_ARCH_COMMAND_R:
19996
  case LLM_ARCH_OLMO:
19997
  case LLM_ARCH_ARCTIC:
 
19998
  case LLM_ARCH_DEEPSEEK2:
19999
  case LLM_ARCH_CHATGLM:
20000
  case LLM_ARCH_GRANITE:
@@ -20028,6 +20438,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
20028
  case LLM_ARCH_MINICPM3:
20029
  return LLAMA_ROPE_TYPE_NEOX;
20030
 
 
 
 
20031
  // all model arches should be listed explicitly here
20032
  case LLM_ARCH_UNKNOWN:
20033
  GGML_ABORT("unknown architecture");
@@ -21596,7 +22009,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
21596
  throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
21597
  }
21598
  } else if ((size_t) i >= ctx->output_ids.size()) {
21599
- throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
21600
  } else {
21601
  j = ctx->output_ids[i];
21602
  }
@@ -21813,6 +22226,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
21813
  }
21814
  } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
21815
  return LLM_CHAT_TEMPLATE_PHI_3;
 
 
21816
  } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
21817
  return LLM_CHAT_TEMPLATE_ZEPHYR;
21818
  } else if (tmpl_contains("bos_token + message['role']")) {
@@ -21857,6 +22272,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
21857
  return LLM_CHAT_TEMPLATE_RWKV_WORLD;
21858
  } else if (tmpl_contains("<|start_of_role|>")) {
21859
  return LLM_CHAT_TEMPLATE_GRANITE;
 
 
21860
  }
21861
  return LLM_CHAT_TEMPLATE_UNKNOWN;
21862
  }
@@ -21963,6 +22380,15 @@ static int32_t llama_chat_apply_template_internal(
21963
  if (add_ass) {
21964
  ss << "<|assistant|>\n";
21965
  }
 
 
 
 
 
 
 
 
 
21966
  } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
21967
  // zephyr template
21968
  for (auto message : chat) {
@@ -22180,6 +22606,32 @@ static int32_t llama_chat_apply_template_internal(
22180
  if (add_ass) {
22181
  ss << "<|start_of_role|>assistant<|end_of_role|>\n";
22182
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22183
  } else {
22184
  // template not supported
22185
  return -1;
 
163
  LLM_ARCH_QWEN,
164
  LLM_ARCH_QWEN2,
165
  LLM_ARCH_QWEN2MOE,
166
+ LLM_ARCH_QWEN2VL,
167
  LLM_ARCH_PHI2,
168
  LLM_ARCH_PHI3,
169
  LLM_ARCH_PLAMO,
 
184
  LLM_ARCH_OLMOE,
185
  LLM_ARCH_OPENELM,
186
  LLM_ARCH_ARCTIC,
187
+ LLM_ARCH_DEEPSEEK,
188
  LLM_ARCH_DEEPSEEK2,
189
  LLM_ARCH_CHATGLM,
190
  LLM_ARCH_BITNET,
 
219
  { LLM_ARCH_QWEN, "qwen" },
220
  { LLM_ARCH_QWEN2, "qwen2" },
221
  { LLM_ARCH_QWEN2MOE, "qwen2moe" },
222
+ { LLM_ARCH_QWEN2VL, "qwen2vl" },
223
  { LLM_ARCH_PHI2, "phi2" },
224
  { LLM_ARCH_PHI3, "phi3" },
225
  { LLM_ARCH_PLAMO, "plamo" },
 
240
  { LLM_ARCH_OLMOE, "olmoe" },
241
  { LLM_ARCH_OPENELM, "openelm" },
242
  { LLM_ARCH_ARCTIC, "arctic" },
243
+ { LLM_ARCH_DEEPSEEK, "deepseek" },
244
  { LLM_ARCH_DEEPSEEK2, "deepseek2" },
245
  { LLM_ARCH_CHATGLM, "chatglm" },
246
  { LLM_ARCH_BITNET, "bitnet" },
 
312
  LLM_KV_ATTENTION_SCALE,
313
 
314
  LLM_KV_ROPE_DIMENSION_COUNT,
315
+ LLM_KV_ROPE_DIMENSION_SECTIONS,
316
  LLM_KV_ROPE_FREQ_BASE,
317
  LLM_KV_ROPE_SCALE_LINEAR,
318
  LLM_KV_ROPE_SCALING_TYPE,
 
429
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
430
 
431
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
432
+ { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
433
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
434
  { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
435
  { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
 
904
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
905
  },
906
  },
907
+ {
908
+ LLM_ARCH_QWEN2VL,
909
+ {
910
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
911
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
912
+ { LLM_TENSOR_OUTPUT, "output" },
913
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
914
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
915
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
916
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
917
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
918
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
919
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
920
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
921
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
922
+ },
923
+ },
924
  {
925
  LLM_ARCH_QWEN2MOE,
926
  {
 
1311
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1312
  },
1313
  },
1314
+ {
1315
+ LLM_ARCH_DEEPSEEK,
1316
+ {
1317
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1318
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1319
+ { LLM_TENSOR_OUTPUT, "output" },
1320
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1321
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1322
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1323
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1324
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1325
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1326
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
1327
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1328
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1329
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1330
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1331
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1332
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1333
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1334
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1335
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
1336
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1337
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1338
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1339
+ },
1340
+ },
1341
  {
1342
  LLM_ARCH_DEEPSEEK2,
1343
  {
 
1612
  LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
1613
  LLM_CHAT_TEMPLATE_MISTRAL_V7,
1614
  LLM_CHAT_TEMPLATE_PHI_3,
1615
+ LLM_CHAT_TEMPLATE_FALCON_3,
1616
  LLM_CHAT_TEMPLATE_ZEPHYR,
1617
  LLM_CHAT_TEMPLATE_MONARCH,
1618
  LLM_CHAT_TEMPLATE_GEMMA,
 
1630
  LLM_CHAT_TEMPLATE_EXAONE_3,
1631
  LLM_CHAT_TEMPLATE_RWKV_WORLD,
1632
  LLM_CHAT_TEMPLATE_GRANITE,
1633
+ LLM_CHAT_TEMPLATE_GIGACHAT,
1634
  LLM_CHAT_TEMPLATE_UNKNOWN,
1635
  };
1636
 
 
1645
  { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
1646
  { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
1647
  { "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
1648
+ { "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
1649
  { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
1650
  { "monarch", LLM_CHAT_TEMPLATE_MONARCH },
1651
  { "gemma", LLM_CHAT_TEMPLATE_GEMMA },
 
1663
  { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
1664
  { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
1665
  { "granite", LLM_CHAT_TEMPLATE_GRANITE },
1666
+ { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
1667
  };
1668
 
1669
  static llm_arch llm_arch_from_string(const std::string & name) {
 
1848
  DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
1849
  NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
1850
  if (!bufLen) {
1851
+ ret = format("Win32 error code: %lx", error_code);
1852
  } else {
1853
  ret = lpMsgBuf;
1854
  LocalFree(lpMsgBuf);
 
2186
  HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
2187
 
2188
  // may fail on pre-Windows 8 systems
2189
+ pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
2190
 
2191
  if (pPrefetchVirtualMemory) {
2192
  // advise the kernel to preload the mapped memory
 
2528
  uint32_t time_decay_extra_dim = 0;
2529
  uint32_t wkv_head_size = 0;
2530
 
2531
+ float rope_attn_factor = 1.0f;
2532
+ float rope_freq_base_train;
2533
+ float rope_freq_scale_train;
2534
+ uint32_t n_ctx_orig_yarn;
2535
+ float rope_yarn_log_mul;
2536
+ int rope_sections[4];
2537
 
2538
  // for State Space Models
2539
  uint32_t ssm_d_conv = 0;
 
2590
 
2591
  if (this->rope_finetuned != other.rope_finetuned) return true;
2592
  if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
2593
+ if (std::equal(std::begin(this->rope_sections),
2594
+ std::end(this->rope_sections),
2595
+ std::begin(other.rope_sections))) return true;
2596
 
2597
  if (this->ssm_d_conv != other.ssm_d_conv) return true;
2598
  if (this->ssm_d_inner != other.ssm_d_inner) return true;
 
3436
  // whether we are computing encoder output or decoder output
3437
  bool is_encoding = false;
3438
 
3439
+ // TODO: find a better way to accommodate mutli-dimension position encoding methods
3440
+ // number of position id each token get, 1 for each token in most cases.
3441
+ // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
3442
+ int n_pos_per_token = 1;
3443
+
3444
  // output of the encoder part of the encoder-decoder models
3445
  std::vector<float> embd_enc;
3446
  std::vector<std::set<llama_seq_id>> seq_ids_enc;
 
4641
  case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
4642
  case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
4643
  case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
 
 
 
4644
  default:
4645
  {
4646
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
 
5404
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
5405
  case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
5406
  case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
 
 
 
5407
 
5408
  default: return "unknown, may not work";
5409
  }
 
5810
  default: model.type = e_model::MODEL_UNKNOWN;
5811
  }
5812
  } break;
5813
+ case LLM_ARCH_QWEN2VL:
5814
+ {
5815
+ std::array<int, 4> section_dims;
5816
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, section_dims, 4, true);
5817
+ std::copy(section_dims.begin(), section_dims.begin() + 4, std::begin(hparams.rope_sections));
5818
+ }
5819
+ // fall through
5820
  case LLM_ARCH_QWEN2:
5821
  {
5822
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
6127
  model.type = e_model::MODEL_UNKNOWN;
6128
  }
6129
  } break;
6130
+ case LLM_ARCH_DEEPSEEK:
6131
+ {
6132
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
6133
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
6134
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
6135
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
6136
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
6137
+
6138
+ switch (hparams.n_layer) {
6139
+ case 28: model.type = e_model::MODEL_20B; break;
6140
+ default: model.type = e_model::MODEL_UNKNOWN;
6141
+ }
6142
+ } break;
6143
  case LLM_ARCH_DEEPSEEK2:
6144
  {
6145
  bool is_lite = (hparams.n_layer == 27);
 
6475
  } else if (
6476
  tokenizer_pre == "falcon") {
6477
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
6478
+ } else if (
6479
+ tokenizer_pre == "falcon3") {
6480
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
6481
+ vocab.tokenizer_ignore_merges = true;
6482
+ vocab.tokenizer_add_bos = true;
6483
  } else if (
6484
  tokenizer_pre == "mpt") {
6485
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
 
6491
  tokenizer_pre == "phi-2" ||
6492
  tokenizer_pre == "jina-es" ||
6493
  tokenizer_pre == "jina-de" ||
6494
+ tokenizer_pre == "gigachat" ||
6495
  tokenizer_pre == "jina-v1-en" ||
6496
  tokenizer_pre == "jina-v2-es" ||
6497
  tokenizer_pre == "jina-v2-de" ||
 
6562
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
6563
  vocab.tokenizer_add_bos = true;
6564
  vocab.tokenizer_clean_spaces = false;
6565
+ } else if (
6566
+ tokenizer_pre == "minerva-7b") {
6567
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
6568
  } else {
6569
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
6570
  }
 
7143
 
7144
  LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
7145
 
7146
+ if (model.arch == LLM_ARCH_DEEPSEEK) {
7147
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7148
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7149
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7150
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7151
+ }
7152
+
7153
  if (model.arch == LLM_ARCH_DEEPSEEK2) {
7154
  LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7155
  LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
 
8263
  }
8264
  } break;
8265
  case LLM_ARCH_QWEN2:
8266
+ case LLM_ARCH_QWEN2VL:
8267
  {
8268
  model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
8269
 
 
8924
  layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
8925
  }
8926
  } break;
8927
+ case LLM_ARCH_DEEPSEEK:
8928
+ {
8929
+
8930
+ const int64_t n_ff_exp = hparams.n_ff_exp;
8931
+ const int64_t n_expert_shared = hparams.n_expert_shared;
8932
+
8933
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
8934
+
8935
+ // output
8936
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
8937
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
8938
+
8939
+ for (int i = 0; i < n_layer; ++i) {
8940
+ auto & layer = model.layers[i];
8941
+
8942
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
8943
+
8944
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
8945
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
8946
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
8947
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
8948
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
8949
+
8950
+ if (i < (int) hparams.n_layer_dense_lead) {
8951
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
8952
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
8953
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
8954
+ } else {
8955
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
8956
+
8957
+ if (n_expert == 0) {
8958
+ throw std::runtime_error("n_expert must be > 0");
8959
+ }
8960
+ if (n_expert_used == 0) {
8961
+ throw std::runtime_error("n_expert_used must be > 0");
8962
+ }
8963
+
8964
+ // MoE branch
8965
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
8966
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
8967
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
8968
+
8969
+ // Shared expert branch
8970
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
8971
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
8972
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
8973
+ }
8974
+ }
8975
+ } break;
8976
  case LLM_ARCH_DEEPSEEK2:
8977
  {
8978
  const bool is_lite = (hparams.n_layer == 27);
 
12702
  return gf;
12703
  }
12704
 
12705
+ struct ggml_cgraph * build_qwen2vl() {
12706
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
12707
+ const int64_t n_embd_head = hparams.n_embd_head_v;
12708
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
12709
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
12710
+
12711
+ struct ggml_tensor * cur;
12712
+ struct ggml_tensor * inpL;
12713
+
12714
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
12715
+
12716
+ // inp_pos - contains the positions
12717
+ lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4);
12718
+ cb(lctx.inp_pos, "inp_pos", -1);
12719
+ ggml_set_input(lctx.inp_pos);
12720
+ struct ggml_tensor * inp_pos = lctx.inp_pos;
12721
+
12722
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
12723
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
12724
+ int sections[4];
12725
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
12726
+
12727
+ for (int il = 0; il < n_layer; ++il) {
12728
+ struct ggml_tensor * inpSA = inpL;
12729
+
12730
+ // norm
12731
+ cur = llm_build_norm(ctx0, inpL, hparams,
12732
+ model.layers[il].attn_norm, NULL,
12733
+ LLM_NORM_RMS, cb, il);
12734
+ cb(cur, "attn_norm", il);
12735
+
12736
+ // self-attention
12737
+ {
12738
+ // compute Q and K and RoPE them
12739
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
12740
+ cb(Qcur, "Qcur", il);
12741
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
12742
+ cb(Qcur, "Qcur", il);
12743
+
12744
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
12745
+ cb(Kcur, "Kcur", il);
12746
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
12747
+ cb(Kcur, "Kcur", il);
12748
+
12749
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
12750
+ cb(Vcur, "Vcur", il);
12751
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
12752
+ cb(Vcur, "Vcur", il);
12753
+
12754
+ Qcur = ggml_rope_multi(
12755
+ ctx0,
12756
+ ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
12757
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
12758
+ ext_factor, attn_factor, beta_fast, beta_slow
12759
+ );
12760
+ cb(Qcur, "Qcur", il);
12761
+
12762
+ Kcur = ggml_rope_multi(
12763
+ ctx0,
12764
+ ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
12765
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
12766
+ ext_factor, attn_factor, beta_fast, beta_slow
12767
+ );
12768
+ cb(Kcur, "Kcur", il);
12769
+
12770
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
12771
+ model.layers[il].wo, model.layers[il].bo,
12772
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
12773
+ }
12774
+
12775
+ if (il == n_layer - 1) {
12776
+ // skip computing output for unused tokens
12777
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
12778
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12779
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12780
+ }
12781
+
12782
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12783
+ cb(ffn_inp, "ffn_inp", il);
12784
+
12785
+ // feed-forward network
12786
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
12787
+ model.layers[il].ffn_norm, NULL,
12788
+ LLM_NORM_RMS, cb, il);
12789
+ cb(cur, "ffn_norm", il);
12790
+
12791
+ cur = llm_build_ffn(ctx0, lctx, cur,
12792
+ model.layers[il].ffn_up, NULL, NULL,
12793
+ model.layers[il].ffn_gate, NULL, NULL,
12794
+ model.layers[il].ffn_down, NULL, NULL,
12795
+ NULL,
12796
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
12797
+ cb(cur, "ffn_out", il);
12798
+
12799
+ cur = ggml_add(ctx0, cur, ffn_inp);
12800
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
12801
+ cb(cur, "l_out", il);
12802
+
12803
+ // input for next layer
12804
+ inpL = cur;
12805
+ }
12806
+
12807
+ cur = inpL;
12808
+
12809
+ cur = llm_build_norm(ctx0, cur, hparams,
12810
+ model.output_norm, NULL,
12811
+ LLM_NORM_RMS, cb, -1);
12812
+ cb(cur, "result_norm", -1);
12813
+
12814
+ // lm_head
12815
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
12816
+ cb(cur, "result_output", -1);
12817
+
12818
+ ggml_build_forward_expand(gf, cur);
12819
+
12820
+ return gf;
12821
+ }
12822
+
12823
  struct ggml_cgraph * build_qwen2moe() {
12824
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
12825
 
 
15327
  return gf;
15328
  }
15329
 
15330
+ struct ggml_cgraph * build_deepseek() {
15331
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
15332
+
15333
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
15334
+ int32_t n_tokens = this->n_tokens;
15335
+
15336
+ const int64_t n_embd_head = hparams.n_embd_head_v;
15337
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
15338
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
15339
+
15340
+ struct ggml_tensor * cur;
15341
+ struct ggml_tensor * inpL;
15342
+
15343
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
15344
+
15345
+ // inp_pos - contains the positions
15346
+ struct ggml_tensor * inp_pos = build_inp_pos();
15347
+
15348
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
15349
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
15350
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
15351
+ for (int il = 0; il < n_layer; ++il) {
15352
+ struct ggml_tensor * inpSA = inpL;
15353
+
15354
+ // norm
15355
+ cur = llm_build_norm(ctx0, inpL, hparams,
15356
+ model.layers[il].attn_norm, NULL,
15357
+ LLM_NORM_RMS, cb, il);
15358
+ cb(cur, "attn_norm", il);
15359
+
15360
+ // self-attention
15361
+ {
15362
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
15363
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
15364
+
15365
+ // compute Q and K and RoPE them
15366
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
15367
+ cb(Qcur, "Qcur", il);
15368
+ if (model.layers[il].bq) {
15369
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
15370
+ cb(Qcur, "Qcur", il);
15371
+ }
15372
+
15373
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
15374
+ cb(Kcur, "Kcur", il);
15375
+ if (model.layers[il].bk) {
15376
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
15377
+ cb(Kcur, "Kcur", il);
15378
+ }
15379
+
15380
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
15381
+ cb(Vcur, "Vcur", il);
15382
+ if (model.layers[il].bv) {
15383
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
15384
+ cb(Vcur, "Vcur", il);
15385
+ }
15386
+
15387
+ Qcur = ggml_rope_ext(
15388
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
15389
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15390
+ ext_factor, attn_factor, beta_fast, beta_slow
15391
+ );
15392
+ cb(Qcur, "Qcur", il);
15393
+
15394
+ Kcur = ggml_rope_ext(
15395
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
15396
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15397
+ ext_factor, attn_factor, beta_fast, beta_slow
15398
+ );
15399
+ cb(Kcur, "Kcur", il);
15400
+
15401
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
15402
+ model.layers[il].wo, model.layers[il].bo,
15403
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
15404
+ }
15405
+
15406
+ if (il == n_layer - 1) {
15407
+ // skip computing output for unused tokens
15408
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
15409
+ n_tokens = n_outputs;
15410
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
15411
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
15412
+ }
15413
+
15414
+
15415
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
15416
+ cb(ffn_inp, "ffn_inp", il);
15417
+
15418
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
15419
+ model.layers[il].ffn_norm, NULL,
15420
+ LLM_NORM_RMS, cb, il);
15421
+ cb(cur, "ffn_norm", il);
15422
+
15423
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
15424
+ cur = llm_build_ffn(ctx0, lctx, cur,
15425
+ model.layers[il].ffn_up, NULL, NULL,
15426
+ model.layers[il].ffn_gate, NULL, NULL,
15427
+ model.layers[il].ffn_down, NULL, NULL,
15428
+ NULL,
15429
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
15430
+ cb(cur, "ffn_out", il);
15431
+ } else {
15432
+ // MoE branch
15433
+ ggml_tensor * moe_out =
15434
+ llm_build_moe_ffn(ctx0, lctx, cur,
15435
+ model.layers[il].ffn_gate_inp,
15436
+ model.layers[il].ffn_up_exps,
15437
+ model.layers[il].ffn_gate_exps,
15438
+ model.layers[il].ffn_down_exps,
15439
+ n_expert, n_expert_used,
15440
+ LLM_FFN_SILU, false,
15441
+ false, hparams.expert_weights_scale,
15442
+ cb, il);
15443
+ cb(moe_out, "ffn_moe_out", il);
15444
+
15445
+ // FFN shared expert
15446
+ {
15447
+ ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
15448
+ model.layers[il].ffn_up_shexp, NULL, NULL,
15449
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
15450
+ model.layers[il].ffn_down_shexp, NULL, NULL,
15451
+ NULL,
15452
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
15453
+ cb(ffn_shexp, "ffn_shexp", il);
15454
+
15455
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
15456
+ cb(cur, "ffn_out", il);
15457
+ }
15458
+ }
15459
+
15460
+ cur = ggml_add(ctx0, cur, ffn_inp);
15461
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
15462
+ cb(cur, "l_out", il);
15463
+
15464
+ // input for next layer
15465
+ inpL = cur;
15466
+ }
15467
+
15468
+ cur = inpL;
15469
+
15470
+ cur = llm_build_norm(ctx0, cur, hparams,
15471
+ model.output_norm, NULL,
15472
+ LLM_NORM_RMS, cb, -1);
15473
+ cb(cur, "result_norm", -1);
15474
+
15475
+ // lm_head
15476
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
15477
+
15478
+ cb(cur, "result_output", -1);
15479
+
15480
+ ggml_build_forward_expand(gf, cur);
15481
+
15482
+ return gf;
15483
+ }
15484
+
15485
  struct ggml_cgraph * build_deepseek2() {
15486
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
15487
 
 
17076
  {
17077
  result = llm.build_qwen2();
17078
  } break;
17079
+ case LLM_ARCH_QWEN2VL:
17080
+ {
17081
+ lctx.n_pos_per_token = 4;
17082
+ result = llm.build_qwen2vl();
17083
+ } break;
17084
  case LLM_ARCH_QWEN2MOE:
17085
  {
17086
  result = llm.build_qwen2moe();
 
17169
  {
17170
  result = llm.build_arctic();
17171
  } break;
17172
+ case LLM_ARCH_DEEPSEEK:
17173
+ {
17174
+ result = llm.build_deepseek();
17175
+ } break;
17176
  case LLM_ARCH_DEEPSEEK2:
17177
  {
17178
  result = llm.build_deepseek2();
 
17303
 
17304
  if (ubatch.pos && lctx.inp_pos) {
17305
  const int64_t n_tokens = ubatch.n_tokens;
17306
+ auto n_pos = lctx.n_pos_per_token;
17307
+ ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos));
17308
  }
17309
 
17310
  if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
 
18789
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
18790
  new_type = GGML_TYPE_IQ3_S;
18791
  }
 
 
 
 
18792
  else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
18793
  new_type = GGML_TYPE_Q4_K;
18794
  }
 
19111
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
19112
  case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
19113
  case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
 
 
 
19114
 
19115
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
19116
  }
 
19449
  f32_data = (float *) f32_conv_buf.data();
19450
  }
19451
 
 
 
 
 
 
 
 
 
19452
  LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
19453
  fflush(stdout);
19454
 
 
19461
  const int64_t nrows = tensor->ne[1];
19462
 
19463
  static const int64_t min_chunk_size = 32 * 512;
19464
+ const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
 
19465
 
19466
  const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
19467
  const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
 
20404
  case LLM_ARCH_COMMAND_R:
20405
  case LLM_ARCH_OLMO:
20406
  case LLM_ARCH_ARCTIC:
20407
+ case LLM_ARCH_DEEPSEEK:
20408
  case LLM_ARCH_DEEPSEEK2:
20409
  case LLM_ARCH_CHATGLM:
20410
  case LLM_ARCH_GRANITE:
 
20438
  case LLM_ARCH_MINICPM3:
20439
  return LLAMA_ROPE_TYPE_NEOX;
20440
 
20441
+ case LLM_ARCH_QWEN2VL:
20442
+ return LLAMA_ROPE_TYPE_MROPE;
20443
+
20444
  // all model arches should be listed explicitly here
20445
  case LLM_ARCH_UNKNOWN:
20446
  GGML_ABORT("unknown architecture");
 
22009
  throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
22010
  }
22011
  } else if ((size_t) i >= ctx->output_ids.size()) {
22012
+ throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
22013
  } else {
22014
  j = ctx->output_ids[i];
22015
  }
 
22226
  }
22227
  } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
22228
  return LLM_CHAT_TEMPLATE_PHI_3;
22229
+ } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
22230
+ return LLM_CHAT_TEMPLATE_FALCON_3;
22231
  } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
22232
  return LLM_CHAT_TEMPLATE_ZEPHYR;
22233
  } else if (tmpl_contains("bos_token + message['role']")) {
 
22272
  return LLM_CHAT_TEMPLATE_RWKV_WORLD;
22273
  } else if (tmpl_contains("<|start_of_role|>")) {
22274
  return LLM_CHAT_TEMPLATE_GRANITE;
22275
+ } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
22276
+ return LLM_CHAT_TEMPLATE_GIGACHAT;
22277
  }
22278
  return LLM_CHAT_TEMPLATE_UNKNOWN;
22279
  }
 
22380
  if (add_ass) {
22381
  ss << "<|assistant|>\n";
22382
  }
22383
+ } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
22384
+ // Falcon 3
22385
+ for (auto message : chat) {
22386
+ std::string role(message->role);
22387
+ ss << "<|" << role << "|>\n" << message->content << "\n";
22388
+ }
22389
+ if (add_ass) {
22390
+ ss << "<|assistant|>\n";
22391
+ }
22392
  } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
22393
  // zephyr template
22394
  for (auto message : chat) {
 
22606
  if (add_ass) {
22607
  ss << "<|start_of_role|>assistant<|end_of_role|>\n";
22608
  }
22609
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
22610
+ // GigaChat template
22611
+ bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
22612
+
22613
+ // Handle system message if present
22614
+ if (has_system) {
22615
+ ss << "<s>" << chat[0]->content << "<|message_sep|>";
22616
+ } else {
22617
+ ss << "<s>";
22618
+ }
22619
+
22620
+ // Process remaining messages
22621
+ for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
22622
+ std::string role(chat[i]->role);
22623
+ if (role == "user") {
22624
+ ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
22625
+ << "available functions<|role_sep|>[]<|message_sep|>";
22626
+ } else if (role == "assistant") {
22627
+ ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
22628
+ }
22629
+ }
22630
+
22631
+ // Add generation prompt if needed
22632
+ if (add_ass) {
22633
+ ss << "assistant<|role_sep|>";
22634
+ }
22635
  } else {
22636
  // template not supported
22637
  return -1;
examples/talk-llama/llama.h CHANGED
@@ -104,12 +104,15 @@ extern "C" {
104
  LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
105
  LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
106
  LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
 
107
  };
108
 
109
  enum llama_rope_type {
110
- LLAMA_ROPE_TYPE_NONE = -1,
111
- LLAMA_ROPE_TYPE_NORM = 0,
112
- LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
 
 
113
  };
114
 
115
  enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
@@ -171,9 +174,9 @@ extern "C" {
171
  LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
172
  LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
173
  LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
174
- LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
175
- LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
176
- LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
177
  LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
178
  LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
179
 
@@ -455,6 +458,7 @@ extern "C" {
455
  // Functions to access the model's GGUF metadata scalar values
456
  // - The functions return the length of the string on success, or -1 on failure
457
  // - The output string is always null-terminated and cleared on failure
 
458
  // - GGUF array values are not supported by these functions
459
 
460
  // Get metadata value as a string by key name
@@ -1135,16 +1139,12 @@ extern "C" {
1135
  const char * grammar_str,
1136
  const char * grammar_root);
1137
 
 
1138
  LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
1139
- int32_t n_vocab, // llama_n_vocab()
1140
- llama_token special_eos_id, // llama_token_eos()
1141
- llama_token linefeed_id, // llama_token_nl()
1142
- int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
1143
- float penalty_repeat, // 1.0 = disabled
1144
- float penalty_freq, // 0.0 = disabled
1145
- float penalty_present, // 0.0 = disabled
1146
- bool penalize_nl, // consider newlines as a repeatable token
1147
- bool ignore_eos); // ignore the end-of-sequence token
1148
 
1149
  /// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
1150
  LLAMA_API struct llama_sampler * llama_sampler_init_dry(
 
104
  LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
105
  LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
106
  LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
107
+ LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
108
  };
109
 
110
  enum llama_rope_type {
111
+ LLAMA_ROPE_TYPE_NONE = -1,
112
+ LLAMA_ROPE_TYPE_NORM = 0,
113
+ LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
114
+ LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE,
115
+ LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
116
  };
117
 
118
  enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
 
174
  LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
175
  LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
176
  LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
177
+ //LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack
178
+ //LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack
179
+ //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
180
  LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
181
  LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
182
 
 
458
  // Functions to access the model's GGUF metadata scalar values
459
  // - The functions return the length of the string on success, or -1 on failure
460
  // - The output string is always null-terminated and cleared on failure
461
+ // - When retrieving a string, an extra byte must be allocated to account for the null terminator
462
  // - GGUF array values are not supported by these functions
463
 
464
  // Get metadata value as a string by key name
 
1139
  const char * grammar_str,
1140
  const char * grammar_root);
1141
 
1142
+ /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
1143
  LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
1144
+ int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
1145
+ float penalty_repeat, // 1.0 = disabled
1146
+ float penalty_freq, // 0.0 = disabled
1147
+ float penalty_present); // 0.0 = disabled
 
 
 
 
 
1148
 
1149
  /// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
1150
  LLAMA_API struct llama_sampler * llama_sampler_init_dry(
examples/talk-llama/unicode.cpp CHANGED
@@ -71,15 +71,15 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
71
  throw std::invalid_argument("failed to convert utf8 to codepoint");
72
  }
73
 
74
- //static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
75
  // std::vector<uint16_t> result;
76
- // if (/* 0x0000 <= cp && */ cp <= 0xffff) {
77
- // result.emplace_back(cp);
78
  // return result;
79
  // }
80
- // if (0x10000 <= cp && cp <= 0x10ffff) {
81
- // result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
82
- // result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
83
  // return result;
84
  // }
85
  // throw std::invalid_argument("failed to convert codepoint to utf16");
@@ -120,8 +120,8 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
120
  // return result;
121
  //}
122
 
123
- static std::vector<codepoint_flags> unicode_cpt_flags_array() {
124
- std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
125
 
126
  assert (unicode_ranges_flags.begin()[0].first == 0);
127
  assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
@@ -253,8 +253,8 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
253
  return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
254
  };
255
 
256
- auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
257
- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
258
  };
259
 
260
  size_t _prev_end = offset_ini;
@@ -371,8 +371,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
371
  return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
372
  };
373
 
374
- auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
375
- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
376
  };
377
 
378
  size_t _prev_end = offset_ini;
@@ -572,29 +572,29 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
572
  // interface
573
  //
574
 
575
- std::string unicode_cpt_to_utf8(uint32_t cp) {
576
  std::string result;
577
 
578
- if (/* 0x00 <= cp && */ cp <= 0x7f) {
579
- result.push_back(cp);
580
  return result;
581
  }
582
- if (0x80 <= cp && cp <= 0x7ff) {
583
- result.push_back(0xc0 | ((cp >> 6) & 0x1f));
584
- result.push_back(0x80 | (cp & 0x3f));
585
  return result;
586
  }
587
- if (0x800 <= cp && cp <= 0xffff) {
588
- result.push_back(0xe0 | ((cp >> 12) & 0x0f));
589
- result.push_back(0x80 | ((cp >> 6) & 0x3f));
590
- result.push_back(0x80 | (cp & 0x3f));
591
  return result;
592
  }
593
- if (0x10000 <= cp && cp <= 0x10ffff) {
594
- result.push_back(0xf0 | ((cp >> 18) & 0x07));
595
- result.push_back(0x80 | ((cp >> 12) & 0x3f));
596
- result.push_back(0x80 | ((cp >> 6) & 0x3f));
597
- result.push_back(0x80 | (cp & 0x3f));
598
  return result;
599
  }
600
 
@@ -624,19 +624,19 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
624
  return result;
625
  }
626
 
627
- codepoint_flags unicode_cpt_flags(const uint32_t cp) {
628
- static const codepoint_flags undef(codepoint_flags::UNDEFINED);
629
  static const auto cpt_flags = unicode_cpt_flags_array();
630
- return cp < cpt_flags.size() ? cpt_flags[cp] : undef;
631
  }
632
 
633
- codepoint_flags unicode_cpt_flags(const std::string & utf8) {
634
- static const codepoint_flags undef(codepoint_flags::UNDEFINED);
635
  if (utf8.empty()) {
636
  return undef; // undefined
637
  }
638
  size_t offset = 0;
639
- return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset));
640
  }
641
 
642
  std::string unicode_byte_to_utf8(uint8_t byte) {
@@ -649,41 +649,41 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
649
  return map.at(utf8);
650
  }
651
 
652
- uint32_t unicode_tolower(uint32_t cp) {
653
  // binary search
654
- auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp,
655
  [](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
656
  return pair.first < value;
657
  });
658
- if (it != unicode_map_lowercase.end() && it->first == cp) {
659
  return it->second;
660
  }
661
- return cp; // Return the original code point if no lowercase mapping is found
662
  }
663
 
664
  std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
665
  // unicode categories
666
  static const std::map<std::string, int> k_ucat_enum = {
667
- { "\\p{N}", codepoint_flags::NUMBER },
668
- { "\\p{L}", codepoint_flags::LETTER },
669
- { "\\p{P}", codepoint_flags::PUNCTUATION },
670
  };
671
 
672
  static const std::map<int, int> k_ucat_cpt = {
673
- { codepoint_flags::NUMBER, 0xD1 },
674
- { codepoint_flags::LETTER, 0xD2 },
675
- { codepoint_flags::PUNCTUATION, 0xD3 },
676
  };
677
 
678
  static const std::map<int, std::string> k_ucat_map = {
679
- { codepoint_flags::NUMBER, "\x30-\x39" }, // 0-9
680
- { codepoint_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
681
- { codepoint_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
682
  };
683
 
684
  // compute collapsed codepoints only if needed by at least one regex
685
  bool need_collapse = false;
686
- for (auto & regex_expr : regex_exprs) {
687
  // search for unicode categories
688
  for (const auto & ucat : k_ucat_enum) {
689
  if (std::string::npos != regex_expr.find(ucat.first)) {
@@ -709,7 +709,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
709
  continue;
710
  }
711
 
712
- const auto flags = unicode_cpt_flags(cpts[i]);
713
 
714
  if (flags.is_whitespace) {
715
  //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
@@ -725,7 +725,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
725
 
726
  std::vector<size_t> bpe_offsets = { cpts.size() };
727
 
728
- for (auto & regex_expr : regex_exprs) {
729
  // first, see if we have an efficient custom regex implementation
730
  auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets);
731
 
@@ -739,7 +739,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
739
  // if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
740
  // with the corresponding collapsed representation
741
  bool use_collapsed = false;
742
- for (auto & ucat : k_ucat_enum) {
743
  if (std::string::npos != regex_expr.find(ucat.first)) {
744
  use_collapsed = true;
745
  break;
@@ -805,7 +805,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
805
  // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
806
  std::wstring wtext(cpts.begin(), cpts.end());
807
  for (size_t i = 0; i < wtext.size(); ++i) {
808
- if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) {
809
  wtext[i] = 0x0B;
810
  }
811
  }
 
71
  throw std::invalid_argument("failed to convert utf8 to codepoint");
72
  }
73
 
74
+ //static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cpt) {
75
  // std::vector<uint16_t> result;
76
+ // if (/* 0x0000 <= cpt && */ cpt <= 0xffff) {
77
+ // result.emplace_back(cpt);
78
  // return result;
79
  // }
80
+ // if (0x10000 <= cpt && cpt <= 0x10ffff) {
81
+ // result.emplace_back(0xd800 | ((cpt - 0x10000) >> 10));
82
+ // result.emplace_back(0xdc00 | ((cpt - 0x10000) & 0x03ff));
83
  // return result;
84
  // }
85
  // throw std::invalid_argument("failed to convert codepoint to utf16");
 
120
  // return result;
121
  //}
122
 
123
+ static std::vector<unicode_cpt_flags> unicode_cpt_flags_array() {
124
+ std::vector<unicode_cpt_flags> cpt_flags(MAX_CODEPOINTS, unicode_cpt_flags::UNDEFINED);
125
 
126
  assert (unicode_ranges_flags.begin()[0].first == 0);
127
  assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
 
253
  return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
254
  };
255
 
256
+ auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
257
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
258
  };
259
 
260
  size_t _prev_end = offset_ini;
 
371
  return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
372
  };
373
 
374
+ auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
375
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
376
  };
377
 
378
  size_t _prev_end = offset_ini;
 
572
  // interface
573
  //
574
 
575
+ std::string unicode_cpt_to_utf8(uint32_t cpt) {
576
  std::string result;
577
 
578
+ if (/* 0x00 <= cpt && */ cpt <= 0x7f) {
579
+ result.push_back(cpt);
580
  return result;
581
  }
582
+ if (0x80 <= cpt && cpt <= 0x7ff) {
583
+ result.push_back(0xc0 | ((cpt >> 6) & 0x1f));
584
+ result.push_back(0x80 | (cpt & 0x3f));
585
  return result;
586
  }
587
+ if (0x800 <= cpt && cpt <= 0xffff) {
588
+ result.push_back(0xe0 | ((cpt >> 12) & 0x0f));
589
+ result.push_back(0x80 | ((cpt >> 6) & 0x3f));
590
+ result.push_back(0x80 | (cpt & 0x3f));
591
  return result;
592
  }
593
+ if (0x10000 <= cpt && cpt <= 0x10ffff) {
594
+ result.push_back(0xf0 | ((cpt >> 18) & 0x07));
595
+ result.push_back(0x80 | ((cpt >> 12) & 0x3f));
596
+ result.push_back(0x80 | ((cpt >> 6) & 0x3f));
597
+ result.push_back(0x80 | (cpt & 0x3f));
598
  return result;
599
  }
600
 
 
624
  return result;
625
  }
626
 
627
+ unicode_cpt_flags unicode_cpt_flags_from_cpt(const uint32_t cpt) {
628
+ static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
629
  static const auto cpt_flags = unicode_cpt_flags_array();
630
+ return cpt < cpt_flags.size() ? cpt_flags[cpt] : undef;
631
  }
632
 
633
+ unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8) {
634
+ static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
635
  if (utf8.empty()) {
636
  return undef; // undefined
637
  }
638
  size_t offset = 0;
639
+ return unicode_cpt_flags_from_cpt(unicode_cpt_from_utf8(utf8, offset));
640
  }
641
 
642
  std::string unicode_byte_to_utf8(uint8_t byte) {
 
649
  return map.at(utf8);
650
  }
651
 
652
+ uint32_t unicode_tolower(uint32_t cpt) {
653
  // binary search
654
+ auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cpt,
655
  [](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
656
  return pair.first < value;
657
  });
658
+ if (it != unicode_map_lowercase.end() && it->first == cpt) {
659
  return it->second;
660
  }
661
+ return cpt; // Return the original code point if no lowercase mapping is found
662
  }
663
 
664
  std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
665
  // unicode categories
666
  static const std::map<std::string, int> k_ucat_enum = {
667
+ { "\\p{N}", unicode_cpt_flags::NUMBER },
668
+ { "\\p{L}", unicode_cpt_flags::LETTER },
669
+ { "\\p{P}", unicode_cpt_flags::PUNCTUATION },
670
  };
671
 
672
  static const std::map<int, int> k_ucat_cpt = {
673
+ { unicode_cpt_flags::NUMBER, 0xD1 },
674
+ { unicode_cpt_flags::LETTER, 0xD2 },
675
+ { unicode_cpt_flags::PUNCTUATION, 0xD3 },
676
  };
677
 
678
  static const std::map<int, std::string> k_ucat_map = {
679
+ { unicode_cpt_flags::NUMBER, "\x30-\x39" }, // 0-9
680
+ { unicode_cpt_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
681
+ { unicode_cpt_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
682
  };
683
 
684
  // compute collapsed codepoints only if needed by at least one regex
685
  bool need_collapse = false;
686
+ for (const auto & regex_expr : regex_exprs) {
687
  // search for unicode categories
688
  for (const auto & ucat : k_ucat_enum) {
689
  if (std::string::npos != regex_expr.find(ucat.first)) {
 
709
  continue;
710
  }
711
 
712
+ const auto flags = unicode_cpt_flags_from_cpt(cpts[i]);
713
 
714
  if (flags.is_whitespace) {
715
  //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
 
725
 
726
  std::vector<size_t> bpe_offsets = { cpts.size() };
727
 
728
+ for (const auto & regex_expr : regex_exprs) {
729
  // first, see if we have an efficient custom regex implementation
730
  auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets);
731
 
 
739
  // if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
740
  // with the corresponding collapsed representation
741
  bool use_collapsed = false;
742
+ for (const auto & ucat : k_ucat_enum) {
743
  if (std::string::npos != regex_expr.find(ucat.first)) {
744
  use_collapsed = true;
745
  break;
 
805
  // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
806
  std::wstring wtext(cpts.begin(), cpts.end());
807
  for (size_t i = 0; i < wtext.size(); ++i) {
808
+ if (wtext[i] > 0x7F && unicode_cpt_flags_from_cpt(wtext[i]).is_whitespace) {
809
  wtext[i] = 0x0B;
810
  }
811
  }
examples/talk-llama/unicode.h CHANGED
@@ -4,9 +4,7 @@
4
  #include <string>
5
  #include <vector>
6
 
7
- // TODO: prefix all symbols with "llama_"
8
-
9
- struct codepoint_flags {
10
  enum {
11
  UNDEFINED = 0x0001,
12
  NUMBER = 0x0002, // regex: \p{N}
@@ -35,7 +33,7 @@ struct codepoint_flags {
35
  uint16_t is_nfd : 1;
36
 
37
  // decode from uint16
38
- inline codepoint_flags(const uint16_t flags=0) {
39
  *reinterpret_cast<uint16_t*>(this) = flags;
40
  }
41
 
@@ -50,18 +48,19 @@ struct codepoint_flags {
50
 
51
  size_t unicode_len_utf8(char src);
52
 
53
- std::string unicode_cpt_to_utf8(uint32_t cp);
54
- uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
 
55
  std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
56
 
57
  std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
58
 
59
- codepoint_flags unicode_cpt_flags(const uint32_t cp);
60
- codepoint_flags unicode_cpt_flags(const std::string & utf8);
61
 
62
  std::string unicode_byte_to_utf8(uint8_t byte);
63
- uint8_t unicode_utf8_to_byte(const std::string & utf8);
64
 
65
- uint32_t unicode_tolower(uint32_t cp);
66
 
67
  std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
 
4
  #include <string>
5
  #include <vector>
6
 
7
+ struct unicode_cpt_flags {
 
 
8
  enum {
9
  UNDEFINED = 0x0001,
10
  NUMBER = 0x0002, // regex: \p{N}
 
33
  uint16_t is_nfd : 1;
34
 
35
  // decode from uint16
36
+ inline unicode_cpt_flags(const uint16_t flags = 0) {
37
  *reinterpret_cast<uint16_t*>(this) = flags;
38
  }
39
 
 
48
 
49
  size_t unicode_len_utf8(char src);
50
 
51
+ std::string unicode_cpt_to_utf8 (uint32_t cpt);
52
+ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
53
+
54
  std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
55
 
56
  std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
57
 
58
+ unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
59
+ unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
60
 
61
  std::string unicode_byte_to_utf8(uint8_t byte);
62
+ uint8_t unicode_utf8_to_byte(const std::string & utf8);
63
 
64
+ uint32_t unicode_tolower(uint32_t cpt);
65
 
66
  std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);