ggerganov commited on
Commit
05fda4a
·
1 Parent(s): 6d29e32

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama-arch.cpp CHANGED
@@ -19,6 +19,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
19
  { LLM_ARCH_REFACT, "refact" },
20
  { LLM_ARCH_BERT, "bert" },
21
  { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
 
22
  { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
23
  { LLM_ARCH_BLOOM, "bloom" },
24
  { LLM_ARCH_STABLELM, "stablelm" },
@@ -106,6 +107,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
106
  { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
107
  { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
108
  { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
 
109
  { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
110
  { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
111
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
@@ -472,6 +474,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
472
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
473
  },
474
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
475
  {
476
  LLM_ARCH_JINA_BERT_V2,
477
  {
 
19
  { LLM_ARCH_REFACT, "refact" },
20
  { LLM_ARCH_BERT, "bert" },
21
  { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
22
+ { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
23
  { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
24
  { LLM_ARCH_BLOOM, "bloom" },
25
  { LLM_ARCH_STABLELM, "stablelm" },
 
107
  { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
108
  { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
109
  { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
110
+ { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
111
  { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
112
  { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
113
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
 
474
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
475
  },
476
  },
477
+ {
478
+ LLM_ARCH_NOMIC_BERT_MOE,
479
+ {
480
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
481
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
482
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
483
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
484
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
485
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
486
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
487
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
488
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
489
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
490
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
491
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
492
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
493
+ },
494
+ },
495
  {
496
  LLM_ARCH_JINA_BERT_V2,
497
  {
examples/talk-llama/llama-arch.h CHANGED
@@ -23,6 +23,7 @@ enum llm_arch {
23
  LLM_ARCH_REFACT,
24
  LLM_ARCH_BERT,
25
  LLM_ARCH_NOMIC_BERT,
 
26
  LLM_ARCH_JINA_BERT_V2,
27
  LLM_ARCH_BLOOM,
28
  LLM_ARCH_STABLELM,
@@ -110,6 +111,7 @@ enum llm_kv {
110
  LLM_KV_EXPERT_WEIGHTS_SCALE,
111
  LLM_KV_EXPERT_WEIGHTS_NORM,
112
  LLM_KV_EXPERT_GATING_FUNC,
 
113
  LLM_KV_POOLING_TYPE,
114
  LLM_KV_LOGIT_SCALE,
115
  LLM_KV_DECODER_START_TOKEN_ID,
 
23
  LLM_ARCH_REFACT,
24
  LLM_ARCH_BERT,
25
  LLM_ARCH_NOMIC_BERT,
26
+ LLM_ARCH_NOMIC_BERT_MOE,
27
  LLM_ARCH_JINA_BERT_V2,
28
  LLM_ARCH_BLOOM,
29
  LLM_ARCH_STABLELM,
 
111
  LLM_KV_EXPERT_WEIGHTS_SCALE,
112
  LLM_KV_EXPERT_WEIGHTS_NORM,
113
  LLM_KV_EXPERT_GATING_FUNC,
114
+ LLM_KV_MOE_EVERY_N_LAYERS,
115
  LLM_KV_POOLING_TYPE,
116
  LLM_KV_LOGIT_SCALE,
117
  LLM_KV_DECODER_START_TOKEN_ID,
examples/talk-llama/llama-chat.cpp CHANGED
@@ -50,8 +50,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
50
  { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
51
  { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
52
  { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
53
- { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
54
- { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
55
  { "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
56
  { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
57
  { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
@@ -122,6 +122,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
122
  }
123
  } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
124
  return LLM_CHAT_TEMPLATE_PHI_3;
 
 
125
  } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
126
  return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
127
  } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
@@ -154,9 +156,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
154
  return LLM_CHAT_TEMPLATE_LLAMA_3;
155
  } else if (tmpl_contains("[gMASK]sop")) {
156
  // chatglm3-6b
157
- return LLM_CHAT_TEMPLATE_CHATGML_3;
158
- } else if (tmpl_contains("[gMASK]<sop>")) {
159
- return LLM_CHAT_TEMPLATE_CHATGML_4;
160
  } else if (tmpl_contains(LU8("<用户>"))) {
161
  // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
162
  return LLM_CHAT_TEMPLATE_MINICPM;
@@ -437,7 +437,7 @@ int32_t llm_chat_apply_template(
437
  if (add_ass) {
438
  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
439
  }
440
- } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
441
  // chatglm3-6b
442
  ss << "[gMASK]" << "sop";
443
  for (auto message : chat) {
@@ -447,7 +447,7 @@ int32_t llm_chat_apply_template(
447
  if (add_ass) {
448
  ss << "<|assistant|>";
449
  }
450
- } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
451
  ss << "[gMASK]" << "<sop>";
452
  for (auto message : chat) {
453
  std::string role(message->role);
@@ -456,14 +456,6 @@ int32_t llm_chat_apply_template(
456
  if (add_ass) {
457
  ss << "<|assistant|>";
458
  }
459
- } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
460
- for (auto message : chat) {
461
- std::string role(message->role);
462
- ss << "<|" << role << "|>" << "\n" << message->content;
463
- }
464
- if (add_ass) {
465
- ss << "<|assistant|>";
466
- }
467
  } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
468
  // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
469
  for (auto message : chat) {
 
50
  { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
51
  { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
52
  { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
53
+ { "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
54
+ { "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
55
  { "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
56
  { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
57
  { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
 
122
  }
123
  } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
124
  return LLM_CHAT_TEMPLATE_PHI_3;
125
+ } else if (tmpl_contains("[gMASK]<sop>")) {
126
+ return LLM_CHAT_TEMPLATE_CHATGLM_4;
127
  } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
128
  return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
129
  } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
 
156
  return LLM_CHAT_TEMPLATE_LLAMA_3;
157
  } else if (tmpl_contains("[gMASK]sop")) {
158
  // chatglm3-6b
159
+ return LLM_CHAT_TEMPLATE_CHATGLM_3;
 
 
160
  } else if (tmpl_contains(LU8("<用户>"))) {
161
  // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
162
  return LLM_CHAT_TEMPLATE_MINICPM;
 
437
  if (add_ass) {
438
  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
439
  }
440
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
441
  // chatglm3-6b
442
  ss << "[gMASK]" << "sop";
443
  for (auto message : chat) {
 
447
  if (add_ass) {
448
  ss << "<|assistant|>";
449
  }
450
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
451
  ss << "[gMASK]" << "<sop>";
452
  for (auto message : chat) {
453
  std::string role(message->role);
 
456
  if (add_ass) {
457
  ss << "<|assistant|>";
458
  }
 
 
 
 
 
 
 
 
459
  } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
460
  // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
461
  for (auto message : chat) {
examples/talk-llama/llama-chat.h CHANGED
@@ -29,8 +29,8 @@ enum llm_chat_template {
29
  LLM_CHAT_TEMPLATE_DEEPSEEK_3,
30
  LLM_CHAT_TEMPLATE_COMMAND_R,
31
  LLM_CHAT_TEMPLATE_LLAMA_3,
32
- LLM_CHAT_TEMPLATE_CHATGML_3,
33
- LLM_CHAT_TEMPLATE_CHATGML_4,
34
  LLM_CHAT_TEMPLATE_GLMEDGE,
35
  LLM_CHAT_TEMPLATE_MINICPM,
36
  LLM_CHAT_TEMPLATE_EXAONE_3,
 
29
  LLM_CHAT_TEMPLATE_DEEPSEEK_3,
30
  LLM_CHAT_TEMPLATE_COMMAND_R,
31
  LLM_CHAT_TEMPLATE_LLAMA_3,
32
+ LLM_CHAT_TEMPLATE_CHATGLM_3,
33
+ LLM_CHAT_TEMPLATE_CHATGLM_4,
34
  LLM_CHAT_TEMPLATE_GLMEDGE,
35
  LLM_CHAT_TEMPLATE_MINICPM,
36
  LLM_CHAT_TEMPLATE_EXAONE_3,
examples/talk-llama/llama-context.cpp CHANGED
@@ -114,7 +114,7 @@ llama_context::llama_context(
114
  }
115
 
116
  if (n_ctx_per_seq > hparams.n_ctx_train) {
117
- LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
118
  __func__, n_ctx_per_seq, hparams.n_ctx_train);
119
  }
120
 
@@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift(
469
  ggml_tensor * shift,
470
  ggml_tensor * factors,
471
  float freq_base,
472
- float freq_scale,
473
- ggml_backend_buffer * bbuf) const {
474
  const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
475
 
476
  const auto & yarn_ext_factor = cparams.yarn_ext_factor;
@@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift(
492
  // dequantize to f32 -> RoPE -> quantize back
493
  tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
494
 
495
- if (bbuf) {
496
- for (const auto & backend : backends) {
497
- // Figure out which backend KV cache belongs to
498
- if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
499
- ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
500
- break;
501
- }
502
- }
503
- }
504
-
505
- tmp = ggml_rope_ext_inplace(ctx0, tmp,
506
  shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
507
  yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
508
 
@@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
582
  ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
583
  0);
584
 
585
- ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer);
586
 
587
  ggml_build_forward_expand(gf, cur);
588
  }
@@ -1547,8 +1536,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
1547
  // set all ids as invalid (negative)
1548
  std::fill(output_ids.begin(), output_ids.end(), -1);
1549
 
1550
- ggml_backend_buffer_clear(buf_output.get(), 0);
1551
-
1552
  this->n_outputs = 0;
1553
  this->n_outputs_max = n_outputs_max;
1554
 
 
114
  }
115
 
116
  if (n_ctx_per_seq > hparams.n_ctx_train) {
117
+ LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
118
  __func__, n_ctx_per_seq, hparams.n_ctx_train);
119
  }
120
 
 
469
  ggml_tensor * shift,
470
  ggml_tensor * factors,
471
  float freq_base,
472
+ float freq_scale) const {
 
473
  const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
474
 
475
  const auto & yarn_ext_factor = cparams.yarn_ext_factor;
 
491
  // dequantize to f32 -> RoPE -> quantize back
492
  tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
493
 
494
+ tmp = ggml_rope_ext(ctx0, tmp,
 
 
 
 
 
 
 
 
 
 
495
  shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
496
  yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
497
 
 
571
  ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
572
  0);
573
 
574
+ ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
575
 
576
  ggml_build_forward_expand(gf, cur);
577
  }
 
1536
  // set all ids as invalid (negative)
1537
  std::fill(output_ids.begin(), output_ids.end(), -1);
1538
 
 
 
1539
  this->n_outputs = 0;
1540
  this->n_outputs_max = n_outputs_max;
1541
 
examples/talk-llama/llama-context.h CHANGED
@@ -170,8 +170,7 @@ private:
170
  ggml_tensor * shift,
171
  ggml_tensor * factors,
172
  float freq_base,
173
- float freq_scale,
174
- ggml_backend_buffer * bbuf) const;
175
 
176
  llm_graph_result_ptr build_kv_self_shift(
177
  ggml_context * ctx0,
 
170
  ggml_tensor * shift,
171
  ggml_tensor * factors,
172
  float freq_base,
173
+ float freq_scale) const;
 
174
 
175
  llm_graph_result_ptr build_kv_self_shift(
176
  ggml_context * ctx0,
examples/talk-llama/llama-graph.cpp CHANGED
@@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
55
  if (ubatch->pos && pos) {
56
  const int64_t n_tokens = ubatch->n_tokens;
57
 
58
- ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  }
60
  }
61
 
@@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
71
  ) * f_attn_temp_scale + 1.0;
72
  }
73
 
74
- ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale));
75
  }
76
  }
77
 
@@ -592,7 +606,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
592
  res (std::make_unique<llm_graph_result>()) {
593
  }
594
 
595
- int64_t llm_graph_context::n_pos_per_token() const {
596
  return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
597
  }
598
 
@@ -803,6 +817,10 @@ ggml_tensor * llm_graph_context::build_ffn(
803
 
804
  if (down) {
805
  cur = build_lora_mm(down, cur);
 
 
 
 
806
  }
807
 
808
  if (down_b) {
@@ -910,28 +928,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
910
  ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
911
  cb(up, "ffn_moe_up", il);
912
 
913
- ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
914
- cb(gate, "ffn_moe_gate", il);
 
 
 
 
 
915
 
916
  switch (type_op) {
917
  case LLM_FFN_SILU:
918
  {
919
- gate = ggml_silu(ctx0, gate);
920
- cb(gate, "ffn_moe_silu", il);
921
  } break;
922
  case LLM_FFN_GELU:
923
  {
924
- gate = ggml_gelu(ctx0, gate);
925
- cb(gate, "ffn_moe_gelu", il);
926
  } break;
927
  default:
928
  GGML_ABORT("fatal error");
929
  }
930
 
931
- ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
932
- cb(par, "ffn_moe_gate_par", il);
 
 
933
 
934
- ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
935
  cb(experts, "ffn_moe_down", il);
936
 
937
  if (!weight_before_ffn) {
@@ -1014,11 +1039,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
1014
  }
1015
 
1016
  ggml_tensor * llm_graph_context::build_inp_pos() const {
1017
- auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token());
1018
 
1019
  auto & cur = inp->pos;
1020
 
1021
- cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
1022
  ggml_set_input(cur);
1023
 
1024
  res->add_input(std::move(inp));
@@ -1027,11 +1052,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
1027
  }
1028
 
1029
  ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
1030
- auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
1031
 
1032
  auto & cur = inp->attn_scale;
1033
 
1034
- cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
 
1035
  ggml_set_input(cur);
1036
 
1037
  res->add_input(std::move(inp));
 
55
  if (ubatch->pos && pos) {
56
  const int64_t n_tokens = ubatch->n_tokens;
57
 
58
+ if (ubatch->token && n_pos_per_embd == 4) {
59
+ // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
60
+ // the 3 first dims are the same, and 4th dim is all 0
61
+ std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
62
+ // copy the first dimension
63
+ for (int i = 0; i < n_tokens; ++i) {
64
+ pos_data[ i] = ubatch->pos[i];
65
+ pos_data[ n_tokens + i] = ubatch->pos[i];
66
+ pos_data[2 * n_tokens + i] = ubatch->pos[i];
67
+ pos_data[3 * n_tokens + i] = 0; // 4th dim is 0
68
+ }
69
+ ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
70
+ } else {
71
+ ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
72
+ }
73
  }
74
  }
75
 
 
85
  ) * f_attn_temp_scale + 1.0;
86
  }
87
 
88
+ ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
89
  }
90
  }
91
 
 
606
  res (std::make_unique<llm_graph_result>()) {
607
  }
608
 
609
+ int64_t llm_graph_context::n_pos_per_embd() const {
610
  return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
611
  }
612
 
 
817
 
818
  if (down) {
819
  cur = build_lora_mm(down, cur);
820
+ if (arch == LLM_ARCH_GLM4) {
821
+ // GLM4 seems to have numerical issues with half-precision accumulators
822
+ ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
823
+ }
824
  }
825
 
826
  if (down_b) {
 
928
  ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
929
  cb(up, "ffn_moe_up", il);
930
 
931
+ ggml_tensor * experts = nullptr;
932
+ if (gate_exps) {
933
+ cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
934
+ cb(cur, "ffn_moe_gate", il);
935
+ } else {
936
+ cur = up;
937
+ }
938
 
939
  switch (type_op) {
940
  case LLM_FFN_SILU:
941
  {
942
+ cur = ggml_silu(ctx0, cur);
943
+ cb(cur, "ffn_moe_silu", il);
944
  } break;
945
  case LLM_FFN_GELU:
946
  {
947
+ cur = ggml_gelu(ctx0, cur);
948
+ cb(cur, "ffn_moe_gelu", il);
949
  } break;
950
  default:
951
  GGML_ABORT("fatal error");
952
  }
953
 
954
+ if (gate_exps) {
955
+ cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
956
+ cb(cur, "ffn_moe_gate_par", il);
957
+ }
958
 
959
+ experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
960
  cb(experts, "ffn_moe_down", il);
961
 
962
  if (!weight_before_ffn) {
 
1039
  }
1040
 
1041
  ggml_tensor * llm_graph_context::build_inp_pos() const {
1042
+ auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
1043
 
1044
  auto & cur = inp->pos;
1045
 
1046
+ cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
1047
  ggml_set_input(cur);
1048
 
1049
  res->add_input(std::move(inp));
 
1052
  }
1053
 
1054
  ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
1055
+ auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
1056
 
1057
  auto & cur = inp->attn_scale;
1058
 
1059
+ // this need to be 1x1xN for broadcasting
1060
+ cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
1061
  ggml_set_input(cur);
1062
 
1063
  res->add_input(std::move(inp));
examples/talk-llama/llama-graph.h CHANGED
@@ -90,29 +90,27 @@ public:
90
 
91
  class llm_graph_input_pos : public llm_graph_input_i {
92
  public:
93
- llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
94
  virtual ~llm_graph_input_pos() = default;
95
 
96
  void set_input(const llama_ubatch * ubatch) override;
97
 
98
  ggml_tensor * pos = nullptr; // I32 [n_batch]
99
 
100
- const int64_t n_pos_per_token = 1;
101
  };
102
 
103
  // temperature tuning, used by llama4
104
  class llm_graph_input_attn_temp : public llm_graph_input_i {
105
  public:
106
- llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
107
- : n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
108
  virtual ~llm_graph_input_attn_temp() = default;
109
 
110
  void set_input(const llama_ubatch * ubatch) override;
111
 
112
  ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
113
 
114
- const int64_t n_pos_per_token = 1;
115
-
116
  const uint32_t n_attn_temp_floor_scale;
117
  const float f_attn_temp_scale;
118
  };
@@ -419,7 +417,7 @@ struct llm_graph_context {
419
 
420
  llm_graph_context(const llm_graph_params & params);
421
 
422
- int64_t n_pos_per_token() const;
423
 
424
  void cb(ggml_tensor * cur, const char * name, int il) const;
425
 
 
90
 
91
  class llm_graph_input_pos : public llm_graph_input_i {
92
  public:
93
+ llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
94
  virtual ~llm_graph_input_pos() = default;
95
 
96
  void set_input(const llama_ubatch * ubatch) override;
97
 
98
  ggml_tensor * pos = nullptr; // I32 [n_batch]
99
 
100
+ const int64_t n_pos_per_embd = 1;
101
  };
102
 
103
  // temperature tuning, used by llama4
104
  class llm_graph_input_attn_temp : public llm_graph_input_i {
105
  public:
106
+ llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
107
+ : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
108
  virtual ~llm_graph_input_attn_temp() = default;
109
 
110
  void set_input(const llama_ubatch * ubatch) override;
111
 
112
  ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
113
 
 
 
114
  const uint32_t n_attn_temp_floor_scale;
115
  const float f_attn_temp_scale;
116
  };
 
417
 
418
  llm_graph_context(const llm_graph_params & params);
419
 
420
+ int64_t n_pos_per_embd() const;
421
 
422
  void cb(ggml_tensor * cur, const char * name, int il) const;
423
 
examples/talk-llama/llama-hparams.h CHANGED
@@ -66,6 +66,7 @@ struct llama_hparams {
66
  float expert_weights_scale = 0.0;
67
  bool expert_weights_norm = false;
68
  uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
 
69
 
70
  float f_norm_eps;
71
  float f_norm_rms_eps;
 
66
  float expert_weights_scale = 0.0;
67
  bool expert_weights_norm = false;
68
  uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
69
+ uint32_t moe_every_n_layers = 0;
70
 
71
  float f_norm_eps;
72
  float f_norm_rms_eps;
examples/talk-llama/llama-model.cpp CHANGED
@@ -40,14 +40,17 @@ const char * llm_type_name(llm_type type) {
40
  case LLM_TYPE_335M: return "335M";
41
  case LLM_TYPE_410M: return "410M";
42
  case LLM_TYPE_450M: return "450M";
 
43
  case LLM_TYPE_770M: return "770M";
44
  case LLM_TYPE_780M: return "780M";
45
  case LLM_TYPE_0_5B: return "0.5B";
 
46
  case LLM_TYPE_1B: return "1B";
47
  case LLM_TYPE_1_3B: return "1.3B";
48
  case LLM_TYPE_1_4B: return "1.4B";
49
  case LLM_TYPE_1_5B: return "1.5B";
50
  case LLM_TYPE_1_6B: return "1.6B";
 
51
  case LLM_TYPE_1_8B: return "1.8B";
52
  case LLM_TYPE_2B: return "2B";
53
  case LLM_TYPE_2_8B: return "2.8B";
@@ -66,6 +69,7 @@ const char * llm_type_name(llm_type type) {
66
  case LLM_TYPE_15B: return "15B";
67
  case LLM_TYPE_16B: return "16B";
68
  case LLM_TYPE_20B: return "20B";
 
69
  case LLM_TYPE_30B: return "30B";
70
  case LLM_TYPE_32B: return "32B";
71
  case LLM_TYPE_34B: return "34B";
@@ -74,6 +78,7 @@ const char * llm_type_name(llm_type type) {
74
  case LLM_TYPE_65B: return "65B";
75
  case LLM_TYPE_70B: return "70B";
76
  case LLM_TYPE_236B: return "236B";
 
77
  case LLM_TYPE_314B: return "314B";
78
  case LLM_TYPE_671B: return "671B";
79
  case LLM_TYPE_SMALL: return "0.1B";
@@ -88,10 +93,10 @@ const char * llm_type_name(llm_type type) {
88
  case LLM_TYPE_16x3_8B: return "16x3.8B";
89
  case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
90
  case LLM_TYPE_57B_A14B: return "57B.A14B";
91
- case LLM_TYPE_27B: return "27B";
92
- case LLM_TYPE_290B: return "290B";
93
  case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
94
  case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
 
 
95
  default: return "?B";
96
  }
97
  }
@@ -695,13 +700,19 @@ void llama_model::load_hparams(llama_model_loader & ml) {
695
  }
696
  } break;
697
  case LLM_ARCH_NOMIC_BERT:
 
698
  {
699
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
700
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
701
  ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
 
702
 
703
  if (hparams.n_layer == 12 && hparams.n_embd == 768) {
704
- type = LLM_TYPE_137M;
 
 
 
 
705
  }
706
  } break;
707
  case LLM_ARCH_BLOOM:
@@ -791,6 +802,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
791
  {
792
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
793
  switch (hparams.n_layer) {
 
 
 
 
794
  default: type = LLM_TYPE_UNKNOWN;
795
  }
796
  } break;
@@ -800,6 +815,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
800
 
801
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
802
  switch (hparams.n_layer) {
 
 
803
  default: type = LLM_TYPE_UNKNOWN;
804
  }
805
  } break;
@@ -2057,6 +2074,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2057
  } break;
2058
  case LLM_ARCH_BERT:
2059
  case LLM_ARCH_NOMIC_BERT:
 
2060
  {
2061
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2062
  type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
@@ -2090,20 +2108,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2090
  layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2091
  }
2092
 
 
 
 
 
2093
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2094
 
2095
  layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
2096
  layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
2097
 
2098
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2099
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2100
-
2101
- if (arch == LLM_ARCH_BERT) {
2102
  layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2103
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
2104
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
 
2105
  } else {
2106
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
 
 
 
 
 
 
 
 
 
2107
  }
2108
 
2109
  layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
@@ -5730,6 +5759,11 @@ struct llm_build_bert : public llm_graph_context {
5730
  cur = build_lora_mm(model.layers[il].wqkv, cur);
5731
  cb(cur, "wqkv", il);
5732
 
 
 
 
 
 
5733
  Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5734
  Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5735
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
@@ -5782,13 +5816,29 @@ struct llm_build_bert : public llm_graph_context {
5782
  cb(ffn_inp, "ffn_inp", il);
5783
 
5784
  // feed-forward network
5785
- if (model.arch == LLM_ARCH_BERT) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5786
  cur = build_ffn(cur,
5787
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
5788
  NULL, NULL, NULL,
5789
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
5790
  NULL,
5791
  LLM_FFN_GELU, LLM_FFN_SEQ, il);
 
5792
  } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
5793
  cur = build_ffn(cur,
5794
  model.layers[il].ffn_up, NULL, NULL,
@@ -5796,6 +5846,7 @@ struct llm_build_bert : public llm_graph_context {
5796
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
5797
  NULL,
5798
  LLM_FFN_GELU, LLM_FFN_PAR, il);
 
5799
  } else {
5800
  cur = build_ffn(cur,
5801
  model.layers[il].ffn_up, NULL, NULL,
@@ -5803,8 +5854,8 @@ struct llm_build_bert : public llm_graph_context {
5803
  model.layers[il].ffn_down, NULL, NULL,
5804
  NULL,
5805
  LLM_FFN_SILU, LLM_FFN_PAR, il);
 
5806
  }
5807
- cb(cur, "ffn_out", il);
5808
 
5809
  // attentions bypass the intermediate layer
5810
  cur = ggml_add(ctx0, cur, ffn_inp);
@@ -12842,6 +12893,7 @@ llm_graph_result_ptr llama_model::build_graph(
12842
  case LLM_ARCH_BERT:
12843
  case LLM_ARCH_JINA_BERT_V2:
12844
  case LLM_ARCH_NOMIC_BERT:
 
12845
  {
12846
  llm = std::make_unique<llm_build_bert>(*this, params, gf);
12847
  } break;
@@ -13200,6 +13252,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
13200
  case LLM_ARCH_DBRX:
13201
  case LLM_ARCH_BERT:
13202
  case LLM_ARCH_NOMIC_BERT:
 
13203
  case LLM_ARCH_STABLELM:
13204
  case LLM_ARCH_BITNET:
13205
  case LLM_ARCH_QWEN:
 
40
  case LLM_TYPE_335M: return "335M";
41
  case LLM_TYPE_410M: return "410M";
42
  case LLM_TYPE_450M: return "450M";
43
+ case LLM_TYPE_475M: return "475M";
44
  case LLM_TYPE_770M: return "770M";
45
  case LLM_TYPE_780M: return "780M";
46
  case LLM_TYPE_0_5B: return "0.5B";
47
+ case LLM_TYPE_0_6B: return "0.6B";
48
  case LLM_TYPE_1B: return "1B";
49
  case LLM_TYPE_1_3B: return "1.3B";
50
  case LLM_TYPE_1_4B: return "1.4B";
51
  case LLM_TYPE_1_5B: return "1.5B";
52
  case LLM_TYPE_1_6B: return "1.6B";
53
+ case LLM_TYPE_1_7B: return "1.7B";
54
  case LLM_TYPE_1_8B: return "1.8B";
55
  case LLM_TYPE_2B: return "2B";
56
  case LLM_TYPE_2_8B: return "2.8B";
 
69
  case LLM_TYPE_15B: return "15B";
70
  case LLM_TYPE_16B: return "16B";
71
  case LLM_TYPE_20B: return "20B";
72
+ case LLM_TYPE_27B: return "27B";
73
  case LLM_TYPE_30B: return "30B";
74
  case LLM_TYPE_32B: return "32B";
75
  case LLM_TYPE_34B: return "34B";
 
78
  case LLM_TYPE_65B: return "65B";
79
  case LLM_TYPE_70B: return "70B";
80
  case LLM_TYPE_236B: return "236B";
81
+ case LLM_TYPE_290B: return "290B";
82
  case LLM_TYPE_314B: return "314B";
83
  case LLM_TYPE_671B: return "671B";
84
  case LLM_TYPE_SMALL: return "0.1B";
 
93
  case LLM_TYPE_16x3_8B: return "16x3.8B";
94
  case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
95
  case LLM_TYPE_57B_A14B: return "57B.A14B";
 
 
96
  case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
97
  case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
98
+ case LLM_TYPE_30B_A3B: return "30B.A3B";
99
+ case LLM_TYPE_235B_A22B: return "235B.A22B";
100
  default: return "?B";
101
  }
102
  }
 
700
  }
701
  } break;
702
  case LLM_ARCH_NOMIC_BERT:
703
+ case LLM_ARCH_NOMIC_BERT_MOE:
704
  {
705
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
706
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
707
  ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
708
+ ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
709
 
710
  if (hparams.n_layer == 12 && hparams.n_embd == 768) {
711
+ if (arch == LLM_ARCH_NOMIC_BERT) {
712
+ type = LLM_TYPE_137M;
713
+ } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
714
+ type = LLM_TYPE_475M;
715
+ }
716
  }
717
  } break;
718
  case LLM_ARCH_BLOOM:
 
802
  {
803
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
804
  switch (hparams.n_layer) {
805
+ case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
806
+ case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
807
+ case 40: type = LLM_TYPE_14B; break;
808
+ case 64: type = LLM_TYPE_32B; break;
809
  default: type = LLM_TYPE_UNKNOWN;
810
  }
811
  } break;
 
815
 
816
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
817
  switch (hparams.n_layer) {
818
+ case 48: type = LLM_TYPE_30B_A3B; break;
819
+ case 94: type = LLM_TYPE_235B_A22B; break;
820
  default: type = LLM_TYPE_UNKNOWN;
821
  }
822
  } break;
 
2074
  } break;
2075
  case LLM_ARCH_BERT:
2076
  case LLM_ARCH_NOMIC_BERT:
2077
+ case LLM_ARCH_NOMIC_BERT_MOE:
2078
  {
2079
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2080
  type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
 
2108
  layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2109
  }
2110
 
2111
+ if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
2112
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
2113
+ }
2114
+
2115
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2116
 
2117
  layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
2118
  layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
2119
 
2120
+ if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
 
 
 
2121
  layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2122
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
2123
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
2124
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2125
  } else {
2126
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2127
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2128
+
2129
+ if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
2130
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2131
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
2132
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2133
+ } else {
2134
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2135
+ }
2136
  }
2137
 
2138
  layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
 
5759
  cur = build_lora_mm(model.layers[il].wqkv, cur);
5760
  cb(cur, "wqkv", il);
5761
 
5762
+ if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
5763
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5764
+ cb(cur, "bqkv", il);
5765
+ }
5766
+
5767
  Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5768
  Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5769
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
 
5816
  cb(ffn_inp, "ffn_inp", il);
5817
 
5818
  // feed-forward network
5819
+ if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
5820
+ // MoE branch
5821
+ cur = build_moe_ffn(cur,
5822
+ model.layers[il].ffn_gate_inp,
5823
+ model.layers[il].ffn_up_exps,
5824
+ nullptr,
5825
+ model.layers[il].ffn_down_exps,
5826
+ nullptr,
5827
+ hparams.n_expert,
5828
+ hparams.n_expert_used,
5829
+ LLM_FFN_GELU,
5830
+ false, false,
5831
+ 0.0f,
5832
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
5833
+ cb(cur, "ffn_moe_out", il);
5834
+ } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
5835
  cur = build_ffn(cur,
5836
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
5837
  NULL, NULL, NULL,
5838
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
5839
  NULL,
5840
  LLM_FFN_GELU, LLM_FFN_SEQ, il);
5841
+ cb(cur, "ffn_out", il);
5842
  } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
5843
  cur = build_ffn(cur,
5844
  model.layers[il].ffn_up, NULL, NULL,
 
5846
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
5847
  NULL,
5848
  LLM_FFN_GELU, LLM_FFN_PAR, il);
5849
+ cb(cur, "ffn_out", il);
5850
  } else {
5851
  cur = build_ffn(cur,
5852
  model.layers[il].ffn_up, NULL, NULL,
 
5854
  model.layers[il].ffn_down, NULL, NULL,
5855
  NULL,
5856
  LLM_FFN_SILU, LLM_FFN_PAR, il);
5857
+ cb(cur, "ffn_out", il);
5858
  }
 
5859
 
5860
  // attentions bypass the intermediate layer
5861
  cur = ggml_add(ctx0, cur, ffn_inp);
 
12893
  case LLM_ARCH_BERT:
12894
  case LLM_ARCH_JINA_BERT_V2:
12895
  case LLM_ARCH_NOMIC_BERT:
12896
+ case LLM_ARCH_NOMIC_BERT_MOE:
12897
  {
12898
  llm = std::make_unique<llm_build_bert>(*this, params, gf);
12899
  } break;
 
13252
  case LLM_ARCH_DBRX:
13253
  case LLM_ARCH_BERT:
13254
  case LLM_ARCH_NOMIC_BERT:
13255
+ case LLM_ARCH_NOMIC_BERT_MOE:
13256
  case LLM_ARCH_STABLELM:
13257
  case LLM_ARCH_BITNET:
13258
  case LLM_ARCH_QWEN:
examples/talk-llama/llama-model.h CHANGED
@@ -36,14 +36,17 @@ enum llm_type {
36
  LLM_TYPE_335M,
37
  LLM_TYPE_410M,
38
  LLM_TYPE_450M,
 
39
  LLM_TYPE_770M,
40
  LLM_TYPE_780M,
41
  LLM_TYPE_0_5B,
 
42
  LLM_TYPE_1B,
43
  LLM_TYPE_1_3B,
44
  LLM_TYPE_1_4B,
45
  LLM_TYPE_1_5B,
46
  LLM_TYPE_1_6B,
 
47
  LLM_TYPE_1_8B,
48
  LLM_TYPE_2B,
49
  LLM_TYPE_2_8B,
@@ -62,6 +65,7 @@ enum llm_type {
62
  LLM_TYPE_15B,
63
  LLM_TYPE_16B,
64
  LLM_TYPE_20B,
 
65
  LLM_TYPE_30B,
66
  LLM_TYPE_32B,
67
  LLM_TYPE_34B,
@@ -70,6 +74,7 @@ enum llm_type {
70
  LLM_TYPE_65B,
71
  LLM_TYPE_70B,
72
  LLM_TYPE_236B,
 
73
  LLM_TYPE_314B,
74
  LLM_TYPE_671B,
75
  LLM_TYPE_SMALL,
@@ -84,10 +89,10 @@ enum llm_type {
84
  LLM_TYPE_16x3_8B,
85
  LLM_TYPE_10B_128x3_66B,
86
  LLM_TYPE_57B_A14B,
87
- LLM_TYPE_27B,
88
- LLM_TYPE_290B,
89
  LLM_TYPE_17B_16E, // llama4 Scout
90
  LLM_TYPE_17B_128E, // llama4 Maverick
 
 
91
  };
92
 
93
  struct llama_layer_posnet {
 
36
  LLM_TYPE_335M,
37
  LLM_TYPE_410M,
38
  LLM_TYPE_450M,
39
+ LLM_TYPE_475M,
40
  LLM_TYPE_770M,
41
  LLM_TYPE_780M,
42
  LLM_TYPE_0_5B,
43
+ LLM_TYPE_0_6B,
44
  LLM_TYPE_1B,
45
  LLM_TYPE_1_3B,
46
  LLM_TYPE_1_4B,
47
  LLM_TYPE_1_5B,
48
  LLM_TYPE_1_6B,
49
+ LLM_TYPE_1_7B,
50
  LLM_TYPE_1_8B,
51
  LLM_TYPE_2B,
52
  LLM_TYPE_2_8B,
 
65
  LLM_TYPE_15B,
66
  LLM_TYPE_16B,
67
  LLM_TYPE_20B,
68
+ LLM_TYPE_27B,
69
  LLM_TYPE_30B,
70
  LLM_TYPE_32B,
71
  LLM_TYPE_34B,
 
74
  LLM_TYPE_65B,
75
  LLM_TYPE_70B,
76
  LLM_TYPE_236B,
77
+ LLM_TYPE_290B,
78
  LLM_TYPE_314B,
79
  LLM_TYPE_671B,
80
  LLM_TYPE_SMALL,
 
89
  LLM_TYPE_16x3_8B,
90
  LLM_TYPE_10B_128x3_66B,
91
  LLM_TYPE_57B_A14B,
 
 
92
  LLM_TYPE_17B_16E, // llama4 Scout
93
  LLM_TYPE_17B_128E, // llama4 Maverick
94
+ LLM_TYPE_30B_A3B,
95
+ LLM_TYPE_235B_A22B,
96
  };
97
 
98
  struct llama_layer_posnet {
examples/talk-llama/llama-sampling.cpp CHANGED
@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
232
  // }
233
 
234
  if (k <= 0) {
235
- k = cur_p->size;
236
  }
237
 
238
  k = std::min(k, (int) cur_p->size);
@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
298
  }
299
  cur_p->sorted = true;
300
  }
 
301
  cur_p->size = k;
302
  }
303
 
 
232
  // }
233
 
234
  if (k <= 0) {
235
+ return;
236
  }
237
 
238
  k = std::min(k, (int) cur_p->size);
 
298
  }
299
  cur_p->sorted = true;
300
  }
301
+
302
  cur_p->size = k;
303
  }
304
 
examples/talk-llama/llama.h CHANGED
@@ -1232,6 +1232,7 @@ extern "C" {
1232
  "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
1233
 
1234
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 
1235
  LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
1236
 
1237
  /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 
1232
  "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
1233
 
1234
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1235
+ /// Setting k <= 0 makes this a noop
1236
  LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
1237
 
1238
  /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751