ggerganov commited on
Commit
542accf
·
unverified ·
1 Parent(s): b3c9e81

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama.cpp CHANGED
@@ -197,6 +197,7 @@ enum llm_arch {
197
  LLM_ARCH_PERSIMMON,
198
  LLM_ARCH_REFACT,
199
  LLM_ARCH_BERT,
 
200
  LLM_ARCH_BLOOM,
201
  LLM_ARCH_STABLELM,
202
  LLM_ARCH_QWEN,
@@ -211,27 +212,28 @@ enum llm_arch {
211
  };
212
 
213
  static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
214
- { LLM_ARCH_LLAMA, "llama" },
215
- { LLM_ARCH_FALCON, "falcon" },
216
- { LLM_ARCH_GPT2, "gpt2" },
217
- { LLM_ARCH_GPTJ, "gptj" },
218
- { LLM_ARCH_GPTNEOX, "gptneox" },
219
- { LLM_ARCH_MPT, "mpt" },
220
- { LLM_ARCH_BAICHUAN, "baichuan" },
221
- { LLM_ARCH_STARCODER, "starcoder" },
222
- { LLM_ARCH_PERSIMMON, "persimmon" },
223
- { LLM_ARCH_REFACT, "refact" },
224
- { LLM_ARCH_BERT, "bert" },
225
- { LLM_ARCH_BLOOM, "bloom" },
226
- { LLM_ARCH_STABLELM, "stablelm" },
227
- { LLM_ARCH_QWEN, "qwen" },
228
- { LLM_ARCH_QWEN2, "qwen2" },
229
- { LLM_ARCH_PHI2, "phi2" },
230
- { LLM_ARCH_PLAMO, "plamo" },
231
- { LLM_ARCH_CODESHELL, "codeshell" },
232
- { LLM_ARCH_ORION, "orion" },
233
- { LLM_ARCH_INTERNLM2, "internlm2" },
234
- { LLM_ARCH_MINICPM, "minicpm" },
 
235
  };
236
 
237
  enum llm_kv {
@@ -254,6 +256,7 @@ enum llm_kv {
254
  LLM_KV_TENSOR_DATA_LAYOUT,
255
  LLM_KV_EXPERT_COUNT,
256
  LLM_KV_EXPERT_USED_COUNT,
 
257
 
258
  LLM_KV_ATTENTION_HEAD_COUNT,
259
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -311,6 +314,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
311
  { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
312
  { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
313
  { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
 
314
 
315
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
316
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -373,6 +377,7 @@ enum llm_tensor {
373
  LLM_TENSOR_ATTN_OUT,
374
  LLM_TENSOR_ATTN_NORM,
375
  LLM_TENSOR_ATTN_NORM_2,
 
376
  LLM_TENSOR_ATTN_ROT_EMBD,
377
  LLM_TENSOR_FFN_GATE_INP,
378
  LLM_TENSOR_FFN_NORM,
@@ -385,6 +390,7 @@ enum llm_tensor {
385
  LLM_TENSOR_FFN_UP_EXP,
386
  LLM_TENSOR_ATTN_Q_NORM,
387
  LLM_TENSOR_ATTN_K_NORM,
 
388
  };
389
 
390
  static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -550,12 +556,27 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
550
  { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
551
  { LLM_TENSOR_TOKEN_TYPES, "token_types" },
552
  { LLM_TENSOR_POS_EMBD, "position_embd" },
553
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_output_norm" },
554
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
555
  { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
556
  { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
557
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
558
- { LLM_TENSOR_FFN_NORM, "blk.%d.layer_output_norm" },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
560
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
561
  },
@@ -772,22 +793,37 @@ struct LLM_TN {
772
  llm_arch arch;
773
 
774
  std::string operator()(llm_tensor tensor) const {
 
 
 
775
  return LLM_TENSOR_NAMES[arch].at(tensor);
776
  }
777
 
778
  std::string operator()(llm_tensor tensor, const std::string & suffix) const {
 
 
 
779
  return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
780
  }
781
 
782
  std::string operator()(llm_tensor tensor, int bid) const {
 
 
 
783
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
784
  }
785
 
786
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
 
 
 
787
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
788
  }
789
 
790
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
 
 
 
791
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
792
  }
793
  };
@@ -998,7 +1034,7 @@ struct llama_mmap {
998
  int fd = fileno(file->fp);
999
  int flags = MAP_SHARED;
1000
  // prefetch/readahead impairs performance on NUMA systems
1001
- if (numa) { prefetch = 0; }
1002
  #ifdef __linux__
1003
  // advise the kernel to read the file sequentially (increases readahead)
1004
  if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
@@ -1468,6 +1504,7 @@ enum e_model {
1468
  MODEL_22M,
1469
  MODEL_33M,
1470
  MODEL_109M,
 
1471
  MODEL_335M,
1472
  MODEL_0_5B,
1473
  MODEL_1B,
@@ -1520,11 +1557,13 @@ struct llama_hparams {
1520
  uint32_t n_yarn_orig_ctx;
1521
  int32_t rope_scaling_type_train;
1522
 
1523
- float f_clamp_kqv;
1524
- float f_max_alibi_bias;
1525
 
1526
  bool causal_attn = true;
 
1527
 
 
1528
 
1529
  bool operator!=(const llama_hparams & other) const {
1530
  if (this->vocab_only != other.vocab_only) return true;
@@ -1586,6 +1625,7 @@ struct llama_cparams {
1586
 
1587
  bool mul_mat_q;
1588
  bool offload_kqv;
 
1589
 
1590
  ggml_backend_sched_eval_callback cb_eval;
1591
  void * cb_eval_user_data;
@@ -1601,6 +1641,8 @@ struct llama_layer {
1601
  struct ggml_tensor * attn_q_norm_b;
1602
  struct ggml_tensor * attn_k_norm;
1603
  struct ggml_tensor * attn_k_norm_b;
 
 
1604
 
1605
  // attention
1606
  struct ggml_tensor * wq;
@@ -1619,6 +1661,8 @@ struct llama_layer {
1619
  // normalization
1620
  struct ggml_tensor * ffn_norm;
1621
  struct ggml_tensor * ffn_norm_b;
 
 
1622
 
1623
  // ff
1624
  struct ggml_tensor * ffn_gate; // w1
@@ -1880,8 +1924,10 @@ struct llama_context {
1880
  struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
1881
  struct ggml_tensor * inp_pos; // I32 [n_batch]
1882
  struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
 
1883
  struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
1884
- struct ggml_tensor * inp_sum; // F32 [1, n_batch]
 
1885
 
1886
  #ifdef GGML_USE_MPI
1887
  ggml_mpi_context * ctx_mpi = NULL;
@@ -2480,6 +2526,7 @@ struct llama_model_loader {
2480
  case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2481
  case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2482
  case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
 
2483
  default:
2484
  {
2485
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2829,6 +2876,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2829
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2830
  case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
2831
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
 
2832
 
2833
  default: return "unknown, may not work";
2834
  }
@@ -2836,6 +2884,11 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2836
 
2837
  static const char * llama_model_type_name(e_model type) {
2838
  switch (type) {
 
 
 
 
 
2839
  case MODEL_1B: return "1B";
2840
  case MODEL_2B: return "2B";
2841
  case MODEL_3B: return "3B";
@@ -3005,6 +3058,11 @@ static void llm_load_hparams(
3005
  case 40: model.type = e_model::MODEL_13B; break;
3006
  default: model.type = e_model::MODEL_UNKNOWN;
3007
  }
 
 
 
 
 
3008
  } break;
3009
  case LLM_ARCH_STARCODER:
3010
  {
@@ -3032,12 +3090,16 @@ static void llm_load_hparams(
3032
  case 32: model.type = e_model::MODEL_1B; break;
3033
  default: model.type = e_model::MODEL_UNKNOWN;
3034
  }
 
 
 
3035
  } break;
3036
  case LLM_ARCH_BERT:
3037
  {
3038
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3039
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3040
  ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
 
3041
 
3042
  switch (hparams.n_layer) {
3043
  case 3:
@@ -3053,6 +3115,17 @@ static void llm_load_hparams(
3053
  model.type = e_model::MODEL_335M; break; // bge-large
3054
  }
3055
  } break;
 
 
 
 
 
 
 
 
 
 
 
3056
  case LLM_ARCH_BLOOM:
3057
  {
3058
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3065,11 +3138,12 @@ static void llm_load_hparams(
3065
  case 4096: model.type = e_model::MODEL_7B; break;
3066
  } break;
3067
  }
 
 
 
3068
  } break;
3069
  case LLM_ARCH_MPT:
3070
  {
3071
- hparams.f_clamp_kqv = 0.0f;
3072
-
3073
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3074
  ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
3075
  ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
@@ -3171,6 +3245,10 @@ static void llm_load_hparams(
3171
  }
3172
 
3173
  model.ftype = ml.ftype;
 
 
 
 
3174
  }
3175
 
3176
  // TODO: This should probably be in llama.h
@@ -3294,7 +3372,12 @@ static void llm_load_vocab(
3294
 
3295
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
3296
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
3297
- vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
 
 
 
 
 
3298
  } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
3299
  vocab.linefeed_id = vocab.special_pad_id;
3300
  } else {
@@ -3850,10 +3933,14 @@ static bool llm_load_tensors(
3850
  }
3851
  } break;
3852
  case LLM_ARCH_BERT:
 
3853
  {
3854
- model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3855
- model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
3856
- model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
 
 
 
3857
  model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3858
  model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3859
 
@@ -3863,29 +3950,38 @@ static bool llm_load_tensors(
3863
 
3864
  auto & layer = model.layers[i];
3865
 
3866
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3867
- layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
 
3868
 
3869
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3870
- layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3871
 
3872
- layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3873
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
 
 
 
3874
 
3875
- layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3876
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
3877
 
3878
- layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3879
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
3880
 
3881
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3882
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3883
 
3884
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3885
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
 
3886
 
3887
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3888
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
 
 
 
 
 
3889
  }
3890
  } break;
3891
  case LLM_ARCH_BLOOM:
@@ -4364,9 +4460,21 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
4364
 
4365
  model.hparams.vocab_only = params.vocab_only;
4366
 
4367
- llm_load_arch (ml, model);
4368
- llm_load_hparams(ml, model);
4369
- llm_load_vocab (ml, model);
 
 
 
 
 
 
 
 
 
 
 
 
4370
 
4371
  llm_load_print_meta(ml, model);
4372
 
@@ -4683,10 +4791,10 @@ static struct ggml_tensor * llm_build_kqv(
4683
  struct ggml_tensor * wo_b,
4684
  struct ggml_tensor * q_cur,
4685
  struct ggml_tensor * kq_mask,
 
4686
  int64_t n_ctx,
4687
  int32_t n_tokens,
4688
  int32_t n_kv,
4689
- float max_alibi_bias,
4690
  float kq_scale,
4691
  const llm_build_cb & cb,
4692
  int il) {
@@ -4716,26 +4824,26 @@ static struct ggml_tensor * llm_build_kqv(
4716
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
4717
  }
4718
 
4719
- if (max_alibi_bias > 0.0f) {
4720
- // temporary branch until we figure out how to handle ggml_alibi through ggml_add
 
 
 
4721
  kq = ggml_scale(ctx, kq, kq_scale);
4722
  cb(kq, "kq_scaled", il);
4723
 
4724
- if (max_alibi_bias > 0.0f) {
4725
- // TODO: n_head or n_head_kv
4726
- // TODO: K-shift is likely not working
4727
- // TODO: change to ggml_add
4728
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
4729
- cb(kq, "kq_scaled_alibi", il);
4730
- }
4731
 
4732
  kq = ggml_add(ctx, kq, kq_mask);
4733
  cb(kq, "kq_masked", il);
4734
 
4735
  kq = ggml_soft_max(ctx, kq);
4736
  cb(kq, "kq_soft_max", il);
4737
- } else {
4738
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
 
 
4739
  cb(kq, "kq_soft_max_ext", il);
4740
  }
4741
 
@@ -4783,11 +4891,11 @@ static struct ggml_tensor * llm_build_kv(
4783
  struct ggml_tensor * v_cur,
4784
  struct ggml_tensor * q_cur,
4785
  struct ggml_tensor * kq_mask,
 
4786
  int64_t n_ctx,
4787
  int32_t n_tokens,
4788
  int32_t kv_head,
4789
  int32_t n_kv,
4790
- float max_alibi_bias,
4791
  float kq_scale,
4792
  const llm_build_cb & cb,
4793
  int il) {
@@ -4801,9 +4909,8 @@ static struct ggml_tensor * llm_build_kv(
4801
  llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
4802
 
4803
  struct ggml_tensor * cur;
4804
- cur = llm_build_kqv(ctx, model, hparams, kv, graph,
4805
- wo, wo_b,
4806
- q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
4807
  cb(cur, "kqv_out", il);
4808
 
4809
  return cur;
@@ -4844,7 +4951,7 @@ struct llm_build_context {
4844
  const int32_t n_orig_ctx;
4845
 
4846
  const bool do_rope_shift;
4847
- const bool causal_attn;
4848
 
4849
  const llm_build_cb & cb;
4850
 
@@ -4888,7 +4995,7 @@ struct llm_build_context {
4888
  kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
4889
  n_orig_ctx (cparams.n_yarn_orig_ctx),
4890
  do_rope_shift (worst_case || kv_self.has_shift),
4891
- causal_attn (hparams.causal_attn),
4892
  cb (cb),
4893
  buf_compute_meta (lctx.buf_compute_meta) {
4894
  // all initializations should be done in init()
@@ -4971,7 +5078,7 @@ struct llm_build_context {
4971
  }
4972
 
4973
  Qcur = ggml_rope_custom(
4974
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4975
  hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
4976
  ext_factor, attn_factor, beta_fast, beta_slow
4977
  );
@@ -4986,7 +5093,7 @@ struct llm_build_context {
4986
 
4987
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
4988
  model.layers[il].wo, model.layers[il].bo,
4989
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4990
  cb(cur, "kqv_out", il);
4991
  }
4992
 
@@ -5116,6 +5223,10 @@ struct llm_build_context {
5116
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5117
  cb(KQ_mask, "KQ_mask", -1);
5118
 
 
 
 
 
5119
  // shift the entire K-cache if needed
5120
  if (do_rope_shift) {
5121
  llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
@@ -5164,12 +5275,9 @@ struct llm_build_context {
5164
  cb(Kcur, "Kcur", il);
5165
 
5166
 
5167
- // apply ALiBi for 13B model
5168
- const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
5169
-
5170
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5171
  model.layers[il].wo, NULL,
5172
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5173
  cb(cur, "kqv_out", il);
5174
  }
5175
 
@@ -5293,7 +5401,7 @@ struct llm_build_context {
5293
 
5294
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5295
  model.layers[il].wo, NULL,
5296
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5297
  cb(cur, "kqv_out", il);
5298
  }
5299
 
@@ -5392,7 +5500,7 @@ struct llm_build_context {
5392
 
5393
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5394
  model.layers[il].wo, model.layers[il].bo,
5395
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5396
  cb(cur, "kqv_out", il);
5397
  }
5398
 
@@ -5597,7 +5705,7 @@ struct llm_build_context {
5597
 
5598
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5599
  model.layers[il].wo, model.layers[il].bo,
5600
- Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5601
  cb(cur, "kqv_out", il);
5602
  }
5603
 
@@ -5659,6 +5767,10 @@ struct llm_build_context {
5659
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5660
  cb(KQ_mask, "KQ_mask", -1);
5661
 
 
 
 
 
5662
  for (int il = 0; il < n_layer; ++il) {
5663
  struct ggml_tensor * inpSA = inpL;
5664
 
@@ -5686,7 +5798,7 @@ struct llm_build_context {
5686
 
5687
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5688
  model.layers[il].wo, NULL,
5689
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5690
  cb(cur, "kqv_out", il);
5691
  }
5692
 
@@ -5736,22 +5848,27 @@ struct llm_build_context {
5736
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5737
 
5738
  const int64_t n_embd_head = hparams.n_embd_head_v;
 
5739
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5740
- GGML_ASSERT(n_embd_head == hparams.n_rot);
5741
 
5742
  struct ggml_tensor * cur;
5743
  struct ggml_tensor * inpL;
5744
 
5745
  // get input vectors with right size
 
5746
  struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5747
- struct ggml_tensor * inp_sum = ggml_view_1d(ctx0, lctx.inp_sum, n_tokens, 0);
 
5748
 
5749
  // construct input embeddings (token, type, position)
5750
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
 
5751
  // token types are hardcoded to zero ("Sentence A")
5752
  struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
5753
  inpL = ggml_add(ctx0, inpL, type_row0);
5754
- inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
 
 
5755
  cb(inpL, "inp_embd", -1);
5756
 
5757
  // embed layer norm
@@ -5767,7 +5884,7 @@ struct llm_build_context {
5767
  struct ggml_tensor * cur = inpL;
5768
 
5769
  // self-attention
5770
- {
5771
  struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
5772
  cb(Qcur, "Qcur", il);
5773
 
@@ -5782,7 +5899,38 @@ struct llm_build_context {
5782
 
5783
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5784
  model.layers[il].wo, model.layers[il].bo,
5785
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5786
  cb(cur, "kqv_out", il);
5787
  }
5788
 
@@ -5790,25 +5938,34 @@ struct llm_build_context {
5790
  cur = ggml_add(ctx0, cur, inpL);
5791
 
5792
  // attention layer norm
5793
- cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il);
5794
 
5795
  struct ggml_tensor * ffn_inp = cur;
5796
  cb(ffn_inp, "ffn_inp", il);
5797
 
5798
  // feed-forward network
5799
- cur = llm_build_ffn(ctx0, cur,
5800
- model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5801
- NULL, NULL,
5802
- model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5803
- NULL,
5804
- LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
 
 
 
 
 
 
 
 
 
5805
  cb(cur, "ffn_out", il);
5806
 
5807
  // attentions bypass the intermediate layer
5808
  cur = ggml_add(ctx0, cur, ffn_inp);
5809
 
5810
  // output layer norm
5811
- cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il);
5812
 
5813
  // input for next layer
5814
  inpL = cur;
@@ -5817,9 +5974,15 @@ struct llm_build_context {
5817
  // final output
5818
  cur = inpL;
5819
 
5820
- // pooling
5821
- cur = ggml_mul_mat(ctx0, inp_sum, ggml_cont(ctx0, ggml_transpose(ctx0, cur)));
5822
- cb(cur, "result_embed", -1);
 
 
 
 
 
 
5823
 
5824
  ggml_build_forward_expand(gf, cur);
5825
 
@@ -5843,6 +6006,10 @@ struct llm_build_context {
5843
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5844
  cb(KQ_mask, "KQ_mask", -1);
5845
 
 
 
 
 
5846
  inpL = llm_build_norm(ctx0, inpL, hparams,
5847
  model.tok_norm,
5848
  model.tok_norm_b,
@@ -5876,7 +6043,7 @@ struct llm_build_context {
5876
 
5877
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5878
  model.layers[il].wo, model.layers[il].bo,
5879
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5880
  cb(cur, "kqv_out", il);
5881
  }
5882
 
@@ -5936,6 +6103,10 @@ struct llm_build_context {
5936
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5937
  cb(KQ_mask, "KQ_mask", -1);
5938
 
 
 
 
 
5939
  for (int il = 0; il < n_layer; ++il) {
5940
  struct ggml_tensor * attn_norm;
5941
 
@@ -5969,7 +6140,7 @@ struct llm_build_context {
5969
 
5970
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5971
  model.layers[il].wo, NULL,
5972
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5973
  cb(cur, "kqv_out", il);
5974
  }
5975
 
@@ -6091,7 +6262,7 @@ struct llm_build_context {
6091
 
6092
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6093
  model.layers[il].wo, NULL,
6094
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6095
  cb(cur, "kqv_out", il);
6096
  }
6097
 
@@ -6206,7 +6377,7 @@ struct llm_build_context {
6206
 
6207
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6208
  model.layers[il].wo, NULL,
6209
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6210
  cb(cur, "kqv_out", il);
6211
  }
6212
 
@@ -6327,7 +6498,7 @@ struct llm_build_context {
6327
 
6328
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6329
  model.layers[il].wo, model.layers[il].bo,
6330
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6331
  cb(cur, "kqv_out", il);
6332
  }
6333
 
@@ -6454,7 +6625,7 @@ struct llm_build_context {
6454
 
6455
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6456
  model.layers[il].wo, model.layers[il].bo,
6457
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il);
6458
  cb(cur, "kqv_out", il);
6459
  }
6460
 
@@ -6557,7 +6728,7 @@ struct llm_build_context {
6557
 
6558
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6559
  model.layers[il].wo, NULL,
6560
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6561
  cb(cur, "kqv_out", il);
6562
  }
6563
  struct ggml_tensor * sa_out = cur;
@@ -6656,7 +6827,7 @@ struct llm_build_context {
6656
 
6657
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6658
  model.layers[il].wo, model.layers[il].bo,
6659
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6660
  cb(cur, "kqv_out", il);
6661
  }
6662
 
@@ -6765,7 +6936,7 @@ struct llm_build_context {
6765
 
6766
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6767
  model.layers[il].wo, model.layers[il].bo,
6768
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6769
  cb(cur, "kqv_out", il);
6770
  }
6771
 
@@ -6883,7 +7054,7 @@ struct llm_build_context {
6883
 
6884
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6885
  model.layers[il].wo, NULL,
6886
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6887
  cb(cur, "kqv_out", il);
6888
  }
6889
 
@@ -7002,7 +7173,7 @@ struct llm_build_context {
7002
 
7003
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7004
  model.layers[il].wo, model.layers[il].bo,
7005
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7006
  cb(cur, "kqv_out", il);
7007
  }
7008
 
@@ -7134,7 +7305,7 @@ struct llm_build_context {
7134
 
7135
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7136
  model.layers[il].wo, model.layers[il].bo,
7137
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7138
  cb(cur, "kqv_out", il);
7139
  }
7140
 
@@ -7249,6 +7420,7 @@ static struct ggml_cgraph * llama_build_graph(
7249
  result = llm.build_refact();
7250
  } break;
7251
  case LLM_ARCH_BERT:
 
7252
  {
7253
  result = llm.build_bert();
7254
  } break;
@@ -7352,7 +7524,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7352
 
7353
  for (int i = 0; i < n_kv; ++i) {
7354
  float f;
7355
- if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
 
7356
  f = -INFINITY;
7357
  } else {
7358
  f = 0;
@@ -7363,13 +7536,15 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7363
  }
7364
  }
7365
 
 
 
7366
 
7367
- {
7368
- assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
7369
- float * data = (float *) lctx.inp_sum->data;
7370
 
7371
- for (int i = 0; i < batch.n_tokens; ++i) {
7372
- data[i] = 1.0f/float(batch.n_tokens);
7373
  }
7374
  }
7375
 
@@ -7384,6 +7559,49 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7384
  data[i] = lctx.kv_self.cells[i].delta;
7385
  }
7386
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7387
  }
7388
 
7389
  // decode a batch of tokens by evaluating the transformer
@@ -7495,7 +7713,7 @@ static int llama_decode_internal(
7495
  embeddings = gf->nodes[gf->n_nodes - 3];
7496
  GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
7497
  }
7498
- } else if (strcmp(res->name, "result_embed") == 0) {
7499
  embeddings = res;
7500
  res = nullptr;
7501
  } else {
@@ -7615,11 +7833,12 @@ static int llama_decode_internal(
7615
  if (!lctx.embedding.empty()) {
7616
  auto & embedding_out = lctx.embedding;
7617
 
7618
- const int64_t embed_pos = res ? n_embd * (n_tokens-1) : 0;
 
7619
 
7620
- embedding_out.resize(n_embd);
7621
  ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
7622
- ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embed_pos*sizeof(float), n_embd*sizeof(float));
7623
  ggml_backend_synchronize(embeddings_backend);
7624
  }
7625
 
@@ -7696,7 +7915,13 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
7696
  switch (llama_vocab_get_type(vocab)) {
7697
  case LLAMA_VOCAB_TYPE_SPM: {
7698
  const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
7699
- return vocab.token_to_id.at(buf);
 
 
 
 
 
 
7700
  }
7701
  case LLAMA_VOCAB_TYPE_WPM:
7702
  case LLAMA_VOCAB_TYPE_BPE: {
@@ -7744,7 +7969,7 @@ struct llm_bigram_spm {
7744
  };
7745
 
7746
  struct llm_tokenizer_spm {
7747
- llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
7748
 
7749
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
7750
  // split string into utf8 chars
@@ -7819,6 +8044,7 @@ private:
7819
 
7820
  if (p == rev_merge.end()) {
7821
  // output any symbols that did not form tokens as bytes.
 
7822
  for (int j = 0; j < (int)symbol.n; ++j) {
7823
  llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
7824
  output.push_back(token_id);
@@ -8381,17 +8607,18 @@ struct fragment_buffer_variant {
8381
  token(_token),
8382
  raw_text(_dummy),
8383
  offset(0),
8384
- length(0){}
 
8385
  fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
8386
  :
8387
  type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
8388
- token((llama_vocab::id)-1),
8389
  raw_text(_raw_text),
8390
  offset(_offset),
8391
  length(_length){
8392
- GGML_ASSERT( _offset >= 0 );
8393
- GGML_ASSERT( _length >= 1 );
8394
- GGML_ASSERT( offset + length <= raw_text.length() );
8395
  }
8396
 
8397
  const FRAGMENT_BUFFER_VARIANT_TYPE type;
@@ -8515,14 +8742,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
8515
  }
8516
 
8517
  std::forward_list<fragment_buffer_variant> fragment_buffer;
8518
- fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
8519
 
8520
- if (special) tokenizer_st_partition( vocab, fragment_buffer );
8521
 
8522
  switch (vocab.type) {
8523
  case LLAMA_VOCAB_TYPE_SPM:
8524
  {
8525
- for (const auto & fragment: fragment_buffer) {
8526
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
8527
  // without adding this leading whitespace, we do not get the same results as the original tokenizer
8528
 
@@ -8550,7 +8777,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
8550
  } break;
8551
  case LLAMA_VOCAB_TYPE_BPE:
8552
  {
8553
- for (const auto & fragment: fragment_buffer) {
8554
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
8555
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
8556
 
@@ -8566,7 +8793,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
8566
  } break;
8567
  case LLAMA_VOCAB_TYPE_WPM:
8568
  {
8569
- for (const auto & fragment: fragment_buffer) {
8570
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
8571
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
8572
 
@@ -10087,20 +10314,20 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10087
  if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
10088
  new_type = GGML_TYPE_Q8_0;
10089
  }
10090
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
10091
  new_type = GGML_TYPE_Q5_K;
10092
  }
10093
  else if (new_type != GGML_TYPE_Q8_0) {
10094
  new_type = GGML_TYPE_Q6_K;
10095
  }
10096
  } else if (name == "token_embd.weight") {
10097
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
10098
  new_type = GGML_TYPE_Q2_K;
10099
  }
10100
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10101
  new_type = GGML_TYPE_Q4_K;
10102
  }
10103
- } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
10104
  if (name.find("attn_v.weight") != std::string::npos) {
10105
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
10106
  else new_type = GGML_TYPE_Q2_K;
@@ -10110,6 +10337,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10110
  if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
10111
  ++qs.i_ffn_down;
10112
  }
 
 
 
10113
  } else if (name.find("attn_v.weight") != std::string::npos) {
10114
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
10115
  new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
@@ -10227,6 +10457,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10227
  }
10228
  ++qs.i_ffn_up;
10229
  }
 
10230
  // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
10231
  //}
10232
  // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
@@ -10242,7 +10473,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10242
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
10243
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
10244
  new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
10245
- new_type == GGML_TYPE_IQ3_XXS) {
10246
  int nx = tensor->ne[0];
10247
  int ny = tensor->ne[1];
10248
  if (nx % QK_K != 0) {
@@ -10257,6 +10488,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10257
  case GGML_TYPE_IQ2_XXS:
10258
  case GGML_TYPE_IQ2_XS:
10259
  case GGML_TYPE_IQ3_XXS:
 
10260
  case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
10261
  case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
10262
  case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
@@ -10286,19 +10518,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10286
 
10287
  // K-quants
10288
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
10289
- case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
10290
  case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
10291
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
10292
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
10293
- case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
10294
  case LLAMA_FTYPE_MOSTLY_Q4_K_S:
10295
- case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
10296
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
10297
- case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
10298
- case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
10299
- case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
10300
- case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
10301
- case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
 
10302
 
10303
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
10304
  }
@@ -10428,7 +10661,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10428
  quantize &= !params->only_copy;
10429
 
10430
  // do not quantize expert gating tensors
10431
- quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
 
 
 
 
10432
 
10433
  enum ggml_type new_type;
10434
  void * new_data;
@@ -10468,6 +10705,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
10468
  }
10469
  if ((new_type == GGML_TYPE_IQ2_XXS ||
10470
  new_type == GGML_TYPE_IQ2_XS ||
 
10471
  (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
10472
  LLAMA_LOG_ERROR("\n\n============================================================\n");
10473
  LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
@@ -10702,7 +10940,7 @@ static int llama_apply_lora_from_file_internal(
10702
  {
10703
  LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
10704
  __func__, ftype);
10705
- return false;
10706
  }
10707
  }
10708
 
@@ -10930,6 +11168,7 @@ struct llama_context_params llama_context_default_params() {
10930
  /*.logits_all =*/ false,
10931
  /*.embedding =*/ false,
10932
  /*.offload_kqv =*/ true,
 
10933
  };
10934
 
10935
  return result;
@@ -10990,7 +11229,7 @@ bool llama_mlock_supported(void) {
10990
  return llama_supports_mlock();
10991
  }
10992
 
10993
- void llama_backend_init(bool numa) {
10994
  ggml_time_init();
10995
 
10996
  // needed to initialize f16 tables
@@ -11000,15 +11239,17 @@ void llama_backend_init(bool numa) {
11000
  ggml_free(ctx);
11001
  }
11002
 
11003
- if (numa) {
11004
- ggml_numa_init();
11005
- }
11006
-
11007
  #ifdef GGML_USE_MPI
11008
  ggml_mpi_backend_init();
11009
  #endif
11010
  }
11011
 
 
 
 
 
 
 
11012
  void llama_backend_free(void) {
11013
  #ifdef GGML_USE_MPI
11014
  ggml_mpi_backend_free();
@@ -11085,6 +11326,7 @@ struct llama_context * llama_new_context_with_model(
11085
  cparams.yarn_beta_slow = params.yarn_beta_slow;
11086
  cparams.mul_mat_q = params.mul_mat_q;
11087
  cparams.offload_kqv = params.offload_kqv;
 
11088
 
11089
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
11090
  cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
@@ -11232,14 +11474,14 @@ struct llama_context * llama_new_context_with_model(
11232
  // resized during inference, reserve maximum
11233
  ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
11234
 
11235
- if (params.embedding){
11236
  ctx->embedding.resize(hparams.n_embd);
11237
  }
11238
 
11239
  // graph inputs
11240
  {
11241
  ggml_init_params init_params = {
11242
- /* .mem_size */ ggml_tensor_overhead()*7,
11243
  /* .mem_buffer */ nullptr,
11244
  /* .no_alloc */ true,
11245
  };
@@ -11249,15 +11491,19 @@ struct llama_context * llama_new_context_with_model(
11249
  ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
11250
  ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
11251
  ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
 
11252
  ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
11253
- ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, 1, cparams.n_batch);
 
11254
 
11255
  ggml_set_name(ctx->inp_tokens, "inp_tokens");
11256
  ggml_set_name(ctx->inp_embd, "inp_embd");
11257
  ggml_set_name(ctx->inp_pos, "inp_pos");
11258
  ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
 
11259
  ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
11260
- ggml_set_name(ctx->inp_sum, "inp_sum");
 
11261
 
11262
  ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
11263
 
@@ -12108,6 +12354,10 @@ float * llama_get_embeddings(struct llama_context * ctx) {
12108
  return ctx->embedding.data();
12109
  }
12110
 
 
 
 
 
12111
  const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
12112
  return model->vocab.id_to_token[token].text.c_str();
12113
  }
@@ -12258,6 +12508,123 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
12258
  return 0;
12259
  }
12260
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12261
  struct llama_timings llama_get_timings(struct llama_context * ctx) {
12262
  struct llama_timings result = {
12263
  /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
 
197
  LLM_ARCH_PERSIMMON,
198
  LLM_ARCH_REFACT,
199
  LLM_ARCH_BERT,
200
+ LLM_ARCH_NOMIC_BERT,
201
  LLM_ARCH_BLOOM,
202
  LLM_ARCH_STABLELM,
203
  LLM_ARCH_QWEN,
 
212
  };
213
 
214
  static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
215
+ { LLM_ARCH_LLAMA, "llama" },
216
+ { LLM_ARCH_FALCON, "falcon" },
217
+ { LLM_ARCH_GPT2, "gpt2" },
218
+ { LLM_ARCH_GPTJ, "gptj" },
219
+ { LLM_ARCH_GPTNEOX, "gptneox" },
220
+ { LLM_ARCH_MPT, "mpt" },
221
+ { LLM_ARCH_BAICHUAN, "baichuan" },
222
+ { LLM_ARCH_STARCODER, "starcoder" },
223
+ { LLM_ARCH_PERSIMMON, "persimmon" },
224
+ { LLM_ARCH_REFACT, "refact" },
225
+ { LLM_ARCH_BERT, "bert" },
226
+ { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
227
+ { LLM_ARCH_BLOOM, "bloom" },
228
+ { LLM_ARCH_STABLELM, "stablelm" },
229
+ { LLM_ARCH_QWEN, "qwen" },
230
+ { LLM_ARCH_QWEN2, "qwen2" },
231
+ { LLM_ARCH_PHI2, "phi2" },
232
+ { LLM_ARCH_PLAMO, "plamo" },
233
+ { LLM_ARCH_CODESHELL, "codeshell" },
234
+ { LLM_ARCH_ORION, "orion" },
235
+ { LLM_ARCH_INTERNLM2, "internlm2" },
236
+ { LLM_ARCH_MINICPM, "minicpm" },
237
  };
238
 
239
  enum llm_kv {
 
256
  LLM_KV_TENSOR_DATA_LAYOUT,
257
  LLM_KV_EXPERT_COUNT,
258
  LLM_KV_EXPERT_USED_COUNT,
259
+ LLM_KV_POOLING_TYPE,
260
 
261
  LLM_KV_ATTENTION_HEAD_COUNT,
262
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
 
314
  { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
315
  { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
316
  { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
317
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
318
 
319
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
320
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
 
377
  LLM_TENSOR_ATTN_OUT,
378
  LLM_TENSOR_ATTN_NORM,
379
  LLM_TENSOR_ATTN_NORM_2,
380
+ LLM_TENSOR_ATTN_OUT_NORM,
381
  LLM_TENSOR_ATTN_ROT_EMBD,
382
  LLM_TENSOR_FFN_GATE_INP,
383
  LLM_TENSOR_FFN_NORM,
 
390
  LLM_TENSOR_FFN_UP_EXP,
391
  LLM_TENSOR_ATTN_Q_NORM,
392
  LLM_TENSOR_ATTN_K_NORM,
393
+ LLM_TENSOR_LAYER_OUT_NORM,
394
  };
395
 
396
  static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
 
556
  { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
557
  { LLM_TENSOR_TOKEN_TYPES, "token_types" },
558
  { LLM_TENSOR_POS_EMBD, "position_embd" },
559
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
560
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
561
  { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
562
  { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
563
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
564
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
565
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
566
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
567
+ },
568
+ },
569
+ {
570
+ LLM_ARCH_NOMIC_BERT,
571
+ {
572
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
573
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
574
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
575
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
576
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
577
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
578
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
579
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
580
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
581
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
582
  },
 
793
  llm_arch arch;
794
 
795
  std::string operator()(llm_tensor tensor) const {
796
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
797
+ return "__missing__";
798
+ }
799
  return LLM_TENSOR_NAMES[arch].at(tensor);
800
  }
801
 
802
  std::string operator()(llm_tensor tensor, const std::string & suffix) const {
803
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
804
+ return "__missing__";
805
+ }
806
  return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
807
  }
808
 
809
  std::string operator()(llm_tensor tensor, int bid) const {
810
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
811
+ return "__missing__";
812
+ }
813
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
814
  }
815
 
816
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
817
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
818
+ return "__missing__";
819
+ }
820
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
821
  }
822
 
823
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
824
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
825
+ return "__missing__";
826
+ }
827
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
828
  }
829
  };
 
1034
  int fd = fileno(file->fp);
1035
  int flags = MAP_SHARED;
1036
  // prefetch/readahead impairs performance on NUMA systems
1037
+ if (numa) { prefetch = 0; }
1038
  #ifdef __linux__
1039
  // advise the kernel to read the file sequentially (increases readahead)
1040
  if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
 
1504
  MODEL_22M,
1505
  MODEL_33M,
1506
  MODEL_109M,
1507
+ MODEL_137M,
1508
  MODEL_335M,
1509
  MODEL_0_5B,
1510
  MODEL_1B,
 
1557
  uint32_t n_yarn_orig_ctx;
1558
  int32_t rope_scaling_type_train;
1559
 
1560
+ float f_clamp_kqv = 0.0f;
1561
+ float f_max_alibi_bias = 0.0f;
1562
 
1563
  bool causal_attn = true;
1564
+ bool need_kq_pos = false;
1565
 
1566
+ uint32_t pooling_type = LLAMA_POOLING_NONE;
1567
 
1568
  bool operator!=(const llama_hparams & other) const {
1569
  if (this->vocab_only != other.vocab_only) return true;
 
1625
 
1626
  bool mul_mat_q;
1627
  bool offload_kqv;
1628
+ bool do_pooling;
1629
 
1630
  ggml_backend_sched_eval_callback cb_eval;
1631
  void * cb_eval_user_data;
 
1641
  struct ggml_tensor * attn_q_norm_b;
1642
  struct ggml_tensor * attn_k_norm;
1643
  struct ggml_tensor * attn_k_norm_b;
1644
+ struct ggml_tensor * attn_out_norm;
1645
+ struct ggml_tensor * attn_out_norm_b;
1646
 
1647
  // attention
1648
  struct ggml_tensor * wq;
 
1661
  // normalization
1662
  struct ggml_tensor * ffn_norm;
1663
  struct ggml_tensor * ffn_norm_b;
1664
+ struct ggml_tensor * layer_out_norm;
1665
+ struct ggml_tensor * layer_out_norm_b;
1666
 
1667
  // ff
1668
  struct ggml_tensor * ffn_gate; // w1
 
1924
  struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
1925
  struct ggml_tensor * inp_pos; // I32 [n_batch]
1926
  struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
1927
+ struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx]
1928
  struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
1929
+ struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
1930
+ struct ggml_tensor * inp_cls; // I32 [n_batch]
1931
 
1932
  #ifdef GGML_USE_MPI
1933
  ggml_mpi_context * ctx_mpi = NULL;
 
2526
  case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2527
  case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2528
  case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2529
+ case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
2530
  default:
2531
  {
2532
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
 
2876
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2877
  case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
2878
  case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
2879
+ case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
2880
 
2881
  default: return "unknown, may not work";
2882
  }
 
2884
 
2885
  static const char * llama_model_type_name(e_model type) {
2886
  switch (type) {
2887
+ case MODEL_22M: return "22M";
2888
+ case MODEL_33M: return "33M";
2889
+ case MODEL_109M: return "109M";
2890
+ case MODEL_137M: return "137M";
2891
+ case MODEL_0_5B: return "0.5B";
2892
  case MODEL_1B: return "1B";
2893
  case MODEL_2B: return "2B";
2894
  case MODEL_3B: return "3B";
 
3058
  case 40: model.type = e_model::MODEL_13B; break;
3059
  default: model.type = e_model::MODEL_UNKNOWN;
3060
  }
3061
+
3062
+ if (model.type == e_model::MODEL_13B) {
3063
+ // TODO: become GGUF KV parameter
3064
+ hparams.f_max_alibi_bias = 8.0f;
3065
+ }
3066
  } break;
3067
  case LLM_ARCH_STARCODER:
3068
  {
 
3090
  case 32: model.type = e_model::MODEL_1B; break;
3091
  default: model.type = e_model::MODEL_UNKNOWN;
3092
  }
3093
+
3094
+ // TODO: become GGUF KV parameter
3095
+ hparams.f_max_alibi_bias = 8.0f;
3096
  } break;
3097
  case LLM_ARCH_BERT:
3098
  {
3099
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3100
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3101
  ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3102
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3103
 
3104
  switch (hparams.n_layer) {
3105
  case 3:
 
3115
  model.type = e_model::MODEL_335M; break; // bge-large
3116
  }
3117
  } break;
3118
+ case LLM_ARCH_NOMIC_BERT:
3119
+ {
3120
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3121
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3122
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3123
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3124
+
3125
+ if (hparams.n_layer == 12 && hparams.n_embd == 768) {
3126
+ model.type = e_model::MODEL_137M;
3127
+ }
3128
+ } break;
3129
  case LLM_ARCH_BLOOM:
3130
  {
3131
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 
3138
  case 4096: model.type = e_model::MODEL_7B; break;
3139
  } break;
3140
  }
3141
+
3142
+ // TODO: become GGUF KV parameter
3143
+ hparams.f_max_alibi_bias = 8.0f;
3144
  } break;
3145
  case LLM_ARCH_MPT:
3146
  {
 
 
3147
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3148
  ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
3149
  ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
 
3245
  }
3246
 
3247
  model.ftype = ml.ftype;
3248
+
3249
+ if (hparams.f_max_alibi_bias > 0.0f) {
3250
+ hparams.need_kq_pos = true;
3251
+ }
3252
  }
3253
 
3254
  // TODO: This should probably be in llama.h
 
3372
 
3373
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
3374
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
3375
+ try {
3376
+ vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
3377
+ } catch (const std::exception & e) {
3378
+ LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
3379
+ vocab.linefeed_id = vocab.special_pad_id;
3380
+ }
3381
  } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
3382
  vocab.linefeed_id = vocab.special_pad_id;
3383
  } else {
 
3933
  }
3934
  } break;
3935
  case LLM_ARCH_BERT:
3936
+ case LLM_ARCH_NOMIC_BERT:
3937
  {
3938
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3939
+ model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
3940
+ if (model.arch == LLM_ARCH_BERT) {
3941
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
3942
+ }
3943
+
3944
  model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3945
  model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3946
 
 
3950
 
3951
  auto & layer = model.layers[i];
3952
 
3953
+ if (model.arch == LLM_ARCH_BERT) {
3954
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3955
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
3956
 
3957
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3958
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
3959
 
3960
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3961
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
3962
+ } else {
3963
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3964
+ }
3965
 
3966
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
 
3967
 
3968
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
3969
+ layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
3970
 
3971
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3972
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3973
 
3974
+ if (model.arch == LLM_ARCH_BERT) {
3975
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3976
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3977
 
3978
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3979
+ } else {
3980
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
3981
+ }
3982
+
3983
+ layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
3984
+ layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
3985
  }
3986
  } break;
3987
  case LLM_ARCH_BLOOM:
 
4460
 
4461
  model.hparams.vocab_only = params.vocab_only;
4462
 
4463
+ try {
4464
+ llm_load_arch(ml, model);
4465
+ } catch(const std::exception & e) {
4466
+ throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
4467
+ }
4468
+ try {
4469
+ llm_load_hparams(ml, model);
4470
+ } catch(const std::exception & e) {
4471
+ throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
4472
+ }
4473
+ try {
4474
+ llm_load_vocab(ml, model);
4475
+ } catch(const std::exception & e) {
4476
+ throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
4477
+ }
4478
 
4479
  llm_load_print_meta(ml, model);
4480
 
 
4791
  struct ggml_tensor * wo_b,
4792
  struct ggml_tensor * q_cur,
4793
  struct ggml_tensor * kq_mask,
4794
+ struct ggml_tensor * kq_pos,
4795
  int64_t n_ctx,
4796
  int32_t n_tokens,
4797
  int32_t n_kv,
 
4798
  float kq_scale,
4799
  const llm_build_cb & cb,
4800
  int il) {
 
4824
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
4825
  }
4826
 
4827
+ #if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_SYCL)
4828
+ #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, Kompute, and SYCL")
4829
+ #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
4830
+ #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
4831
+ if (hparams.f_max_alibi_bias > 0.0f) {
4832
  kq = ggml_scale(ctx, kq, kq_scale);
4833
  cb(kq, "kq_scaled", il);
4834
 
4835
+ kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
4836
+ cb(kq, "kq_scaled_alibi", il);
 
 
 
 
 
4837
 
4838
  kq = ggml_add(ctx, kq, kq_mask);
4839
  cb(kq, "kq_masked", il);
4840
 
4841
  kq = ggml_soft_max(ctx, kq);
4842
  cb(kq, "kq_soft_max", il);
4843
+ } else
4844
+ #endif
4845
+ {
4846
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
4847
  cb(kq, "kq_soft_max_ext", il);
4848
  }
4849
 
 
4891
  struct ggml_tensor * v_cur,
4892
  struct ggml_tensor * q_cur,
4893
  struct ggml_tensor * kq_mask,
4894
+ struct ggml_tensor * kq_pos,
4895
  int64_t n_ctx,
4896
  int32_t n_tokens,
4897
  int32_t kv_head,
4898
  int32_t n_kv,
 
4899
  float kq_scale,
4900
  const llm_build_cb & cb,
4901
  int il) {
 
4909
  llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
4910
 
4911
  struct ggml_tensor * cur;
4912
+ cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
4913
+ q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
 
4914
  cb(cur, "kqv_out", il);
4915
 
4916
  return cur;
 
4951
  const int32_t n_orig_ctx;
4952
 
4953
  const bool do_rope_shift;
4954
+ const uint32_t pooling_type;
4955
 
4956
  const llm_build_cb & cb;
4957
 
 
4995
  kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
4996
  n_orig_ctx (cparams.n_yarn_orig_ctx),
4997
  do_rope_shift (worst_case || kv_self.has_shift),
4998
+ pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
4999
  cb (cb),
5000
  buf_compute_meta (lctx.buf_compute_meta) {
5001
  // all initializations should be done in init()
 
5078
  }
5079
 
5080
  Qcur = ggml_rope_custom(
5081
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5082
  hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
5083
  ext_factor, attn_factor, beta_fast, beta_slow
5084
  );
 
5093
 
5094
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5095
  model.layers[il].wo, model.layers[il].bo,
5096
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5097
  cb(cur, "kqv_out", il);
5098
  }
5099
 
 
5223
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5224
  cb(KQ_mask, "KQ_mask", -1);
5225
 
5226
+ // positions of the tokens in the KV cache
5227
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
5228
+ cb(KQ_pos, "KQ_pos", -1);
5229
+
5230
  // shift the entire K-cache if needed
5231
  if (do_rope_shift) {
5232
  llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
 
5275
  cb(Kcur, "Kcur", il);
5276
 
5277
 
 
 
 
5278
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5279
  model.layers[il].wo, NULL,
5280
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5281
  cb(cur, "kqv_out", il);
5282
  }
5283
 
 
5401
 
5402
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5403
  model.layers[il].wo, NULL,
5404
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5405
  cb(cur, "kqv_out", il);
5406
  }
5407
 
 
5500
 
5501
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5502
  model.layers[il].wo, model.layers[il].bo,
5503
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5504
  cb(cur, "kqv_out", il);
5505
  }
5506
 
 
5705
 
5706
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5707
  model.layers[il].wo, model.layers[il].bo,
5708
+ Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5709
  cb(cur, "kqv_out", il);
5710
  }
5711
 
 
5767
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5768
  cb(KQ_mask, "KQ_mask", -1);
5769
 
5770
+ // positions of the tokens in the KV cache
5771
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
5772
+ cb(KQ_pos, "KQ_pos", -1);
5773
+
5774
  for (int il = 0; il < n_layer; ++il) {
5775
  struct ggml_tensor * inpSA = inpL;
5776
 
 
5798
 
5799
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5800
  model.layers[il].wo, NULL,
5801
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5802
  cb(cur, "kqv_out", il);
5803
  }
5804
 
 
5848
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5849
 
5850
  const int64_t n_embd_head = hparams.n_embd_head_v;
5851
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5852
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
5853
 
5854
  struct ggml_tensor * cur;
5855
  struct ggml_tensor * inpL;
5856
 
5857
  // get input vectors with right size
5858
+ const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
5859
  struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5860
+ struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
5861
+ struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
5862
 
5863
  // construct input embeddings (token, type, position)
5864
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5865
+
5866
  // token types are hardcoded to zero ("Sentence A")
5867
  struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
5868
  inpL = ggml_add(ctx0, inpL, type_row0);
5869
+ if (model.arch == LLM_ARCH_BERT) {
5870
+ inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
5871
+ }
5872
  cb(inpL, "inp_embd", -1);
5873
 
5874
  // embed layer norm
 
5884
  struct ggml_tensor * cur = inpL;
5885
 
5886
  // self-attention
5887
+ if (model.arch == LLM_ARCH_BERT) {
5888
  struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
5889
  cb(Qcur, "Qcur", il);
5890
 
 
5899
 
5900
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5901
  model.layers[il].wo, model.layers[il].bo,
5902
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5903
+ cb(cur, "kqv_out", il);
5904
+ } else {
5905
+ // compute Q and K and RoPE them
5906
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5907
+ cb(cur, "wqkv", il);
5908
+
5909
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5910
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5911
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5912
+
5913
+ cb(Qcur, "Qcur", il);
5914
+ cb(Kcur, "Kcur", il);
5915
+ cb(Vcur, "Vcur", il);
5916
+
5917
+ Qcur = ggml_rope_custom(
5918
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5919
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
5920
+ ext_factor, attn_factor, beta_fast, beta_slow
5921
+ );
5922
+ cb(Qcur, "Qcur", il);
5923
+
5924
+ Kcur = ggml_rope_custom(
5925
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5926
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
5927
+ ext_factor, attn_factor, beta_fast, beta_slow
5928
+ );
5929
+ cb(Kcur, "Kcur", il);
5930
+
5931
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5932
+ model.layers[il].wo, model.layers[il].bo,
5933
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5934
  cb(cur, "kqv_out", il);
5935
  }
5936
 
 
5938
  cur = ggml_add(ctx0, cur, inpL);
5939
 
5940
  // attention layer norm
5941
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
5942
 
5943
  struct ggml_tensor * ffn_inp = cur;
5944
  cb(ffn_inp, "ffn_inp", il);
5945
 
5946
  // feed-forward network
5947
+ if (model.arch == LLM_ARCH_BERT) {
5948
+ cur = llm_build_ffn(ctx0, cur,
5949
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5950
+ NULL, NULL,
5951
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5952
+ NULL,
5953
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5954
+ } else {
5955
+ cur = llm_build_ffn(ctx0, cur,
5956
+ model.layers[il].ffn_up, NULL,
5957
+ model.layers[il].ffn_gate, NULL,
5958
+ model.layers[il].ffn_down, NULL,
5959
+ NULL,
5960
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5961
+ }
5962
  cb(cur, "ffn_out", il);
5963
 
5964
  // attentions bypass the intermediate layer
5965
  cur = ggml_add(ctx0, cur, ffn_inp);
5966
 
5967
  // output layer norm
5968
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
5969
 
5970
  // input for next layer
5971
  inpL = cur;
 
5974
  // final output
5975
  cur = inpL;
5976
 
5977
+ // pooling layer
5978
+ if (pooling_type == LLAMA_POOLING_MEAN) {
5979
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
5980
+ } else if (pooling_type == LLAMA_POOLING_CLS) {
5981
+ cur = ggml_get_rows(ctx0, cur, inp_cls);
5982
+ } else {
5983
+ GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
5984
+ }
5985
+ cb(cur, "result_embd", -1);
5986
 
5987
  ggml_build_forward_expand(gf, cur);
5988
 
 
6006
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6007
  cb(KQ_mask, "KQ_mask", -1);
6008
 
6009
+ // positions of the tokens in the KV cache
6010
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
6011
+ cb(KQ_pos, "KQ_pos", -1);
6012
+
6013
  inpL = llm_build_norm(ctx0, inpL, hparams,
6014
  model.tok_norm,
6015
  model.tok_norm_b,
 
6043
 
6044
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6045
  model.layers[il].wo, model.layers[il].bo,
6046
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6047
  cb(cur, "kqv_out", il);
6048
  }
6049
 
 
6103
  struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6104
  cb(KQ_mask, "KQ_mask", -1);
6105
 
6106
+ // positions of the tokens in the KV cache
6107
+ struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
6108
+ cb(KQ_pos, "KQ_pos", -1);
6109
+
6110
  for (int il = 0; il < n_layer; ++il) {
6111
  struct ggml_tensor * attn_norm;
6112
 
 
6140
 
6141
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6142
  model.layers[il].wo, NULL,
6143
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6144
  cb(cur, "kqv_out", il);
6145
  }
6146
 
 
6262
 
6263
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6264
  model.layers[il].wo, NULL,
6265
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6266
  cb(cur, "kqv_out", il);
6267
  }
6268
 
 
6377
 
6378
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6379
  model.layers[il].wo, NULL,
6380
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6381
  cb(cur, "kqv_out", il);
6382
  }
6383
 
 
6498
 
6499
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6500
  model.layers[il].wo, model.layers[il].bo,
6501
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6502
  cb(cur, "kqv_out", il);
6503
  }
6504
 
 
6625
 
6626
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6627
  model.layers[il].wo, model.layers[il].bo,
6628
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
6629
  cb(cur, "kqv_out", il);
6630
  }
6631
 
 
6728
 
6729
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6730
  model.layers[il].wo, NULL,
6731
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6732
  cb(cur, "kqv_out", il);
6733
  }
6734
  struct ggml_tensor * sa_out = cur;
 
6827
 
6828
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6829
  model.layers[il].wo, model.layers[il].bo,
6830
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6831
  cb(cur, "kqv_out", il);
6832
  }
6833
 
 
6936
 
6937
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6938
  model.layers[il].wo, model.layers[il].bo,
6939
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6940
  cb(cur, "kqv_out", il);
6941
  }
6942
 
 
7054
 
7055
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7056
  model.layers[il].wo, NULL,
7057
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7058
  cb(cur, "kqv_out", il);
7059
  }
7060
 
 
7173
 
7174
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7175
  model.layers[il].wo, model.layers[il].bo,
7176
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7177
  cb(cur, "kqv_out", il);
7178
  }
7179
 
 
7305
 
7306
  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7307
  model.layers[il].wo, model.layers[il].bo,
7308
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7309
  cb(cur, "kqv_out", il);
7310
  }
7311
 
 
7420
  result = llm.build_refact();
7421
  } break;
7422
  case LLM_ARCH_BERT:
7423
+ case LLM_ARCH_NOMIC_BERT:
7424
  {
7425
  result = llm.build_bert();
7426
  } break;
 
7524
 
7525
  for (int i = 0; i < n_kv; ++i) {
7526
  float f;
7527
+ if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
7528
+ (hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
7529
  f = -INFINITY;
7530
  } else {
7531
  f = 0;
 
7536
  }
7537
  }
7538
 
7539
+ if (hparams.need_kq_pos) {
7540
+ const int64_t n_kv = kv_self.n;
7541
 
7542
+ assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
7543
+
7544
+ float * data = (float *) lctx.inp_KQ_pos->data;
7545
 
7546
+ for (int i = 0; i < n_kv; ++i) {
7547
+ data[i] = float(lctx.kv_self.cells[i].pos);
7548
  }
7549
  }
7550
 
 
7559
  data[i] = lctx.kv_self.cells[i].delta;
7560
  }
7561
  }
7562
+
7563
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
7564
+ const int64_t n_tokens = batch.n_tokens;
7565
+
7566
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
7567
+ float * data = (float *) lctx.inp_mean->data;
7568
+
7569
+ memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
7570
+
7571
+ std::vector<uint64_t> sum(n_tokens, 0);
7572
+ for (int i = 0; i < n_tokens; ++i) {
7573
+ const llama_seq_id seq_id = batch.seq_id[i][0];
7574
+ sum[seq_id] += 1;
7575
+ }
7576
+
7577
+ std::vector<float> div(n_tokens, 0.0f);
7578
+ for (int i = 0; i < n_tokens; ++i) {
7579
+ const uint64_t s = sum[i];
7580
+ if (s > 0) {
7581
+ div[i] = 1.0f/float(s);
7582
+ }
7583
+ }
7584
+
7585
+ for (int i = 0; i < n_tokens; ++i) {
7586
+ const llama_seq_id seq_id = batch.seq_id[i][0];
7587
+ data[seq_id*n_tokens + i] = div[seq_id];
7588
+ }
7589
+ }
7590
+
7591
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
7592
+ const int64_t n_tokens = batch.n_tokens;
7593
+
7594
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
7595
+ uint32_t * data = (uint32_t *) lctx.inp_cls->data;
7596
+
7597
+ for (int i = 0; i < n_tokens; ++i) {
7598
+ const llama_seq_id seq_id = batch.seq_id[i][0];
7599
+ const llama_pos pos = batch.pos[i];
7600
+ if (pos == 0) {
7601
+ data[seq_id] = i;
7602
+ }
7603
+ }
7604
+ }
7605
  }
7606
 
7607
  // decode a batch of tokens by evaluating the transformer
 
7713
  embeddings = gf->nodes[gf->n_nodes - 3];
7714
  GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
7715
  }
7716
+ } else if (strcmp(res->name, "result_embd") == 0) {
7717
  embeddings = res;
7718
  res = nullptr;
7719
  } else {
 
7833
  if (!lctx.embedding.empty()) {
7834
  auto & embedding_out = lctx.embedding;
7835
 
7836
+ const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
7837
+ const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
7838
 
7839
+ embedding_out.resize(embd_size);
7840
  ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
7841
+ ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
7842
  ggml_backend_synchronize(embeddings_backend);
7843
  }
7844
 
 
7915
  switch (llama_vocab_get_type(vocab)) {
7916
  case LLAMA_VOCAB_TYPE_SPM: {
7917
  const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
7918
+ auto token = vocab.token_to_id.find(buf);
7919
+ if (token != vocab.token_to_id.end()) {
7920
+ return (*token).second;
7921
+ }
7922
+ // Try to fall back to just the byte as a string
7923
+ const char buf2[2] = { (char)ch, 0 };
7924
+ return vocab.token_to_id.at(buf2);
7925
  }
7926
  case LLAMA_VOCAB_TYPE_WPM:
7927
  case LLAMA_VOCAB_TYPE_BPE: {
 
7969
  };
7970
 
7971
  struct llm_tokenizer_spm {
7972
+ llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
7973
 
7974
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
7975
  // split string into utf8 chars
 
8044
 
8045
  if (p == rev_merge.end()) {
8046
  // output any symbols that did not form tokens as bytes.
8047
+ output.reserve(output.size() + symbol.n);
8048
  for (int j = 0; j < (int)symbol.n; ++j) {
8049
  llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
8050
  output.push_back(token_id);
 
8607
  token(_token),
8608
  raw_text(_dummy),
8609
  offset(0),
8610
+ length(0) {}
8611
+
8612
  fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
8613
  :
8614
  type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
8615
+ token((llama_vocab::id) - 1),
8616
  raw_text(_raw_text),
8617
  offset(_offset),
8618
  length(_length){
8619
+ GGML_ASSERT(_offset >= 0);
8620
+ GGML_ASSERT(_length >= 1);
8621
+ GGML_ASSERT(offset + length <= raw_text.length());
8622
  }
8623
 
8624
  const FRAGMENT_BUFFER_VARIANT_TYPE type;
 
8742
  }
8743
 
8744
  std::forward_list<fragment_buffer_variant> fragment_buffer;
8745
+ fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
8746
 
8747
+ if (special) tokenizer_st_partition(vocab, fragment_buffer);
8748
 
8749
  switch (vocab.type) {
8750
  case LLAMA_VOCAB_TYPE_SPM:
8751
  {
8752
+ for (const auto & fragment : fragment_buffer) {
8753
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
8754
  // without adding this leading whitespace, we do not get the same results as the original tokenizer
8755
 
 
8777
  } break;
8778
  case LLAMA_VOCAB_TYPE_BPE:
8779
  {
8780
+ for (const auto & fragment : fragment_buffer) {
8781
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
8782
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
8783
 
 
8793
  } break;
8794
  case LLAMA_VOCAB_TYPE_WPM:
8795
  {
8796
+ for (const auto & fragment : fragment_buffer) {
8797
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
8798
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
8799
 
 
10314
  if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
10315
  new_type = GGML_TYPE_Q8_0;
10316
  }
10317
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10318
  new_type = GGML_TYPE_Q5_K;
10319
  }
10320
  else if (new_type != GGML_TYPE_Q8_0) {
10321
  new_type = GGML_TYPE_Q6_K;
10322
  }
10323
  } else if (name == "token_embd.weight") {
10324
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10325
  new_type = GGML_TYPE_Q2_K;
10326
  }
10327
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10328
  new_type = GGML_TYPE_Q4_K;
10329
  }
10330
+ } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10331
  if (name.find("attn_v.weight") != std::string::npos) {
10332
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
10333
  else new_type = GGML_TYPE_Q2_K;
 
10337
  if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
10338
  ++qs.i_ffn_down;
10339
  }
10340
+ else if (name.find("attn_output.weight") != std::string::npos) {
10341
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
10342
+ }
10343
  } else if (name.find("attn_v.weight") != std::string::npos) {
10344
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
10345
  new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
 
10457
  }
10458
  ++qs.i_ffn_up;
10459
  }
10460
+
10461
  // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
10462
  //}
10463
  // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
 
10473
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
10474
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
10475
  new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
10476
+ new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
10477
  int nx = tensor->ne[0];
10478
  int ny = tensor->ne[1];
10479
  if (nx % QK_K != 0) {
 
10488
  case GGML_TYPE_IQ2_XXS:
10489
  case GGML_TYPE_IQ2_XS:
10490
  case GGML_TYPE_IQ3_XXS:
10491
+ case GGML_TYPE_IQ1_S:
10492
  case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
10493
  case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
10494
  case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
 
10518
 
10519
  // K-quants
10520
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
10521
+ case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
10522
  case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
10523
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
10524
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
10525
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
10526
  case LLAMA_FTYPE_MOSTLY_Q4_K_S:
10527
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
10528
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
10529
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
10530
+ case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
10531
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
10532
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
10533
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
10534
+ case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S ; break;
10535
 
10536
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
10537
  }
 
10661
  quantize &= !params->only_copy;
10662
 
10663
  // do not quantize expert gating tensors
10664
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
10665
+
10666
+ // do not quantize positional embeddings and token types (BERT)
10667
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
10668
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
10669
 
10670
  enum ggml_type new_type;
10671
  void * new_data;
 
10705
  }
10706
  if ((new_type == GGML_TYPE_IQ2_XXS ||
10707
  new_type == GGML_TYPE_IQ2_XS ||
10708
+ new_type == GGML_TYPE_IQ1_S ||
10709
  (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
10710
  LLAMA_LOG_ERROR("\n\n============================================================\n");
10711
  LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
 
10940
  {
10941
  LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
10942
  __func__, ftype);
10943
+ return 1;
10944
  }
10945
  }
10946
 
 
11168
  /*.logits_all =*/ false,
11169
  /*.embedding =*/ false,
11170
  /*.offload_kqv =*/ true,
11171
+ /*.do_pooling =*/ true,
11172
  };
11173
 
11174
  return result;
 
11229
  return llama_supports_mlock();
11230
  }
11231
 
11232
+ void llama_backend_init(void) {
11233
  ggml_time_init();
11234
 
11235
  // needed to initialize f16 tables
 
11239
  ggml_free(ctx);
11240
  }
11241
 
 
 
 
 
11242
  #ifdef GGML_USE_MPI
11243
  ggml_mpi_backend_init();
11244
  #endif
11245
  }
11246
 
11247
+ void llama_numa_init(enum ggml_numa_strategy numa) {
11248
+ if (numa != GGML_NUMA_STRATEGY_DISABLED) {
11249
+ ggml_numa_init(numa);
11250
+ }
11251
+ }
11252
+
11253
  void llama_backend_free(void) {
11254
  #ifdef GGML_USE_MPI
11255
  ggml_mpi_backend_free();
 
11326
  cparams.yarn_beta_slow = params.yarn_beta_slow;
11327
  cparams.mul_mat_q = params.mul_mat_q;
11328
  cparams.offload_kqv = params.offload_kqv;
11329
+ cparams.do_pooling = params.do_pooling;
11330
 
11331
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
11332
  cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
 
11474
  // resized during inference, reserve maximum
11475
  ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
11476
 
11477
+ if (params.embedding) {
11478
  ctx->embedding.resize(hparams.n_embd);
11479
  }
11480
 
11481
  // graph inputs
11482
  {
11483
  ggml_init_params init_params = {
11484
+ /* .mem_size */ ggml_tensor_overhead()*8,
11485
  /* .mem_buffer */ nullptr,
11486
  /* .no_alloc */ true,
11487
  };
 
11491
  ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
11492
  ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
11493
  ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
11494
+ ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
11495
  ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
11496
+ ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
11497
+ ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
11498
 
11499
  ggml_set_name(ctx->inp_tokens, "inp_tokens");
11500
  ggml_set_name(ctx->inp_embd, "inp_embd");
11501
  ggml_set_name(ctx->inp_pos, "inp_pos");
11502
  ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
11503
+ ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos");
11504
  ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
11505
+ ggml_set_name(ctx->inp_mean, "inp_mean");
11506
+ ggml_set_name(ctx->inp_cls, "inp_cls");
11507
 
11508
  ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
11509
 
 
12354
  return ctx->embedding.data();
12355
  }
12356
 
12357
+ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
12358
+ return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
12359
+ }
12360
+
12361
  const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
12362
  return model->vocab.id_to_token[token].text.c_str();
12363
  }
 
12508
  return 0;
12509
  }
12510
 
12511
+ // trim whitespace from the beginning and end of a string
12512
+ static std::string trim(const std::string & str) {
12513
+ size_t start = 0;
12514
+ size_t end = str.size();
12515
+ while (start < end && isspace(str[start])) {
12516
+ start += 1;
12517
+ }
12518
+ while (end > start && isspace(str[end - 1])) {
12519
+ end -= 1;
12520
+ }
12521
+ return str.substr(start, end - start);
12522
+ }
12523
+
12524
+ // Simple version of "llama_apply_chat_template" that only works with strings
12525
+ // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
12526
+ static int32_t llama_chat_apply_template_internal(
12527
+ const std::string & tmpl,
12528
+ const std::vector<const llama_chat_message *> & chat,
12529
+ std::string & dest, bool add_ass) {
12530
+ // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
12531
+ std::stringstream ss;
12532
+ if (tmpl.find("<|im_start|>") != std::string::npos) {
12533
+ // chatml template
12534
+ for (auto message : chat) {
12535
+ ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
12536
+ }
12537
+ if (add_ass) {
12538
+ ss << "<|im_start|>assistant\n";
12539
+ }
12540
+ } else if (tmpl.find("[INST]") != std::string::npos) {
12541
+ // llama2 template and its variants
12542
+ // [variant] support system message
12543
+ bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
12544
+ // [variant] space before + after response
12545
+ bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
12546
+ // [variant] add BOS inside history
12547
+ bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
12548
+ // [variant] trim spaces from the input message
12549
+ bool strip_message = tmpl.find("content.strip()") != std::string::npos;
12550
+ // construct the prompt
12551
+ bool is_inside_turn = true; // skip BOS at the beginning
12552
+ ss << "[INST] ";
12553
+ for (auto message : chat) {
12554
+ std::string content = strip_message ? trim(message->content) : message->content;
12555
+ std::string role(message->role);
12556
+ if (!is_inside_turn) {
12557
+ is_inside_turn = true;
12558
+ ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
12559
+ }
12560
+ if (role == "system") {
12561
+ if (support_system_message) {
12562
+ ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
12563
+ } else {
12564
+ // if the model does not support system message, we still include it in the first message, but without <<SYS>>
12565
+ ss << content << "\n";
12566
+ }
12567
+ } else if (role == "user") {
12568
+ ss << content << " [/INST]";
12569
+ } else {
12570
+ ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
12571
+ is_inside_turn = false;
12572
+ }
12573
+ }
12574
+ // llama2 templates seem to not care about "add_generation_prompt"
12575
+ } else if (tmpl.find("<|user|>") != std::string::npos) {
12576
+ // zephyr template
12577
+ for (auto message : chat) {
12578
+ ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
12579
+ }
12580
+ if (add_ass) {
12581
+ ss << "<|assistant|>\n";
12582
+ }
12583
+ } else {
12584
+ // template not supported
12585
+ return -1;
12586
+ }
12587
+ dest = ss.str();
12588
+ return dest.size();
12589
+ }
12590
+
12591
+ LLAMA_API int32_t llama_chat_apply_template(
12592
+ const struct llama_model * model,
12593
+ const char * tmpl,
12594
+ const struct llama_chat_message * chat,
12595
+ size_t n_msg,
12596
+ bool add_ass,
12597
+ char * buf,
12598
+ int32_t length) {
12599
+ std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
12600
+ if (tmpl == nullptr) {
12601
+ GGML_ASSERT(model != nullptr);
12602
+ // load template from model
12603
+ std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
12604
+ std::string template_key = "tokenizer.chat_template";
12605
+ int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), curr_tmpl.size());
12606
+ if (res < 0) {
12607
+ // worst case: there is no information about template, we will use chatml by default
12608
+ curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
12609
+ } else {
12610
+ curr_tmpl = std::string(model_template.data(), model_template.size());
12611
+ }
12612
+ }
12613
+ // format the chat to string
12614
+ std::vector<const llama_chat_message *> chat_vec;
12615
+ chat_vec.resize(n_msg);
12616
+ for (size_t i = 0; i < n_msg; i++) {
12617
+ chat_vec[i] = &chat[i];
12618
+ }
12619
+ std::string formatted_chat;
12620
+ int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
12621
+ if (res < 0) {
12622
+ return res;
12623
+ }
12624
+ strncpy(buf, formatted_chat.c_str(), length);
12625
+ return res;
12626
+ }
12627
+
12628
  struct llama_timings llama_get_timings(struct llama_context * ctx) {
12629
  struct llama_timings result = {
12630
  /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
examples/talk-llama/llama.h CHANGED
@@ -100,6 +100,7 @@ extern "C" {
100
  LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
101
  LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
102
  LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
 
103
 
104
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
105
  };
@@ -112,6 +113,12 @@ extern "C" {
112
  LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
113
  };
114
 
 
 
 
 
 
 
115
  enum llama_split_mode {
116
  LLAMA_SPLIT_NONE = 0, // single GPU
117
  LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
@@ -236,6 +243,7 @@ extern "C" {
236
  bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
237
  bool embedding; // embedding mode only
238
  bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
 
239
  };
240
 
241
  // model quantization parameters
@@ -297,6 +305,12 @@ extern "C" {
297
  int32_t n_eval;
298
  };
299
 
 
 
 
 
 
 
300
  // Helpers for getting default parameters
301
  LLAMA_API struct llama_model_params llama_model_default_params(void);
302
  LLAMA_API struct llama_context_params llama_context_default_params(void);
@@ -305,7 +319,10 @@ extern "C" {
305
  // Initialize the llama + ggml backend
306
  // If numa is true, use NUMA optimizations
307
  // Call once at the start of the program
308
- LLAMA_API void llama_backend_init(bool numa);
 
 
 
309
 
310
  // Call once at the end of the program - currently only used for MPI
311
  LLAMA_API void llama_backend_free(void);
@@ -628,6 +645,10 @@ extern "C" {
628
  // shape: [n_embd] (1-dimensional)
629
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
630
 
 
 
 
 
631
  //
632
  // Vocab
633
  //
@@ -684,6 +705,25 @@ extern "C" {
684
  char * buf,
685
  int32_t length);
686
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
687
  //
688
  // Grammar
689
  //
 
100
  LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
101
  LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
102
  LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
103
+ LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
104
 
105
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
106
  };
 
113
  LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
114
  };
115
 
116
+ enum llama_pooling_type {
117
+ LLAMA_POOLING_NONE = 0,
118
+ LLAMA_POOLING_MEAN = 1,
119
+ LLAMA_POOLING_CLS = 2,
120
+ };
121
+
122
  enum llama_split_mode {
123
  LLAMA_SPLIT_NONE = 0, // single GPU
124
  LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
 
243
  bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
244
  bool embedding; // embedding mode only
245
  bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
246
+ bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
247
  };
248
 
249
  // model quantization parameters
 
305
  int32_t n_eval;
306
  };
307
 
308
+ // used in chat template
309
+ typedef struct llama_chat_message {
310
+ const char * role;
311
+ const char * content;
312
+ } llama_chat_message;
313
+
314
  // Helpers for getting default parameters
315
  LLAMA_API struct llama_model_params llama_model_default_params(void);
316
  LLAMA_API struct llama_context_params llama_context_default_params(void);
 
319
  // Initialize the llama + ggml backend
320
  // If numa is true, use NUMA optimizations
321
  // Call once at the start of the program
322
+ LLAMA_API void llama_backend_init(void);
323
+
324
+ //optional:
325
+ LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
326
 
327
  // Call once at the end of the program - currently only used for MPI
328
  LLAMA_API void llama_backend_free(void);
 
645
  // shape: [n_embd] (1-dimensional)
646
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
647
 
648
+ // Get the embeddings for the ith sequence
649
+ // llama_get_embeddings(ctx) + i*n_embd
650
+ LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
651
+
652
  //
653
  // Vocab
654
  //
 
705
  char * buf,
706
  int32_t length);
707
 
708
+ /// Apply chat template. Inspired by hf apply_chat_template() on python.
709
+ /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
710
+ /// NOTE: This function only support some known jinja templates. It is not a jinja parser.
711
+ /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
712
+ /// @param chat Pointer to a list of multiple llama_chat_message
713
+ /// @param n_msg Number of llama_chat_message in this chat
714
+ /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
715
+ /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
716
+ /// @param length The size of the allocated buffer
717
+ /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
718
+ LLAMA_API int32_t llama_chat_apply_template(
719
+ const struct llama_model * model,
720
+ const char * tmpl,
721
+ const struct llama_chat_message * chat,
722
+ size_t n_msg,
723
+ bool add_ass,
724
+ char * buf,
725
+ int32_t length);
726
+
727
  //
728
  // Grammar
729
  //
examples/talk-llama/talk-llama.cpp CHANGED
@@ -288,7 +288,7 @@ int main(int argc, char ** argv) {
288
 
289
  // llama init
290
 
291
- llama_backend_init(true);
292
 
293
  auto lmparams = llama_model_default_params();
294
  if (!params.use_gpu) {
 
288
 
289
  // llama init
290
 
291
+ llama_backend_init();
292
 
293
  auto lmparams = llama_model_default_params();
294
  if (!params.use_gpu) {
examples/talk-llama/unicode.h CHANGED
@@ -264,26 +264,29 @@ static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) {
264
  offset += 1;
265
  return result;
266
  }
267
- else if (!(utf8[offset + 0] & 0x40)) {
268
  throw std::invalid_argument("invalid character");
269
  }
270
- else if (!(utf8[offset + 0] & 0x20)) {
271
- if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80))
272
  throw std::invalid_argument("invalid character");
 
273
  auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
274
  offset += 2;
275
  return result;
276
  }
277
- else if (!(utf8[offset + 0] & 0x10)) {
278
- if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80))
279
  throw std::invalid_argument("invalid character");
 
280
  auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
281
  offset += 3;
282
  return result;
283
  }
284
- else if (!(utf8[offset + 0] & 0x08)) {
285
- if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80))
286
  throw std::invalid_argument("invalid character");
 
287
  auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
288
  offset += 4;
289
  return result;
@@ -331,21 +334,22 @@ static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t
331
  offset += 1;
332
  return result;
333
  }
334
- else {
335
- if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00))
336
- throw std::invalid_argument("invalid character");
337
- auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
338
- offset += 2;
339
- return result;
340
  }
341
- throw std::invalid_argument("invalid string");
 
 
 
342
  }
343
 
344
  static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
345
  std::vector<uint32_t> result;
346
  size_t offset = 0;
347
- while (offset < utf16.size())
348
  result.push_back(codepoint_from_utf16(utf16, offset));
 
349
  return result;
350
  }
351
 
@@ -361,44 +365,52 @@ static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> &
361
  static std::unordered_map<uint32_t, int> codepoint_type_map() {
362
  std::unordered_map<uint32_t, int> codepoint_types;
363
  for (auto p : digit_ranges) {
364
- for(auto i = p.first; i <= p.second; ++ i)
365
  codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
 
366
  }
367
- for(auto p : letter_ranges) {
368
- for(auto i = p.first; i <= p.second; ++ i)
369
  codepoint_types[i] = CODEPOINT_TYPE_LETTER;
 
370
  }
371
- for(auto p : whitespace_ranges) {
372
- for(auto i = p.first; i <= p.second; ++ i)
373
  codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
 
374
  }
375
- for(auto p : accent_mark_ranges) {
376
- for(auto i = p.first; i <= p.second; ++ i)
377
  codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
 
378
  }
379
- for(auto p : punctuation_ranges) {
380
- for(auto i = p.first; i <= p.second; ++ i)
381
  codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
 
382
  }
383
- for (auto p : symbol_ranges) {
384
- for (auto i = p.first; i <= p.second; ++i)
385
  codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
 
386
  }
387
- for(auto p : control_ranges) {
388
- for(auto i = p.first; i <= p.second; ++ i)
389
  codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
 
390
  }
391
  return codepoint_types;
392
  }
393
 
394
  static int codepoint_type(uint32_t cp) {
395
  static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
396
- return codepoint_types[cp];
397
  }
398
 
399
  static int codepoint_type(const std::string & utf8) {
400
- if (utf8.length() == 0)
401
  return CODEPOINT_TYPE_UNIDENTIFIED;
 
402
  size_t offset = 0;
403
  return codepoint_type(codepoint_from_utf8(utf8, offset));
404
  }
 
264
  offset += 1;
265
  return result;
266
  }
267
+ if (!(utf8[offset + 0] & 0x40)) {
268
  throw std::invalid_argument("invalid character");
269
  }
270
+ if (!(utf8[offset + 0] & 0x20)) {
271
+ if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) {
272
  throw std::invalid_argument("invalid character");
273
+ }
274
  auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
275
  offset += 2;
276
  return result;
277
  }
278
+ if (!(utf8[offset + 0] & 0x10)) {
279
+ if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) {
280
  throw std::invalid_argument("invalid character");
281
+ }
282
  auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
283
  offset += 3;
284
  return result;
285
  }
286
+ if (!(utf8[offset + 0] & 0x08)) {
287
+ if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) {
288
  throw std::invalid_argument("invalid character");
289
+ }
290
  auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
291
  offset += 4;
292
  return result;
 
334
  offset += 1;
335
  return result;
336
  }
337
+
338
+ if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
339
+ throw std::invalid_argument("invalid character");
 
 
 
340
  }
341
+
342
+ auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
343
+ offset += 2;
344
+ return result;
345
  }
346
 
347
  static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
348
  std::vector<uint32_t> result;
349
  size_t offset = 0;
350
+ while (offset < utf16.size()) {
351
  result.push_back(codepoint_from_utf16(utf16, offset));
352
+ }
353
  return result;
354
  }
355
 
 
365
  static std::unordered_map<uint32_t, int> codepoint_type_map() {
366
  std::unordered_map<uint32_t, int> codepoint_types;
367
  for (auto p : digit_ranges) {
368
+ for (auto i = p.first; i <= p.second; ++ i) {
369
  codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
370
+ }
371
  }
372
+ for (auto p : letter_ranges) {
373
+ for (auto i = p.first; i <= p.second; ++ i) {
374
  codepoint_types[i] = CODEPOINT_TYPE_LETTER;
375
+ }
376
  }
377
+ for (auto p : whitespace_ranges) {
378
+ for (auto i = p.first; i <= p.second; ++ i) {
379
  codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
380
+ }
381
  }
382
+ for (auto p : accent_mark_ranges) {
383
+ for (auto i = p.first; i <= p.second; ++ i) {
384
  codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
385
+ }
386
  }
387
+ for (auto p : punctuation_ranges) {
388
+ for (auto i = p.first; i <= p.second; ++ i) {
389
  codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
390
+ }
391
  }
392
+ for (auto p : symbol_ranges) {
393
+ for (auto i = p.first; i <= p.second; ++i) {
394
  codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
395
+ }
396
  }
397
+ for (auto p : control_ranges) {
398
+ for (auto i = p.first; i <= p.second; ++ i) {
399
  codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
400
+ }
401
  }
402
  return codepoint_types;
403
  }
404
 
405
  static int codepoint_type(uint32_t cp) {
406
  static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
407
+ return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types.at(cp);
408
  }
409
 
410
  static int codepoint_type(const std::string & utf8) {
411
+ if (utf8.length() == 0) {
412
  return CODEPOINT_TYPE_UNIDENTIFIED;
413
+ }
414
  size_t offset = 0;
415
  return codepoint_type(codepoint_from_utf8(utf8, offset));
416
  }