ggerganov commited on
Commit
42123fc
·
unverified ·
1 Parent(s): 96799a3

talk-llama : sync latest llama.cpp

Browse files
examples/talk-llama/llama.cpp CHANGED
@@ -198,6 +198,7 @@ enum llm_arch {
198
  LLM_ARCH_STABLELM,
199
  LLM_ARCH_QWEN,
200
  LLM_ARCH_PHI2,
 
201
  LLM_ARCH_UNKNOWN,
202
  };
203
 
@@ -216,6 +217,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
216
  { LLM_ARCH_STABLELM, "stablelm" },
217
  { LLM_ARCH_QWEN, "qwen" },
218
  { LLM_ARCH_PHI2, "phi2" },
 
219
  };
220
 
221
  enum llm_kv {
@@ -243,6 +245,8 @@ enum llm_kv {
243
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
244
  LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
245
  LLM_KV_ATTENTION_CLAMP_KQV,
 
 
246
  LLM_KV_ATTENTION_LAYERNORM_EPS,
247
  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
248
 
@@ -295,6 +299,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
295
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
296
  { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
297
  { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
 
 
298
  { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
299
  { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
300
 
@@ -352,6 +358,7 @@ enum llm_tensor {
352
  LLM_TENSOR_FFN_GATE,
353
  LLM_TENSOR_FFN_DOWN,
354
  LLM_TENSOR_FFN_UP,
 
355
  LLM_TENSOR_FFN_DOWN_EXP,
356
  LLM_TENSOR_FFN_GATE_EXP,
357
  LLM_TENSOR_FFN_UP_EXP,
@@ -420,6 +427,15 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
420
  LLM_ARCH_GPT2,
421
  {
422
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
 
 
 
 
 
 
 
 
 
423
  },
424
  },
425
  {
@@ -471,6 +487,7 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
471
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
472
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
473
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
 
474
  },
475
  },
476
  {
@@ -567,6 +584,24 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
567
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
568
  },
569
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
570
 
571
  {
572
  LLM_ARCH_UNKNOWN,
@@ -778,7 +813,7 @@ struct llama_file {
778
  throw std::runtime_error(format("read error: %s", strerror(errno)));
779
  }
780
  if (ret != 1) {
781
- throw std::runtime_error(std::string("unexpectedly reached end of file"));
782
  }
783
  }
784
 
@@ -931,29 +966,29 @@ struct llama_mmap {
931
  #elif defined(_WIN32)
932
  static constexpr bool SUPPORTED = true;
933
 
934
- llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
935
- (void) numa;
936
 
937
  size = file->size;
938
 
939
  HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
940
 
941
  HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
942
- DWORD error = GetLastError();
943
 
944
  if (hMapping == NULL) {
 
945
  throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
946
  }
947
 
948
  addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
949
- error = GetLastError();
950
  CloseHandle(hMapping);
951
 
952
  if (addr == NULL) {
953
  throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
954
  }
955
 
956
- if (prefetch) {
957
  // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
958
  BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
959
  HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
@@ -965,9 +1000,9 @@ struct llama_mmap {
965
  // advise the kernel to preload the mapped memory
966
  WIN32_MEMORY_RANGE_ENTRY range;
967
  range.VirtualAddress = addr;
968
- range.NumberOfBytes = (SIZE_T)size;
969
  if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
970
- fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
971
  llama_format_win_err(GetLastError()).c_str());
972
  }
973
  }
@@ -982,26 +1017,26 @@ struct llama_mmap {
982
 
983
  ~llama_mmap() {
984
  if (!UnmapViewOfFile(addr)) {
985
- fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
986
  llama_format_win_err(GetLastError()).c_str());
987
  }
988
  }
989
  #else
990
  static constexpr bool SUPPORTED = false;
991
 
992
- llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
993
- (void) file;
994
- (void) prefetch;
995
- (void) numa;
996
 
997
- throw std::runtime_error(std::string("mmap not supported"));
998
  }
999
 
1000
- void unmap(size_t offset, size_t len) {
1001
- (void) offset;
1002
- (void) len;
1003
 
1004
- throw std::runtime_error(std::string("mmap not supported"));
1005
  }
1006
  #endif
1007
  };
@@ -1177,21 +1212,27 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
1177
  }
1178
 
1179
  static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
 
 
1180
  #ifdef GGML_USE_METAL
1181
  if (n_gpu_layers > 0) {
1182
- return ggml_backend_metal_buffer_type();
1183
  }
1184
  #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1185
  if (n_gpu_layers > 0) {
1186
- return ggml_backend_cuda_buffer_type(0);
1187
  }
1188
  #elif defined(GGML_USE_CUBLAS)
1189
- return ggml_backend_cuda_host_buffer_type();
1190
  #elif defined(GGML_USE_CPU_HBM)
1191
- return ggml_backend_cpu_hbm_buffer_type();
1192
  #endif
1193
 
1194
- return ggml_backend_cpu_buffer_type();
 
 
 
 
1195
 
1196
  GGML_UNUSED(n_gpu_layers);
1197
  }
@@ -1228,6 +1269,10 @@ enum e_model {
1228
  MODEL_40B,
1229
  MODEL_65B,
1230
  MODEL_70B,
 
 
 
 
1231
  };
1232
 
1233
  static const size_t kiB = 1024;
@@ -1243,6 +1288,8 @@ struct llama_hparams {
1243
  uint32_t n_head_kv;
1244
  uint32_t n_layer;
1245
  uint32_t n_rot;
 
 
1246
  uint32_t n_ff;
1247
  uint32_t n_expert = 0;
1248
  uint32_t n_expert_used = 0;
@@ -1259,6 +1306,7 @@ struct llama_hparams {
1259
  float f_clamp_kqv;
1260
  float f_max_alibi_bias;
1261
 
 
1262
  bool operator!=(const llama_hparams & other) const {
1263
  if (this->vocab_only != other.vocab_only) return true;
1264
  if (this->n_vocab != other.n_vocab) return true;
@@ -1268,6 +1316,8 @@ struct llama_hparams {
1268
  if (this->n_head_kv != other.n_head_kv) return true;
1269
  if (this->n_layer != other.n_layer) return true;
1270
  if (this->n_rot != other.n_rot) return true;
 
 
1271
  if (this->n_ff != other.n_ff) return true;
1272
  if (this->n_expert != other.n_expert) return true;
1273
  if (this->n_expert_used != other.n_expert_used) return true;
@@ -1275,7 +1325,7 @@ struct llama_hparams {
1275
  if (this->rope_finetuned != other.rope_finetuned) return true;
1276
  if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1277
 
1278
- const float EPSILON = 1e-9;
1279
 
1280
  if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1281
  if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
@@ -1289,12 +1339,12 @@ struct llama_hparams {
1289
  return n_head/n_head_kv;
1290
  }
1291
 
1292
- uint32_t n_embd_head() const {
1293
- return n_embd/n_head;
1294
  }
1295
 
1296
- uint32_t n_embd_gqa() const {
1297
- return n_embd/n_gqa();
1298
  }
1299
  };
1300
 
@@ -1362,6 +1412,7 @@ struct llama_layer {
1362
  // ff bias
1363
  struct ggml_tensor * ffn_down_b; // b2
1364
  struct ggml_tensor * ffn_up_b; // b3
 
1365
  };
1366
 
1367
  struct llama_kv_cell {
@@ -1602,8 +1653,9 @@ static bool llama_kv_cache_init(
1602
  uint32_t n_ctx,
1603
  int n_gpu_layers,
1604
  bool offload) {
1605
- const uint32_t n_embd = hparams.n_embd_gqa();
1606
- const uint32_t n_layer = hparams.n_layer;
 
1607
 
1608
  cache.has_shift = false;
1609
 
@@ -1634,8 +1686,8 @@ static bool llama_kv_cache_init(
1634
  const int i_gpu_start = (int) n_layer - n_gpu_layers;
1635
 
1636
  for (int i = 0; i < (int) n_layer; i++) {
1637
- ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
1638
- ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
1639
  ggml_format_name(k, "cache_k_l%d", i);
1640
  ggml_format_name(v, "cache_v_l%d", i);
1641
  cache.k_l.push_back(k);
@@ -2522,18 +2574,22 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2522
 
2523
  static const char * llama_model_type_name(e_model type) {
2524
  switch (type) {
2525
- case MODEL_1B: return "1B";
2526
- case MODEL_3B: return "3B";
2527
- case MODEL_7B: return "7B";
2528
- case MODEL_8B: return "8B";
2529
- case MODEL_13B: return "13B";
2530
- case MODEL_15B: return "15B";
2531
- case MODEL_30B: return "30B";
2532
- case MODEL_34B: return "34B";
2533
- case MODEL_40B: return "40B";
2534
- case MODEL_65B: return "65B";
2535
- case MODEL_70B: return "70B";
2536
- default: return "?B";
 
 
 
 
2537
  }
2538
  }
2539
 
@@ -2625,6 +2681,12 @@ static void llm_load_hparams(
2625
  // gpt-j n_rot = rotary_dim
2626
  }
2627
 
 
 
 
 
 
 
2628
  // arch-specific KVs
2629
  switch (model.arch) {
2630
  case LLM_ARCH_LLAMA:
@@ -2743,6 +2805,26 @@ static void llm_load_hparams(
2743
  default: model.type = e_model::MODEL_UNKNOWN;
2744
  }
2745
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2746
 
2747
  default: (void)0;
2748
  }
@@ -3015,8 +3097,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3015
  LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
3016
  LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
3017
  LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
3018
- LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
 
 
3019
  LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
 
 
3020
  LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
3021
  LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
3022
  LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
@@ -3106,10 +3192,11 @@ static bool llm_load_tensors(
3106
 
3107
  // create tensors for the weights
3108
  {
3109
- const int64_t n_embd = hparams.n_embd;
3110
- const int64_t n_embd_gqa = hparams.n_embd_gqa();
3111
- const int64_t n_layer = hparams.n_layer;
3112
- const int64_t n_vocab = hparams.n_vocab;
 
3113
 
3114
  const auto tn = LLM_TN(model.arch);
3115
  switch (model.arch) {
@@ -3135,7 +3222,10 @@ static bool llm_load_tensors(
3135
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3136
  }
3137
 
3138
- const uint32_t n_ff = hparams.n_ff;
 
 
 
3139
 
3140
  const int i_gpu_start = n_layer - n_gpu_layers;
3141
 
@@ -3203,7 +3293,10 @@ static bool llm_load_tensors(
3203
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3204
  }
3205
 
3206
- const uint32_t n_ff = hparams.n_ff;
 
 
 
3207
 
3208
  const int i_gpu_start = n_layer - n_gpu_layers;
3209
 
@@ -3251,7 +3344,10 @@ static bool llm_load_tensors(
3251
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3252
  }
3253
 
3254
- const uint32_t n_ff = hparams.n_ff;
 
 
 
3255
 
3256
  const int i_gpu_start = n_layer - n_gpu_layers;
3257
 
@@ -3301,7 +3397,10 @@ static bool llm_load_tensors(
3301
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3302
  }
3303
 
3304
- const uint32_t n_ff = hparams.n_ff;
 
 
 
3305
 
3306
  const int i_gpu_start = n_layer - n_gpu_layers;
3307
 
@@ -3353,7 +3452,11 @@ static bool llm_load_tensors(
3353
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3354
  }
3355
 
3356
- const uint32_t n_ff = hparams.n_ff;
 
 
 
 
3357
  const int i_gpu_start = n_layer - n_gpu_layers;
3358
  model.layers.resize(n_layer);
3359
  for (uint32_t i = 0; i < n_layer; ++i) {
@@ -3402,7 +3505,10 @@ static bool llm_load_tensors(
3402
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3403
  }
3404
 
3405
- const uint32_t n_ff = hparams.n_ff;
 
 
 
3406
 
3407
  const int i_gpu_start = n_layer - n_gpu_layers;
3408
 
@@ -3436,7 +3542,6 @@ static bool llm_load_tensors(
3436
  case LLM_ARCH_MPT:
3437
  {
3438
  model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3439
-
3440
  // output
3441
  {
3442
  ggml_backend_type backend_norm;
@@ -3454,7 +3559,10 @@ static bool llm_load_tensors(
3454
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3455
  }
3456
 
3457
- const uint32_t n_ff = hparams.n_ff;
 
 
 
3458
 
3459
  const int i_gpu_start = n_layer - n_gpu_layers;
3460
 
@@ -3474,6 +3582,9 @@ static bool llm_load_tensors(
3474
 
3475
  layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3476
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
 
 
 
3477
  }
3478
  } break;
3479
  case LLM_ARCH_STABLELM:
@@ -3498,7 +3609,10 @@ static bool llm_load_tensors(
3498
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3499
  }
3500
 
3501
- const uint32_t n_ff = hparams.n_ff;
 
 
 
3502
 
3503
  const int i_gpu_start = n_layer - n_gpu_layers;
3504
 
@@ -3596,7 +3710,10 @@ static bool llm_load_tensors(
3596
  model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
3597
  }
3598
 
3599
- const uint32_t n_ff = hparams.n_ff;
 
 
 
3600
 
3601
  const int i_gpu_start = n_layer - n_gpu_layers;
3602
 
@@ -3624,6 +3741,111 @@ static bool llm_load_tensors(
3624
  layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3625
  }
3626
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3627
  default:
3628
  throw std::runtime_error("unknown architecture");
3629
  }
@@ -3832,8 +4054,8 @@ static struct ggml_tensor * llm_build_inp_embd(
3832
  return inpL;
3833
  }
3834
 
3835
- // Persimmon: n_rot = n_embd_head/2
3836
- // Other: n_rot = n_embd_head
3837
  static void llm_build_k_shift(
3838
  struct ggml_context * ctx,
3839
  const llama_hparams & hparams,
@@ -3846,17 +4068,17 @@ static void llm_build_k_shift(
3846
  float freq_base,
3847
  float freq_scale,
3848
  const llm_build_cb & cb) {
3849
- const int64_t n_layer = hparams.n_layer;
3850
- const int64_t n_head_kv = hparams.n_head_kv;
3851
- const int64_t n_embd_gqa = hparams.n_embd_gqa();
3852
- const int64_t n_embd_head = hparams.n_embd_head();
3853
- const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
3854
- const float ext_factor = cparams.yarn_ext_factor;
3855
- const float attn_factor = cparams.yarn_attn_factor;
3856
- const float beta_fast = cparams.yarn_beta_fast;
3857
- const float beta_slow = cparams.yarn_beta_slow;
3858
-
3859
- GGML_ASSERT(n_embd_head % n_rot == 0);
3860
 
3861
  struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
3862
  cb(K_shift, "K_shift", -1);
@@ -3874,9 +4096,9 @@ static void llm_build_k_shift(
3874
  // we rotate only the first n_rot dimensions
3875
  ggml_rope_custom_inplace(ctx,
3876
  ggml_view_3d(ctx, kv.k_l[il],
3877
- n_embd_head, n_head_kv, n_ctx,
3878
- ggml_row_size(kv.k_l[il]->type, n_embd_head),
3879
- ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
3880
  0),
3881
  K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
3882
  ext_factor, attn_factor, beta_fast, beta_slow);
@@ -3897,18 +4119,19 @@ static void llm_build_kv_store(
3897
  int32_t kv_head,
3898
  const llm_build_cb & cb,
3899
  int64_t il) {
3900
- const int64_t n_embd_gqa = hparams.n_embd_gqa();
 
3901
 
3902
  // compute the transposed [n_tokens, n_embd] V matrix
3903
- struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens));
3904
  //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
3905
  cb(v_cur_t, "v_cur_t", il);
3906
 
3907
- struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
3908
- (ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
3909
  cb(k_cache_view, "k_cache_view", il);
3910
 
3911
- struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
3912
  ( n_ctx)*ggml_element_size(kv.v_l[il]),
3913
  (kv_head)*ggml_element_size(kv.v_l[il]));
3914
  cb(v_cache_view, "v_cache_view", il);
@@ -3959,6 +4182,7 @@ static struct ggml_tensor * llm_build_ffn(
3959
  struct ggml_tensor * gate_b,
3960
  struct ggml_tensor * down,
3961
  struct ggml_tensor * down_b,
 
3962
  llm_ffn_op_type type_op,
3963
  llm_ffn_gate_type type_gate,
3964
  const llm_build_cb & cb,
@@ -4003,6 +4227,10 @@ static struct ggml_tensor * llm_build_ffn(
4003
  {
4004
  cur = ggml_gelu(ctx, cur);
4005
  cb(cur, "ffn_gelu", il);
 
 
 
 
4006
  } break;
4007
  case LLM_FFN_RELU:
4008
  {
@@ -4053,20 +4281,20 @@ static struct ggml_tensor * llm_build_kqv(
4053
  float kq_scale,
4054
  const llm_build_cb & cb,
4055
  int il) {
4056
- const int64_t n_embd = hparams.n_embd;
4057
- const int64_t n_head = hparams.n_head;
4058
- const int64_t n_head_kv = hparams.n_head_kv;
4059
- const int64_t n_embd_head = hparams.n_embd_head();
4060
- const int64_t n_embd_gqa = hparams.n_embd_gqa();
4061
 
4062
  struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
4063
  cb(q, "q", il);
4064
 
4065
  struct ggml_tensor * k =
4066
  ggml_view_3d(ctx, kv.k_l[il],
4067
- n_embd_head, n_kv, n_head_kv,
4068
- ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
4069
- ggml_row_size(kv.k_l[il]->type, n_embd_head),
4070
  0);
4071
  cb(k, "k", il);
4072
 
@@ -4105,9 +4333,9 @@ static struct ggml_tensor * llm_build_kqv(
4105
  // split cached v into n_head heads
4106
  struct ggml_tensor * v =
4107
  ggml_view_3d(ctx, kv.v_l[il],
4108
- n_kv, n_embd_head, n_head_kv,
4109
  ggml_element_size(kv.v_l[il])*n_ctx,
4110
- ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head,
4111
  0);
4112
  cb(v, "v", il);
4113
 
@@ -4117,7 +4345,7 @@ static struct ggml_tensor * llm_build_kqv(
4117
  struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
4118
  cb(kqv_merged, "kqv_merged", il);
4119
 
4120
- struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens);
4121
  cb(cur, "kqv_merged_cont", il);
4122
 
4123
  cur = ggml_mul_mat(ctx, wo, cur);
@@ -4144,8 +4372,10 @@ struct llm_build_context {
4144
  const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
4145
  const int64_t n_head;
4146
  const int64_t n_head_kv;
4147
- const int64_t n_embd_head;
4148
- const int64_t n_embd_gqa;
 
 
4149
  const int64_t n_expert;
4150
  const int64_t n_expert_used;
4151
 
@@ -4187,8 +4417,10 @@ struct llm_build_context {
4187
  n_ctx (cparams.n_ctx),
4188
  n_head (hparams.n_head),
4189
  n_head_kv (hparams.n_head_kv),
4190
- n_embd_head (hparams.n_embd_head()),
4191
- n_embd_gqa (hparams.n_embd_gqa()),
 
 
4192
  n_expert (hparams.n_expert),
4193
  n_expert_used (hparams.n_expert_used),
4194
  freq_base (cparams.rope_freq_base),
@@ -4231,6 +4463,8 @@ struct llm_build_context {
4231
  struct ggml_cgraph * build_llama() {
4232
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4233
 
 
 
4234
  GGML_ASSERT(n_embd_head == hparams.n_rot);
4235
 
4236
  struct ggml_tensor * cur;
@@ -4321,6 +4555,7 @@ struct llm_build_context {
4321
  model.layers[il].ffn_up, NULL,
4322
  model.layers[il].ffn_gate, NULL,
4323
  model.layers[il].ffn_down, NULL,
 
4324
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
4325
  cb(cur, "ffn_out", il);
4326
  } else {
@@ -4414,6 +4649,9 @@ struct llm_build_context {
4414
  struct ggml_cgraph * build_baichuan() {
4415
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4416
 
 
 
 
4417
  struct ggml_tensor * cur;
4418
  struct ggml_tensor * inpL;
4419
 
@@ -4500,6 +4738,7 @@ struct llm_build_context {
4500
  model.layers[il].ffn_up, NULL,
4501
  model.layers[il].ffn_gate, NULL,
4502
  model.layers[il].ffn_down, NULL,
 
4503
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
4504
  cb(cur, "ffn_out", il);
4505
  }
@@ -4530,6 +4769,11 @@ struct llm_build_context {
4530
  struct ggml_cgraph * build_falcon() {
4531
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4532
 
 
 
 
 
 
4533
  struct ggml_tensor * cur;
4534
  struct ggml_tensor * inpL;
4535
 
@@ -4614,6 +4858,7 @@ struct llm_build_context {
4614
  model.layers[il].ffn_up, NULL,
4615
  NULL, NULL,
4616
  model.layers[il].ffn_down, NULL,
 
4617
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
4618
  cb(cur, "ffn_out", il);
4619
  }
@@ -4648,6 +4893,11 @@ struct llm_build_context {
4648
  struct ggml_cgraph * build_starcoder() {
4649
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4650
 
 
 
 
 
 
4651
  struct ggml_tensor * cur;
4652
  struct ggml_tensor * pos;
4653
  struct ggml_tensor * inpL;
@@ -4718,6 +4968,7 @@ struct llm_build_context {
4718
  model.layers[il].ffn_up, model.layers[il].ffn_up_b,
4719
  NULL, NULL,
4720
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
 
4721
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
4722
  cb(cur, "ffn_out", il);
4723
  }
@@ -4743,7 +4994,12 @@ struct llm_build_context {
4743
  struct ggml_cgraph * build_persimmon() {
4744
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4745
 
4746
- const int64_t n_rot = n_embd_head / 2;
 
 
 
 
 
4747
 
4748
  struct ggml_tensor * cur;
4749
  struct ggml_tensor * inpL;
@@ -4922,6 +5178,7 @@ struct llm_build_context {
4922
  model.layers[il].ffn_up, model.layers[il].ffn_up_b,
4923
  NULL, NULL,
4924
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
 
4925
  LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
4926
  cb(cur, "ffn_out", il);
4927
  }
@@ -4951,6 +5208,11 @@ struct llm_build_context {
4951
  struct ggml_cgraph * build_refact() {
4952
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4953
 
 
 
 
 
 
4954
  struct ggml_tensor * cur;
4955
  struct ggml_tensor * inpL;
4956
 
@@ -5008,6 +5270,7 @@ struct llm_build_context {
5008
  model.layers[il].ffn_up, NULL,
5009
  model.layers[il].ffn_gate, NULL,
5010
  model.layers[il].ffn_down, NULL,
 
5011
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5012
  cb(cur, "ffn_out", il);
5013
  }
@@ -5038,6 +5301,11 @@ struct llm_build_context {
5038
  struct ggml_cgraph * build_bloom() {
5039
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5040
 
 
 
 
 
 
5041
  struct ggml_tensor * cur;
5042
  struct ggml_tensor * inpL;
5043
 
@@ -5103,6 +5371,7 @@ struct llm_build_context {
5103
  model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5104
  NULL, NULL,
5105
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
 
5106
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5107
  cb(cur, "ffn_out", il);
5108
  }
@@ -5128,6 +5397,11 @@ struct llm_build_context {
5128
  struct ggml_cgraph * build_mpt() {
5129
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5130
 
 
 
 
 
 
5131
  struct ggml_tensor * cur;
5132
  struct ggml_tensor * inpL;
5133
 
@@ -5188,11 +5462,11 @@ struct llm_build_context {
5188
  NULL,
5189
  LLM_NORM, cb, il);
5190
  cb(cur, "ffn_norm", il);
5191
-
5192
  cur = llm_build_ffn(ctx0, cur,
5193
  model.layers[il].ffn_up, NULL,
5194
  NULL, NULL,
5195
  model.layers[il].ffn_down, NULL,
 
5196
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5197
  cb(cur, "ffn_out", il);
5198
  }
@@ -5223,6 +5497,9 @@ struct llm_build_context {
5223
  struct ggml_cgraph * build_stablelm() {
5224
  struct ggml_cgraph * gf = ggml_new_graph(ctx0);
5225
 
 
 
 
5226
  struct ggml_tensor * cur;
5227
  struct ggml_tensor * inpL;
5228
 
@@ -5301,6 +5578,7 @@ struct llm_build_context {
5301
  model.layers[il].ffn_up, NULL,
5302
  model.layers[il].ffn_gate, NULL,
5303
  model.layers[il].ffn_down, NULL,
 
5304
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5305
  cb(cur, "ffn_out", il);
5306
  }
@@ -5332,6 +5610,9 @@ struct llm_build_context {
5332
  struct ggml_cgraph * build_qwen() {
5333
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5334
 
 
 
 
5335
  struct ggml_tensor * cur;
5336
  struct ggml_tensor * inpL;
5337
 
@@ -5413,6 +5694,7 @@ struct llm_build_context {
5413
  model.layers[il].ffn_up, NULL,
5414
  model.layers[il].ffn_gate, NULL,
5415
  model.layers[il].ffn_down, NULL,
 
5416
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5417
  cb(cur, "ffn_out", il);
5418
  }
@@ -5442,6 +5724,11 @@ struct llm_build_context {
5442
  struct ggml_cgraph * build_phi2() {
5443
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5444
 
 
 
 
 
 
5445
  struct ggml_tensor * cur;
5446
  struct ggml_tensor * attn_norm_output;
5447
  struct ggml_tensor * ffn_output;
@@ -5520,6 +5807,7 @@ struct llm_build_context {
5520
  model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5521
  NULL, NULL,
5522
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
 
5523
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5524
  cb(ffn_output, "ffn_out", il);
5525
  }
@@ -5549,6 +5837,214 @@ struct llm_build_context {
5549
 
5550
  return gf;
5551
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5552
  };
5553
 
5554
  //
@@ -5704,6 +6200,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5704
  { "ffn_gate", OFFLOAD_FUNC },
5705
  { "ffn_gate_b", OFFLOAD_FUNC },
5706
  { "ffn_gate_par", OFFLOAD_FUNC },
 
5707
  { "ffn_down", OFFLOAD_FUNC },
5708
  { "ffn_down_b", OFFLOAD_FUNC },
5709
  { "ffn_out", OFFLOAD_FUNC },
@@ -6059,6 +6556,14 @@ static struct ggml_cgraph * llama_build_graph(
6059
  {
6060
  result = llm.build_phi2();
6061
  } break;
 
 
 
 
 
 
 
 
6062
  default:
6063
  GGML_ASSERT(false);
6064
  }
@@ -7525,7 +8030,7 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
7525
  }
7526
  }
7527
 
7528
- void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
7529
  const int64_t t_start_sample_us = ggml_time_us();
7530
 
7531
  k = std::max(k, (int) min_keep);
@@ -7885,7 +8390,7 @@ void llama_sample_classifier_free_guidance(
7885
  }
7886
  }
7887
 
7888
- llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
7889
  GGML_ASSERT(ctx);
7890
 
7891
  auto N = float(llama_n_vocab(llama_get_model(ctx)));
@@ -9093,7 +9598,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
9093
  return result;
9094
  }
9095
 
9096
- int llama_max_devices(void) {
9097
  return LLAMA_MAX_DEVICES;
9098
  }
9099
 
@@ -9235,8 +9740,8 @@ struct llama_context * llama_new_context_with_model(
9235
  const ggml_type type_k = params.type_k;
9236
  const ggml_type type_v = params.type_v;
9237
 
9238
- GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_k) == 0);
9239
- GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_v) == 0);
9240
 
9241
  // reserve memory for context buffers
9242
  if (!hparams.vocab_only) {
@@ -9332,7 +9837,8 @@ struct llama_context * llama_new_context_with_model(
9332
  ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
9333
  #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
9334
  if (model->n_gpu_layers > 0) {
9335
- ggml_cuda_set_scratch_size(alloc_size);
 
9336
  LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
9337
 
9338
  // calculate total VRAM usage
@@ -9403,15 +9909,15 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
9403
  return model->vocab.type;
9404
  }
9405
 
9406
- int llama_n_vocab(const struct llama_model * model) {
9407
  return model->vocab.id_to_token.size();
9408
  }
9409
 
9410
- int llama_n_ctx_train(const struct llama_model * model) {
9411
  return model->hparams.n_ctx_train;
9412
  }
9413
 
9414
- int llama_n_embd(const struct llama_model * model) {
9415
  return model->hparams.n_embd;
9416
  }
9417
 
@@ -9419,7 +9925,7 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
9419
  return model->hparams.rope_freq_scale_train;
9420
  }
9421
 
9422
- int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
9423
  const auto & it = model->gguf_kv.find(key);
9424
  if (it == model->gguf_kv.end()) {
9425
  if (buf_size > 0) {
@@ -9430,11 +9936,11 @@ int llama_model_meta_val_str(const struct llama_model * model, const char * key,
9430
  return snprintf(buf, buf_size, "%s", it->second.c_str());
9431
  }
9432
 
9433
- int llama_model_meta_count(const struct llama_model * model) {
9434
  return (int)model->gguf_kv.size();
9435
  }
9436
 
9437
- int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
9438
  if (i < 0 || i >= (int)model->gguf_kv.size()) {
9439
  if (buf_size > 0) {
9440
  buf[0] = '\0';
@@ -9446,7 +9952,7 @@ int llama_model_meta_key_by_index(const struct llama_model * model, int i, char
9446
  return snprintf(buf, buf_size, "%s", it->first.c_str());
9447
  }
9448
 
9449
- int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
9450
  if (i < 0 || i >= (int)model->gguf_kv.size()) {
9451
  if (buf_size > 0) {
9452
  buf[0] = '\0';
@@ -9458,9 +9964,10 @@ int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, c
9458
  return snprintf(buf, buf_size, "%s", it->second.c_str());
9459
  }
9460
 
9461
- int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
9462
- return snprintf(buf, buf_size, "%s %s %s",
9463
  llama_model_arch_name(model->arch).c_str(),
 
9464
  llama_model_type_name(model->type),
9465
  llama_model_ftype_name(model->ftype).c_str());
9466
  }
@@ -9485,7 +9992,7 @@ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const ch
9485
  return ggml_get_tensor(model->ctx, name);
9486
  }
9487
 
9488
- int llama_model_quantize(
9489
  const char * fname_inp,
9490
  const char * fname_out,
9491
  const llama_model_quantize_params * params) {
@@ -9498,7 +10005,7 @@ int llama_model_quantize(
9498
  }
9499
  }
9500
 
9501
- int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
9502
  try {
9503
  return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
9504
  } catch (const std::exception & err) {
@@ -9507,7 +10014,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
9507
  }
9508
  }
9509
 
9510
- int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
9511
  try {
9512
  return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
9513
  } catch (const std::exception & err) {
@@ -9605,7 +10112,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
9605
  }
9606
  }
9607
 
9608
- int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
9609
  int result = 0;
9610
 
9611
  for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
@@ -9615,7 +10122,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
9615
  return result;
9616
  }
9617
 
9618
- int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
9619
  return ctx->kv_self.used;
9620
  }
9621
 
@@ -9779,9 +10286,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
9779
  const auto & hparams = ctx->model.hparams;
9780
  const auto & cparams = ctx->cparams;
9781
 
9782
- const auto n_layer = hparams.n_layer;
9783
- const auto n_embd = hparams.n_embd_gqa();
9784
- const auto n_ctx = cparams.n_ctx;
 
9785
 
9786
  const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
9787
  const uint32_t kv_head = kv_self.head;
@@ -9803,15 +10311,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
9803
  std::vector<struct ggml_tensor *> vout2d(n_layer);
9804
 
9805
  for (int il = 0; il < (int) n_layer; ++il) {
9806
- kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9807
- vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9808
 
9809
  ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
9810
- n_embd, kv_head,
9811
- elt_size*n_embd, 0);
9812
 
9813
  ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
9814
- kv_head, n_embd,
9815
  elt_size*n_ctx, 0);
9816
 
9817
  ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
@@ -9918,9 +10426,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9918
  const auto & hparams = ctx->model.hparams;
9919
  const auto & cparams = ctx->cparams;
9920
 
9921
- const int n_layer = hparams.n_layer;
9922
- const int n_embd = hparams.n_embd_gqa();
9923
- const int n_ctx = cparams.n_ctx;
 
9924
 
9925
  size_t kv_buf_size;
9926
  uint32_t kv_head;
@@ -9944,15 +10453,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9944
  std::vector<struct ggml_tensor *> vin2d(n_layer);
9945
 
9946
  for (int il = 0; il < n_layer; ++il) {
9947
- kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
9948
- vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
9949
 
9950
  ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
9951
- n_embd, kv_head,
9952
- elt_size*n_embd, 0);
9953
 
9954
  ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
9955
- kv_head, n_embd,
9956
  elt_size*n_ctx, 0);
9957
 
9958
  ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
@@ -10095,7 +10604,7 @@ int llama_eval(
10095
  struct llama_context * ctx,
10096
  llama_token * tokens,
10097
  int32_t n_tokens,
10098
- int n_past) {
10099
  llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
10100
 
10101
  const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
@@ -10110,7 +10619,7 @@ int llama_eval_embd(
10110
  struct llama_context * ctx,
10111
  float * embd,
10112
  int32_t n_tokens,
10113
- int n_past) {
10114
  llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
10115
 
10116
  llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
@@ -10181,7 +10690,7 @@ void llama_batch_free(struct llama_batch batch) {
10181
  if (batch.logits) free(batch.logits);
10182
  }
10183
 
10184
- int llama_decode(
10185
  struct llama_context * ctx,
10186
  struct llama_batch batch) {
10187
  const int ret = llama_decode_internal(*ctx, batch);
@@ -10229,11 +10738,11 @@ llama_token llama_token_nl(const struct llama_model * model) {
10229
  return model->vocab.linefeed_id;
10230
  }
10231
 
10232
- int llama_add_bos_token(const struct llama_model * model) {
10233
  return model->vocab.special_add_bos;
10234
  }
10235
 
10236
- int llama_add_eos_token(const struct llama_model * model) {
10237
  return model->vocab.special_add_eos;
10238
  }
10239
 
@@ -10253,12 +10762,12 @@ llama_token llama_token_eot(const struct llama_model * model) {
10253
  return model->vocab.special_eot_id;
10254
  }
10255
 
10256
- int llama_tokenize(
10257
  const struct llama_model * model,
10258
  const char * text,
10259
- int text_len,
10260
  llama_token * tokens,
10261
- int n_max_tokens,
10262
  bool add_bos,
10263
  bool special) {
10264
  auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
@@ -10286,7 +10795,7 @@ static std::string llama_decode_text(const std::string & text) {
10286
  }
10287
 
10288
  // does not write null-terminator to buf
10289
- int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
10290
  if (0 <= token && token < llama_n_vocab(model)) {
10291
  switch (llama_vocab_get_type(model->vocab)) {
10292
  case LLAMA_VOCAB_TYPE_SPM: {
@@ -10294,7 +10803,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
10294
  std::string result = model->vocab.id_to_token[token].text;
10295
  llama_unescape_whitespace(result);
10296
  if (length < (int) result.length()) {
10297
- return -result.length();
10298
  }
10299
  memcpy(buf, result.c_str(), result.length());
10300
  return result.length();
@@ -10324,7 +10833,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
10324
  std::string result = model->vocab.id_to_token[token].text;
10325
  result = llama_decode_text(result);
10326
  if (length < (int) result.length()) {
10327
- return -result.length();
10328
  }
10329
  memcpy(buf, result.c_str(), result.length());
10330
  return result.length();
@@ -10387,6 +10896,7 @@ const char * llama_print_system_info(void) {
10387
 
10388
  s = "";
10389
  s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
 
10390
  s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
10391
  s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
10392
  s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
 
198
  LLM_ARCH_STABLELM,
199
  LLM_ARCH_QWEN,
200
  LLM_ARCH_PHI2,
201
+ LLM_ARCH_PLAMO,
202
  LLM_ARCH_UNKNOWN,
203
  };
204
 
 
217
  { LLM_ARCH_STABLELM, "stablelm" },
218
  { LLM_ARCH_QWEN, "qwen" },
219
  { LLM_ARCH_PHI2, "phi2" },
220
+ { LLM_ARCH_PLAMO, "plamo" },
221
  };
222
 
223
  enum llm_kv {
 
245
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
246
  LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
247
  LLM_KV_ATTENTION_CLAMP_KQV,
248
+ LLM_KV_ATTENTION_KEY_LENGTH,
249
+ LLM_KV_ATTENTION_VALUE_LENGTH,
250
  LLM_KV_ATTENTION_LAYERNORM_EPS,
251
  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
252
 
 
299
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
300
  { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
301
  { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
302
+ { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
303
+ { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
304
  { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
305
  { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
306
 
 
358
  LLM_TENSOR_FFN_GATE,
359
  LLM_TENSOR_FFN_DOWN,
360
  LLM_TENSOR_FFN_UP,
361
+ LLM_TENSOR_FFN_ACT,
362
  LLM_TENSOR_FFN_DOWN_EXP,
363
  LLM_TENSOR_FFN_GATE_EXP,
364
  LLM_TENSOR_FFN_UP_EXP,
 
427
  LLM_ARCH_GPT2,
428
  {
429
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
430
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
431
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
432
+ { LLM_TENSOR_OUTPUT, "output" },
433
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
434
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
435
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
436
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
437
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
438
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
439
  },
440
  },
441
  {
 
487
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
488
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
489
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
490
+ { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
491
  },
492
  },
493
  {
 
584
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
585
  },
586
  },
587
+ {
588
+ LLM_ARCH_PLAMO,
589
+ {
590
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
591
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
592
+ { LLM_TENSOR_OUTPUT, "output" },
593
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
594
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
595
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
596
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
597
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
598
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
599
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
600
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
601
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
602
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
603
+ },
604
+ },
605
 
606
  {
607
  LLM_ARCH_UNKNOWN,
 
813
  throw std::runtime_error(format("read error: %s", strerror(errno)));
814
  }
815
  if (ret != 1) {
816
+ throw std::runtime_error("unexpectedly reached end of file");
817
  }
818
  }
819
 
 
966
  #elif defined(_WIN32)
967
  static constexpr bool SUPPORTED = true;
968
 
969
+ llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false) {
970
+ GGML_UNUSED(numa);
971
 
972
  size = file->size;
973
 
974
  HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
975
 
976
  HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
 
977
 
978
  if (hMapping == NULL) {
979
+ DWORD error = GetLastError();
980
  throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
981
  }
982
 
983
  addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
984
+ DWORD error = GetLastError();
985
  CloseHandle(hMapping);
986
 
987
  if (addr == NULL) {
988
  throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
989
  }
990
 
991
+ if (prefetch > 0) {
992
  // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
993
  BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
994
  HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
 
1000
  // advise the kernel to preload the mapped memory
1001
  WIN32_MEMORY_RANGE_ENTRY range;
1002
  range.VirtualAddress = addr;
1003
+ range.NumberOfBytes = (SIZE_T) std::min(size, prefetch);
1004
  if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
1005
+ LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n",
1006
  llama_format_win_err(GetLastError()).c_str());
1007
  }
1008
  }
 
1017
 
1018
  ~llama_mmap() {
1019
  if (!UnmapViewOfFile(addr)) {
1020
+ LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
1021
  llama_format_win_err(GetLastError()).c_str());
1022
  }
1023
  }
1024
  #else
1025
  static constexpr bool SUPPORTED = false;
1026
 
1027
+ llama_mmap(struct llama_file * file, size_t prefetch = -1, bool numa = false) {
1028
+ GGML_UNUSED(file);
1029
+ GGML_UNUSED(prefetch);
1030
+ GGML_UNUSED(numa);
1031
 
1032
+ throw std::runtime_error("mmap not supported");
1033
  }
1034
 
1035
+ void unmap_fragment(size_t first, size_t last) {
1036
+ GGML_UNUSED(first);
1037
+ GGML_UNUSED(last);
1038
 
1039
+ throw std::runtime_error("mmap not supported");
1040
  }
1041
  #endif
1042
  };
 
1212
  }
1213
 
1214
  static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
1215
+ ggml_backend_buffer_type_t buft = nullptr;
1216
+
1217
  #ifdef GGML_USE_METAL
1218
  if (n_gpu_layers > 0) {
1219
+ buft = ggml_backend_metal_buffer_type();
1220
  }
1221
  #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1222
  if (n_gpu_layers > 0) {
1223
+ buft = ggml_backend_cuda_buffer_type(0);
1224
  }
1225
  #elif defined(GGML_USE_CUBLAS)
1226
+ buft = ggml_backend_cuda_host_buffer_type();
1227
  #elif defined(GGML_USE_CPU_HBM)
1228
+ buft = ggml_backend_cpu_hbm_buffer_type();
1229
  #endif
1230
 
1231
+ if (buft == nullptr) {
1232
+ buft = ggml_backend_cpu_buffer_type();
1233
+ }
1234
+
1235
+ return buft;
1236
 
1237
  GGML_UNUSED(n_gpu_layers);
1238
  }
 
1269
  MODEL_40B,
1270
  MODEL_65B,
1271
  MODEL_70B,
1272
+ MODEL_SMALL,
1273
+ MODEL_MEDIUM,
1274
+ MODEL_LARGE,
1275
+ MODEL_XL,
1276
  };
1277
 
1278
  static const size_t kiB = 1024;
 
1288
  uint32_t n_head_kv;
1289
  uint32_t n_layer;
1290
  uint32_t n_rot;
1291
+ uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
1292
+ uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
1293
  uint32_t n_ff;
1294
  uint32_t n_expert = 0;
1295
  uint32_t n_expert_used = 0;
 
1306
  float f_clamp_kqv;
1307
  float f_max_alibi_bias;
1308
 
1309
+
1310
  bool operator!=(const llama_hparams & other) const {
1311
  if (this->vocab_only != other.vocab_only) return true;
1312
  if (this->n_vocab != other.n_vocab) return true;
 
1316
  if (this->n_head_kv != other.n_head_kv) return true;
1317
  if (this->n_layer != other.n_layer) return true;
1318
  if (this->n_rot != other.n_rot) return true;
1319
+ if (this->n_embd_head_k != other.n_embd_head_k) return true;
1320
+ if (this->n_embd_head_v != other.n_embd_head_v) return true;
1321
  if (this->n_ff != other.n_ff) return true;
1322
  if (this->n_expert != other.n_expert) return true;
1323
  if (this->n_expert_used != other.n_expert_used) return true;
 
1325
  if (this->rope_finetuned != other.rope_finetuned) return true;
1326
  if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1327
 
1328
+ const float EPSILON = 1e-9f;
1329
 
1330
  if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1331
  if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
 
1339
  return n_head/n_head_kv;
1340
  }
1341
 
1342
+ uint32_t n_embd_k_gqa() const { // dimension of key embeddings across all k-v heads
1343
+ return n_embd_head_k * n_head_kv;
1344
  }
1345
 
1346
+ uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads
1347
+ return n_embd_head_v * n_head_kv;
1348
  }
1349
  };
1350
 
 
1412
  // ff bias
1413
  struct ggml_tensor * ffn_down_b; // b2
1414
  struct ggml_tensor * ffn_up_b; // b3
1415
+ struct ggml_tensor * ffn_act;
1416
  };
1417
 
1418
  struct llama_kv_cell {
 
1653
  uint32_t n_ctx,
1654
  int n_gpu_layers,
1655
  bool offload) {
1656
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
1657
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
1658
+ const uint32_t n_layer = hparams.n_layer;
1659
 
1660
  cache.has_shift = false;
1661
 
 
1686
  const int i_gpu_start = (int) n_layer - n_gpu_layers;
1687
 
1688
  for (int i = 0; i < (int) n_layer; i++) {
1689
+ ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd_k_gqa*n_ctx);
1690
+ ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd_v_gqa*n_ctx);
1691
  ggml_format_name(k, "cache_k_l%d", i);
1692
  ggml_format_name(v, "cache_v_l%d", i);
1693
  cache.k_l.push_back(k);
 
2574
 
2575
  static const char * llama_model_type_name(e_model type) {
2576
  switch (type) {
2577
+ case MODEL_1B: return "1B";
2578
+ case MODEL_3B: return "3B";
2579
+ case MODEL_7B: return "7B";
2580
+ case MODEL_8B: return "8B";
2581
+ case MODEL_13B: return "13B";
2582
+ case MODEL_15B: return "15B";
2583
+ case MODEL_30B: return "30B";
2584
+ case MODEL_34B: return "34B";
2585
+ case MODEL_40B: return "40B";
2586
+ case MODEL_65B: return "65B";
2587
+ case MODEL_70B: return "70B";
2588
+ case MODEL_SMALL: return "0.1B";
2589
+ case MODEL_MEDIUM: return "0.4B";
2590
+ case MODEL_LARGE: return "0.8B";
2591
+ case MODEL_XL: return "1.5B";
2592
+ default: return "?B";
2593
  }
2594
  }
2595
 
 
2681
  // gpt-j n_rot = rotary_dim
2682
  }
2683
 
2684
+ hparams.n_embd_head_k = hparams.n_embd / hparams.n_head;
2685
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
2686
+
2687
+ hparams.n_embd_head_v = hparams.n_embd / hparams.n_head;
2688
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
2689
+
2690
  // arch-specific KVs
2691
  switch (model.arch) {
2692
  case LLM_ARCH_LLAMA:
 
2805
  default: model.type = e_model::MODEL_UNKNOWN;
2806
  }
2807
  } break;
2808
+ case LLM_ARCH_PLAMO:
2809
+ {
2810
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2811
+
2812
+ switch (hparams.n_layer) {
2813
+ case 40: model.type = e_model::MODEL_13B; break;
2814
+ default: model.type = e_model::MODEL_UNKNOWN;
2815
+ }
2816
+ } break;
2817
+ case LLM_ARCH_GPT2:
2818
+ {
2819
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2820
+ switch (hparams.n_layer) {
2821
+ case 12: model.type = e_model::MODEL_SMALL; break;
2822
+ case 24: model.type = e_model::MODEL_MEDIUM; break;
2823
+ case 36: model.type = e_model::MODEL_LARGE; break;
2824
+ case 48: model.type = e_model::MODEL_XL; break;
2825
+ default: model.type = e_model::MODEL_UNKNOWN;
2826
+ }
2827
+ } break;
2828
 
2829
  default: (void)0;
2830
  }
 
3097
  LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
3098
  LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
3099
  LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
3100
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
3101
+ LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
3102
+ LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
3103
  LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
3104
+ LLAMA_LOG_INFO("%s: n_embd_k_gqa = %u\n", __func__, hparams.n_embd_k_gqa());
3105
+ LLAMA_LOG_INFO("%s: n_embd_v_gqa = %u\n", __func__, hparams.n_embd_v_gqa());
3106
  LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
3107
  LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
3108
  LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
 
3192
 
3193
  // create tensors for the weights
3194
  {
3195
+ const int64_t n_embd = hparams.n_embd;
3196
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
3197
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
3198
+ const int64_t n_layer = hparams.n_layer;
3199
+ const int64_t n_vocab = hparams.n_vocab;
3200
 
3201
  const auto tn = LLM_TN(model.arch);
3202
  switch (model.arch) {
 
3222
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3223
  }
3224
 
3225
+ const uint32_t n_ff = hparams.n_ff;
3226
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3227
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3228
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3229
 
3230
  const int i_gpu_start = n_layer - n_gpu_layers;
3231
 
 
3293
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3294
  }
3295
 
3296
+ const uint32_t n_ff = hparams.n_ff;
3297
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3298
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3299
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3300
 
3301
  const int i_gpu_start = n_layer - n_gpu_layers;
3302
 
 
3344
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3345
  }
3346
 
3347
+ const uint32_t n_ff = hparams.n_ff;
3348
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3349
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3350
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3351
 
3352
  const int i_gpu_start = n_layer - n_gpu_layers;
3353
 
 
3397
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3398
  }
3399
 
3400
+ const uint32_t n_ff = hparams.n_ff;
3401
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3402
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3403
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3404
 
3405
  const int i_gpu_start = n_layer - n_gpu_layers;
3406
 
 
3452
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3453
  }
3454
 
3455
+ const uint32_t n_ff = hparams.n_ff;
3456
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3457
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3458
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3459
+
3460
  const int i_gpu_start = n_layer - n_gpu_layers;
3461
  model.layers.resize(n_layer);
3462
  for (uint32_t i = 0; i < n_layer; ++i) {
 
3505
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3506
  }
3507
 
3508
+ const uint32_t n_ff = hparams.n_ff;
3509
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3510
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3511
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3512
 
3513
  const int i_gpu_start = n_layer - n_gpu_layers;
3514
 
 
3542
  case LLM_ARCH_MPT:
3543
  {
3544
  model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
 
3545
  // output
3546
  {
3547
  ggml_backend_type backend_norm;
 
3559
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3560
  }
3561
 
3562
+ const uint32_t n_ff = hparams.n_ff;
3563
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3564
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3565
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3566
 
3567
  const int i_gpu_start = n_layer - n_gpu_layers;
3568
 
 
3582
 
3583
  layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3584
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3585
+
3586
+ // AWQ ScaleActivation layer
3587
+ layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend, false);
3588
  }
3589
  } break;
3590
  case LLM_ARCH_STABLELM:
 
3609
  model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3610
  }
3611
 
3612
+ const uint32_t n_ff = hparams.n_ff;
3613
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3614
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3615
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3616
 
3617
  const int i_gpu_start = n_layer - n_gpu_layers;
3618
 
 
3710
  model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
3711
  }
3712
 
3713
+ const uint32_t n_ff = hparams.n_ff;
3714
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3715
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3716
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3717
 
3718
  const int i_gpu_start = n_layer - n_gpu_layers;
3719
 
 
3741
  layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3742
  }
3743
  } break;
3744
+ case LLM_ARCH_PLAMO:
3745
+ {
3746
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3747
+
3748
+ // output
3749
+ {
3750
+ ggml_backend_type backend_norm;
3751
+ ggml_backend_type backend_output;
3752
+
3753
+ if (n_gpu_layers > int(n_layer)) {
3754
+ backend_norm = llama_backend_offload;
3755
+ backend_output = llama_backend_offload_split;
3756
+ } else {
3757
+ backend_norm = GGML_BACKEND_CPU;
3758
+ backend_output = GGML_BACKEND_CPU;
3759
+ }
3760
+
3761
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3762
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3763
+ }
3764
+
3765
+ const uint32_t n_ff = hparams.n_ff;
3766
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3767
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3768
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3769
+
3770
+ const int i_gpu_start = n_layer - n_gpu_layers;
3771
+
3772
+ model.layers.resize(n_layer);
3773
+
3774
+ for (uint32_t i = 0; i < n_layer; ++i) {
3775
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3776
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3777
+
3778
+ auto & layer = model.layers[i];
3779
+
3780
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3781
+
3782
+ layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
3783
+ layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3784
+ layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3785
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3786
+
3787
+ layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3788
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3789
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3790
+ }
3791
+ } break;
3792
+ case LLM_ARCH_GPT2:
3793
+ {
3794
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3795
+ model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
3796
+
3797
+ // output
3798
+ {
3799
+ ggml_backend_type backend_norm;
3800
+ ggml_backend_type backend_output;
3801
+
3802
+ if (n_gpu_layers > int(n_layer)) {
3803
+ backend_norm = llama_backend_offload;
3804
+ backend_output = llama_backend_offload_split;
3805
+ } else {
3806
+ backend_norm = GGML_BACKEND_CPU;
3807
+ backend_output = GGML_BACKEND_CPU;
3808
+ }
3809
+
3810
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3811
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3812
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3813
+ }
3814
+
3815
+ const uint32_t n_ff = hparams.n_ff;
3816
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3817
+ GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3818
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3819
+
3820
+ const int i_gpu_start = n_layer - n_gpu_layers;
3821
+
3822
+ model.layers.resize(n_layer);
3823
+
3824
+ for (uint32_t i = 0; i < n_layer; ++i) {
3825
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3826
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3827
+
3828
+ auto & layer = model.layers[i];
3829
+
3830
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3831
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3832
+
3833
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3834
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
3835
+
3836
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3837
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3838
+
3839
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3840
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3841
+
3842
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
3843
+ layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
3844
+
3845
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3846
+ layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3847
+ }
3848
+ } break;
3849
  default:
3850
  throw std::runtime_error("unknown architecture");
3851
  }
 
4054
  return inpL;
4055
  }
4056
 
4057
+ // Persimmon: n_rot = n_embd_head_k/2
4058
+ // Other: n_rot = n_embd_head_k
4059
  static void llm_build_k_shift(
4060
  struct ggml_context * ctx,
4061
  const llama_hparams & hparams,
 
4068
  float freq_base,
4069
  float freq_scale,
4070
  const llm_build_cb & cb) {
4071
+ const int64_t n_layer = hparams.n_layer;
4072
+ const int64_t n_head_kv = hparams.n_head_kv;
4073
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
4074
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4075
+ const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
4076
+ const float ext_factor = cparams.yarn_ext_factor;
4077
+ const float attn_factor = cparams.yarn_attn_factor;
4078
+ const float beta_fast = cparams.yarn_beta_fast;
4079
+ const float beta_slow = cparams.yarn_beta_slow;
4080
+
4081
+ GGML_ASSERT(n_embd_head_k % n_rot == 0);
4082
 
4083
  struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
4084
  cb(K_shift, "K_shift", -1);
 
4096
  // we rotate only the first n_rot dimensions
4097
  ggml_rope_custom_inplace(ctx,
4098
  ggml_view_3d(ctx, kv.k_l[il],
4099
+ n_embd_head_k, n_head_kv, n_ctx,
4100
+ ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
4101
+ ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
4102
  0),
4103
  K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
4104
  ext_factor, attn_factor, beta_fast, beta_slow);
 
4119
  int32_t kv_head,
4120
  const llm_build_cb & cb,
4121
  int64_t il) {
4122
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4123
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
4124
 
4125
  // compute the transposed [n_tokens, n_embd] V matrix
4126
+ struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
4127
  //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
4128
  cb(v_cur_t, "v_cur_t", il);
4129
 
4130
+ struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
4131
+ (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
4132
  cb(k_cache_view, "k_cache_view", il);
4133
 
4134
+ struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
4135
  ( n_ctx)*ggml_element_size(kv.v_l[il]),
4136
  (kv_head)*ggml_element_size(kv.v_l[il]));
4137
  cb(v_cache_view, "v_cache_view", il);
 
4182
  struct ggml_tensor * gate_b,
4183
  struct ggml_tensor * down,
4184
  struct ggml_tensor * down_b,
4185
+ struct ggml_tensor * act_scales,
4186
  llm_ffn_op_type type_op,
4187
  llm_ffn_gate_type type_gate,
4188
  const llm_build_cb & cb,
 
4227
  {
4228
  cur = ggml_gelu(ctx, cur);
4229
  cb(cur, "ffn_gelu", il);
4230
+ if (act_scales != NULL) {
4231
+ cur = ggml_div(ctx, cur, act_scales);
4232
+ cb(cur, "ffn_act", il);
4233
+ }
4234
  } break;
4235
  case LLM_FFN_RELU:
4236
  {
 
4281
  float kq_scale,
4282
  const llm_build_cb & cb,
4283
  int il) {
4284
+ const int64_t n_head = hparams.n_head;
4285
+ const int64_t n_head_kv = hparams.n_head_kv;
4286
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
4287
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
4288
+ const int64_t n_embd_head_v = hparams.n_embd_head_v;
4289
 
4290
  struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
4291
  cb(q, "q", il);
4292
 
4293
  struct ggml_tensor * k =
4294
  ggml_view_3d(ctx, kv.k_l[il],
4295
+ n_embd_head_k, n_kv, n_head_kv,
4296
+ ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
4297
+ ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
4298
  0);
4299
  cb(k, "k", il);
4300
 
 
4333
  // split cached v into n_head heads
4334
  struct ggml_tensor * v =
4335
  ggml_view_3d(ctx, kv.v_l[il],
4336
+ n_kv, n_embd_head_v, n_head_kv,
4337
  ggml_element_size(kv.v_l[il])*n_ctx,
4338
+ ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
4339
  0);
4340
  cb(v, "v", il);
4341
 
 
4345
  struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
4346
  cb(kqv_merged, "kqv_merged", il);
4347
 
4348
+ struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
4349
  cb(cur, "kqv_merged_cont", il);
4350
 
4351
  cur = ggml_mul_mat(ctx, wo, cur);
 
4372
  const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
4373
  const int64_t n_head;
4374
  const int64_t n_head_kv;
4375
+ const int64_t n_embd_head_k;
4376
+ const int64_t n_embd_k_gqa;
4377
+ const int64_t n_embd_head_v;
4378
+ const int64_t n_embd_v_gqa;
4379
  const int64_t n_expert;
4380
  const int64_t n_expert_used;
4381
 
 
4417
  n_ctx (cparams.n_ctx),
4418
  n_head (hparams.n_head),
4419
  n_head_kv (hparams.n_head_kv),
4420
+ n_embd_head_k (hparams.n_embd_head_k),
4421
+ n_embd_k_gqa (hparams.n_embd_k_gqa()),
4422
+ n_embd_head_v (hparams.n_embd_head_v),
4423
+ n_embd_v_gqa (hparams.n_embd_v_gqa()),
4424
  n_expert (hparams.n_expert),
4425
  n_expert_used (hparams.n_expert_used),
4426
  freq_base (cparams.rope_freq_base),
 
4463
  struct ggml_cgraph * build_llama() {
4464
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4465
 
4466
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4467
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4468
  GGML_ASSERT(n_embd_head == hparams.n_rot);
4469
 
4470
  struct ggml_tensor * cur;
 
4555
  model.layers[il].ffn_up, NULL,
4556
  model.layers[il].ffn_gate, NULL,
4557
  model.layers[il].ffn_down, NULL,
4558
+ NULL,
4559
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
4560
  cb(cur, "ffn_out", il);
4561
  } else {
 
4649
  struct ggml_cgraph * build_baichuan() {
4650
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4651
 
4652
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4653
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4654
+
4655
  struct ggml_tensor * cur;
4656
  struct ggml_tensor * inpL;
4657
 
 
4738
  model.layers[il].ffn_up, NULL,
4739
  model.layers[il].ffn_gate, NULL,
4740
  model.layers[il].ffn_down, NULL,
4741
+ NULL,
4742
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
4743
  cb(cur, "ffn_out", il);
4744
  }
 
4769
  struct ggml_cgraph * build_falcon() {
4770
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4771
 
4772
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4773
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4774
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4775
+ GGML_ASSERT(n_embd_gqa == n_embd);
4776
+
4777
  struct ggml_tensor * cur;
4778
  struct ggml_tensor * inpL;
4779
 
 
4858
  model.layers[il].ffn_up, NULL,
4859
  NULL, NULL,
4860
  model.layers[il].ffn_down, NULL,
4861
+ NULL,
4862
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
4863
  cb(cur, "ffn_out", il);
4864
  }
 
4893
  struct ggml_cgraph * build_starcoder() {
4894
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4895
 
4896
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4897
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4898
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4899
+ GGML_ASSERT(n_embd_gqa == n_embd);
4900
+
4901
  struct ggml_tensor * cur;
4902
  struct ggml_tensor * pos;
4903
  struct ggml_tensor * inpL;
 
4968
  model.layers[il].ffn_up, model.layers[il].ffn_up_b,
4969
  NULL, NULL,
4970
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
4971
+ NULL,
4972
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
4973
  cb(cur, "ffn_out", il);
4974
  }
 
4994
  struct ggml_cgraph * build_persimmon() {
4995
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4996
 
4997
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4998
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4999
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5000
+ GGML_ASSERT(n_embd_gqa == n_embd);
5001
+
5002
+ const int64_t n_rot = n_embd_head_k / 2;
5003
 
5004
  struct ggml_tensor * cur;
5005
  struct ggml_tensor * inpL;
 
5178
  model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5179
  NULL, NULL,
5180
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5181
+ NULL,
5182
  LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
5183
  cb(cur, "ffn_out", il);
5184
  }
 
5208
  struct ggml_cgraph * build_refact() {
5209
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5210
 
5211
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5212
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5213
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5214
+ GGML_ASSERT(n_embd_gqa == n_embd);
5215
+
5216
  struct ggml_tensor * cur;
5217
  struct ggml_tensor * inpL;
5218
 
 
5270
  model.layers[il].ffn_up, NULL,
5271
  model.layers[il].ffn_gate, NULL,
5272
  model.layers[il].ffn_down, NULL,
5273
+ NULL,
5274
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5275
  cb(cur, "ffn_out", il);
5276
  }
 
5301
  struct ggml_cgraph * build_bloom() {
5302
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5303
 
5304
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5305
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5306
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5307
+ GGML_ASSERT(n_embd_gqa == n_embd);
5308
+
5309
  struct ggml_tensor * cur;
5310
  struct ggml_tensor * inpL;
5311
 
 
5371
  model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5372
  NULL, NULL,
5373
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5374
+ NULL,
5375
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5376
  cb(cur, "ffn_out", il);
5377
  }
 
5397
  struct ggml_cgraph * build_mpt() {
5398
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5399
 
5400
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5401
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5402
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5403
+ GGML_ASSERT(n_embd_gqa == n_embd);
5404
+
5405
  struct ggml_tensor * cur;
5406
  struct ggml_tensor * inpL;
5407
 
 
5462
  NULL,
5463
  LLM_NORM, cb, il);
5464
  cb(cur, "ffn_norm", il);
 
5465
  cur = llm_build_ffn(ctx0, cur,
5466
  model.layers[il].ffn_up, NULL,
5467
  NULL, NULL,
5468
  model.layers[il].ffn_down, NULL,
5469
+ model.layers[il].ffn_act,
5470
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5471
  cb(cur, "ffn_out", il);
5472
  }
 
5497
  struct ggml_cgraph * build_stablelm() {
5498
  struct ggml_cgraph * gf = ggml_new_graph(ctx0);
5499
 
5500
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5501
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5502
+
5503
  struct ggml_tensor * cur;
5504
  struct ggml_tensor * inpL;
5505
 
 
5578
  model.layers[il].ffn_up, NULL,
5579
  model.layers[il].ffn_gate, NULL,
5580
  model.layers[il].ffn_down, NULL,
5581
+ NULL,
5582
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5583
  cb(cur, "ffn_out", il);
5584
  }
 
5610
  struct ggml_cgraph * build_qwen() {
5611
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5612
 
5613
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5614
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5615
+
5616
  struct ggml_tensor * cur;
5617
  struct ggml_tensor * inpL;
5618
 
 
5694
  model.layers[il].ffn_up, NULL,
5695
  model.layers[il].ffn_gate, NULL,
5696
  model.layers[il].ffn_down, NULL,
5697
+ NULL,
5698
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5699
  cb(cur, "ffn_out", il);
5700
  }
 
5724
  struct ggml_cgraph * build_phi2() {
5725
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5726
 
5727
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5728
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5729
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5730
+ GGML_ASSERT(n_embd_gqa == n_embd);
5731
+
5732
  struct ggml_tensor * cur;
5733
  struct ggml_tensor * attn_norm_output;
5734
  struct ggml_tensor * ffn_output;
 
5807
  model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5808
  NULL, NULL,
5809
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5810
+ NULL,
5811
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5812
  cb(ffn_output, "ffn_out", il);
5813
  }
 
5837
 
5838
  return gf;
5839
  }
5840
+
5841
+ struct ggml_cgraph * build_plamo() {
5842
+ struct ggml_cgraph * gf = ggml_new_graph(ctx0);
5843
+
5844
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5845
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5846
+
5847
+ struct ggml_tensor * cur;
5848
+ struct ggml_tensor * inpL;
5849
+
5850
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5851
+ cb(inpL, "inp_embd", -1);
5852
+
5853
+ // inp_pos - contains the positions
5854
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5855
+ cb(inp_pos, "inp_pos", -1);
5856
+
5857
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5858
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5859
+ cb(KQ_mask, "KQ_mask", -1);
5860
+
5861
+ // shift the entire K-cache if needed
5862
+ if (do_rope_shift) {
5863
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
5864
+ }
5865
+
5866
+ for (int il = 0; il < n_layer; ++il) {
5867
+
5868
+ // norm
5869
+ cur = llm_build_norm(ctx0, inpL, hparams,
5870
+ model.layers[il].attn_norm, NULL,
5871
+ LLM_NORM_RMS, cb, il);
5872
+ cb(cur, "attn_norm", il);
5873
+
5874
+ struct ggml_tensor * attention_norm = cur;
5875
+
5876
+ // self-attention
5877
+ {
5878
+ // compute Q and K and RoPE them
5879
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
5880
+ cb(Qcur, "Qcur", il);
5881
+
5882
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
5883
+ cb(Kcur, "Kcur", il);
5884
+
5885
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
5886
+ cb(Vcur, "Vcur", il);
5887
+
5888
+ Qcur = ggml_rope_custom(
5889
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5890
+ n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
5891
+ ext_factor, attn_factor, beta_fast, beta_slow);
5892
+ cb(Qcur, "Qcur", il);
5893
+
5894
+ Kcur = ggml_rope_custom(
5895
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5896
+ n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
5897
+ ext_factor, attn_factor, beta_fast, beta_slow);
5898
+ cb(Kcur, "Kcur", il);
5899
+
5900
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5901
+
5902
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5903
+ model.layers[il].wo, NULL,
5904
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5905
+ cb(cur, "kqv_out", il);
5906
+ }
5907
+ struct ggml_tensor * sa_out = cur;
5908
+
5909
+ cur = attention_norm;
5910
+
5911
+ // feed-forward network
5912
+ {
5913
+ cur = llm_build_ffn(ctx0, cur,
5914
+ model.layers[il].ffn_up, NULL,
5915
+ model.layers[il].ffn_gate, NULL,
5916
+ model.layers[il].ffn_down, NULL,
5917
+ NULL,
5918
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5919
+ cb(cur, "ffn_out", il);
5920
+ }
5921
+
5922
+ cur = ggml_add(ctx0, cur, sa_out);
5923
+ cb(cur, "l_out", il);
5924
+
5925
+ cur = ggml_add(ctx0, cur, inpL);
5926
+ cb(cur, "l_out", il);
5927
+
5928
+ // input for next layer
5929
+ inpL = cur;
5930
+ }
5931
+
5932
+ cur = inpL;
5933
+
5934
+ cur = llm_build_norm(ctx0, cur, hparams,
5935
+ model.output_norm, NULL,
5936
+ LLM_NORM_RMS, cb, -1);
5937
+ cb(cur, "result_norm", -1);
5938
+
5939
+ // lm_head
5940
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5941
+ cb(cur, "result_output", -1);
5942
+
5943
+ ggml_build_forward_expand(gf, cur);
5944
+
5945
+ return gf;
5946
+ }
5947
+
5948
+ struct ggml_cgraph * build_gpt2() {
5949
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5950
+
5951
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5952
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5953
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5954
+ GGML_ASSERT(n_embd_gqa == n_embd);
5955
+
5956
+ struct ggml_tensor * cur;
5957
+ struct ggml_tensor * pos;
5958
+ struct ggml_tensor * inpL;
5959
+
5960
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5961
+ cb(inpL, "inp_embd", -1);
5962
+
5963
+ // inp_pos - contains the positions
5964
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5965
+ cb(inp_pos, "inp_pos", -1);
5966
+
5967
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5968
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5969
+ cb(KQ_mask, "KQ_mask", -1);
5970
+
5971
+ pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
5972
+ cb(pos, "pos_embd", -1);
5973
+
5974
+ inpL = ggml_add(ctx0, inpL, pos);
5975
+ cb(inpL, "inpL", -1);
5976
+
5977
+ for (int il = 0; il < n_layer; ++il) {
5978
+ cur = llm_build_norm(ctx0, inpL, hparams,
5979
+ model.layers[il].attn_norm,
5980
+ model.layers[il].attn_norm_b,
5981
+ LLM_NORM, cb, il);
5982
+ cb(cur, "attn_norm", il);
5983
+
5984
+ // self-attention
5985
+ {
5986
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5987
+ cb(cur, "wqkv", il);
5988
+
5989
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5990
+ cb(cur, "bqkv", il);
5991
+
5992
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5993
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5994
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5995
+
5996
+ cb(Qcur, "Qcur", il);
5997
+ cb(Kcur, "Kcur", il);
5998
+ cb(Vcur, "Vcur", il);
5999
+
6000
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6001
+
6002
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
6003
+
6004
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
6005
+ model.layers[il].wo, model.layers[il].bo,
6006
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6007
+ cb(cur, "kqv_out", il);
6008
+ }
6009
+
6010
+ // add the input
6011
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
6012
+ cb(ffn_inp, "ffn_inp", il);
6013
+
6014
+ // FF
6015
+ {
6016
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6017
+ model.layers[il].ffn_norm,
6018
+ model.layers[il].ffn_norm_b,
6019
+ LLM_NORM, cb, il);
6020
+ cb(cur, "ffn_norm", il);
6021
+
6022
+ cur = llm_build_ffn(ctx0, cur,
6023
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
6024
+ NULL, NULL,
6025
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
6026
+ NULL,
6027
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6028
+ cb(cur, "ffn_out", il);
6029
+ }
6030
+
6031
+ inpL = ggml_add(ctx0, cur, ffn_inp);
6032
+ cb(inpL, "l_out", il);
6033
+ }
6034
+
6035
+ cur = llm_build_norm(ctx0, inpL, hparams,
6036
+ model.output_norm,
6037
+ model.output_norm_b,
6038
+ LLM_NORM, cb, -1);
6039
+ cb(cur, "result_norm", -1);
6040
+
6041
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6042
+ cb(cur, "result_output", -1);
6043
+
6044
+ ggml_build_forward_expand(gf, cur);
6045
+
6046
+ return gf;
6047
+ }
6048
  };
6049
 
6050
  //
 
6200
  { "ffn_gate", OFFLOAD_FUNC },
6201
  { "ffn_gate_b", OFFLOAD_FUNC },
6202
  { "ffn_gate_par", OFFLOAD_FUNC },
6203
+ { "ffn_act", OFFLOAD_FUNC },
6204
  { "ffn_down", OFFLOAD_FUNC },
6205
  { "ffn_down_b", OFFLOAD_FUNC },
6206
  { "ffn_out", OFFLOAD_FUNC },
 
6556
  {
6557
  result = llm.build_phi2();
6558
  } break;
6559
+ case LLM_ARCH_PLAMO:
6560
+ {
6561
+ result = llm.build_plamo();
6562
+ } break;
6563
+ case LLM_ARCH_GPT2:
6564
+ {
6565
+ result = llm.build_gpt2();
6566
+ } break;
6567
  default:
6568
  GGML_ASSERT(false);
6569
  }
 
8030
  }
8031
  }
8032
 
8033
+ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
8034
  const int64_t t_start_sample_us = ggml_time_us();
8035
 
8036
  k = std::max(k, (int) min_keep);
 
8390
  }
8391
  }
8392
 
8393
+ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
8394
  GGML_ASSERT(ctx);
8395
 
8396
  auto N = float(llama_n_vocab(llama_get_model(ctx)));
 
9598
  return result;
9599
  }
9600
 
9601
+ int32_t llama_max_devices(void) {
9602
  return LLAMA_MAX_DEVICES;
9603
  }
9604
 
 
9740
  const ggml_type type_k = params.type_k;
9741
  const ggml_type type_v = params.type_v;
9742
 
9743
+ GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
9744
+ GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
9745
 
9746
  // reserve memory for context buffers
9747
  if (!hparams.vocab_only) {
 
9837
  ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
9838
  #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
9839
  if (model->n_gpu_layers > 0) {
9840
+ // the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets
9841
+ ggml_cuda_set_scratch_size(alloc_size + 64);
9842
  LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
9843
 
9844
  // calculate total VRAM usage
 
9909
  return model->vocab.type;
9910
  }
9911
 
9912
+ int32_t llama_n_vocab(const struct llama_model * model) {
9913
  return model->vocab.id_to_token.size();
9914
  }
9915
 
9916
+ int32_t llama_n_ctx_train(const struct llama_model * model) {
9917
  return model->hparams.n_ctx_train;
9918
  }
9919
 
9920
+ int32_t llama_n_embd(const struct llama_model * model) {
9921
  return model->hparams.n_embd;
9922
  }
9923
 
 
9925
  return model->hparams.rope_freq_scale_train;
9926
  }
9927
 
9928
+ int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
9929
  const auto & it = model->gguf_kv.find(key);
9930
  if (it == model->gguf_kv.end()) {
9931
  if (buf_size > 0) {
 
9936
  return snprintf(buf, buf_size, "%s", it->second.c_str());
9937
  }
9938
 
9939
+ int32_t llama_model_meta_count(const struct llama_model * model) {
9940
  return (int)model->gguf_kv.size();
9941
  }
9942
 
9943
+ int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
9944
  if (i < 0 || i >= (int)model->gguf_kv.size()) {
9945
  if (buf_size > 0) {
9946
  buf[0] = '\0';
 
9952
  return snprintf(buf, buf_size, "%s", it->first.c_str());
9953
  }
9954
 
9955
+ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
9956
  if (i < 0 || i >= (int)model->gguf_kv.size()) {
9957
  if (buf_size > 0) {
9958
  buf[0] = '\0';
 
9964
  return snprintf(buf, buf_size, "%s", it->second.c_str());
9965
  }
9966
 
9967
+ int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
9968
+ return snprintf(buf, buf_size, "%s %s%s %s",
9969
  llama_model_arch_name(model->arch).c_str(),
9970
+ model->hparams.n_expert > 0 ? (std::to_string(model->hparams.n_expert) + "x").c_str() : "",
9971
  llama_model_type_name(model->type),
9972
  llama_model_ftype_name(model->ftype).c_str());
9973
  }
 
9992
  return ggml_get_tensor(model->ctx, name);
9993
  }
9994
 
9995
+ uint32_t llama_model_quantize(
9996
  const char * fname_inp,
9997
  const char * fname_out,
9998
  const llama_model_quantize_params * params) {
 
10005
  }
10006
  }
10007
 
10008
+ int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
10009
  try {
10010
  return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
10011
  } catch (const std::exception & err) {
 
10014
  }
10015
  }
10016
 
10017
+ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
10018
  try {
10019
  return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
10020
  } catch (const std::exception & err) {
 
10112
  }
10113
  }
10114
 
10115
+ int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
10116
  int result = 0;
10117
 
10118
  for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
 
10122
  return result;
10123
  }
10124
 
10125
+ int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
10126
  return ctx->kv_self.used;
10127
  }
10128
 
 
10286
  const auto & hparams = ctx->model.hparams;
10287
  const auto & cparams = ctx->cparams;
10288
 
10289
+ const auto n_layer = hparams.n_layer;
10290
+ const auto n_embd_k_gqa = hparams.n_embd_k_gqa();
10291
+ const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
10292
+ const auto n_ctx = cparams.n_ctx;
10293
 
10294
  const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
10295
  const uint32_t kv_head = kv_self.head;
 
10311
  std::vector<struct ggml_tensor *> vout2d(n_layer);
10312
 
10313
  for (int il = 0; il < (int) n_layer; ++il) {
10314
+ kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
10315
+ vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
10316
 
10317
  ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
10318
+ n_embd_k_gqa, kv_head,
10319
+ elt_size*n_embd_k_gqa, 0);
10320
 
10321
  ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
10322
+ kv_head, n_embd_v_gqa,
10323
  elt_size*n_ctx, 0);
10324
 
10325
  ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
 
10426
  const auto & hparams = ctx->model.hparams;
10427
  const auto & cparams = ctx->cparams;
10428
 
10429
+ const int n_layer = hparams.n_layer;
10430
+ const int n_embd_k_gqa = hparams.n_embd_k_gqa();
10431
+ const int n_embd_v_gqa = hparams.n_embd_v_gqa();
10432
+ const int n_ctx = cparams.n_ctx;
10433
 
10434
  size_t kv_buf_size;
10435
  uint32_t kv_head;
 
10453
  std::vector<struct ggml_tensor *> vin2d(n_layer);
10454
 
10455
  for (int il = 0; il < n_layer; ++il) {
10456
+ kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
10457
+ vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
10458
 
10459
  ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
10460
+ n_embd_k_gqa, kv_head,
10461
+ elt_size*n_embd_k_gqa, 0);
10462
 
10463
  ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
10464
+ kv_head, n_embd_v_gqa,
10465
  elt_size*n_ctx, 0);
10466
 
10467
  ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
 
10604
  struct llama_context * ctx,
10605
  llama_token * tokens,
10606
  int32_t n_tokens,
10607
+ int32_t n_past) {
10608
  llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
10609
 
10610
  const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
 
10619
  struct llama_context * ctx,
10620
  float * embd,
10621
  int32_t n_tokens,
10622
+ int32_t n_past) {
10623
  llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
10624
 
10625
  llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
 
10690
  if (batch.logits) free(batch.logits);
10691
  }
10692
 
10693
+ int32_t llama_decode(
10694
  struct llama_context * ctx,
10695
  struct llama_batch batch) {
10696
  const int ret = llama_decode_internal(*ctx, batch);
 
10738
  return model->vocab.linefeed_id;
10739
  }
10740
 
10741
+ int32_t llama_add_bos_token(const struct llama_model * model) {
10742
  return model->vocab.special_add_bos;
10743
  }
10744
 
10745
+ int32_t llama_add_eos_token(const struct llama_model * model) {
10746
  return model->vocab.special_add_eos;
10747
  }
10748
 
 
10762
  return model->vocab.special_eot_id;
10763
  }
10764
 
10765
+ int32_t llama_tokenize(
10766
  const struct llama_model * model,
10767
  const char * text,
10768
+ int32_t text_len,
10769
  llama_token * tokens,
10770
+ int32_t n_max_tokens,
10771
  bool add_bos,
10772
  bool special) {
10773
  auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
 
10795
  }
10796
 
10797
  // does not write null-terminator to buf
10798
+ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
10799
  if (0 <= token && token < llama_n_vocab(model)) {
10800
  switch (llama_vocab_get_type(model->vocab)) {
10801
  case LLAMA_VOCAB_TYPE_SPM: {
 
10803
  std::string result = model->vocab.id_to_token[token].text;
10804
  llama_unescape_whitespace(result);
10805
  if (length < (int) result.length()) {
10806
+ return -(int) result.length();
10807
  }
10808
  memcpy(buf, result.c_str(), result.length());
10809
  return result.length();
 
10833
  std::string result = model->vocab.id_to_token[token].text;
10834
  result = llama_decode_text(result);
10835
  if (length < (int) result.length()) {
10836
+ return -(int) result.length();
10837
  }
10838
  memcpy(buf, result.c_str(), result.length());
10839
  return result.length();
 
10896
 
10897
  s = "";
10898
  s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
10899
+ s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | ";
10900
  s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
10901
  s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
10902
  s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
examples/talk-llama/llama.h CHANGED
@@ -226,7 +226,7 @@ extern "C" {
226
 
227
  // model quantization parameters
228
  typedef struct llama_model_quantize_params {
229
- int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
230
  enum llama_ftype ftype; // quantize to this llama_ftype
231
  bool allow_requantize; // allow quantizing non-f32/f16 tensors
232
  bool quantize_output_tensor; // quantize output.weight
@@ -310,21 +310,20 @@ extern "C" {
310
 
311
  LLAMA_API int64_t llama_time_us(void);
312
 
313
- LLAMA_API int llama_max_devices (void);
314
  LLAMA_API bool llama_mmap_supported (void);
315
  LLAMA_API bool llama_mlock_supported(void);
316
 
317
  LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
318
 
319
- // TODO: become more consistent with returned int types across the API
320
  LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
321
  LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
322
 
323
  LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
324
 
325
- LLAMA_API int llama_n_vocab (const struct llama_model * model);
326
- LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
327
- LLAMA_API int llama_n_embd (const struct llama_model * model);
328
 
329
  // Get the model's RoPE frequency scaling factor
330
  LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@@ -335,19 +334,19 @@ extern "C" {
335
  // - GGUF array values are not supported by these functions
336
 
337
  // Get metadata value as a string by key name
338
- LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
339
 
340
  // Get the number of metadata key/value pairs
341
- LLAMA_API int llama_model_meta_count(const struct llama_model * model);
342
 
343
  // Get metadata key name by index
344
- LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
345
 
346
  // Get metadata value as a string by index
347
- LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
348
 
349
  // Get a string describing the model type
350
- LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
351
 
352
  // Returns the total size of all the tensors in the model in bytes
353
  LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
@@ -359,7 +358,7 @@ extern "C" {
359
  LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
360
 
361
  // Returns 0 on success
362
- LLAMA_API int llama_model_quantize(
363
  const char * fname_inp,
364
  const char * fname_out,
365
  const llama_model_quantize_params * params);
@@ -370,20 +369,20 @@ extern "C" {
370
  // The model needs to be reloaded before applying a new adapter, otherwise the adapter
371
  // will be applied on top of the previous one
372
  // Returns 0 on success
373
- LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
374
  struct llama_context * ctx,
375
  const char * path_lora,
376
  float scale,
377
  const char * path_base_model,
378
- int n_threads),
379
  "use llama_model_apply_lora_from_file instead");
380
 
381
- LLAMA_API int llama_model_apply_lora_from_file(
382
  const struct llama_model * model,
383
  const char * path_lora,
384
  float scale,
385
  const char * path_base_model,
386
- int n_threads);
387
 
388
  //
389
  // KV cache
@@ -439,10 +438,10 @@ extern "C" {
439
 
440
  // Returns the number of tokens in the KV cache (slow, use only for debug)
441
  // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
442
- LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
443
 
444
  // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
445
- LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
446
 
447
  // Clear the KV cache
448
  LLAMA_API void llama_kv_cache_clear(
@@ -533,7 +532,7 @@ extern "C" {
533
  struct llama_context * ctx,
534
  llama_token * tokens,
535
  int32_t n_tokens,
536
- int n_past),
537
  "use llama_decode() instead");
538
 
539
  // Same as llama_eval, but use float matrix input directly.
@@ -542,7 +541,7 @@ extern "C" {
542
  struct llama_context * ctx,
543
  float * embd,
544
  int32_t n_tokens,
545
- int n_past),
546
  "use llama_decode() instead");
547
 
548
  // Return batch for single sequence of tokens starting at pos_0
@@ -574,7 +573,7 @@ extern "C" {
574
  // 0 - success
575
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
576
  // < 0 - error
577
- LLAMA_API int llama_decode(
578
  struct llama_context * ctx,
579
  struct llama_batch batch);
580
 
@@ -614,10 +613,10 @@ extern "C" {
614
  LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
615
 
616
  // Returns -1 if unknown, 1 for true or 0 for false.
617
- LLAMA_API int llama_add_bos_token(const struct llama_model * model);
618
 
619
  // Returns -1 if unknown, 1 for true or 0 for false.
620
- LLAMA_API int llama_add_eos_token(const struct llama_model * model);
621
 
622
  // codellama infill tokens
623
  LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
@@ -635,12 +634,12 @@ extern "C" {
635
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
636
  /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
637
  /// Does not insert a leading space.
638
- LLAMA_API int llama_tokenize(
639
  const struct llama_model * model,
640
  const char * text,
641
- int text_len,
642
  llama_token * tokens,
643
- int n_max_tokens,
644
  bool add_bos,
645
  bool special);
646
 
@@ -648,11 +647,11 @@ extern "C" {
648
  // Uses the vocabulary in the provided context.
649
  // Does not write null terminator to the buffer.
650
  // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
651
- LLAMA_API int llama_token_to_piece(
652
  const struct llama_model * model,
653
  llama_token token,
654
  char * buf,
655
- int length);
656
 
657
  //
658
  // Grammar
@@ -704,7 +703,7 @@ extern "C" {
704
  LLAMA_API void llama_sample_top_k(
705
  struct llama_context * ctx,
706
  llama_token_data_array * candidates,
707
- int k,
708
  size_t min_keep);
709
 
710
  /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
@@ -763,7 +762,7 @@ extern "C" {
763
  llama_token_data_array * candidates,
764
  float tau,
765
  float eta,
766
- int m,
767
  float * mu);
768
 
769
  /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
@@ -836,8 +835,8 @@ extern "C" {
836
  llama_beam_search_callback_fn_t callback,
837
  void * callback_data,
838
  size_t n_beams,
839
- int n_past,
840
- int n_predict);
841
 
842
  // Performance information
843
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
 
226
 
227
  // model quantization parameters
228
  typedef struct llama_model_quantize_params {
229
+ int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
230
  enum llama_ftype ftype; // quantize to this llama_ftype
231
  bool allow_requantize; // allow quantizing non-f32/f16 tensors
232
  bool quantize_output_tensor; // quantize output.weight
 
310
 
311
  LLAMA_API int64_t llama_time_us(void);
312
 
313
+ LLAMA_API int32_t llama_max_devices(void);
314
  LLAMA_API bool llama_mmap_supported (void);
315
  LLAMA_API bool llama_mlock_supported(void);
316
 
317
  LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
318
 
 
319
  LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
320
  LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
321
 
322
  LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
323
 
324
+ LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
325
+ LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
326
+ LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
327
 
328
  // Get the model's RoPE frequency scaling factor
329
  LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
 
334
  // - GGUF array values are not supported by these functions
335
 
336
  // Get metadata value as a string by key name
337
+ LLAMA_API int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
338
 
339
  // Get the number of metadata key/value pairs
340
+ LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
341
 
342
  // Get metadata key name by index
343
+ LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
344
 
345
  // Get metadata value as a string by index
346
+ LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
347
 
348
  // Get a string describing the model type
349
+ LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
350
 
351
  // Returns the total size of all the tensors in the model in bytes
352
  LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
 
358
  LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
359
 
360
  // Returns 0 on success
361
+ LLAMA_API uint32_t llama_model_quantize(
362
  const char * fname_inp,
363
  const char * fname_out,
364
  const llama_model_quantize_params * params);
 
369
  // The model needs to be reloaded before applying a new adapter, otherwise the adapter
370
  // will be applied on top of the previous one
371
  // Returns 0 on success
372
+ LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
373
  struct llama_context * ctx,
374
  const char * path_lora,
375
  float scale,
376
  const char * path_base_model,
377
+ int32_t n_threads),
378
  "use llama_model_apply_lora_from_file instead");
379
 
380
+ LLAMA_API int32_t llama_model_apply_lora_from_file(
381
  const struct llama_model * model,
382
  const char * path_lora,
383
  float scale,
384
  const char * path_base_model,
385
+ int32_t n_threads);
386
 
387
  //
388
  // KV cache
 
438
 
439
  // Returns the number of tokens in the KV cache (slow, use only for debug)
440
  // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
441
+ LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
442
 
443
  // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
444
+ LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
445
 
446
  // Clear the KV cache
447
  LLAMA_API void llama_kv_cache_clear(
 
532
  struct llama_context * ctx,
533
  llama_token * tokens,
534
  int32_t n_tokens,
535
+ int32_t n_past),
536
  "use llama_decode() instead");
537
 
538
  // Same as llama_eval, but use float matrix input directly.
 
541
  struct llama_context * ctx,
542
  float * embd,
543
  int32_t n_tokens,
544
+ int32_t n_past),
545
  "use llama_decode() instead");
546
 
547
  // Return batch for single sequence of tokens starting at pos_0
 
573
  // 0 - success
574
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
575
  // < 0 - error
576
+ LLAMA_API int32_t llama_decode(
577
  struct llama_context * ctx,
578
  struct llama_batch batch);
579
 
 
613
  LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
614
 
615
  // Returns -1 if unknown, 1 for true or 0 for false.
616
+ LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
617
 
618
  // Returns -1 if unknown, 1 for true or 0 for false.
619
+ LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
620
 
621
  // codellama infill tokens
622
  LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
 
634
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
635
  /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
636
  /// Does not insert a leading space.
637
+ LLAMA_API int32_t llama_tokenize(
638
  const struct llama_model * model,
639
  const char * text,
640
+ int32_t text_len,
641
  llama_token * tokens,
642
+ int32_t n_max_tokens,
643
  bool add_bos,
644
  bool special);
645
 
 
647
  // Uses the vocabulary in the provided context.
648
  // Does not write null terminator to the buffer.
649
  // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
650
+ LLAMA_API int32_t llama_token_to_piece(
651
  const struct llama_model * model,
652
  llama_token token,
653
  char * buf,
654
+ int32_t length);
655
 
656
  //
657
  // Grammar
 
703
  LLAMA_API void llama_sample_top_k(
704
  struct llama_context * ctx,
705
  llama_token_data_array * candidates,
706
+ int32_t k,
707
  size_t min_keep);
708
 
709
  /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 
762
  llama_token_data_array * candidates,
763
  float tau,
764
  float eta,
765
+ int32_t m,
766
  float * mu);
767
 
768
  /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
 
835
  llama_beam_search_callback_fn_t callback,
836
  void * callback_data,
837
  size_t n_beams,
838
+ int32_t n_past,
839
+ int32_t n_predict);
840
 
841
  // Performance information
842
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);