ggerganov commited on
Commit
e6d6e1d
·
unverified ·
1 Parent(s): 94800c5

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama.cpp CHANGED
@@ -15,6 +15,8 @@
15
  # include "ggml-vulkan.h"
16
  #elif defined(GGML_USE_SYCL)
17
  # include "ggml-sycl.h"
 
 
18
  #endif
19
 
20
  #ifdef GGML_USE_METAL
@@ -202,10 +204,12 @@ enum llm_arch {
202
  LLM_ARCH_PLAMO,
203
  LLM_ARCH_CODESHELL,
204
  LLM_ARCH_ORION,
 
 
205
  LLM_ARCH_UNKNOWN,
206
  };
207
 
208
- static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
209
  { LLM_ARCH_LLAMA, "llama" },
210
  { LLM_ARCH_FALCON, "falcon" },
211
  { LLM_ARCH_GPT2, "gpt2" },
@@ -224,6 +228,8 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
224
  { LLM_ARCH_PLAMO, "plamo" },
225
  { LLM_ARCH_CODESHELL, "codeshell" },
226
  { LLM_ARCH_ORION, "orion" },
 
 
227
  };
228
 
229
  enum llm_kv {
@@ -276,11 +282,12 @@ enum llm_kv {
276
  LLM_KV_TOKENIZER_PAD_ID,
277
  LLM_KV_TOKENIZER_ADD_BOS,
278
  LLM_KV_TOKENIZER_ADD_EOS,
 
279
  LLM_KV_TOKENIZER_HF_JSON,
280
  LLM_KV_TOKENIZER_RWKV,
281
  };
282
 
283
- static std::map<llm_kv, std::string> LLM_KV_NAMES = {
284
  { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
285
  { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
286
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
@@ -330,6 +337,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
330
  { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
331
  { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
332
  { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
 
333
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
334
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
335
  };
@@ -340,7 +348,7 @@ struct LLM_KV {
340
  llm_arch arch;
341
 
342
  std::string operator()(llm_kv kv) const {
343
- return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str());
344
  }
345
  };
346
 
@@ -667,7 +675,46 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
667
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
668
  },
669
  },
670
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
671
  {
672
  LLM_ARCH_UNKNOWN,
673
  {
@@ -725,13 +772,13 @@ struct LLM_TN {
725
  // gguf helpers
726
  //
727
 
728
- static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
729
  { LLAMA_ROPE_SCALING_NONE, "none" },
730
  { LLAMA_ROPE_SCALING_LINEAR, "linear" },
731
  { LLAMA_ROPE_SCALING_YARN, "yarn" },
732
  };
733
 
734
- static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
735
  for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
736
  if (kv.second == name) {
737
  return kv.first;
@@ -1158,10 +1205,10 @@ struct llama_mlock {
1158
  #ifdef __APPLE__
1159
  #define MLOCK_SUGGESTION \
1160
  "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
1161
- "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
1162
  #else
1163
  #define MLOCK_SUGGESTION \
1164
- "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
1165
  #endif
1166
 
1167
  bool raw_lock(const void * addr, size_t size) const {
@@ -1308,11 +1355,16 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1308
  #elif defined(GGML_USE_CUBLAS)
1309
  buft = ggml_backend_cuda_buffer_type(gpu);
1310
  #elif defined(GGML_USE_VULKAN)
1311
- buft = ggml_backend_vk_buffer_type();
1312
  #elif defined(GGML_USE_SYCL)
1313
  buft = ggml_backend_sycl_buffer_type(gpu);
1314
  #elif defined(GGML_USE_CLBLAST)
1315
  buft = ggml_backend_opencl_buffer_type();
 
 
 
 
 
1316
  #endif
1317
 
1318
  if (buft == nullptr) {
@@ -1340,6 +1392,33 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
1340
  GGML_UNUSED(tensor_split);
1341
  }
1342
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1343
  //
1344
  // globals
1345
  //
@@ -1363,6 +1442,7 @@ enum e_model {
1363
  MODEL_UNKNOWN,
1364
  MODEL_0_5B,
1365
  MODEL_1B,
 
1366
  MODEL_3B,
1367
  MODEL_4B,
1368
  MODEL_7B,
@@ -1370,6 +1450,7 @@ enum e_model {
1370
  MODEL_13B,
1371
  MODEL_14B,
1372
  MODEL_15B,
 
1373
  MODEL_30B,
1374
  MODEL_34B,
1375
  MODEL_40B,
@@ -1387,6 +1468,7 @@ static const size_t GiB = 1024*MiB;
1387
 
1388
  struct llama_hparams {
1389
  bool vocab_only;
 
1390
  uint32_t n_vocab;
1391
  uint32_t n_ctx_train; // context size the model was trained on
1392
  uint32_t n_embd;
@@ -1406,8 +1488,7 @@ struct llama_hparams {
1406
  float rope_freq_base_train;
1407
  float rope_freq_scale_train;
1408
  uint32_t n_yarn_orig_ctx;
1409
- int8_t rope_scaling_type_train : 3;
1410
- bool rope_finetuned : 1;
1411
 
1412
  float f_clamp_kqv;
1413
  float f_max_alibi_bias;
@@ -1611,6 +1692,8 @@ struct llama_vocab {
1611
  id special_suffix_id = 32008;
1612
  id special_eot_id = 32010;
1613
 
 
 
1614
  int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
1615
  GGML_ASSERT(token_left.find(' ') == std::string::npos);
1616
  GGML_ASSERT(token_left.find('\n') == std::string::npos);
@@ -1707,6 +1790,10 @@ struct llama_context {
1707
  ggml_backend_free(backend);
1708
  }
1709
 
 
 
 
 
1710
  ggml_backend_buffer_free(buf_input);
1711
  ggml_free(ctx_input);
1712
  }
@@ -2360,6 +2447,7 @@ struct llama_model_loader {
2360
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
2361
  case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2362
  case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
 
2363
  default:
2364
  {
2365
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2670,7 +2758,7 @@ struct llama_model_loader {
2670
  // load LLaMA models
2671
  //
2672
 
2673
- static std::string llama_model_arch_name(llm_arch arch) {
2674
  auto it = LLM_ARCH_NAMES.find(arch);
2675
  if (it == LLM_ARCH_NAMES.end()) {
2676
  return "unknown";
@@ -2705,9 +2793,10 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2705
  case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
2706
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
2707
  case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2708
- case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
2709
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2710
  case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
 
2711
 
2712
  default: return "unknown, may not work";
2713
  }
@@ -2716,12 +2805,14 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2716
  static const char * llama_model_type_name(e_model type) {
2717
  switch (type) {
2718
  case MODEL_1B: return "1B";
 
2719
  case MODEL_3B: return "3B";
2720
  case MODEL_7B: return "7B";
2721
  case MODEL_8B: return "8B";
2722
  case MODEL_13B: return "13B";
2723
  case MODEL_14B: return "14B";
2724
  case MODEL_15B: return "15B";
 
2725
  case MODEL_30B: return "30B";
2726
  case MODEL_34B: return "34B";
2727
  case MODEL_40B: return "40B";
@@ -2734,6 +2825,14 @@ static const char * llama_model_type_name(e_model type) {
2734
  default: return "?B";
2735
  }
2736
  }
 
 
 
 
 
 
 
 
2737
 
2738
  static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
2739
  model.arch = ml.get_arch();
@@ -2846,6 +2945,15 @@ static void llm_load_hparams(
2846
  default: model.type = e_model::MODEL_UNKNOWN;
2847
  }
2848
  } break;
 
 
 
 
 
 
 
 
 
2849
  case LLM_ARCH_FALCON:
2850
  {
2851
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -2997,6 +3105,15 @@ static void llm_load_hparams(
2997
  default: model.type = e_model::MODEL_UNKNOWN;
2998
  }
2999
  } break;
 
 
 
 
 
 
 
 
 
3000
  default: (void)0;
3001
  }
3002
 
@@ -3048,6 +3165,11 @@ static void llm_load_vocab(
3048
  vocab.special_unk_id = 0;
3049
  vocab.special_sep_id = -1;
3050
  vocab.special_pad_id = -1;
 
 
 
 
 
3051
  } else if (tokenizer_name == "gpt2") {
3052
  vocab.type = LLAMA_VOCAB_TYPE_BPE;
3053
 
@@ -3255,12 +3377,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3255
  const auto & hparams = model.hparams;
3256
  const auto & vocab = model.vocab;
3257
 
3258
- const auto rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
3259
 
3260
  // hparams
3261
  LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
3262
- LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
3263
- LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
3264
  LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
3265
  LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
3266
  LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
@@ -3281,7 +3403,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3281
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
3282
  LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
3283
  LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
3284
- LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
3285
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
3286
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
3287
  LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
@@ -3347,22 +3469,18 @@ static bool llm_load_tensors(
3347
  model.buft_layer[i] = llama_default_buffer_type_cpu(true);
3348
  }
3349
 
3350
- #ifdef GGML_USE_CUBLAS
3351
  if (split_mode == LLAMA_SPLIT_LAYER) {
3352
  // calculate the split points
3353
- int device_count = ggml_backend_cuda_get_device_count();
3354
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
3355
- float splits[GGML_CUDA_MAX_DEVICES];
3356
  if (all_zero) {
3357
  // default split, by free memory
3358
  for (int i = 0; i < device_count; ++i) {
3359
- size_t total;
3360
- size_t free;
3361
- ggml_backend_cuda_get_device_memory(i, &total, &free);
3362
- splits[i] = free;
3363
  }
3364
  } else {
3365
- std::copy(tensor_split, tensor_split + device_count, splits);
3366
  }
3367
 
3368
  // sum and normalize the splits to get the split points
@@ -3378,19 +3496,17 @@ static bool llm_load_tensors(
3378
  // assign the repeating layers to the devices according to the splits
3379
  int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
3380
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
3381
- int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
3382
  model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
3383
  }
3384
  // assign the output layer
3385
  if (n_gpu_layers > n_layer) {
3386
- int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits;
3387
  model.buft_output = llama_default_buffer_type_offload(layer_gpu);
3388
  } else {
3389
  model.buft_output = llama_default_buffer_type_cpu(true);
3390
  }
3391
- } else
3392
- #endif
3393
- {
3394
  ggml_backend_buffer_type_t split_buft;
3395
  if (split_mode == LLAMA_SPLIT_ROW) {
3396
  split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
@@ -3469,13 +3585,16 @@ static bool llm_load_tensors(
3469
  switch (model.arch) {
3470
  case LLM_ARCH_LLAMA:
3471
  case LLM_ARCH_REFACT:
 
3472
  {
3473
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3474
 
3475
  // output
3476
  {
3477
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3478
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
 
 
3479
  }
3480
 
3481
  for (int i = 0; i < n_layer; ++i) {
@@ -4009,8 +4128,35 @@ static bool llm_load_tensors(
4009
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4010
  }
4011
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4012
 
 
 
 
 
 
4013
 
 
 
 
 
 
 
 
4014
  default:
4015
  throw std::runtime_error("unknown architecture");
4016
  }
@@ -4063,8 +4209,7 @@ static bool llm_load_tensors(
4063
  ctx_bufs.emplace_back(ctx, buf);
4064
  }
4065
 
4066
- // print memory requirements
4067
- {
4068
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
4069
 
4070
  LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
@@ -4076,10 +4221,11 @@ static bool llm_load_tensors(
4076
  const int max_offloadable_layers = hparams.n_layer + 1;
4077
 
4078
  LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
 
4079
 
4080
- for (ggml_backend_buffer_t buf : model.bufs) {
4081
- LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
4082
- }
4083
  }
4084
 
4085
  // populate tensors_by_name
@@ -4107,7 +4253,7 @@ static bool llm_load_tensors(
4107
  }
4108
 
4109
  // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
4110
- static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
4111
  try {
4112
  llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
4113
 
@@ -4128,6 +4274,22 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
4128
  return 0;
4129
  }
4130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4131
  if (!llm_load_tensors(
4132
  ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
4133
  params.progress_callback, params.progress_callback_user_data
@@ -4641,126 +4803,6 @@ struct llm_build_context {
4641
  ctx0 = nullptr;
4642
  }
4643
  }
4644
- struct ggml_cgraph * build_orion() {
4645
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4646
-
4647
- const int64_t n_embd_head = hparams.n_embd_head_v;
4648
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4649
- GGML_ASSERT(n_embd_head == hparams.n_rot);
4650
-
4651
- struct ggml_tensor * cur;
4652
- struct ggml_tensor * inpL;
4653
-
4654
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
4655
- cb(inpL, "inp_embd", -1);
4656
-
4657
- // inp_pos - contains the positions
4658
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
4659
- cb(inp_pos, "inp_pos", -1);
4660
-
4661
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4662
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
4663
- cb(KQ_mask, "KQ_mask", -1);
4664
-
4665
- // shift the entire K-cache if needed
4666
- if (do_rope_shift) {
4667
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
4668
- }
4669
-
4670
- for (int il = 0; il < n_layer; ++il) {
4671
- struct ggml_tensor * inpSA = inpL;
4672
-
4673
- // norm
4674
- cur = llm_build_norm(ctx0, inpL, hparams,
4675
- model.layers[il].attn_norm, model.layers[il].attn_norm_b,
4676
- LLM_NORM, cb, il);
4677
- cb(cur, "attn_norm", il);
4678
-
4679
- // self-attention
4680
- {
4681
- // compute Q and K and RoPE them
4682
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
4683
- cb(Qcur, "Qcur", il);
4684
- // if (model.layers[il].bq) {
4685
- // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
4686
- // cb(Qcur, "Qcur", il);
4687
- // }
4688
-
4689
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
4690
- cb(Kcur, "Kcur", il);
4691
- // if (model.layers[il].bk) {
4692
- // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
4693
- // cb(Kcur, "Kcur", il);
4694
- // }
4695
-
4696
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
4697
- cb(Vcur, "Vcur", il);
4698
- // if (model.layers[il].bv) {
4699
- // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
4700
- // cb(Vcur, "Vcur", il);
4701
- // }
4702
-
4703
- Qcur = ggml_rope_custom(
4704
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4705
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4706
- ext_factor, attn_factor, beta_fast, beta_slow
4707
- );
4708
- cb(Qcur, "Qcur", il);
4709
-
4710
- Kcur = ggml_rope_custom(
4711
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
4712
- hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4713
- ext_factor, attn_factor, beta_fast, beta_slow
4714
- );
4715
- cb(Kcur, "Kcur", il);
4716
-
4717
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
4718
- model.layers[il].wo, NULL,
4719
- Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4720
- cb(cur, "kqv_out", il);
4721
- }
4722
-
4723
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
4724
- cb(ffn_inp, "ffn_inp", il);
4725
-
4726
- // feed-forward network
4727
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
4728
- model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
4729
- LLM_NORM, cb, il);
4730
- cb(cur, "ffn_norm", il);
4731
-
4732
- cur = llm_build_ffn(ctx0, cur,
4733
- model.layers[il].ffn_up, NULL,
4734
- model.layers[il].ffn_gate, NULL,
4735
- model.layers[il].ffn_down, NULL,
4736
- NULL,
4737
- LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
4738
- cb(cur, "ffn_out", il);
4739
-
4740
- cur = ggml_add(ctx0, cur, ffn_inp);
4741
- cb(cur, "l_out", il);
4742
-
4743
- // input for next layer
4744
- inpL = cur;
4745
- }
4746
-
4747
- cur = inpL;
4748
-
4749
- cur = llm_build_norm(ctx0, cur, hparams,
4750
- model.output_norm, model.output_norm_b,
4751
- LLM_NORM, cb, -1);
4752
- cb(cur, "result_norm", -1);
4753
-
4754
- // lm_head
4755
- cur = ggml_mul_mat(ctx0, model.output, cur);
4756
- cb(cur, "result_output", -1);
4757
-
4758
- ggml_build_forward_expand(gf, cur);
4759
-
4760
- return gf;
4761
- }
4762
-
4763
-
4764
 
4765
  struct ggml_cgraph * build_llama() {
4766
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
@@ -6564,23 +6606,409 @@ struct llm_build_context {
6564
 
6565
  return gf;
6566
  }
6567
- };
6568
 
6569
- static struct ggml_cgraph * llama_build_graph(
6570
- llama_context & lctx,
6571
- const llama_batch & batch) {
6572
- const auto & model = lctx.model;
6573
 
6574
- // check if we should build the worst-case graph (for memory measurement)
6575
- const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
 
6576
 
6577
- // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
6578
- llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
6579
- if (il >= 0) {
6580
- ggml_format_name(cur, "%s-%d", name, il);
6581
- } else {
6582
- ggml_set_name(cur, name);
6583
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6584
 
6585
  if (!lctx.cparams.offload_kqv) {
6586
  if (strcmp(name, "kqv_merged_cont") == 0) {
@@ -6722,6 +7150,14 @@ static struct ggml_cgraph * llama_build_graph(
6722
  {
6723
  result = llm.build_orion();
6724
  } break;
 
 
 
 
 
 
 
 
6725
  default:
6726
  GGML_ASSERT(false);
6727
  }
@@ -6849,15 +7285,12 @@ static int llama_decode_internal(
6849
  // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
6850
  // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
6851
  // with the BLAS calls. need a better solution
6852
- if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
 
 
6853
  n_threads = std::min(4, n_threads);
6854
  }
6855
 
6856
- const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
6857
- if ((ggml_cpu_has_cublas() || ggml_cpu_has_vulkan()) && fully_offloaded) {
6858
- n_threads = 1;
6859
- }
6860
-
6861
  #ifdef GGML_USE_MPI
6862
  const int64_t n_layer = hparams.n_layer;
6863
  ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
@@ -7669,7 +8102,9 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
7669
  //
7670
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
7671
  if (&fragment == &fragment_buffer.front()) {
7672
- raw_text = " " + raw_text; // prefix with space if the first token is not special
 
 
7673
  }
7674
 
7675
  #ifdef PRETOKENIZERDEBUG
@@ -8155,6 +8590,10 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can
8155
 
8156
  const int64_t t_start_sample_us = ggml_time_us();
8157
 
 
 
 
 
8158
  k = std::max(k, (int) min_keep);
8159
  k = std::min(k, (int) candidates->size);
8160
 
@@ -9214,6 +9653,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9214
  else if (new_type != GGML_TYPE_Q8_0) {
9215
  new_type = GGML_TYPE_Q6_K;
9216
  }
 
 
 
 
 
 
 
9217
  } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
9218
  if (name.find("attn_v.weight") != std::string::npos) {
9219
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
@@ -9224,7 +9670,6 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9224
  if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
9225
  ++qs.i_ffn_down;
9226
  }
9227
- else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
9228
  } else if (name.find("attn_v.weight") != std::string::npos) {
9229
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
9230
  new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
@@ -9232,6 +9677,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9232
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
9233
  new_type = GGML_TYPE_Q4_K;
9234
  }
 
 
 
9235
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
9236
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
9237
  }
@@ -9269,6 +9717,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9269
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
9270
  if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
9271
  }
 
 
 
9272
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
9273
  new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
9274
  : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
@@ -9300,13 +9751,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9300
  } else if (name.find("attn_output.weight") != std::string::npos) {
9301
  if (arch != LLM_ARCH_FALCON) {
9302
  if (qs.model.hparams.n_expert == 8) {
9303
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
9304
  ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
9305
  ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
9306
  new_type = GGML_TYPE_Q5_K;
9307
  }
9308
  } else {
9309
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
 
9310
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
9311
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
9312
  }
@@ -9349,7 +9801,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9349
  bool convert_incompatible_tensor = false;
9350
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
9351
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
9352
- new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS) {
 
9353
  int nx = tensor->ne[0];
9354
  int ny = tensor->ne[1];
9355
  if (nx % QK_K != 0) {
@@ -9363,6 +9816,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9363
  switch (new_type) {
9364
  case GGML_TYPE_IQ2_XXS:
9365
  case GGML_TYPE_IQ2_XS:
 
9366
  case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
9367
  case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
9368
  case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
@@ -9404,6 +9858,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9404
  case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
9405
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
9406
  case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
 
9407
 
9408
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
9409
  }
@@ -10054,18 +10509,47 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
10054
  return result;
10055
  }
10056
 
10057
- int32_t llama_max_devices(void) {
10058
- return LLAMA_MAX_DEVICES;
 
 
 
 
 
 
 
 
 
 
10059
  }
10060
 
10061
- bool llama_mmap_supported(void) {
10062
  return llama_mmap::SUPPORTED;
10063
  }
10064
 
10065
- bool llama_mlock_supported(void) {
10066
  return llama_mlock::SUPPORTED;
10067
  }
10068
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10069
  void llama_backend_init(bool numa) {
10070
  ggml_time_init();
10071
 
@@ -10097,8 +10581,8 @@ int64_t llama_time_us(void) {
10097
  }
10098
 
10099
  struct llama_model * llama_load_model_from_file(
10100
- const char * path_model,
10101
- struct llama_model_params params) {
10102
  ggml_time_init();
10103
 
10104
  llama_model * model = new llama_model;
@@ -10241,19 +10725,31 @@ struct llama_context * llama_new_context_with_model(
10241
  }
10242
  #elif defined(GGML_USE_VULKAN)
10243
  if (model->n_gpu_layers > 0) {
10244
- ggml_backend_t backend = ggml_backend_vk_init();
 
 
 
 
 
 
 
 
 
 
 
 
10245
  if (backend == nullptr) {
10246
- LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
10247
  llama_free(ctx);
10248
  return nullptr;
10249
  }
10250
  ctx->backends.push_back(backend);
10251
  }
10252
- #elif defined(GGML_USE_SYCL)
10253
  if (model->n_gpu_layers > 0) {
10254
- ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
10255
  if (backend == nullptr) {
10256
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
10257
  llama_free(ctx);
10258
  return nullptr;
10259
  }
@@ -10464,7 +10960,7 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
10464
 
10465
  int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
10466
  return snprintf(buf, buf_size, "%s %s %s",
10467
- llama_model_arch_name(model->arch).c_str(),
10468
  llama_model_type_name(model->type),
10469
  llama_model_ftype_name(model->ftype).c_str());
10470
  }
@@ -11106,22 +11602,24 @@ struct llama_batch llama_batch_get_one(
11106
  };
11107
  }
11108
 
11109
- struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
11110
  llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
11111
 
11112
  if (embd) {
11113
- batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
11114
  } else {
11115
- batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
11116
  }
11117
 
11118
- batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
11119
- batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
11120
- batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
11121
- for (int i = 0; i < n_tokens; ++i) {
11122
  batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
11123
  }
11124
- batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
 
 
11125
 
11126
  return batch;
11127
  }
@@ -11132,7 +11630,7 @@ void llama_batch_free(struct llama_batch batch) {
11132
  if (batch.pos) free(batch.pos);
11133
  if (batch.n_seq_id) free(batch.n_seq_id);
11134
  if (batch.seq_id) {
11135
- for (int i = 0; i < batch.n_tokens; ++i) {
11136
  free(batch.seq_id[i]);
11137
  }
11138
  free(batch.seq_id);
 
15
  # include "ggml-vulkan.h"
16
  #elif defined(GGML_USE_SYCL)
17
  # include "ggml-sycl.h"
18
+ #elif defined(GGML_USE_KOMPUTE)
19
+ # include "ggml-kompute.h"
20
  #endif
21
 
22
  #ifdef GGML_USE_METAL
 
204
  LLM_ARCH_PLAMO,
205
  LLM_ARCH_CODESHELL,
206
  LLM_ARCH_ORION,
207
+ LLM_ARCH_INTERNLM2,
208
+ LLM_ARCH_MINICPM,
209
  LLM_ARCH_UNKNOWN,
210
  };
211
 
212
+ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
213
  { LLM_ARCH_LLAMA, "llama" },
214
  { LLM_ARCH_FALCON, "falcon" },
215
  { LLM_ARCH_GPT2, "gpt2" },
 
228
  { LLM_ARCH_PLAMO, "plamo" },
229
  { LLM_ARCH_CODESHELL, "codeshell" },
230
  { LLM_ARCH_ORION, "orion" },
231
+ { LLM_ARCH_INTERNLM2, "internlm2" },
232
+ { LLM_ARCH_MINICPM, "minicpm" },
233
  };
234
 
235
  enum llm_kv {
 
282
  LLM_KV_TOKENIZER_PAD_ID,
283
  LLM_KV_TOKENIZER_ADD_BOS,
284
  LLM_KV_TOKENIZER_ADD_EOS,
285
+ LLM_KV_TOKENIZER_ADD_PREFIX,
286
  LLM_KV_TOKENIZER_HF_JSON,
287
  LLM_KV_TOKENIZER_RWKV,
288
  };
289
 
290
+ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
291
  { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
292
  { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
293
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
 
337
  { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
338
  { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
339
  { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
340
+ { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
341
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
342
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
343
  };
 
348
  llm_arch arch;
349
 
350
  std::string operator()(llm_kv kv) const {
351
+ return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]);
352
  }
353
  };
354
 
 
675
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
676
  },
677
  },
678
+ {
679
+ LLM_ARCH_INTERNLM2,
680
+ {
681
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
682
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
683
+ { LLM_TENSOR_OUTPUT, "output" },
684
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
685
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
686
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
687
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
688
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
689
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
690
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
691
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
692
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
693
+ },
694
+ },
695
+ {
696
+ LLM_ARCH_MINICPM,
697
+ {
698
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
699
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
700
+ { LLM_TENSOR_OUTPUT, "output" },
701
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
702
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
703
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
704
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
705
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
706
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
707
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
708
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
709
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
710
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
711
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
712
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
713
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
714
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
715
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
716
+ },
717
+ },
718
  {
719
  LLM_ARCH_UNKNOWN,
720
  {
 
772
  // gguf helpers
773
  //
774
 
775
+ static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
776
  { LLAMA_ROPE_SCALING_NONE, "none" },
777
  { LLAMA_ROPE_SCALING_LINEAR, "linear" },
778
  { LLAMA_ROPE_SCALING_YARN, "yarn" },
779
  };
780
 
781
+ static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
782
  for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
783
  if (kv.second == name) {
784
  return kv.first;
 
1205
  #ifdef __APPLE__
1206
  #define MLOCK_SUGGESTION \
1207
  "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
1208
+ "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n"
1209
  #else
1210
  #define MLOCK_SUGGESTION \
1211
+ "Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n"
1212
  #endif
1213
 
1214
  bool raw_lock(const void * addr, size_t size) const {
 
1355
  #elif defined(GGML_USE_CUBLAS)
1356
  buft = ggml_backend_cuda_buffer_type(gpu);
1357
  #elif defined(GGML_USE_VULKAN)
1358
+ buft = ggml_backend_vk_buffer_type(gpu);
1359
  #elif defined(GGML_USE_SYCL)
1360
  buft = ggml_backend_sycl_buffer_type(gpu);
1361
  #elif defined(GGML_USE_CLBLAST)
1362
  buft = ggml_backend_opencl_buffer_type();
1363
+ #elif defined(GGML_USE_KOMPUTE)
1364
+ buft = ggml_backend_kompute_buffer_type(gpu);
1365
+ if (buft == nullptr) {
1366
+ LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
1367
+ }
1368
  #endif
1369
 
1370
  if (buft == nullptr) {
 
1392
  GGML_UNUSED(tensor_split);
1393
  }
1394
 
1395
+ static size_t llama_get_device_count() {
1396
+ #if defined(GGML_USE_CUBLAS)
1397
+ return ggml_backend_cuda_get_device_count();
1398
+ #elif defined(GGML_USE_VULKAN)
1399
+ return ggml_backend_vk_get_device_count();
1400
+ #else
1401
+ return 1;
1402
+ #endif
1403
+ }
1404
+
1405
+ static size_t llama_get_device_memory(int device) {
1406
+ #if defined(GGML_USE_CUBLAS)
1407
+ size_t total;
1408
+ size_t free;
1409
+ ggml_backend_cuda_get_device_memory(device, &total, &free);
1410
+ return free;
1411
+ #elif defined(GGML_USE_VULKAN)
1412
+ size_t total;
1413
+ size_t free;
1414
+ ggml_backend_vk_get_device_memory(device, &total, &free);
1415
+ return free;
1416
+ #else
1417
+ return 1;
1418
+ GGML_UNUSED(device);
1419
+ #endif
1420
+ }
1421
+
1422
  //
1423
  // globals
1424
  //
 
1442
  MODEL_UNKNOWN,
1443
  MODEL_0_5B,
1444
  MODEL_1B,
1445
+ MODEL_2B,
1446
  MODEL_3B,
1447
  MODEL_4B,
1448
  MODEL_7B,
 
1450
  MODEL_13B,
1451
  MODEL_14B,
1452
  MODEL_15B,
1453
+ MODEL_20B,
1454
  MODEL_30B,
1455
  MODEL_34B,
1456
  MODEL_40B,
 
1468
 
1469
  struct llama_hparams {
1470
  bool vocab_only;
1471
+ bool rope_finetuned;
1472
  uint32_t n_vocab;
1473
  uint32_t n_ctx_train; // context size the model was trained on
1474
  uint32_t n_embd;
 
1488
  float rope_freq_base_train;
1489
  float rope_freq_scale_train;
1490
  uint32_t n_yarn_orig_ctx;
1491
+ int32_t rope_scaling_type_train;
 
1492
 
1493
  float f_clamp_kqv;
1494
  float f_max_alibi_bias;
 
1692
  id special_suffix_id = 32008;
1693
  id special_eot_id = 32010;
1694
 
1695
+ bool add_space_prefix = true;
1696
+
1697
  int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
1698
  GGML_ASSERT(token_left.find(' ') == std::string::npos);
1699
  GGML_ASSERT(token_left.find('\n') == std::string::npos);
 
1790
  ggml_backend_free(backend);
1791
  }
1792
 
1793
+ #ifdef GGML_USE_VULKAN
1794
+ ggml_vk_free_cpu_assist();
1795
+ #endif
1796
+
1797
  ggml_backend_buffer_free(buf_input);
1798
  ggml_free(ctx_input);
1799
  }
 
2447
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
2448
  case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2449
  case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2450
+ case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
2451
  default:
2452
  {
2453
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
 
2758
  // load LLaMA models
2759
  //
2760
 
2761
+ static const char * llama_model_arch_name(llm_arch arch) {
2762
  auto it = LLM_ARCH_NAMES.find(arch);
2763
  if (it == LLM_ARCH_NAMES.end()) {
2764
  return "unknown";
 
2793
  case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
2794
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
2795
  case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2796
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
2797
  case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2798
  case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
2799
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
2800
 
2801
  default: return "unknown, may not work";
2802
  }
 
2805
  static const char * llama_model_type_name(e_model type) {
2806
  switch (type) {
2807
  case MODEL_1B: return "1B";
2808
+ case MODEL_2B: return "2B";
2809
  case MODEL_3B: return "3B";
2810
  case MODEL_7B: return "7B";
2811
  case MODEL_8B: return "8B";
2812
  case MODEL_13B: return "13B";
2813
  case MODEL_14B: return "14B";
2814
  case MODEL_15B: return "15B";
2815
+ case MODEL_20B: return "20B";
2816
  case MODEL_30B: return "30B";
2817
  case MODEL_34B: return "34B";
2818
  case MODEL_40B: return "40B";
 
2825
  default: return "?B";
2826
  }
2827
  }
2828
+ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
2829
+ switch (type) {
2830
+ case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2831
+ case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2832
+ default: return "unknown";
2833
+ }
2834
+ }
2835
+
2836
 
2837
  static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
2838
  model.arch = ml.get_arch();
 
2945
  default: model.type = e_model::MODEL_UNKNOWN;
2946
  }
2947
  } break;
2948
+ case LLM_ARCH_MINICPM:
2949
+ {
2950
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2951
+
2952
+ switch (hparams.n_layer) {
2953
+ case 40: model.type = e_model::MODEL_2B; break;
2954
+ default: model.type = e_model::MODEL_UNKNOWN;
2955
+ }
2956
+ } break;
2957
  case LLM_ARCH_FALCON:
2958
  {
2959
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 
3105
  default: model.type = e_model::MODEL_UNKNOWN;
3106
  }
3107
  } break;
3108
+ case LLM_ARCH_INTERNLM2:
3109
+ {
3110
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3111
+ switch (hparams.n_layer) {
3112
+ case 32: model.type = e_model::MODEL_7B; break;
3113
+ case 48: model.type = e_model::MODEL_20B; break;
3114
+ default: model.type = e_model::MODEL_UNKNOWN;
3115
+ }
3116
+ } break;
3117
  default: (void)0;
3118
  }
3119
 
 
3165
  vocab.special_unk_id = 0;
3166
  vocab.special_sep_id = -1;
3167
  vocab.special_pad_id = -1;
3168
+
3169
+ const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
3170
+ if (add_space_prefix_keyidx != -1) {
3171
+ vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
3172
+ } // The default value of add_space_prefix is true.
3173
  } else if (tokenizer_name == "gpt2") {
3174
  vocab.type = LLAMA_VOCAB_TYPE_BPE;
3175
 
 
3377
  const auto & hparams = model.hparams;
3378
  const auto & vocab = model.vocab;
3379
 
3380
+ const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
3381
 
3382
  // hparams
3383
  LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
3384
+ LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch));
3385
+ LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
3386
  LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
3387
  LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
3388
  LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
 
3403
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
3404
  LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
3405
  LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
3406
+ LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
3407
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
3408
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
3409
  LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
 
3469
  model.buft_layer[i] = llama_default_buffer_type_cpu(true);
3470
  }
3471
 
 
3472
  if (split_mode == LLAMA_SPLIT_LAYER) {
3473
  // calculate the split points
3474
+ int device_count = llama_get_device_count();
3475
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
3476
+ std::vector<float> splits(device_count);
3477
  if (all_zero) {
3478
  // default split, by free memory
3479
  for (int i = 0; i < device_count; ++i) {
3480
+ splits[i] = llama_get_device_memory(i);
 
 
 
3481
  }
3482
  } else {
3483
+ std::copy(tensor_split, tensor_split + device_count, splits.begin());
3484
  }
3485
 
3486
  // sum and normalize the splits to get the split points
 
3496
  // assign the repeating layers to the devices according to the splits
3497
  int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
3498
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
3499
+ int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
3500
  model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
3501
  }
3502
  // assign the output layer
3503
  if (n_gpu_layers > n_layer) {
3504
+ int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
3505
  model.buft_output = llama_default_buffer_type_offload(layer_gpu);
3506
  } else {
3507
  model.buft_output = llama_default_buffer_type_cpu(true);
3508
  }
3509
+ } else {
 
 
3510
  ggml_backend_buffer_type_t split_buft;
3511
  if (split_mode == LLAMA_SPLIT_ROW) {
3512
  split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
 
3585
  switch (model.arch) {
3586
  case LLM_ARCH_LLAMA:
3587
  case LLM_ARCH_REFACT:
3588
+ case LLM_ARCH_MINICPM:
3589
  {
3590
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3591
 
3592
  // output
3593
  {
3594
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3595
+ if (model.arch != LLM_ARCH_MINICPM){
3596
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3597
+ }
3598
  }
3599
 
3600
  for (int i = 0; i < n_layer; ++i) {
 
4128
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4129
  }
4130
  } break;
4131
+ case LLM_ARCH_INTERNLM2:
4132
+ {
4133
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4134
+
4135
+ // output
4136
+ {
4137
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4138
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4139
+ }
4140
+
4141
+ for (int i = 0; i < n_layer; ++i) {
4142
+ ggml_context * ctx_layer = ctx_for_layer(i);
4143
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4144
+
4145
+ auto & layer = model.layers[i];
4146
 
4147
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4148
+ // layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4149
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4150
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4151
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4152
 
4153
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4154
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4155
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4156
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4157
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4158
+ }
4159
+ } break;
4160
  default:
4161
  throw std::runtime_error("unknown architecture");
4162
  }
 
4209
  ctx_bufs.emplace_back(ctx, buf);
4210
  }
4211
 
4212
+ if (llama_supports_gpu_offload()) {
 
4213
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
4214
 
4215
  LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
 
4221
  const int max_offloadable_layers = hparams.n_layer + 1;
4222
 
4223
  LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
4224
+ }
4225
 
4226
+ // print memory requirements
4227
+ for (ggml_backend_buffer_t buf : model.bufs) {
4228
+ LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
4229
  }
4230
 
4231
  // populate tensors_by_name
 
4253
  }
4254
 
4255
  // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
4256
+ static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
4257
  try {
4258
  llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
4259
 
 
4274
  return 0;
4275
  }
4276
 
4277
+ #ifdef GGML_USE_KOMPUTE
4278
+ if (params.n_gpu_layers > 0 && (
4279
+ !(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
4280
+ || !(
4281
+ model.ftype == LLAMA_FTYPE_ALL_F32 ||
4282
+ model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
4283
+ model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
4284
+ model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
4285
+ )
4286
+ )) {
4287
+ // TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file
4288
+ LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
4289
+ params.n_gpu_layers = 0;
4290
+ }
4291
+ #endif
4292
+
4293
  if (!llm_load_tensors(
4294
  ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
4295
  params.progress_callback, params.progress_callback_user_data
 
4803
  ctx0 = nullptr;
4804
  }
4805
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4806
 
4807
  struct ggml_cgraph * build_llama() {
4808
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
6606
 
6607
  return gf;
6608
  }
 
6609
 
6610
+ struct ggml_cgraph * build_orion() {
6611
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
 
6612
 
6613
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6614
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6615
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6616
 
6617
+ struct ggml_tensor * cur;
6618
+ struct ggml_tensor * inpL;
6619
+
6620
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6621
+ cb(inpL, "inp_embd", -1);
6622
+
6623
+ // inp_pos - contains the positions
6624
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6625
+ cb(inp_pos, "inp_pos", -1);
6626
+
6627
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6628
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6629
+ cb(KQ_mask, "KQ_mask", -1);
6630
+
6631
+ // shift the entire K-cache if needed
6632
+ if (do_rope_shift) {
6633
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6634
+ }
6635
+
6636
+ for (int il = 0; il < n_layer; ++il) {
6637
+ struct ggml_tensor * inpSA = inpL;
6638
+
6639
+ // norm
6640
+ cur = llm_build_norm(ctx0, inpL, hparams,
6641
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
6642
+ LLM_NORM, cb, il);
6643
+ cb(cur, "attn_norm", il);
6644
+
6645
+ // self-attention
6646
+ {
6647
+ // compute Q and K and RoPE them
6648
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6649
+ cb(Qcur, "Qcur", il);
6650
+ // if (model.layers[il].bq) {
6651
+ // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6652
+ // cb(Qcur, "Qcur", il);
6653
+ // }
6654
+
6655
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6656
+ cb(Kcur, "Kcur", il);
6657
+ // if (model.layers[il].bk) {
6658
+ // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6659
+ // cb(Kcur, "Kcur", il);
6660
+ // }
6661
+
6662
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6663
+ cb(Vcur, "Vcur", il);
6664
+ // if (model.layers[il].bv) {
6665
+ // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6666
+ // cb(Vcur, "Vcur", il);
6667
+ // }
6668
+
6669
+ Qcur = ggml_rope_custom(
6670
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6671
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6672
+ ext_factor, attn_factor, beta_fast, beta_slow
6673
+ );
6674
+ cb(Qcur, "Qcur", il);
6675
+
6676
+ Kcur = ggml_rope_custom(
6677
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6678
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
6679
+ ext_factor, attn_factor, beta_fast, beta_slow
6680
+ );
6681
+ cb(Kcur, "Kcur", il);
6682
+
6683
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6684
+ model.layers[il].wo, NULL,
6685
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6686
+ cb(cur, "kqv_out", il);
6687
+ }
6688
+
6689
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6690
+ cb(ffn_inp, "ffn_inp", il);
6691
+
6692
+ // feed-forward network
6693
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6694
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
6695
+ LLM_NORM, cb, il);
6696
+ cb(cur, "ffn_norm", il);
6697
+
6698
+ cur = llm_build_ffn(ctx0, cur,
6699
+ model.layers[il].ffn_up, NULL,
6700
+ model.layers[il].ffn_gate, NULL,
6701
+ model.layers[il].ffn_down, NULL,
6702
+ NULL,
6703
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6704
+ cb(cur, "ffn_out", il);
6705
+
6706
+ cur = ggml_add(ctx0, cur, ffn_inp);
6707
+ cb(cur, "l_out", il);
6708
+
6709
+ // input for next layer
6710
+ inpL = cur;
6711
+ }
6712
+
6713
+ cur = inpL;
6714
+
6715
+ cur = llm_build_norm(ctx0, cur, hparams,
6716
+ model.output_norm, model.output_norm_b,
6717
+ LLM_NORM, cb, -1);
6718
+ cb(cur, "result_norm", -1);
6719
+
6720
+ // lm_head
6721
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6722
+ cb(cur, "result_output", -1);
6723
+
6724
+ ggml_build_forward_expand(gf, cur);
6725
+
6726
+ return gf;
6727
+ }
6728
+
6729
+ struct ggml_cgraph * build_internlm2() {
6730
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6731
+
6732
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6733
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6734
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6735
+
6736
+ struct ggml_tensor * cur;
6737
+ struct ggml_tensor * inpL;
6738
+
6739
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6740
+ cb(inpL, "inp_embd", -1);
6741
+
6742
+ // inp_pos - contains the positions
6743
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6744
+ cb(inp_pos, "inp_pos", -1);
6745
+
6746
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6747
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6748
+ cb(KQ_mask, "KQ_mask", -1);
6749
+
6750
+ // shift the entire K-cache if needed
6751
+ if (do_rope_shift) {
6752
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6753
+ }
6754
+
6755
+ for (int il = 0; il < n_layer; ++il) {
6756
+ struct ggml_tensor * inpSA = inpL;
6757
+
6758
+ // norm
6759
+ cur = llm_build_norm(ctx0, inpL, hparams,
6760
+ model.layers[il].attn_norm, NULL,
6761
+ LLM_NORM_RMS, cb, il);
6762
+ cb(cur, "attn_norm", il);
6763
+
6764
+ // self-attention
6765
+ {
6766
+ // compute Q and K and RoPE them
6767
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6768
+ cb(Qcur, "Qcur", il);
6769
+ if (model.layers[il].bq) {
6770
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6771
+ cb(Qcur, "Qcur", il);
6772
+ }
6773
+
6774
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6775
+ cb(Kcur, "Kcur", il);
6776
+ if (model.layers[il].bk) {
6777
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6778
+ cb(Kcur, "Kcur", il);
6779
+ }
6780
+
6781
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6782
+ cb(Vcur, "Vcur", il);
6783
+ if (model.layers[il].bv) {
6784
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6785
+ cb(Vcur, "Vcur", il);
6786
+ }
6787
+
6788
+ Qcur = ggml_rope_custom(
6789
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6790
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
6791
+ ext_factor, attn_factor, beta_fast, beta_slow
6792
+ );
6793
+ cb(Qcur, "Qcur", il);
6794
+
6795
+ Kcur = ggml_rope_custom(
6796
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6797
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
6798
+ ext_factor, attn_factor, beta_fast, beta_slow
6799
+ );
6800
+ cb(Kcur, "Kcur", il);
6801
+
6802
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6803
+ model.layers[il].wo, model.layers[il].bo,
6804
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6805
+ cb(cur, "kqv_out", il);
6806
+ }
6807
+
6808
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6809
+ cb(ffn_inp, "ffn_inp", il);
6810
+
6811
+ // feed-forward network
6812
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6813
+ model.layers[il].ffn_norm, NULL,
6814
+ LLM_NORM_RMS, cb, il);
6815
+ cb(cur, "ffn_norm", il);
6816
+
6817
+ cur = llm_build_ffn(ctx0, cur,
6818
+ model.layers[il].ffn_up, NULL,
6819
+ model.layers[il].ffn_gate, NULL,
6820
+ model.layers[il].ffn_down, NULL,
6821
+ NULL,
6822
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6823
+ cb(cur, "ffn_out", il);
6824
+
6825
+ cur = ggml_add(ctx0, cur, ffn_inp);
6826
+ cb(cur, "l_out", il);
6827
+
6828
+ // input for next layer
6829
+ inpL = cur;
6830
+ }
6831
+
6832
+ cur = inpL;
6833
+
6834
+ cur = llm_build_norm(ctx0, cur, hparams,
6835
+ model.output_norm, NULL,
6836
+ LLM_NORM_RMS, cb, -1);
6837
+ cb(cur, "result_norm", -1);
6838
+
6839
+ // lm_head
6840
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6841
+ cb(cur, "result_output", -1);
6842
+
6843
+ ggml_build_forward_expand(gf, cur);
6844
+
6845
+ return gf;
6846
+ }
6847
+
6848
+ // ref: https://arxiv.org/abs/2203.03466
6849
+ // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
6850
+ // based on the original build_llama() function
6851
+ struct ggml_cgraph * build_minicpm() {
6852
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6853
+
6854
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6855
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6856
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6857
+
6858
+ const int64_t n_embd = hparams.n_embd;
6859
+ //TODO: if the model varies, these parameters need to be read from the model
6860
+ const int64_t n_embd_base = 256;
6861
+ const float scale_embd = 12.0f;
6862
+ const float scale_depth = 1.4f;
6863
+
6864
+ struct ggml_tensor * cur;
6865
+ struct ggml_tensor * inpL;
6866
+
6867
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
6868
+ cb(inpL, "inp_embd", -1);
6869
+
6870
+ // scale the input embeddings
6871
+ inpL = ggml_scale(ctx0, inpL, scale_embd);
6872
+ cb(inpL, "inp_scaled", -1);
6873
+
6874
+ // inp_pos - contains the positions
6875
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6876
+ cb(inp_pos, "inp_pos", -1);
6877
+
6878
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6879
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6880
+ cb(KQ_mask, "KQ_mask", -1);
6881
+
6882
+ // shift the entire K-cache if needed
6883
+ if (do_rope_shift) {
6884
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6885
+ }
6886
+
6887
+ for (int il = 0; il < n_layer; ++il) {
6888
+ struct ggml_tensor * inpSA = inpL;
6889
+
6890
+ // norm
6891
+ cur = llm_build_norm(ctx0, inpL, hparams,
6892
+ model.layers[il].attn_norm, NULL,
6893
+ LLM_NORM_RMS, cb, il);
6894
+ cb(cur, "attn_norm", il);
6895
+
6896
+ // self-attention
6897
+ {
6898
+ // compute Q and K and RoPE them
6899
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6900
+ cb(Qcur, "Qcur", il);
6901
+ if (model.layers[il].bq) {
6902
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6903
+ cb(Qcur, "Qcur", il);
6904
+ }
6905
+
6906
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6907
+ cb(Kcur, "Kcur", il);
6908
+ if (model.layers[il].bk) {
6909
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6910
+ cb(Kcur, "Kcur", il);
6911
+ }
6912
+
6913
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6914
+ cb(Vcur, "Vcur", il);
6915
+ if (model.layers[il].bv) {
6916
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6917
+ cb(Vcur, "Vcur", il);
6918
+ }
6919
+
6920
+ Qcur = ggml_rope_custom(
6921
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6922
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
6923
+ ext_factor, attn_factor, beta_fast, beta_slow
6924
+ );
6925
+ cb(Qcur, "Qcur", il);
6926
+
6927
+ Kcur = ggml_rope_custom(
6928
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6929
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
6930
+ ext_factor, attn_factor, beta_fast, beta_slow
6931
+ );
6932
+ cb(Kcur, "Kcur", il);
6933
+
6934
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6935
+ model.layers[il].wo, model.layers[il].bo,
6936
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6937
+ cb(cur, "kqv_out", il);
6938
+ }
6939
+
6940
+ // scale_res - scale the hidden states for residual connection
6941
+ const float scale_res = scale_depth/sqrtf(float(n_layer));
6942
+ cur = ggml_scale(ctx0, cur, scale_res);
6943
+ cb(cur, "hidden_scaled", -1);
6944
+
6945
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6946
+ cb(ffn_inp, "ffn_inp", il);
6947
+
6948
+ // feed-forward network
6949
+ {
6950
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6951
+ model.layers[il].ffn_norm, NULL,
6952
+ LLM_NORM_RMS, cb, il);
6953
+ cb(cur, "ffn_norm", il);
6954
+
6955
+ cur = llm_build_ffn(ctx0, cur,
6956
+ model.layers[il].ffn_up, NULL,
6957
+ model.layers[il].ffn_gate, NULL,
6958
+ model.layers[il].ffn_down, NULL,
6959
+ NULL,
6960
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6961
+ cb(cur, "ffn_out", il);
6962
+ }
6963
+
6964
+ // scale the hidden states for residual connection
6965
+ cur = ggml_scale(ctx0, cur, scale_res);
6966
+ cb(cur, "hidden_scaled_ffn", -1);
6967
+
6968
+ cur = ggml_add(ctx0, cur, ffn_inp);
6969
+ cb(cur, "l_out", il);
6970
+
6971
+ // input for next layer
6972
+ inpL = cur;
6973
+ }
6974
+
6975
+ cur = inpL;
6976
+
6977
+ cur = llm_build_norm(ctx0, cur, hparams,
6978
+ model.output_norm, NULL,
6979
+ LLM_NORM_RMS, cb, -1);
6980
+ cb(cur, "result_norm", -1);
6981
+
6982
+ // lm_head scaling
6983
+ const float scale_lmhead = float(n_embd_base)/float(n_embd);
6984
+ cur = ggml_scale(ctx0, cur, scale_lmhead);
6985
+ cb(cur, "lmhead_scaling", -1);
6986
+
6987
+ // lm_head
6988
+ cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
6989
+ cb(cur, "result_output", -1);
6990
+
6991
+ ggml_build_forward_expand(gf, cur);
6992
+
6993
+ return gf;
6994
+ }
6995
+ };
6996
+
6997
+ static struct ggml_cgraph * llama_build_graph(
6998
+ llama_context & lctx,
6999
+ const llama_batch & batch) {
7000
+ const auto & model = lctx.model;
7001
+
7002
+ // check if we should build the worst-case graph (for memory measurement)
7003
+ const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
7004
+
7005
+ // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
7006
+ llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
7007
+ if (il >= 0) {
7008
+ ggml_format_name(cur, "%s-%d", name, il);
7009
+ } else {
7010
+ ggml_set_name(cur, name);
7011
+ }
7012
 
7013
  if (!lctx.cparams.offload_kqv) {
7014
  if (strcmp(name, "kqv_merged_cont") == 0) {
 
7150
  {
7151
  result = llm.build_orion();
7152
  } break;
7153
+ case LLM_ARCH_INTERNLM2:
7154
+ {
7155
+ result = llm.build_internlm2();
7156
+ } break;
7157
+ case LLM_ARCH_MINICPM:
7158
+ {
7159
+ result = llm.build_minicpm();
7160
+ } break;
7161
  default:
7162
  GGML_ASSERT(false);
7163
  }
 
7285
  // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
7286
  // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
7287
  // with the BLAS calls. need a better solution
7288
+ // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
7289
+ // being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
7290
+ if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
7291
  n_threads = std::min(4, n_threads);
7292
  }
7293
 
 
 
 
 
 
7294
  #ifdef GGML_USE_MPI
7295
  const int64_t n_layer = hparams.n_layer;
7296
  ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
 
8102
  //
8103
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
8104
  if (&fragment == &fragment_buffer.front()) {
8105
+ if (vocab.add_space_prefix) {
8106
+ raw_text = " " + raw_text; // prefix with space if the first token is not special
8107
+ }
8108
  }
8109
 
8110
  #ifdef PRETOKENIZERDEBUG
 
8590
 
8591
  const int64_t t_start_sample_us = ggml_time_us();
8592
 
8593
+ if (k <= 0) {
8594
+ k = candidates->size;
8595
+ }
8596
+
8597
  k = std::max(k, (int) min_keep);
8598
  k = std::min(k, (int) candidates->size);
8599
 
 
9653
  else if (new_type != GGML_TYPE_Q8_0) {
9654
  new_type = GGML_TYPE_Q6_K;
9655
  }
9656
+ } else if (name == "token_embd.weight") {
9657
+ if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
9658
+ new_type = GGML_TYPE_Q2_K;
9659
+ }
9660
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
9661
+ new_type = GGML_TYPE_Q4_K;
9662
+ }
9663
  } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
9664
  if (name.find("attn_v.weight") != std::string::npos) {
9665
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
 
9670
  if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
9671
  ++qs.i_ffn_down;
9672
  }
 
9673
  } else if (name.find("attn_v.weight") != std::string::npos) {
9674
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
9675
  new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
 
9677
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
9678
  new_type = GGML_TYPE_Q4_K;
9679
  }
9680
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
9681
+ new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
9682
+ }
9683
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
9684
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
9685
  }
 
9717
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
9718
  if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
9719
  }
9720
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
9721
+ new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
9722
+ }
9723
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
9724
  new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
9725
  : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
 
9751
  } else if (name.find("attn_output.weight") != std::string::npos) {
9752
  if (arch != LLM_ARCH_FALCON) {
9753
  if (qs.model.hparams.n_expert == 8) {
9754
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
9755
  ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
9756
  ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
9757
  new_type = GGML_TYPE_Q5_K;
9758
  }
9759
  } else {
9760
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
9761
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
9762
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
9763
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
9764
  }
 
9801
  bool convert_incompatible_tensor = false;
9802
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
9803
  new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
9804
+ new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
9805
+ new_type == GGML_TYPE_IQ3_XXS) {
9806
  int nx = tensor->ne[0];
9807
  int ny = tensor->ne[1];
9808
  if (nx % QK_K != 0) {
 
9816
  switch (new_type) {
9817
  case GGML_TYPE_IQ2_XXS:
9818
  case GGML_TYPE_IQ2_XS:
9819
+ case GGML_TYPE_IQ3_XXS:
9820
  case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
9821
  case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
9822
  case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
 
9858
  case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
9859
  case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
9860
  case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
9861
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
9862
 
9863
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
9864
  }
 
10509
  return result;
10510
  }
10511
 
10512
+ size_t llama_max_devices(void) {
10513
+ #if defined(GGML_USE_METAL)
10514
+ return 1;
10515
+ #elif defined(GGML_USE_CUBLAS)
10516
+ return GGML_CUDA_MAX_DEVICES;
10517
+ #elif defined(GGML_USE_SYCL)
10518
+ return GGML_SYCL_MAX_DEVICES;
10519
+ #elif defined(GGML_USE_VULKAN)
10520
+ return GGML_VK_MAX_DEVICES;
10521
+ #else
10522
+ return 1;
10523
+ #endif
10524
  }
10525
 
10526
+ bool llama_supports_mmap(void) {
10527
  return llama_mmap::SUPPORTED;
10528
  }
10529
 
10530
+ bool llama_supports_mlock(void) {
10531
  return llama_mlock::SUPPORTED;
10532
  }
10533
 
10534
+ bool llama_supports_gpu_offload(void) {
10535
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
10536
+ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
10537
+ // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
10538
+ return true;
10539
+ #else
10540
+ return false;
10541
+ #endif
10542
+ }
10543
+
10544
+ // deprecated:
10545
+ bool llama_mmap_supported(void) {
10546
+ return llama_supports_mmap();
10547
+ }
10548
+
10549
+ bool llama_mlock_supported(void) {
10550
+ return llama_supports_mlock();
10551
+ }
10552
+
10553
  void llama_backend_init(bool numa) {
10554
  ggml_time_init();
10555
 
 
10581
  }
10582
 
10583
  struct llama_model * llama_load_model_from_file(
10584
+ const char * path_model,
10585
+ struct llama_model_params params) {
10586
  ggml_time_init();
10587
 
10588
  llama_model * model = new llama_model;
 
10725
  }
10726
  #elif defined(GGML_USE_VULKAN)
10727
  if (model->n_gpu_layers > 0) {
10728
+ for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
10729
+ ggml_backend_t backend = ggml_backend_vk_init(device);
10730
+ if (backend == nullptr) {
10731
+ LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
10732
+ llama_free(ctx);
10733
+ return nullptr;
10734
+ }
10735
+ ctx->backends.push_back(backend);
10736
+ }
10737
+ }
10738
+ #elif defined(GGML_USE_SYCL)
10739
+ if (model->n_gpu_layers > 0) {
10740
+ ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
10741
  if (backend == nullptr) {
10742
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
10743
  llama_free(ctx);
10744
  return nullptr;
10745
  }
10746
  ctx->backends.push_back(backend);
10747
  }
10748
+ #elif defined(GGML_USE_KOMPUTE)
10749
  if (model->n_gpu_layers > 0) {
10750
+ auto * backend = ggml_backend_kompute_init(model->main_gpu);
10751
  if (backend == nullptr) {
10752
+ LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
10753
  llama_free(ctx);
10754
  return nullptr;
10755
  }
 
10960
 
10961
  int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
10962
  return snprintf(buf, buf_size, "%s %s %s",
10963
+ llama_model_arch_name(model->arch),
10964
  llama_model_type_name(model->type),
10965
  llama_model_ftype_name(model->ftype).c_str());
10966
  }
 
11602
  };
11603
  }
11604
 
11605
+ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
11606
  llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
11607
 
11608
  if (embd) {
11609
+ batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
11610
  } else {
11611
+ batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
11612
  }
11613
 
11614
+ batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens_alloc);
11615
+ batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens_alloc);
11616
+ batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
11617
+ for (int i = 0; i < n_tokens_alloc; ++i) {
11618
  batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
11619
  }
11620
+ batch.seq_id[n_tokens_alloc] = nullptr;
11621
+
11622
+ batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens_alloc);
11623
 
11624
  return batch;
11625
  }
 
11630
  if (batch.pos) free(batch.pos);
11631
  if (batch.n_seq_id) free(batch.n_seq_id);
11632
  if (batch.seq_id) {
11633
+ for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
11634
  free(batch.seq_id[i]);
11635
  }
11636
  free(batch.seq_id);
examples/talk-llama/llama.h CHANGED
@@ -3,15 +3,7 @@
3
 
4
  #include "ggml.h"
5
  #include "ggml-backend.h"
6
- #ifdef GGML_USE_CUBLAS
7
- #include "ggml-cuda.h"
8
- #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
9
- #elif defined(GGML_USE_SYCL)
10
- #include "ggml-sycl.h"
11
- #define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
12
- #else
13
- #define LLAMA_MAX_DEVICES 1
14
- #endif // GGML_USE_CUBLAS
15
  #include <stddef.h>
16
  #include <stdint.h>
17
  #include <stdio.h>
@@ -49,11 +41,6 @@
49
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
50
  #define LLAMA_SESSION_VERSION 4
51
 
52
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
53
- // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
54
- #define LLAMA_SUPPORTS_GPU_OFFLOAD
55
- #endif
56
-
57
  #ifdef __cplusplus
58
  extern "C" {
59
  #endif
@@ -111,6 +98,7 @@ extern "C" {
111
  LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
112
  LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
113
  LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
 
114
 
115
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
116
  };
@@ -199,7 +187,7 @@ extern "C" {
199
  // LLAMA_SPLIT_LAYER: ignored
200
  int32_t main_gpu;
201
 
202
- // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
203
  const float * tensor_split;
204
 
205
  // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
@@ -225,7 +213,7 @@ extern "C" {
225
  uint32_t n_batch; // prompt processing maximum batch size
226
  uint32_t n_threads; // number of threads to use for generation
227
  uint32_t n_threads_batch; // number of threads to use for batch processing
228
- int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
229
 
230
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
231
  float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -336,9 +324,14 @@ extern "C" {
336
 
337
  LLAMA_API int64_t llama_time_us(void);
338
 
339
- LLAMA_API int32_t llama_max_devices(void);
340
- LLAMA_API bool llama_mmap_supported (void);
341
- LLAMA_API bool llama_mlock_supported(void);
 
 
 
 
 
342
 
343
  LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
344
 
 
3
 
4
  #include "ggml.h"
5
  #include "ggml-backend.h"
6
+
 
 
 
 
 
 
 
 
7
  #include <stddef.h>
8
  #include <stdint.h>
9
  #include <stdio.h>
 
41
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
42
  #define LLAMA_SESSION_VERSION 4
43
 
 
 
 
 
 
44
  #ifdef __cplusplus
45
  extern "C" {
46
  #endif
 
98
  LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
99
  LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
100
  LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
101
+ LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
102
 
103
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
104
  };
 
187
  // LLAMA_SPLIT_LAYER: ignored
188
  int32_t main_gpu;
189
 
190
+ // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
191
  const float * tensor_split;
192
 
193
  // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
 
213
  uint32_t n_batch; // prompt processing maximum batch size
214
  uint32_t n_threads; // number of threads to use for generation
215
  uint32_t n_threads_batch; // number of threads to use for batch processing
216
+ int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
217
 
218
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
219
  float rope_freq_base; // RoPE base frequency, 0 = from model
 
324
 
325
  LLAMA_API int64_t llama_time_us(void);
326
 
327
+ LLAMA_API size_t llama_max_devices(void);
328
+
329
+ LLAMA_API bool llama_supports_mmap (void);
330
+ LLAMA_API bool llama_supports_mlock (void);
331
+ LLAMA_API bool llama_supports_gpu_offload(void);
332
+
333
+ LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
334
+ LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
335
 
336
  LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
337