ggerganov commited on
Commit
d3484ef
·
unverified ·
1 Parent(s): 924f2de

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama.cpp CHANGED
@@ -218,6 +218,7 @@ enum llm_arch {
218
  LLM_ARCH_GEMMA,
219
  LLM_ARCH_STARCODER2,
220
  LLM_ARCH_MAMBA,
 
221
  LLM_ARCH_COMMAND_R,
222
  LLM_ARCH_UNKNOWN,
223
  };
@@ -249,6 +250,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
249
  { LLM_ARCH_GEMMA, "gemma" },
250
  { LLM_ARCH_STARCODER2, "starcoder2" },
251
  { LLM_ARCH_MAMBA, "mamba" },
 
252
  { LLM_ARCH_COMMAND_R, "command-r" },
253
  { LLM_ARCH_UNKNOWN, "(unknown)" },
254
  };
@@ -259,6 +261,7 @@ enum llm_kv {
259
  LLM_KV_GENERAL_ALIGNMENT,
260
  LLM_KV_GENERAL_NAME,
261
  LLM_KV_GENERAL_AUTHOR,
 
262
  LLM_KV_GENERAL_URL,
263
  LLM_KV_GENERAL_DESCRIPTION,
264
  LLM_KV_GENERAL_LICENSE,
@@ -328,6 +331,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
328
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
329
  { LLM_KV_GENERAL_NAME, "general.name" },
330
  { LLM_KV_GENERAL_AUTHOR, "general.author" },
 
331
  { LLM_KV_GENERAL_URL, "general.url" },
332
  { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
333
  { LLM_KV_GENERAL_LICENSE, "general.license" },
@@ -424,9 +428,12 @@ enum llm_tensor {
424
  LLM_TENSOR_FFN_DOWN,
425
  LLM_TENSOR_FFN_UP,
426
  LLM_TENSOR_FFN_ACT,
427
- LLM_TENSOR_FFN_DOWN_EXP,
428
  LLM_TENSOR_FFN_GATE_EXP,
429
  LLM_TENSOR_FFN_UP_EXP,
 
 
 
430
  LLM_TENSOR_ATTN_Q_NORM,
431
  LLM_TENSOR_ATTN_K_NORM,
432
  LLM_TENSOR_LAYER_OUT_NORM,
@@ -461,6 +468,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
461
  { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
462
  { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
463
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
 
 
 
464
  },
465
  },
466
  {
@@ -514,6 +524,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
514
  { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
515
  { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
516
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
 
 
 
517
  { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
518
  { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
519
  },
@@ -583,6 +596,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
583
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
584
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
585
  { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
 
 
 
586
  },
587
  },
588
  {
@@ -878,6 +894,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
878
  { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
879
  },
880
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
881
  {
882
  LLM_ARCH_COMMAND_R,
883
  {
@@ -1843,9 +1878,9 @@ struct llama_layer {
1843
 
1844
  // ff MoE
1845
  struct ggml_tensor * ffn_gate_inp;
1846
- struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
1847
- struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
1848
- struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
1849
 
1850
  // ff bias
1851
  struct ggml_tensor * ffn_down_b; // b2
@@ -2100,10 +2135,6 @@ struct llama_context {
2100
  ggml_backend_free(backend);
2101
  }
2102
 
2103
- #ifdef GGML_USE_VULKAN
2104
- ggml_vk_free_cpu_assist();
2105
- #endif
2106
-
2107
  ggml_backend_buffer_free(buf_output);
2108
  }
2109
 
@@ -2851,19 +2882,19 @@ struct llama_model_loader {
2851
 
2852
  llama_mmaps mappings;
2853
 
2854
- // Holds information on a model weights
2855
- struct llama_tensor_weights {
2856
  uint16_t idx; // source file index
2857
  size_t offs; // tensor data offset in the original file
2858
 
2859
  ggml_tensor * tensor;
2860
 
2861
- llama_tensor_weights(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
2862
  const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
2863
  offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
2864
  }
2865
  };
2866
- std::vector<llama_tensor_weights> weights;
2867
 
2868
  std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
2869
 
@@ -2903,7 +2934,7 @@ struct llama_model_loader {
2903
  // For subsidiary files, `meta` tensor data offset must not be used,
2904
  // so we build a unified tensors index for weights.
2905
  for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
2906
- weights.emplace_back(llama_tensor_weights(0, cur->name, meta, cur));
2907
  }
2908
  files.emplace_back(new llama_file(fname.c_str(), "rb"));
2909
  contexts.emplace_back(ctx);
@@ -2943,7 +2974,7 @@ struct llama_model_loader {
2943
 
2944
  // Save tensors data offset info of the shard.
2945
  for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
2946
- weights.emplace_back(llama_tensor_weights(idx, cur->name, ctx_gguf, cur));
2947
  }
2948
  files.emplace_back(new llama_file(split_path, "rb"));
2949
  contexts.emplace_back(ctx);
@@ -3147,21 +3178,37 @@ struct llama_model_loader {
3147
  return weights.at(i).tensor->name;
3148
  }
3149
 
3150
- const llama_tensor_weights & get_weights(const char * name) const {
3151
  for (const auto & weight : weights) {
3152
  if (strcmp(name, weight.tensor->name) == 0) {
3153
- return weight;
3154
  }
3155
  }
3156
- throw std::runtime_error(format("tensor %s not found", name));
 
 
 
 
 
 
 
 
3157
  }
3158
 
3159
  struct ggml_tensor * get_tensor_meta(const char * name) const {
3160
- try {
3161
- return get_weights(name).tensor;
3162
- } catch (const std::runtime_error & e) {
3163
- return NULL;
3164
  }
 
 
 
 
 
 
 
 
 
3165
  }
3166
 
3167
  struct ggml_tensor * get_tensor_meta(int i) const {
@@ -3177,7 +3224,7 @@ struct llama_model_loader {
3177
  return tensor;
3178
  }
3179
 
3180
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
3181
  const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
3182
 
3183
  if (cur == NULL) {
@@ -3189,8 +3236,8 @@ struct llama_model_loader {
3189
 
3190
  {
3191
  bool is_ok = true;
3192
- for (size_t i = 0; i < ne.size(); ++i) {
3193
- if (ne[i] != cur->ne[i]) {
3194
  is_ok = false;
3195
  break;
3196
  }
@@ -3204,9 +3251,47 @@ struct llama_model_loader {
3204
  }
3205
  }
3206
 
 
 
 
 
 
 
 
 
 
 
3207
  return create_tensor_for(ctx, cur);
3208
  }
3209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3210
  void done_getting_tensors() const {
3211
  if (n_created != n_tensors) {
3212
  throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
@@ -3219,7 +3304,7 @@ struct llama_model_loader {
3219
  mmaps_used.reserve(files.size());
3220
  for (const auto & file : files) {
3221
  std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
3222
- mmaps_used.emplace_back(std::make_pair(mapping->size, 0));
3223
  if (mlock_mmaps) {
3224
  std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
3225
  mlock_mmap->init(mapping->addr);
@@ -3243,18 +3328,25 @@ struct llama_model_loader {
3243
  *last = 0;
3244
  *addr = mapping->addr;
3245
  for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
3246
- const auto & w = get_weights(ggml_get_name(tensor));
3247
- if (w.idx != idx) {
3248
- continue;
 
 
 
 
 
 
 
 
 
3249
  }
3250
- *first = std::min(*first, w.offs);
3251
- *last = std::max(*last, w.offs + ggml_nbytes(tensor));
3252
  }
3253
  }
3254
 
3255
  // for backwards compatibility, does not support ggml-backend
3256
  void load_data_for(struct ggml_tensor * cur) const {
3257
- const auto & w = get_weights(ggml_get_name(cur));
3258
 
3259
  if (use_mmap) {
3260
  const auto & mapping = mappings.at(w.idx);
@@ -3287,44 +3379,49 @@ struct llama_model_loader {
3287
 
3288
  std::vector<no_init<uint8_t>> read_buf;
3289
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
 
 
 
 
 
 
3290
  if (progress_callback) {
3291
  if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
3292
  return false;
3293
  }
3294
  }
3295
 
3296
- const auto & w = get_weights(ggml_get_name(cur));
3297
  size_t n_size = ggml_nbytes(cur);
3298
 
3299
  if (use_mmap) {
3300
- const auto & mapping = mappings.at(w.idx);
3301
  ggml_backend_buffer_t buf_mmap = nullptr;
3302
- if (bufs_mmap.count(w.idx)) {
3303
- buf_mmap = bufs_mmap.at(w.idx);
3304
  }
3305
  GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
3306
  if (buf_mmap && cur->data == nullptr) {
3307
- ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + w.offs);
3308
  if (lmlocks) {
3309
- const auto & lmlock = lmlocks->at(w.idx);
3310
- lmlock->grow_to(w.offs + ggml_nbytes(cur));
3311
  }
3312
 
3313
- auto & mmap_used = mmaps_used[w.idx];
3314
- mmap_used.first = std::min(mmap_used.first, w.offs);
3315
- mmap_used.second = std::max(mmap_used.second, w.offs + n_size);
3316
  } else {
3317
- ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + w.offs, 0, n_size);
3318
  }
3319
  } else {
3320
- GGML_ASSERT(w.idx < files.size());
3321
- const auto & file = files.at(w.idx);
3322
  if (ggml_backend_buffer_is_host(cur->buffer)) {
3323
- file->seek(w.offs, SEEK_SET);
3324
  file->read_raw(cur->data, ggml_nbytes(cur));
3325
  } else {
3326
  read_buf.resize(ggml_nbytes(cur));
3327
- file->seek(w.offs, SEEK_SET);
3328
  file->read_raw(read_buf.data(), ggml_nbytes(cur));
3329
  ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3330
  }
@@ -3847,6 +3944,16 @@ static void llm_load_hparams(
3847
  default: model.type = e_model::MODEL_UNKNOWN;
3848
  }
3849
  } break;
 
 
 
 
 
 
 
 
 
 
3850
  case LLM_ARCH_COMMAND_R:
3851
  {
3852
  ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
@@ -4243,6 +4350,7 @@ static bool llm_load_tensors(
4243
 
4244
  const int64_t n_layer = hparams.n_layer;
4245
  const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
 
4246
 
4247
  // there is very little benefit to offloading the input layer, so always keep it on the CPU
4248
  model.buft_input = llama_default_buffer_type_cpu(true);
@@ -4331,6 +4439,10 @@ static bool llm_load_tensors(
4331
 
4332
  // create one context per buffer type
4333
  size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
 
 
 
 
4334
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
4335
  for (auto & it : buft_layer_count) {
4336
  struct ggml_init_params params = {
@@ -4357,6 +4469,11 @@ static bool llm_load_tensors(
4357
  const int64_t n_vocab = hparams.n_vocab;
4358
  const int64_t n_vocab_type = hparams.n_vocab_type;
4359
  const int64_t n_ff = hparams.n_ff;
 
 
 
 
 
4360
 
4361
  GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
4362
 
@@ -4411,30 +4528,50 @@ static bool llm_load_tensors(
4411
 
4412
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4413
 
4414
- layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
4415
-
4416
- if (layer.ffn_gate_inp == nullptr) {
4417
- GGML_ASSERT(hparams.n_expert == 0);
4418
- GGML_ASSERT(hparams.n_expert_used == 0);
4419
-
4420
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4421
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4422
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4423
  } else {
4424
- GGML_ASSERT(hparams.n_expert > 0);
4425
- GGML_ASSERT(hparams.n_expert_used > 0);
4426
-
4427
- // MoE branch
4428
- for (uint32_t x = 0; x < hparams.n_expert; ++x) {
4429
- layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff});
4430
- layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd});
4431
- layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4432
  }
4433
  }
4434
  }
4435
  } break;
4436
  case LLM_ARCH_GROK:
4437
  {
 
 
 
 
4438
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4439
 
4440
  // output
@@ -4466,16 +4603,35 @@ static bool llm_load_tensors(
4466
 
4467
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4468
 
4469
- layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd});
4470
 
4471
- GGML_ASSERT(hparams.n_expert > 0);
4472
- GGML_ASSERT(hparams.n_expert_used > 0);
4473
-
4474
- // MoE branch
4475
- for (uint32_t x = 0; x < hparams.n_expert; ++x) {
4476
- layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff});
4477
- layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd});
4478
- layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4479
  }
4480
 
4481
  layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
@@ -4716,6 +4872,7 @@ static bool llm_load_tensors(
4716
  case LLM_ARCH_MPT:
4717
  {
4718
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
 
4719
 
4720
  // output
4721
  {
@@ -4754,6 +4911,12 @@ static bool llm_load_tensors(
4754
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4755
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
4756
 
 
 
 
 
 
 
4757
  // AWQ ScaleActivation layer
4758
  layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
4759
  }
@@ -5200,6 +5363,28 @@ static bool llm_load_tensors(
5200
  layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
5201
  }
5202
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5203
  case LLM_ARCH_COMMAND_R:
5204
  {
5205
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5238,7 +5423,7 @@ static bool llm_load_tensors(
5238
 
5239
  ml.done_getting_tensors();
5240
 
5241
- ml.init_mappings(true, &model.mlock_mmaps);
5242
  model.mappings.reserve(ml.mappings.size());
5243
 
5244
  // create the backend buffers
@@ -5259,7 +5444,7 @@ static bool llm_load_tensors(
5259
  // only the mmap region containing the tensors in the model is mapped to the backend buffer
5260
  // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
5261
  // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
5262
- if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
5263
  for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
5264
  void * addr = nullptr;
5265
  size_t first, last;
@@ -5283,7 +5468,7 @@ static bool llm_load_tensors(
5283
  }
5284
  }
5285
  #ifdef GGML_USE_METAL
5286
- else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
5287
  for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
5288
  const size_t max_size = ggml_get_max_tensor_size(ctx);
5289
  void * addr = nullptr;
@@ -5366,8 +5551,10 @@ static bool llm_load_tensors(
5366
  }
5367
  }
5368
 
5369
- for (auto & mapping : ml.mappings) {
5370
- model.mappings.emplace_back(std::move(mapping));
 
 
5371
  }
5372
 
5373
  // loading time will be recalculate after the first eval, so
@@ -5523,8 +5710,8 @@ static void llm_build_kv_store(
5523
  GGML_ASSERT(kv.size == n_ctx);
5524
 
5525
  // compute the transposed [n_tokens, n_embd] V matrix
5526
- struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
5527
- //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
5528
  cb(v_cur_t, "v_cur_t", il);
5529
 
5530
  struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
@@ -6235,19 +6422,19 @@ struct llm_build_context {
6235
  for (int i = 0; i < n_expert_used; ++i) {
6236
  ggml_tensor * cur_expert;
6237
 
6238
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
6239
  cb(cur_up, "ffn_moe_up", il);
6240
 
6241
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
6242
  cb(cur_gate, "ffn_moe_gate", il);
6243
 
6244
  cur_gate = ggml_silu(ctx0, cur_gate);
6245
  cb(cur_gate, "ffn_moe_silu", il);
6246
 
6247
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
6248
  cb(cur_expert, "ffn_moe_gate_par", il);
6249
 
6250
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6251
  cb(cur_expert, "ffn_moe_down", il);
6252
 
6253
  cur_expert = ggml_mul(ctx0, cur_expert,
@@ -6411,6 +6598,111 @@ struct llm_build_context {
6411
  return gf;
6412
  }
6413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6414
  struct ggml_cgraph * build_falcon() {
6415
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6416
 
@@ -6664,20 +6956,20 @@ struct llm_build_context {
6664
  for (int i = 0; i < n_expert_used; ++i) {
6665
  ggml_tensor * cur_expert;
6666
 
6667
- ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
6668
  cb(cur_up, "ffn_moe_up", il);
6669
 
6670
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
6671
  cb(cur_gate, "ffn_moe_gate", il);
6672
 
6673
  //GeLU
6674
  cur_gate = ggml_gelu(ctx0, cur_gate);
6675
  cb(cur_gate, "ffn_moe_gelu", il);
6676
 
6677
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
6678
  cb(cur_expert, "ffn_moe_gate_par", il);
6679
 
6680
- cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6681
  cb(cur_expert, "ffn_moe_down", il);
6682
 
6683
  cur_expert = ggml_mul(ctx0, cur_expert,
@@ -7441,6 +7733,7 @@ struct llm_build_context {
7441
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7442
 
7443
  struct ggml_tensor * cur;
 
7444
  struct ggml_tensor * inpL;
7445
 
7446
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
@@ -7451,6 +7744,16 @@ struct llm_build_context {
7451
  // positions of the tokens in the KV cache
7452
  struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
7453
 
 
 
 
 
 
 
 
 
 
 
7454
  for (int il = 0; il < n_layer; ++il) {
7455
  struct ggml_tensor * attn_norm;
7456
 
@@ -7485,11 +7788,32 @@ struct llm_build_context {
7485
  cb(Kcur, "Kcur", il);
7486
  cb(Vcur, "Vcur", il);
7487
 
7488
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
 
 
 
 
 
7489
 
7490
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
 
 
 
 
 
 
 
 
 
7491
  model.layers[il].wo, model.layers[il].bo,
7492
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
 
 
 
 
 
 
7493
  }
7494
 
7495
  if (il == n_layer - 1) {
@@ -9152,8 +9476,9 @@ struct llm_build_context {
9152
  if (il == n_layer - 1) {
9153
  // skip computing output for unused tokens
9154
  struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9155
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9156
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
 
9157
  }
9158
 
9159
  struct ggml_tensor * attn_out = cur;
@@ -9388,6 +9713,10 @@ static struct ggml_cgraph * llama_build_graph(
9388
  {
9389
  result = llm.build_mamba();
9390
  } break;
 
 
 
 
9391
  case LLM_ARCH_COMMAND_R:
9392
  {
9393
  result = llm.build_command_r();
@@ -11294,28 +11623,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
11294
  // grammar - internal
11295
  //
11296
 
11297
- struct llama_partial_utf8 {
11298
- uint32_t value; // bit value so far (unshifted)
11299
- int n_remain; // num bytes remaining; -1 indicates invalid sequence
11300
- };
11301
-
11302
- struct llama_grammar {
11303
- const std::vector<std::vector<llama_grammar_element>> rules;
11304
- std::vector<std::vector<const llama_grammar_element *>> stacks;
11305
-
11306
- // buffer for partially generated UTF-8 sequence from accepted tokens
11307
- llama_partial_utf8 partial_utf8;
11308
- };
11309
-
11310
- struct llama_grammar_candidate {
11311
- size_t index;
11312
- const uint32_t * code_points;
11313
- llama_partial_utf8 partial_utf8;
11314
- };
11315
 
11316
  // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
11317
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
11318
- static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
11319
  const std::string & src,
11320
  llama_partial_utf8 partial_start) {
11321
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
@@ -11517,7 +11828,7 @@ static void llama_grammar_advance_stack(
11517
  // be positioned at a character range (see `llama_grammar_advance_stack`), and
11518
  // produces the N possible stacks if the given char is accepted at those
11519
  // positions
11520
- static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
11521
  const std::vector<std::vector<llama_grammar_element>> & rules,
11522
  const std::vector<std::vector<const llama_grammar_element *>> & stacks,
11523
  const uint32_t chr) {
@@ -12743,7 +13054,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
12743
  // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
12744
  // for getting the current layer as I initially thought, and we need to resort to parsing the
12745
  // tensor name.
12746
- n_layer /= n_expert;
12747
  if (sscanf(name, "blk.%d.", &i_layer) != 1) {
12748
  throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
12749
  }
@@ -13105,7 +13415,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13105
  kv_overrides = v->data();
13106
  }
13107
  llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
13108
- ml.init_mappings(false); // no prefetching?
13109
 
13110
  llama_model model;
13111
  llm_load_arch(ml, model);
@@ -13157,20 +13467,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13157
  // TODO: avoid hardcoded tensor names - use the TN_* constants
13158
  if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
13159
  ++qs.n_attention_wv;
13160
- } else if (name.find("ffn_down") != std::string::npos) {
13161
- ++qs.n_ffn_down;
13162
- } else if (name.find("ffn_gate") != std::string::npos) {
13163
- ++qs.n_ffn_gate;
13164
- } else if (name.find("ffn_up") != std::string::npos) {
13165
- ++qs.n_ffn_up;
13166
  } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
13167
  qs.has_output = true;
13168
  }
13169
  }
13170
- if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t) qs.n_attention_wv != model.hparams.n_layer) {
13171
- LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
13172
- __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
13173
- }
 
13174
 
13175
  size_t total_size_org = 0;
13176
  size_t total_size_new = 0;
@@ -13200,6 +13505,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13200
  // placeholder for the meta data
13201
  ::zeros(fout, meta_size);
13202
 
 
 
13203
  for (int i = 0; i < ml.n_tensors; ++i) {
13204
  struct ggml_tensor * tensor = ml.get_tensor_meta(i);
13205
 
@@ -13222,8 +13529,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13222
  // This used to be a regex, but <regex> has an extreme cost to compile times.
13223
  bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
13224
 
13225
- // quantize only 2D tensors
13226
- quantize &= (ggml_n_dims(tensor) == 2);
13227
  quantize &= params->quantize_output_tensor || name != "output.weight";
13228
  quantize &= !params->only_copy;
13229
 
@@ -13278,11 +13585,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13278
  if (it == imatrix_data->end()) {
13279
  LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
13280
  } else {
13281
- if (it->second.size() == (size_t)tensor->ne[0]) {
13282
  imatrix = it->second.data();
13283
  } else {
13284
  LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
13285
- int(it->second.size()), int(tensor->ne[0]), tensor->name);
 
 
 
 
 
 
 
 
 
13286
  }
13287
  }
13288
  }
@@ -13319,15 +13635,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
13319
  new_data = work.data();
13320
 
13321
  const int n_per_row = tensor->ne[0];
13322
- const int nrows = nelements / n_per_row;
13323
 
13324
  static const int min_chunk_size = 32 * 512;
13325
  const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
13326
 
13327
- const int nchunk = (nelements + chunk_size - 1)/chunk_size;
 
13328
  const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
13329
- new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix, workers, nthread_use);
13330
 
 
 
 
 
 
 
 
 
 
13331
  LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
13332
  }
13333
  total_size_org += ggml_nbytes(tensor);
@@ -13968,7 +14293,20 @@ struct llama_context * llama_new_context_with_model(
13968
  }
13969
  }
13970
  #elif defined(GGML_USE_VULKAN)
13971
- if (model->n_gpu_layers > 0) {
 
 
 
 
 
 
 
 
 
 
 
 
 
13972
  for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
13973
  ggml_backend_t backend = ggml_backend_vk_init(device);
13974
  if (backend == nullptr) {
@@ -14187,6 +14525,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
14187
  case LLM_ARCH_ORION:
14188
  case LLM_ARCH_INTERNLM2:
14189
  case LLM_ARCH_MINICPM:
 
14190
  case LLM_ARCH_COMMAND_R:
14191
  return LLAMA_ROPE_TYPE_NORM;
14192
 
@@ -15524,6 +15863,55 @@ static int32_t llama_chat_apply_template_internal(
15524
  ss << message->content << "</s>";
15525
  }
15526
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15527
  } else {
15528
  // template not supported
15529
  return -1;
 
218
  LLM_ARCH_GEMMA,
219
  LLM_ARCH_STARCODER2,
220
  LLM_ARCH_MAMBA,
221
+ LLM_ARCH_XVERSE,
222
  LLM_ARCH_COMMAND_R,
223
  LLM_ARCH_UNKNOWN,
224
  };
 
250
  { LLM_ARCH_GEMMA, "gemma" },
251
  { LLM_ARCH_STARCODER2, "starcoder2" },
252
  { LLM_ARCH_MAMBA, "mamba" },
253
+ { LLM_ARCH_XVERSE, "xverse" },
254
  { LLM_ARCH_COMMAND_R, "command-r" },
255
  { LLM_ARCH_UNKNOWN, "(unknown)" },
256
  };
 
261
  LLM_KV_GENERAL_ALIGNMENT,
262
  LLM_KV_GENERAL_NAME,
263
  LLM_KV_GENERAL_AUTHOR,
264
+ LLM_KV_GENERAL_VERSION,
265
  LLM_KV_GENERAL_URL,
266
  LLM_KV_GENERAL_DESCRIPTION,
267
  LLM_KV_GENERAL_LICENSE,
 
331
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
332
  { LLM_KV_GENERAL_NAME, "general.name" },
333
  { LLM_KV_GENERAL_AUTHOR, "general.author" },
334
+ { LLM_KV_GENERAL_VERSION, "general.version" },
335
  { LLM_KV_GENERAL_URL, "general.url" },
336
  { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
337
  { LLM_KV_GENERAL_LICENSE, "general.license" },
 
428
  LLM_TENSOR_FFN_DOWN,
429
  LLM_TENSOR_FFN_UP,
430
  LLM_TENSOR_FFN_ACT,
431
+ LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
432
  LLM_TENSOR_FFN_GATE_EXP,
433
  LLM_TENSOR_FFN_UP_EXP,
434
+ LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
435
+ LLM_TENSOR_FFN_GATE_EXPS,
436
+ LLM_TENSOR_FFN_UP_EXPS,
437
  LLM_TENSOR_ATTN_Q_NORM,
438
  LLM_TENSOR_ATTN_K_NORM,
439
  LLM_TENSOR_LAYER_OUT_NORM,
 
468
  { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
469
  { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
470
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
471
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
472
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
473
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
474
  },
475
  },
476
  {
 
524
  { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
525
  { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
526
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
527
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
528
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
529
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
530
  { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
531
  { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
532
  },
 
596
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
597
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
598
  { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
599
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
600
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
601
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
602
  },
603
  },
604
  {
 
894
  { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
895
  },
896
  },
897
+ {
898
+ LLM_ARCH_XVERSE,
899
+ {
900
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
901
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
902
+ { LLM_TENSOR_OUTPUT, "output" },
903
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
904
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
905
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
906
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
907
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
908
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
909
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
910
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
911
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
912
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
913
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
914
+ },
915
+ },
916
  {
917
  LLM_ARCH_COMMAND_R,
918
  {
 
1878
 
1879
  // ff MoE
1880
  struct ggml_tensor * ffn_gate_inp;
1881
+ struct ggml_tensor * ffn_gate_exps;
1882
+ struct ggml_tensor * ffn_down_exps;
1883
+ struct ggml_tensor * ffn_up_exps ;
1884
 
1885
  // ff bias
1886
  struct ggml_tensor * ffn_down_b; // b2
 
2135
  ggml_backend_free(backend);
2136
  }
2137
 
 
 
 
 
2138
  ggml_backend_buffer_free(buf_output);
2139
  }
2140
 
 
2882
 
2883
  llama_mmaps mappings;
2884
 
2885
+ // Holds information on a model weight
2886
+ struct llama_tensor_weight {
2887
  uint16_t idx; // source file index
2888
  size_t offs; // tensor data offset in the original file
2889
 
2890
  ggml_tensor * tensor;
2891
 
2892
+ llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
2893
  const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
2894
  offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
2895
  }
2896
  };
2897
+ std::vector<llama_tensor_weight> weights;
2898
 
2899
  std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
2900
 
 
2934
  // For subsidiary files, `meta` tensor data offset must not be used,
2935
  // so we build a unified tensors index for weights.
2936
  for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
2937
+ weights.emplace_back(0, cur->name, meta, cur);
2938
  }
2939
  files.emplace_back(new llama_file(fname.c_str(), "rb"));
2940
  contexts.emplace_back(ctx);
 
2974
 
2975
  // Save tensors data offset info of the shard.
2976
  for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
2977
+ weights.emplace_back(idx, cur->name, ctx_gguf, cur);
2978
  }
2979
  files.emplace_back(new llama_file(split_path, "rb"));
2980
  contexts.emplace_back(ctx);
 
3178
  return weights.at(i).tensor->name;
3179
  }
3180
 
3181
+ const llama_tensor_weight * get_weight(const char * name) const {
3182
  for (const auto & weight : weights) {
3183
  if (strcmp(name, weight.tensor->name) == 0) {
3184
+ return &weight;
3185
  }
3186
  }
3187
+ return nullptr;
3188
+ }
3189
+
3190
+ const llama_tensor_weight & require_weight(const char * name) const {
3191
+ const llama_tensor_weight * weight = get_weight(name);
3192
+ if (!weight) {
3193
+ throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
3194
+ }
3195
+ return *weight;
3196
  }
3197
 
3198
  struct ggml_tensor * get_tensor_meta(const char * name) const {
3199
+ const auto * weight = get_weight(name);
3200
+ if (!weight) {
3201
+ return nullptr;
 
3202
  }
3203
+ return weight->tensor;
3204
+ }
3205
+
3206
+ struct ggml_tensor * require_tensor_meta(const char * name) const {
3207
+ struct ggml_tensor * tensor = get_tensor_meta(name);
3208
+ if (!tensor) {
3209
+ throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
3210
+ }
3211
+ return tensor;
3212
  }
3213
 
3214
  struct ggml_tensor * get_tensor_meta(int i) const {
 
3224
  return tensor;
3225
  }
3226
 
3227
+ const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
3228
  const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
3229
 
3230
  if (cur == NULL) {
 
3236
 
3237
  {
3238
  bool is_ok = true;
3239
+ for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
3240
+ if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
3241
  is_ok = false;
3242
  break;
3243
  }
 
3251
  }
3252
  }
3253
 
3254
+ return cur;
3255
+ }
3256
+
3257
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
3258
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3259
+
3260
+ if (cur == NULL) {
3261
+ return NULL;
3262
+ }
3263
+
3264
  return create_tensor_for(ctx, cur);
3265
  }
3266
 
3267
+ struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
3268
+ const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
3269
+
3270
+ if (cur == NULL) {
3271
+ return NULL;
3272
+ }
3273
+
3274
+ if (cur->type != base->type) {
3275
+ throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type)));
3276
+ }
3277
+
3278
+ std::array<int64_t, GGML_MAX_DIMS> dims;
3279
+ for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
3280
+ dims[i] = i < ne.size() ? ne[i] : 1;
3281
+ }
3282
+
3283
+ struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
3284
+ dims[0], dims[1], dims[2], dims[3],
3285
+ cur->nb[1], cur->nb[2], cur->nb[3],
3286
+ offset);
3287
+
3288
+ ggml_set_name(tensor, name.c_str());
3289
+
3290
+ n_created++;
3291
+
3292
+ return tensor;
3293
+ }
3294
+
3295
  void done_getting_tensors() const {
3296
  if (n_created != n_tensors) {
3297
  throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
 
3304
  mmaps_used.reserve(files.size());
3305
  for (const auto & file : files) {
3306
  std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
3307
+ mmaps_used.emplace_back(mapping->size, 0);
3308
  if (mlock_mmaps) {
3309
  std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
3310
  mlock_mmap->init(mapping->addr);
 
3328
  *last = 0;
3329
  *addr = mapping->addr;
3330
  for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
3331
+ try {
3332
+ const auto * weight = get_weight(ggml_get_name(tensor));
3333
+ if (!weight) {
3334
+ continue;
3335
+ }
3336
+ if (weight->idx != idx) {
3337
+ continue;
3338
+ }
3339
+ *first = std::min(*first, weight->offs);
3340
+ *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
3341
+ } catch(...) {
3342
+ // the tensor is not in the model
3343
  }
 
 
3344
  }
3345
  }
3346
 
3347
  // for backwards compatibility, does not support ggml-backend
3348
  void load_data_for(struct ggml_tensor * cur) const {
3349
+ const auto & w = require_weight(ggml_get_name(cur));
3350
 
3351
  if (use_mmap) {
3352
  const auto & mapping = mappings.at(w.idx);
 
3379
 
3380
  std::vector<no_init<uint8_t>> read_buf;
3381
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3382
+ const auto * weight = get_weight(ggml_get_name(cur));
3383
+ if (weight == nullptr) {
3384
+ // this can happen with split experts models
3385
+ continue;
3386
+ }
3387
+
3388
  if (progress_callback) {
3389
  if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
3390
  return false;
3391
  }
3392
  }
3393
 
 
3394
  size_t n_size = ggml_nbytes(cur);
3395
 
3396
  if (use_mmap) {
3397
+ const auto & mapping = mappings.at(weight->idx);
3398
  ggml_backend_buffer_t buf_mmap = nullptr;
3399
+ if (bufs_mmap.count(weight->idx)) {
3400
+ buf_mmap = bufs_mmap.at(weight->idx);
3401
  }
3402
  GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
3403
  if (buf_mmap && cur->data == nullptr) {
3404
+ ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
3405
  if (lmlocks) {
3406
+ const auto & lmlock = lmlocks->at(weight->idx);
3407
+ lmlock->grow_to(weight->offs + ggml_nbytes(cur));
3408
  }
3409
 
3410
+ auto & mmap_used = mmaps_used[weight->idx];
3411
+ mmap_used.first = std::min(mmap_used.first, weight->offs);
3412
+ mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
3413
  } else {
3414
+ ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
3415
  }
3416
  } else {
3417
+ GGML_ASSERT(weight->idx < files.size());
3418
+ const auto & file = files.at(weight->idx);
3419
  if (ggml_backend_buffer_is_host(cur->buffer)) {
3420
+ file->seek(weight->offs, SEEK_SET);
3421
  file->read_raw(cur->data, ggml_nbytes(cur));
3422
  } else {
3423
  read_buf.resize(ggml_nbytes(cur));
3424
+ file->seek(weight->offs, SEEK_SET);
3425
  file->read_raw(read_buf.data(), ggml_nbytes(cur));
3426
  ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3427
  }
 
3944
  default: model.type = e_model::MODEL_UNKNOWN;
3945
  }
3946
  } break;
3947
+ case LLM_ARCH_XVERSE:
3948
+ {
3949
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3950
+ switch (hparams.n_layer) {
3951
+ case 32: model.type = e_model::MODEL_7B; break;
3952
+ case 40: model.type = e_model::MODEL_13B; break;
3953
+ case 80: model.type = e_model::MODEL_65B; break;
3954
+ default: model.type = e_model::MODEL_UNKNOWN;
3955
+ }
3956
+ } break;
3957
  case LLM_ARCH_COMMAND_R:
3958
  {
3959
  ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
 
4350
 
4351
  const int64_t n_layer = hparams.n_layer;
4352
  const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
4353
+ bool use_mmap_buffer = true;
4354
 
4355
  // there is very little benefit to offloading the input layer, so always keep it on the CPU
4356
  model.buft_input = llama_default_buffer_type_cpu(true);
 
4439
 
4440
  // create one context per buffer type
4441
  size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
4442
+
4443
+ // for moe merged tensors
4444
+ ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
4445
+
4446
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
4447
  for (auto & it : buft_layer_count) {
4448
  struct ggml_init_params params = {
 
4469
  const int64_t n_vocab = hparams.n_vocab;
4470
  const int64_t n_vocab_type = hparams.n_vocab_type;
4471
  const int64_t n_ff = hparams.n_ff;
4472
+ const int64_t n_expert = hparams.n_expert;
4473
+
4474
+ if (n_expert > 0 && hparams.n_expert_used == 0) {
4475
+ throw std::runtime_error("model has expert layers but no expert layers are used");
4476
+ }
4477
 
4478
  GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
4479
 
 
4528
 
4529
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4530
 
4531
+ if (n_expert == 0) {
 
 
 
 
 
4532
  layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4533
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4534
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4535
  } else {
4536
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4537
+
4538
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
4539
+ if (layer.ffn_gate_exps) {
4540
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4541
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4542
+ } else {
4543
+ // merge split expert into a single tensor for compatibility with older models
4544
+ // requires disabling mmap
4545
+ use_mmap_buffer = false;
4546
+
4547
+ ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
4548
+ ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
4549
+ ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
4550
+
4551
+ layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
4552
+ layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
4553
+ layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
4554
+
4555
+ ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
4556
+ ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
4557
+ ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
4558
+
4559
+ for (uint32_t x = 0; x < n_expert; ++x) {
4560
+ // the individual experts are loaded into a view of the merged tensor
4561
+ ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
4562
+ ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
4563
+ ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
4564
+ }
4565
  }
4566
  }
4567
  }
4568
  } break;
4569
  case LLM_ARCH_GROK:
4570
  {
4571
+ if (n_expert == 0) {
4572
+ throw std::runtime_error("Grok model cannot have zero experts");
4573
+ }
4574
+
4575
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4576
 
4577
  // output
 
4603
 
4604
  layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4605
 
4606
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
4607
 
4608
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
4609
+ if (layer.ffn_gate_exps) {
4610
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
4611
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
4612
+ } else {
4613
+ // merge split expert into a single tensor for compatibility with older models
4614
+ // requires disabling mmap
4615
+ use_mmap_buffer = false;
4616
+
4617
+ ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
4618
+ ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
4619
+ ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
4620
+
4621
+ layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
4622
+ layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
4623
+ layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
4624
+
4625
+ ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
4626
+ ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
4627
+ ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
4628
+
4629
+ for (uint32_t x = 0; x < n_expert; ++x) {
4630
+ // the individual experts are loaded into a view of the merged tensor
4631
+ ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
4632
+ ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
4633
+ ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
4634
+ }
4635
  }
4636
 
4637
  layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
 
4872
  case LLM_ARCH_MPT:
4873
  {
4874
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4875
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
4876
 
4877
  // output
4878
  {
 
4911
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4912
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
4913
 
4914
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
4915
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
4916
+
4917
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
4918
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
4919
+
4920
  // AWQ ScaleActivation layer
4921
  layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
4922
  }
 
5363
  layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
5364
  }
5365
  } break;
5366
+ case LLM_ARCH_XVERSE:
5367
+ {
5368
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5369
+ {
5370
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5371
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
5372
+ }
5373
+ for (int i = 0; i < n_layer; ++i) {
5374
+ ggml_context * ctx_layer = ctx_for_layer(i);
5375
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5376
+ auto & layer = model.layers[i];
5377
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
5378
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5379
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5380
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5381
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
5382
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
5383
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5384
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5385
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5386
+ }
5387
+ } break;
5388
  case LLM_ARCH_COMMAND_R:
5389
  {
5390
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
 
5423
 
5424
  ml.done_getting_tensors();
5425
 
5426
+ ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
5427
  model.mappings.reserve(ml.mappings.size());
5428
 
5429
  // create the backend buffers
 
5444
  // only the mmap region containing the tensors in the model is mapped to the backend buffer
5445
  // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
5446
  // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
5447
+ if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
5448
  for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
5449
  void * addr = nullptr;
5450
  size_t first, last;
 
5468
  }
5469
  }
5470
  #ifdef GGML_USE_METAL
5471
+ else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
5472
  for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
5473
  const size_t max_size = ggml_get_max_tensor_size(ctx);
5474
  void * addr = nullptr;
 
5551
  }
5552
  }
5553
 
5554
+ if (use_mmap_buffer) {
5555
+ for (auto & mapping : ml.mappings) {
5556
+ model.mappings.emplace_back(std::move(mapping));
5557
+ }
5558
  }
5559
 
5560
  // loading time will be recalculate after the first eval, so
 
5710
  GGML_ASSERT(kv.size == n_ctx);
5711
 
5712
  // compute the transposed [n_tokens, n_embd] V matrix
5713
+ assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
5714
+ struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
5715
  cb(v_cur_t, "v_cur_t", il);
5716
 
5717
  struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
 
6422
  for (int i = 0; i < n_expert_used; ++i) {
6423
  ggml_tensor * cur_expert;
6424
 
6425
+ ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6426
  cb(cur_up, "ffn_moe_up", il);
6427
 
6428
+ ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6429
  cb(cur_gate, "ffn_moe_gate", il);
6430
 
6431
  cur_gate = ggml_silu(ctx0, cur_gate);
6432
  cb(cur_gate, "ffn_moe_silu", il);
6433
 
6434
+ cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
6435
  cb(cur_expert, "ffn_moe_gate_par", il);
6436
 
6437
+ cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6438
  cb(cur_expert, "ffn_moe_down", il);
6439
 
6440
  cur_expert = ggml_mul(ctx0, cur_expert,
 
6598
  return gf;
6599
  }
6600
 
6601
+ struct ggml_cgraph * build_xverse() {
6602
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6603
+
6604
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6605
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6606
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6607
+
6608
+ struct ggml_tensor * cur;
6609
+ struct ggml_tensor * inpL;
6610
+
6611
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
6612
+
6613
+ // inp_pos - contains the positions
6614
+ struct ggml_tensor * inp_pos = build_inp_pos();
6615
+
6616
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6617
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
6618
+
6619
+ // positions of the tokens in the KV cache
6620
+ struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
6621
+
6622
+ for (int il = 0; il < n_layer; ++il) {
6623
+ struct ggml_tensor * inpSA = inpL;
6624
+
6625
+ cur = llm_build_norm(ctx0, inpL, hparams,
6626
+ model.layers[il].attn_norm, NULL,
6627
+ LLM_NORM_RMS, cb, il);
6628
+ cb(cur, "attn_norm", il);
6629
+
6630
+ // self-attention
6631
+ {
6632
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
6633
+ cb(Qcur, "Qcur", il);
6634
+
6635
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
6636
+ cb(Kcur, "Kcur", il);
6637
+
6638
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
6639
+ cb(Vcur, "Vcur", il);
6640
+
6641
+ Qcur = ggml_rope_custom(
6642
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6643
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6644
+ ext_factor, attn_factor, beta_fast, beta_slow
6645
+ );
6646
+ cb(Qcur, "Qcur", il);
6647
+
6648
+ Kcur = ggml_rope_custom(
6649
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6650
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
6651
+ ext_factor, attn_factor, beta_fast, beta_slow
6652
+ );
6653
+ cb(Kcur, "Kcur", il);
6654
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6655
+ model.layers[il].wo, NULL,
6656
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6657
+ }
6658
+
6659
+ if (il == n_layer - 1) {
6660
+ // skip computing output for unused tokens
6661
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
6662
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6663
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6664
+ }
6665
+
6666
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6667
+ cb(ffn_inp, "ffn_inp", il);
6668
+
6669
+ // feed-forward network
6670
+ {
6671
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
6672
+ model.layers[il].ffn_norm, NULL,
6673
+ LLM_NORM_RMS, cb, il);
6674
+ cb(cur, "ffn_norm", il);
6675
+
6676
+ cur = llm_build_ffn(ctx0, cur,
6677
+ model.layers[il].ffn_up, NULL,
6678
+ model.layers[il].ffn_gate, NULL,
6679
+ model.layers[il].ffn_down, NULL,
6680
+ NULL,
6681
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6682
+ cb(cur, "ffn_out", il);
6683
+ }
6684
+
6685
+ cur = ggml_add(ctx0, cur, ffn_inp);
6686
+ cb(cur, "l_out", il);
6687
+
6688
+ // input for next layer
6689
+ inpL = cur;
6690
+ }
6691
+
6692
+ cur = inpL;
6693
+
6694
+ cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
6695
+ cb(cur, "result_norm", -1);
6696
+
6697
+ // lm_head
6698
+ cur = ggml_mul_mat(ctx0, model.output, cur);
6699
+ cb(cur, "result_output", -1);
6700
+
6701
+ ggml_build_forward_expand(gf, cur);
6702
+
6703
+ return gf;
6704
+ }
6705
+
6706
  struct ggml_cgraph * build_falcon() {
6707
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6708
 
 
6956
  for (int i = 0; i < n_expert_used; ++i) {
6957
  ggml_tensor * cur_expert;
6958
 
6959
+ ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6960
  cb(cur_up, "ffn_moe_up", il);
6961
 
6962
+ ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6963
  cb(cur_gate, "ffn_moe_gate", il);
6964
 
6965
  //GeLU
6966
  cur_gate = ggml_gelu(ctx0, cur_gate);
6967
  cb(cur_gate, "ffn_moe_gelu", il);
6968
 
6969
+ cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
6970
  cb(cur_expert, "ffn_moe_gate_par", il);
6971
 
6972
+ cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
6973
  cb(cur_expert, "ffn_moe_down", il);
6974
 
6975
  cur_expert = ggml_mul(ctx0, cur_expert,
 
7733
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7734
 
7735
  struct ggml_tensor * cur;
7736
+ struct ggml_tensor * pos;
7737
  struct ggml_tensor * inpL;
7738
 
7739
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
 
7744
  // positions of the tokens in the KV cache
7745
  struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
7746
 
7747
+ if (model.pos_embd) {
7748
+ // inp_pos - contains the positions
7749
+ struct ggml_tensor * inp_pos = build_inp_pos();
7750
+ pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
7751
+ cb(pos, "pos_embd", -1);
7752
+
7753
+ inpL = ggml_add(ctx0, inpL, pos);
7754
+ cb(inpL, "inpL", -1);
7755
+ }
7756
+
7757
  for (int il = 0; il < n_layer; ++il) {
7758
  struct ggml_tensor * attn_norm;
7759
 
 
7788
  cb(Kcur, "Kcur", il);
7789
  cb(Vcur, "Vcur", il);
7790
 
7791
+ // Q/K Layernorm
7792
+ if (model.layers[il].attn_q_norm) {
7793
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
7794
+ model.layers[il].attn_q_norm,
7795
+ model.layers[il].attn_q_norm_b,
7796
+ LLM_NORM, cb, il);
7797
+ cb(Qcur, "Qcur", il);
7798
 
7799
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
7800
+ model.layers[il].attn_k_norm,
7801
+ model.layers[il].attn_k_norm_b,
7802
+ LLM_NORM, cb, il);
7803
+ cb(Kcur, "Kcur", il);
7804
+
7805
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7806
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7807
+
7808
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7809
  model.layers[il].wo, model.layers[il].bo,
7810
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7811
+ } else {
7812
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7813
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7814
+ model.layers[il].wo, model.layers[il].bo,
7815
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7816
+ }
7817
  }
7818
 
7819
  if (il == n_layer - 1) {
 
9476
  if (il == n_layer - 1) {
9477
  // skip computing output for unused tokens
9478
  struct ggml_tensor * inp_out_ids = build_inp_out_ids();
9479
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9480
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9481
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
9482
  }
9483
 
9484
  struct ggml_tensor * attn_out = cur;
 
9713
  {
9714
  result = llm.build_mamba();
9715
  } break;
9716
+ case LLM_ARCH_XVERSE:
9717
+ {
9718
+ result = llm.build_xverse();
9719
+ } break;
9720
  case LLM_ARCH_COMMAND_R:
9721
  {
9722
  result = llm.build_command_r();
 
11623
  // grammar - internal
11624
  //
11625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11626
 
11627
  // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
11628
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
11629
+ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
11630
  const std::string & src,
11631
  llama_partial_utf8 partial_start) {
11632
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
 
11828
  // be positioned at a character range (see `llama_grammar_advance_stack`), and
11829
  // produces the N possible stacks if the given char is accepted at those
11830
  // positions
11831
+ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
11832
  const std::vector<std::vector<llama_grammar_element>> & rules,
11833
  const std::vector<std::vector<const llama_grammar_element *>> & stacks,
11834
  const uint32_t chr) {
 
13054
  // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
13055
  // for getting the current layer as I initially thought, and we need to resort to parsing the
13056
  // tensor name.
 
13057
  if (sscanf(name, "blk.%d.", &i_layer) != 1) {
13058
  throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
13059
  }
 
13415
  kv_overrides = v->data();
13416
  }
13417
  llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
13418
+ ml.init_mappings(false); // no prefetching
13419
 
13420
  llama_model model;
13421
  llm_load_arch(ml, model);
 
13467
  // TODO: avoid hardcoded tensor names - use the TN_* constants
13468
  if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
13469
  ++qs.n_attention_wv;
 
 
 
 
 
 
13470
  } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
13471
  qs.has_output = true;
13472
  }
13473
  }
13474
+
13475
+ qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
13476
+
13477
+ // sanity checks
13478
+ GGML_ASSERT(qs.n_attention_wv == (int)model.hparams.n_layer && "n_attention_wv != n_layer is unexpected");
13479
 
13480
  size_t total_size_org = 0;
13481
  size_t total_size_new = 0;
 
13505
  // placeholder for the meta data
13506
  ::zeros(fout, meta_size);
13507
 
13508
+ const auto tn = LLM_TN(model.arch);
13509
+
13510
  for (int i = 0; i < ml.n_tensors; ++i) {
13511
  struct ggml_tensor * tensor = ml.get_tensor_meta(i);
13512
 
 
13529
  // This used to be a regex, but <regex> has an extreme cost to compile times.
13530
  bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
13531
 
13532
+ // quantize only 2D and 3D tensors (experts)
13533
+ quantize &= (ggml_n_dims(tensor) >= 2);
13534
  quantize &= params->quantize_output_tensor || name != "output.weight";
13535
  quantize &= !params->only_copy;
13536
 
 
13585
  if (it == imatrix_data->end()) {
13586
  LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
13587
  } else {
13588
+ if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
13589
  imatrix = it->second.data();
13590
  } else {
13591
  LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
13592
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
13593
+
13594
+ // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
13595
+ // this is a significant error and it may be good idea to abort the process if this happens,
13596
+ // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
13597
+ // tok_embd should be ignored in this case, since it always causes this warning
13598
+ if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
13599
+ throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
13600
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
13601
+ }
13602
  }
13603
  }
13604
  }
 
13635
  new_data = work.data();
13636
 
13637
  const int n_per_row = tensor->ne[0];
13638
+ const int nrows = tensor->ne[1];
13639
 
13640
  static const int min_chunk_size = 32 * 512;
13641
  const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
13642
 
13643
+ const int nelements_matrix = tensor->ne[0] * tensor->ne[1];
13644
+ const int nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
13645
  const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
 
13646
 
13647
+ // quantize each expert separately since they have different importance matrices
13648
+ new_size = 0;
13649
+ for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
13650
+ const float * f32_data_03 = f32_data + i03 * nelements_matrix;
13651
+ void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
13652
+ const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
13653
+
13654
+ new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
13655
+ }
13656
  LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
13657
  }
13658
  total_size_org += ggml_nbytes(tensor);
 
14293
  }
14294
  }
14295
  #elif defined(GGML_USE_VULKAN)
14296
+ if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
14297
+ LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
14298
+ llama_free(ctx);
14299
+ return nullptr;
14300
+ }
14301
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
14302
+ ggml_backend_t backend = ggml_backend_vk_init(0);
14303
+ if (backend == nullptr) {
14304
+ LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
14305
+ llama_free(ctx);
14306
+ return nullptr;
14307
+ }
14308
+ ctx->backends.push_back(backend);
14309
+ } else {
14310
  for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
14311
  ggml_backend_t backend = ggml_backend_vk_init(device);
14312
  if (backend == nullptr) {
 
14525
  case LLM_ARCH_ORION:
14526
  case LLM_ARCH_INTERNLM2:
14527
  case LLM_ARCH_MINICPM:
14528
+ case LLM_ARCH_XVERSE:
14529
  case LLM_ARCH_COMMAND_R:
14530
  return LLAMA_ROPE_TYPE_NORM;
14531
 
 
15863
  ss << message->content << "</s>";
15864
  }
15865
  }
15866
+ } else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) {
15867
+ // openchat/openchat-3.5-0106,
15868
+ for (auto message : chat) {
15869
+ std::string role(message->role);
15870
+ if (role == "system") {
15871
+ ss << message->content << "<|end_of_turn|>";
15872
+ } else {
15873
+ role[0] = toupper(role[0]);
15874
+ ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
15875
+ }
15876
+ }
15877
+ if (add_ass) {
15878
+ ss << "GPT4 Correct Assistant:";
15879
+ }
15880
+ } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) {
15881
+ // eachadea/vicuna-13b-1.1 (and Orca variant)
15882
+ for (auto message : chat) {
15883
+ std::string role(message->role);
15884
+ if (role == "system") {
15885
+ // Orca-Vicuna variant uses a system prefix
15886
+ if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) {
15887
+ ss << "SYSTEM: " << message->content << "\n";
15888
+ } else {
15889
+ ss << message->content << "\n\n";
15890
+ }
15891
+ } else if (role == "user") {
15892
+ ss << "USER: " << message->content << "\n";
15893
+ } else if (role == "assistant") {
15894
+ ss << "ASSISTANT: " << message->content << "</s>\n";
15895
+ }
15896
+ }
15897
+ if (add_ass) {
15898
+ ss << "ASSISTANT:";
15899
+ }
15900
+ } else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) {
15901
+ // deepseek-ai/deepseek-coder-33b-instruct
15902
+ for (auto message : chat) {
15903
+ std::string role(message->role);
15904
+ if (role == "system") {
15905
+ ss << message->content;
15906
+ } else if (role == "user") {
15907
+ ss << "### Instruction:\n" << message->content << "\n";
15908
+ } else if (role == "assistant") {
15909
+ ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
15910
+ }
15911
+ }
15912
+ if (add_ass) {
15913
+ ss << "### Response:\n";
15914
+ }
15915
  } else {
15916
  // template not supported
15917
  return -1;
examples/talk-llama/llama.h CHANGED
@@ -60,9 +60,9 @@ extern "C" {
60
 
61
  enum llama_vocab_type {
62
  LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
63
- LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
64
- LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
65
- LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
66
  };
67
 
68
  // note: these values should be synchronized with ggml_rope
@@ -1007,10 +1007,38 @@ extern "C" {
1007
 
1008
  struct ggml_tensor;
1009
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1010
  const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
1011
  struct llama_context * ctx
1012
  );
1013
 
 
 
 
 
 
 
 
 
 
1014
  #endif // LLAMA_API_INTERNAL
1015
 
1016
  #endif // LLAMA_H
 
60
 
61
  enum llama_vocab_type {
62
  LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
63
+ LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
64
+ LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
65
+ LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
66
  };
67
 
68
  // note: these values should be synchronized with ggml_rope
 
1007
 
1008
  struct ggml_tensor;
1009
 
1010
+ struct llama_partial_utf8 {
1011
+ uint32_t value; // bit value so far (unshifted)
1012
+ int n_remain; // num bytes remaining; -1 indicates invalid sequence
1013
+ };
1014
+
1015
+ struct llama_grammar {
1016
+ const std::vector<std::vector<llama_grammar_element>> rules;
1017
+ std::vector<std::vector<const llama_grammar_element *>> stacks;
1018
+
1019
+ // buffer for partially generated UTF-8 sequence from accepted tokens
1020
+ llama_partial_utf8 partial_utf8;
1021
+ };
1022
+
1023
+ struct llama_grammar_candidate {
1024
+ size_t index;
1025
+ const uint32_t * code_points;
1026
+ llama_partial_utf8 partial_utf8;
1027
+ };
1028
+
1029
  const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
1030
  struct llama_context * ctx
1031
  );
1032
 
1033
+ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
1034
+ const std::vector<std::vector<llama_grammar_element>> & rules,
1035
+ const std::vector<std::vector<const llama_grammar_element *>> & stacks,
1036
+ const uint32_t chr);
1037
+
1038
+ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
1039
+ const std::string & src,
1040
+ llama_partial_utf8 partial_start);
1041
+
1042
  #endif // LLAMA_API_INTERNAL
1043
 
1044
  #endif // LLAMA_H