Spaces:
Running
Running
talk-llama : sync llama.cpp
Browse files- examples/talk-llama/llama.cpp +517 -129
- examples/talk-llama/llama.h +31 -3
examples/talk-llama/llama.cpp
CHANGED
|
@@ -218,6 +218,7 @@ enum llm_arch {
|
|
| 218 |
LLM_ARCH_GEMMA,
|
| 219 |
LLM_ARCH_STARCODER2,
|
| 220 |
LLM_ARCH_MAMBA,
|
|
|
|
| 221 |
LLM_ARCH_COMMAND_R,
|
| 222 |
LLM_ARCH_UNKNOWN,
|
| 223 |
};
|
|
@@ -249,6 +250,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
| 249 |
{ LLM_ARCH_GEMMA, "gemma" },
|
| 250 |
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
| 251 |
{ LLM_ARCH_MAMBA, "mamba" },
|
|
|
|
| 252 |
{ LLM_ARCH_COMMAND_R, "command-r" },
|
| 253 |
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
| 254 |
};
|
|
@@ -259,6 +261,7 @@ enum llm_kv {
|
|
| 259 |
LLM_KV_GENERAL_ALIGNMENT,
|
| 260 |
LLM_KV_GENERAL_NAME,
|
| 261 |
LLM_KV_GENERAL_AUTHOR,
|
|
|
|
| 262 |
LLM_KV_GENERAL_URL,
|
| 263 |
LLM_KV_GENERAL_DESCRIPTION,
|
| 264 |
LLM_KV_GENERAL_LICENSE,
|
|
@@ -328,6 +331,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
| 328 |
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
| 329 |
{ LLM_KV_GENERAL_NAME, "general.name" },
|
| 330 |
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
|
|
|
| 331 |
{ LLM_KV_GENERAL_URL, "general.url" },
|
| 332 |
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
| 333 |
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
|
@@ -424,9 +428,12 @@ enum llm_tensor {
|
|
| 424 |
LLM_TENSOR_FFN_DOWN,
|
| 425 |
LLM_TENSOR_FFN_UP,
|
| 426 |
LLM_TENSOR_FFN_ACT,
|
| 427 |
-
LLM_TENSOR_FFN_DOWN_EXP,
|
| 428 |
LLM_TENSOR_FFN_GATE_EXP,
|
| 429 |
LLM_TENSOR_FFN_UP_EXP,
|
|
|
|
|
|
|
|
|
|
| 430 |
LLM_TENSOR_ATTN_Q_NORM,
|
| 431 |
LLM_TENSOR_ATTN_K_NORM,
|
| 432 |
LLM_TENSOR_LAYER_OUT_NORM,
|
|
@@ -461,6 +468,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
| 461 |
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
| 462 |
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
| 463 |
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
|
|
|
|
|
|
|
|
|
| 464 |
},
|
| 465 |
},
|
| 466 |
{
|
|
@@ -514,6 +524,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
| 514 |
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
| 515 |
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
| 516 |
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
|
|
|
|
|
|
|
|
|
| 517 |
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
| 518 |
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
| 519 |
},
|
|
@@ -583,6 +596,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
| 583 |
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 584 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 585 |
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
|
|
|
|
|
|
|
|
|
|
| 586 |
},
|
| 587 |
},
|
| 588 |
{
|
|
@@ -878,6 +894,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
| 878 |
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
| 879 |
},
|
| 880 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 881 |
{
|
| 882 |
LLM_ARCH_COMMAND_R,
|
| 883 |
{
|
|
@@ -1843,9 +1878,9 @@ struct llama_layer {
|
|
| 1843 |
|
| 1844 |
// ff MoE
|
| 1845 |
struct ggml_tensor * ffn_gate_inp;
|
| 1846 |
-
struct ggml_tensor *
|
| 1847 |
-
struct ggml_tensor *
|
| 1848 |
-
struct ggml_tensor *
|
| 1849 |
|
| 1850 |
// ff bias
|
| 1851 |
struct ggml_tensor * ffn_down_b; // b2
|
|
@@ -2100,10 +2135,6 @@ struct llama_context {
|
|
| 2100 |
ggml_backend_free(backend);
|
| 2101 |
}
|
| 2102 |
|
| 2103 |
-
#ifdef GGML_USE_VULKAN
|
| 2104 |
-
ggml_vk_free_cpu_assist();
|
| 2105 |
-
#endif
|
| 2106 |
-
|
| 2107 |
ggml_backend_buffer_free(buf_output);
|
| 2108 |
}
|
| 2109 |
|
|
@@ -2851,19 +2882,19 @@ struct llama_model_loader {
|
|
| 2851 |
|
| 2852 |
llama_mmaps mappings;
|
| 2853 |
|
| 2854 |
-
// Holds information on a model
|
| 2855 |
-
struct
|
| 2856 |
uint16_t idx; // source file index
|
| 2857 |
size_t offs; // tensor data offset in the original file
|
| 2858 |
|
| 2859 |
ggml_tensor * tensor;
|
| 2860 |
|
| 2861 |
-
|
| 2862 |
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
| 2863 |
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
| 2864 |
}
|
| 2865 |
};
|
| 2866 |
-
std::vector<
|
| 2867 |
|
| 2868 |
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
|
| 2869 |
|
|
@@ -2903,7 +2934,7 @@ struct llama_model_loader {
|
|
| 2903 |
// For subsidiary files, `meta` tensor data offset must not be used,
|
| 2904 |
// so we build a unified tensors index for weights.
|
| 2905 |
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
| 2906 |
-
weights.emplace_back(
|
| 2907 |
}
|
| 2908 |
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
| 2909 |
contexts.emplace_back(ctx);
|
|
@@ -2943,7 +2974,7 @@ struct llama_model_loader {
|
|
| 2943 |
|
| 2944 |
// Save tensors data offset info of the shard.
|
| 2945 |
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
| 2946 |
-
weights.emplace_back(
|
| 2947 |
}
|
| 2948 |
files.emplace_back(new llama_file(split_path, "rb"));
|
| 2949 |
contexts.emplace_back(ctx);
|
|
@@ -3147,21 +3178,37 @@ struct llama_model_loader {
|
|
| 3147 |
return weights.at(i).tensor->name;
|
| 3148 |
}
|
| 3149 |
|
| 3150 |
-
const
|
| 3151 |
for (const auto & weight : weights) {
|
| 3152 |
if (strcmp(name, weight.tensor->name) == 0) {
|
| 3153 |
-
return weight;
|
| 3154 |
}
|
| 3155 |
}
|
| 3156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3157 |
}
|
| 3158 |
|
| 3159 |
struct ggml_tensor * get_tensor_meta(const char * name) const {
|
| 3160 |
-
|
| 3161 |
-
|
| 3162 |
-
|
| 3163 |
-
return NULL;
|
| 3164 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3165 |
}
|
| 3166 |
|
| 3167 |
struct ggml_tensor * get_tensor_meta(int i) const {
|
|
@@ -3177,7 +3224,7 @@ struct llama_model_loader {
|
|
| 3177 |
return tensor;
|
| 3178 |
}
|
| 3179 |
|
| 3180 |
-
struct ggml_tensor *
|
| 3181 |
const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
|
| 3182 |
|
| 3183 |
if (cur == NULL) {
|
|
@@ -3189,8 +3236,8 @@ struct llama_model_loader {
|
|
| 3189 |
|
| 3190 |
{
|
| 3191 |
bool is_ok = true;
|
| 3192 |
-
for (size_t i = 0; i <
|
| 3193 |
-
if (ne[i] != cur->ne[i]) {
|
| 3194 |
is_ok = false;
|
| 3195 |
break;
|
| 3196 |
}
|
|
@@ -3204,9 +3251,47 @@ struct llama_model_loader {
|
|
| 3204 |
}
|
| 3205 |
}
|
| 3206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3207 |
return create_tensor_for(ctx, cur);
|
| 3208 |
}
|
| 3209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3210 |
void done_getting_tensors() const {
|
| 3211 |
if (n_created != n_tensors) {
|
| 3212 |
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
|
|
@@ -3219,7 +3304,7 @@ struct llama_model_loader {
|
|
| 3219 |
mmaps_used.reserve(files.size());
|
| 3220 |
for (const auto & file : files) {
|
| 3221 |
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
|
| 3222 |
-
mmaps_used.emplace_back(
|
| 3223 |
if (mlock_mmaps) {
|
| 3224 |
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
| 3225 |
mlock_mmap->init(mapping->addr);
|
|
@@ -3243,18 +3328,25 @@ struct llama_model_loader {
|
|
| 3243 |
*last = 0;
|
| 3244 |
*addr = mapping->addr;
|
| 3245 |
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
| 3246 |
-
|
| 3247 |
-
|
| 3248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3249 |
}
|
| 3250 |
-
*first = std::min(*first, w.offs);
|
| 3251 |
-
*last = std::max(*last, w.offs + ggml_nbytes(tensor));
|
| 3252 |
}
|
| 3253 |
}
|
| 3254 |
|
| 3255 |
// for backwards compatibility, does not support ggml-backend
|
| 3256 |
void load_data_for(struct ggml_tensor * cur) const {
|
| 3257 |
-
const auto & w =
|
| 3258 |
|
| 3259 |
if (use_mmap) {
|
| 3260 |
const auto & mapping = mappings.at(w.idx);
|
|
@@ -3287,44 +3379,49 @@ struct llama_model_loader {
|
|
| 3287 |
|
| 3288 |
std::vector<no_init<uint8_t>> read_buf;
|
| 3289 |
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3290 |
if (progress_callback) {
|
| 3291 |
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
| 3292 |
return false;
|
| 3293 |
}
|
| 3294 |
}
|
| 3295 |
|
| 3296 |
-
const auto & w = get_weights(ggml_get_name(cur));
|
| 3297 |
size_t n_size = ggml_nbytes(cur);
|
| 3298 |
|
| 3299 |
if (use_mmap) {
|
| 3300 |
-
const auto & mapping = mappings.at(
|
| 3301 |
ggml_backend_buffer_t buf_mmap = nullptr;
|
| 3302 |
-
if (bufs_mmap.count(
|
| 3303 |
-
buf_mmap = bufs_mmap.at(
|
| 3304 |
}
|
| 3305 |
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
| 3306 |
if (buf_mmap && cur->data == nullptr) {
|
| 3307 |
-
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr +
|
| 3308 |
if (lmlocks) {
|
| 3309 |
-
const auto & lmlock = lmlocks->at(
|
| 3310 |
-
lmlock->grow_to(
|
| 3311 |
}
|
| 3312 |
|
| 3313 |
-
auto & mmap_used = mmaps_used[
|
| 3314 |
-
mmap_used.first = std::min(mmap_used.first,
|
| 3315 |
-
mmap_used.second = std::max(mmap_used.second,
|
| 3316 |
} else {
|
| 3317 |
-
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr +
|
| 3318 |
}
|
| 3319 |
} else {
|
| 3320 |
-
GGML_ASSERT(
|
| 3321 |
-
const auto & file = files.at(
|
| 3322 |
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
| 3323 |
-
file->seek(
|
| 3324 |
file->read_raw(cur->data, ggml_nbytes(cur));
|
| 3325 |
} else {
|
| 3326 |
read_buf.resize(ggml_nbytes(cur));
|
| 3327 |
-
file->seek(
|
| 3328 |
file->read_raw(read_buf.data(), ggml_nbytes(cur));
|
| 3329 |
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
| 3330 |
}
|
|
@@ -3847,6 +3944,16 @@ static void llm_load_hparams(
|
|
| 3847 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 3848 |
}
|
| 3849 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3850 |
case LLM_ARCH_COMMAND_R:
|
| 3851 |
{
|
| 3852 |
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
@@ -4243,6 +4350,7 @@ static bool llm_load_tensors(
|
|
| 4243 |
|
| 4244 |
const int64_t n_layer = hparams.n_layer;
|
| 4245 |
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
|
|
|
|
| 4246 |
|
| 4247 |
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
| 4248 |
model.buft_input = llama_default_buffer_type_cpu(true);
|
|
@@ -4331,6 +4439,10 @@ static bool llm_load_tensors(
|
|
| 4331 |
|
| 4332 |
// create one context per buffer type
|
| 4333 |
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4334 |
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
| 4335 |
for (auto & it : buft_layer_count) {
|
| 4336 |
struct ggml_init_params params = {
|
|
@@ -4357,6 +4469,11 @@ static bool llm_load_tensors(
|
|
| 4357 |
const int64_t n_vocab = hparams.n_vocab;
|
| 4358 |
const int64_t n_vocab_type = hparams.n_vocab_type;
|
| 4359 |
const int64_t n_ff = hparams.n_ff;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4360 |
|
| 4361 |
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
| 4362 |
|
|
@@ -4411,30 +4528,50 @@ static bool llm_load_tensors(
|
|
| 4411 |
|
| 4412 |
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
| 4413 |
|
| 4414 |
-
|
| 4415 |
-
|
| 4416 |
-
if (layer.ffn_gate_inp == nullptr) {
|
| 4417 |
-
GGML_ASSERT(hparams.n_expert == 0);
|
| 4418 |
-
GGML_ASSERT(hparams.n_expert_used == 0);
|
| 4419 |
-
|
| 4420 |
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
| 4421 |
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
| 4422 |
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
| 4423 |
} else {
|
| 4424 |
-
|
| 4425 |
-
|
| 4426 |
-
|
| 4427 |
-
|
| 4428 |
-
|
| 4429 |
-
layer.
|
| 4430 |
-
|
| 4431 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4432 |
}
|
| 4433 |
}
|
| 4434 |
}
|
| 4435 |
} break;
|
| 4436 |
case LLM_ARCH_GROK:
|
| 4437 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4438 |
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
| 4439 |
|
| 4440 |
// output
|
|
@@ -4466,16 +4603,35 @@ static bool llm_load_tensors(
|
|
| 4466 |
|
| 4467 |
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
| 4468 |
|
| 4469 |
-
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd});
|
| 4470 |
|
| 4471 |
-
|
| 4472 |
-
|
| 4473 |
-
|
| 4474 |
-
|
| 4475 |
-
|
| 4476 |
-
|
| 4477 |
-
|
| 4478 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4479 |
}
|
| 4480 |
|
| 4481 |
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
|
@@ -4716,6 +4872,7 @@ static bool llm_load_tensors(
|
|
| 4716 |
case LLM_ARCH_MPT:
|
| 4717 |
{
|
| 4718 |
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
|
|
| 4719 |
|
| 4720 |
// output
|
| 4721 |
{
|
|
@@ -4754,6 +4911,12 @@ static bool llm_load_tensors(
|
|
| 4754 |
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
| 4755 |
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
|
| 4756 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4757 |
// AWQ ScaleActivation layer
|
| 4758 |
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
|
| 4759 |
}
|
|
@@ -5200,6 +5363,28 @@ static bool llm_load_tensors(
|
|
| 5200 |
layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
|
| 5201 |
}
|
| 5202 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5203 |
case LLM_ARCH_COMMAND_R:
|
| 5204 |
{
|
| 5205 |
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
@@ -5238,7 +5423,7 @@ static bool llm_load_tensors(
|
|
| 5238 |
|
| 5239 |
ml.done_getting_tensors();
|
| 5240 |
|
| 5241 |
-
ml.init_mappings(true, &model.mlock_mmaps);
|
| 5242 |
model.mappings.reserve(ml.mappings.size());
|
| 5243 |
|
| 5244 |
// create the backend buffers
|
|
@@ -5259,7 +5444,7 @@ static bool llm_load_tensors(
|
|
| 5259 |
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
| 5260 |
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
| 5261 |
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
| 5262 |
-
if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
|
| 5263 |
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
| 5264 |
void * addr = nullptr;
|
| 5265 |
size_t first, last;
|
|
@@ -5283,7 +5468,7 @@ static bool llm_load_tensors(
|
|
| 5283 |
}
|
| 5284 |
}
|
| 5285 |
#ifdef GGML_USE_METAL
|
| 5286 |
-
else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
|
| 5287 |
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
| 5288 |
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
| 5289 |
void * addr = nullptr;
|
|
@@ -5366,8 +5551,10 @@ static bool llm_load_tensors(
|
|
| 5366 |
}
|
| 5367 |
}
|
| 5368 |
|
| 5369 |
-
|
| 5370 |
-
|
|
|
|
|
|
|
| 5371 |
}
|
| 5372 |
|
| 5373 |
// loading time will be recalculate after the first eval, so
|
|
@@ -5523,8 +5710,8 @@ static void llm_build_kv_store(
|
|
| 5523 |
GGML_ASSERT(kv.size == n_ctx);
|
| 5524 |
|
| 5525 |
// compute the transposed [n_tokens, n_embd] V matrix
|
| 5526 |
-
|
| 5527 |
-
|
| 5528 |
cb(v_cur_t, "v_cur_t", il);
|
| 5529 |
|
| 5530 |
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
|
@@ -6235,19 +6422,19 @@ struct llm_build_context {
|
|
| 6235 |
for (int i = 0; i < n_expert_used; ++i) {
|
| 6236 |
ggml_tensor * cur_expert;
|
| 6237 |
|
| 6238 |
-
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].
|
| 6239 |
cb(cur_up, "ffn_moe_up", il);
|
| 6240 |
|
| 6241 |
-
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].
|
| 6242 |
cb(cur_gate, "ffn_moe_gate", il);
|
| 6243 |
|
| 6244 |
cur_gate = ggml_silu(ctx0, cur_gate);
|
| 6245 |
cb(cur_gate, "ffn_moe_silu", il);
|
| 6246 |
|
| 6247 |
-
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
| 6248 |
cb(cur_expert, "ffn_moe_gate_par", il);
|
| 6249 |
|
| 6250 |
-
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].
|
| 6251 |
cb(cur_expert, "ffn_moe_down", il);
|
| 6252 |
|
| 6253 |
cur_expert = ggml_mul(ctx0, cur_expert,
|
|
@@ -6411,6 +6598,111 @@ struct llm_build_context {
|
|
| 6411 |
return gf;
|
| 6412 |
}
|
| 6413 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6414 |
struct ggml_cgraph * build_falcon() {
|
| 6415 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 6416 |
|
|
@@ -6664,20 +6956,20 @@ struct llm_build_context {
|
|
| 6664 |
for (int i = 0; i < n_expert_used; ++i) {
|
| 6665 |
ggml_tensor * cur_expert;
|
| 6666 |
|
| 6667 |
-
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].
|
| 6668 |
cb(cur_up, "ffn_moe_up", il);
|
| 6669 |
|
| 6670 |
-
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].
|
| 6671 |
cb(cur_gate, "ffn_moe_gate", il);
|
| 6672 |
|
| 6673 |
//GeLU
|
| 6674 |
cur_gate = ggml_gelu(ctx0, cur_gate);
|
| 6675 |
cb(cur_gate, "ffn_moe_gelu", il);
|
| 6676 |
|
| 6677 |
-
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
| 6678 |
cb(cur_expert, "ffn_moe_gate_par", il);
|
| 6679 |
|
| 6680 |
-
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].
|
| 6681 |
cb(cur_expert, "ffn_moe_down", il);
|
| 6682 |
|
| 6683 |
cur_expert = ggml_mul(ctx0, cur_expert,
|
|
@@ -7441,6 +7733,7 @@ struct llm_build_context {
|
|
| 7441 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 7442 |
|
| 7443 |
struct ggml_tensor * cur;
|
|
|
|
| 7444 |
struct ggml_tensor * inpL;
|
| 7445 |
|
| 7446 |
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
|
@@ -7451,6 +7744,16 @@ struct llm_build_context {
|
|
| 7451 |
// positions of the tokens in the KV cache
|
| 7452 |
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
| 7453 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7454 |
for (int il = 0; il < n_layer; ++il) {
|
| 7455 |
struct ggml_tensor * attn_norm;
|
| 7456 |
|
|
@@ -7485,11 +7788,32 @@ struct llm_build_context {
|
|
| 7485 |
cb(Kcur, "Kcur", il);
|
| 7486 |
cb(Vcur, "Vcur", il);
|
| 7487 |
|
| 7488 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7489 |
|
| 7490 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7491 |
model.layers[il].wo, model.layers[il].bo,
|
| 7492 |
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7493 |
}
|
| 7494 |
|
| 7495 |
if (il == n_layer - 1) {
|
|
@@ -9152,8 +9476,9 @@ struct llm_build_context {
|
|
| 9152 |
if (il == n_layer - 1) {
|
| 9153 |
// skip computing output for unused tokens
|
| 9154 |
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
| 9155 |
-
cur
|
| 9156 |
-
inpL
|
|
|
|
| 9157 |
}
|
| 9158 |
|
| 9159 |
struct ggml_tensor * attn_out = cur;
|
|
@@ -9388,6 +9713,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
| 9388 |
{
|
| 9389 |
result = llm.build_mamba();
|
| 9390 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9391 |
case LLM_ARCH_COMMAND_R:
|
| 9392 |
{
|
| 9393 |
result = llm.build_command_r();
|
|
@@ -11294,28 +11623,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
| 11294 |
// grammar - internal
|
| 11295 |
//
|
| 11296 |
|
| 11297 |
-
struct llama_partial_utf8 {
|
| 11298 |
-
uint32_t value; // bit value so far (unshifted)
|
| 11299 |
-
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
| 11300 |
-
};
|
| 11301 |
-
|
| 11302 |
-
struct llama_grammar {
|
| 11303 |
-
const std::vector<std::vector<llama_grammar_element>> rules;
|
| 11304 |
-
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
| 11305 |
-
|
| 11306 |
-
// buffer for partially generated UTF-8 sequence from accepted tokens
|
| 11307 |
-
llama_partial_utf8 partial_utf8;
|
| 11308 |
-
};
|
| 11309 |
-
|
| 11310 |
-
struct llama_grammar_candidate {
|
| 11311 |
-
size_t index;
|
| 11312 |
-
const uint32_t * code_points;
|
| 11313 |
-
llama_partial_utf8 partial_utf8;
|
| 11314 |
-
};
|
| 11315 |
|
| 11316 |
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
| 11317 |
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
| 11318 |
-
|
| 11319 |
const std::string & src,
|
| 11320 |
llama_partial_utf8 partial_start) {
|
| 11321 |
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
|
@@ -11517,7 +11828,7 @@ static void llama_grammar_advance_stack(
|
|
| 11517 |
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
| 11518 |
// produces the N possible stacks if the given char is accepted at those
|
| 11519 |
// positions
|
| 11520 |
-
|
| 11521 |
const std::vector<std::vector<llama_grammar_element>> & rules,
|
| 11522 |
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
| 11523 |
const uint32_t chr) {
|
|
@@ -12743,7 +13054,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
| 12743 |
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
| 12744 |
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
| 12745 |
// tensor name.
|
| 12746 |
-
n_layer /= n_expert;
|
| 12747 |
if (sscanf(name, "blk.%d.", &i_layer) != 1) {
|
| 12748 |
throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
|
| 12749 |
}
|
|
@@ -13105,7 +13415,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 13105 |
kv_overrides = v->data();
|
| 13106 |
}
|
| 13107 |
llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
|
| 13108 |
-
ml.init_mappings(false); // no prefetching
|
| 13109 |
|
| 13110 |
llama_model model;
|
| 13111 |
llm_load_arch(ml, model);
|
|
@@ -13157,20 +13467,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 13157 |
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
| 13158 |
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
| 13159 |
++qs.n_attention_wv;
|
| 13160 |
-
} else if (name.find("ffn_down") != std::string::npos) {
|
| 13161 |
-
++qs.n_ffn_down;
|
| 13162 |
-
} else if (name.find("ffn_gate") != std::string::npos) {
|
| 13163 |
-
++qs.n_ffn_gate;
|
| 13164 |
-
} else if (name.find("ffn_up") != std::string::npos) {
|
| 13165 |
-
++qs.n_ffn_up;
|
| 13166 |
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
| 13167 |
qs.has_output = true;
|
| 13168 |
}
|
| 13169 |
}
|
| 13170 |
-
|
| 13171 |
-
|
| 13172 |
-
|
| 13173 |
-
|
|
|
|
| 13174 |
|
| 13175 |
size_t total_size_org = 0;
|
| 13176 |
size_t total_size_new = 0;
|
|
@@ -13200,6 +13505,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 13200 |
// placeholder for the meta data
|
| 13201 |
::zeros(fout, meta_size);
|
| 13202 |
|
|
|
|
|
|
|
| 13203 |
for (int i = 0; i < ml.n_tensors; ++i) {
|
| 13204 |
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
| 13205 |
|
|
@@ -13222,8 +13529,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 13222 |
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
| 13223 |
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
| 13224 |
|
| 13225 |
-
// quantize only 2D tensors
|
| 13226 |
-
quantize &= (ggml_n_dims(tensor)
|
| 13227 |
quantize &= params->quantize_output_tensor || name != "output.weight";
|
| 13228 |
quantize &= !params->only_copy;
|
| 13229 |
|
|
@@ -13278,11 +13585,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 13278 |
if (it == imatrix_data->end()) {
|
| 13279 |
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
|
| 13280 |
} else {
|
| 13281 |
-
if (it->second.size() == (size_t)tensor->ne[0]) {
|
| 13282 |
imatrix = it->second.data();
|
| 13283 |
} else {
|
| 13284 |
LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
|
| 13285 |
-
int(it->second.size()), int(tensor->ne[0]), tensor->name);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13286 |
}
|
| 13287 |
}
|
| 13288 |
}
|
|
@@ -13319,15 +13635,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 13319 |
new_data = work.data();
|
| 13320 |
|
| 13321 |
const int n_per_row = tensor->ne[0];
|
| 13322 |
-
const int nrows =
|
| 13323 |
|
| 13324 |
static const int min_chunk_size = 32 * 512;
|
| 13325 |
const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
| 13326 |
|
| 13327 |
-
const int
|
|
|
|
| 13328 |
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
| 13329 |
-
new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix, workers, nthread_use);
|
| 13330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13331 |
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
| 13332 |
}
|
| 13333 |
total_size_org += ggml_nbytes(tensor);
|
|
@@ -13968,7 +14293,20 @@ struct llama_context * llama_new_context_with_model(
|
|
| 13968 |
}
|
| 13969 |
}
|
| 13970 |
#elif defined(GGML_USE_VULKAN)
|
| 13971 |
-
if (model->
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13972 |
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
| 13973 |
ggml_backend_t backend = ggml_backend_vk_init(device);
|
| 13974 |
if (backend == nullptr) {
|
|
@@ -14187,6 +14525,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
| 14187 |
case LLM_ARCH_ORION:
|
| 14188 |
case LLM_ARCH_INTERNLM2:
|
| 14189 |
case LLM_ARCH_MINICPM:
|
|
|
|
| 14190 |
case LLM_ARCH_COMMAND_R:
|
| 14191 |
return LLAMA_ROPE_TYPE_NORM;
|
| 14192 |
|
|
@@ -15524,6 +15863,55 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 15524 |
ss << message->content << "</s>";
|
| 15525 |
}
|
| 15526 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15527 |
} else {
|
| 15528 |
// template not supported
|
| 15529 |
return -1;
|
|
|
|
| 218 |
LLM_ARCH_GEMMA,
|
| 219 |
LLM_ARCH_STARCODER2,
|
| 220 |
LLM_ARCH_MAMBA,
|
| 221 |
+
LLM_ARCH_XVERSE,
|
| 222 |
LLM_ARCH_COMMAND_R,
|
| 223 |
LLM_ARCH_UNKNOWN,
|
| 224 |
};
|
|
|
|
| 250 |
{ LLM_ARCH_GEMMA, "gemma" },
|
| 251 |
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
| 252 |
{ LLM_ARCH_MAMBA, "mamba" },
|
| 253 |
+
{ LLM_ARCH_XVERSE, "xverse" },
|
| 254 |
{ LLM_ARCH_COMMAND_R, "command-r" },
|
| 255 |
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
| 256 |
};
|
|
|
|
| 261 |
LLM_KV_GENERAL_ALIGNMENT,
|
| 262 |
LLM_KV_GENERAL_NAME,
|
| 263 |
LLM_KV_GENERAL_AUTHOR,
|
| 264 |
+
LLM_KV_GENERAL_VERSION,
|
| 265 |
LLM_KV_GENERAL_URL,
|
| 266 |
LLM_KV_GENERAL_DESCRIPTION,
|
| 267 |
LLM_KV_GENERAL_LICENSE,
|
|
|
|
| 331 |
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
| 332 |
{ LLM_KV_GENERAL_NAME, "general.name" },
|
| 333 |
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
| 334 |
+
{ LLM_KV_GENERAL_VERSION, "general.version" },
|
| 335 |
{ LLM_KV_GENERAL_URL, "general.url" },
|
| 336 |
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
| 337 |
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
|
|
|
| 428 |
LLM_TENSOR_FFN_DOWN,
|
| 429 |
LLM_TENSOR_FFN_UP,
|
| 430 |
LLM_TENSOR_FFN_ACT,
|
| 431 |
+
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
| 432 |
LLM_TENSOR_FFN_GATE_EXP,
|
| 433 |
LLM_TENSOR_FFN_UP_EXP,
|
| 434 |
+
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
| 435 |
+
LLM_TENSOR_FFN_GATE_EXPS,
|
| 436 |
+
LLM_TENSOR_FFN_UP_EXPS,
|
| 437 |
LLM_TENSOR_ATTN_Q_NORM,
|
| 438 |
LLM_TENSOR_ATTN_K_NORM,
|
| 439 |
LLM_TENSOR_LAYER_OUT_NORM,
|
|
|
|
| 468 |
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
| 469 |
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
| 470 |
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
| 471 |
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
| 472 |
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
| 473 |
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
| 474 |
},
|
| 475 |
},
|
| 476 |
{
|
|
|
|
| 524 |
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
| 525 |
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
| 526 |
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
| 527 |
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
| 528 |
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
| 529 |
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
| 530 |
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
| 531 |
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
| 532 |
},
|
|
|
|
| 596 |
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 597 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 598 |
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
|
| 599 |
+
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
| 600 |
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
| 601 |
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
| 602 |
},
|
| 603 |
},
|
| 604 |
{
|
|
|
|
| 894 |
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
| 895 |
},
|
| 896 |
},
|
| 897 |
+
{
|
| 898 |
+
LLM_ARCH_XVERSE,
|
| 899 |
+
{
|
| 900 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 901 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 902 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
| 903 |
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
| 904 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 905 |
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 906 |
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 907 |
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 908 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 909 |
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
| 910 |
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
| 911 |
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
| 912 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 913 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 914 |
+
},
|
| 915 |
+
},
|
| 916 |
{
|
| 917 |
LLM_ARCH_COMMAND_R,
|
| 918 |
{
|
|
|
|
| 1878 |
|
| 1879 |
// ff MoE
|
| 1880 |
struct ggml_tensor * ffn_gate_inp;
|
| 1881 |
+
struct ggml_tensor * ffn_gate_exps;
|
| 1882 |
+
struct ggml_tensor * ffn_down_exps;
|
| 1883 |
+
struct ggml_tensor * ffn_up_exps ;
|
| 1884 |
|
| 1885 |
// ff bias
|
| 1886 |
struct ggml_tensor * ffn_down_b; // b2
|
|
|
|
| 2135 |
ggml_backend_free(backend);
|
| 2136 |
}
|
| 2137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2138 |
ggml_backend_buffer_free(buf_output);
|
| 2139 |
}
|
| 2140 |
|
|
|
|
| 2882 |
|
| 2883 |
llama_mmaps mappings;
|
| 2884 |
|
| 2885 |
+
// Holds information on a model weight
|
| 2886 |
+
struct llama_tensor_weight {
|
| 2887 |
uint16_t idx; // source file index
|
| 2888 |
size_t offs; // tensor data offset in the original file
|
| 2889 |
|
| 2890 |
ggml_tensor * tensor;
|
| 2891 |
|
| 2892 |
+
llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
| 2893 |
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
| 2894 |
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
| 2895 |
}
|
| 2896 |
};
|
| 2897 |
+
std::vector<llama_tensor_weight> weights;
|
| 2898 |
|
| 2899 |
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
|
| 2900 |
|
|
|
|
| 2934 |
// For subsidiary files, `meta` tensor data offset must not be used,
|
| 2935 |
// so we build a unified tensors index for weights.
|
| 2936 |
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
| 2937 |
+
weights.emplace_back(0, cur->name, meta, cur);
|
| 2938 |
}
|
| 2939 |
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
| 2940 |
contexts.emplace_back(ctx);
|
|
|
|
| 2974 |
|
| 2975 |
// Save tensors data offset info of the shard.
|
| 2976 |
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
| 2977 |
+
weights.emplace_back(idx, cur->name, ctx_gguf, cur);
|
| 2978 |
}
|
| 2979 |
files.emplace_back(new llama_file(split_path, "rb"));
|
| 2980 |
contexts.emplace_back(ctx);
|
|
|
|
| 3178 |
return weights.at(i).tensor->name;
|
| 3179 |
}
|
| 3180 |
|
| 3181 |
+
const llama_tensor_weight * get_weight(const char * name) const {
|
| 3182 |
for (const auto & weight : weights) {
|
| 3183 |
if (strcmp(name, weight.tensor->name) == 0) {
|
| 3184 |
+
return &weight;
|
| 3185 |
}
|
| 3186 |
}
|
| 3187 |
+
return nullptr;
|
| 3188 |
+
}
|
| 3189 |
+
|
| 3190 |
+
const llama_tensor_weight & require_weight(const char * name) const {
|
| 3191 |
+
const llama_tensor_weight * weight = get_weight(name);
|
| 3192 |
+
if (!weight) {
|
| 3193 |
+
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
|
| 3194 |
+
}
|
| 3195 |
+
return *weight;
|
| 3196 |
}
|
| 3197 |
|
| 3198 |
struct ggml_tensor * get_tensor_meta(const char * name) const {
|
| 3199 |
+
const auto * weight = get_weight(name);
|
| 3200 |
+
if (!weight) {
|
| 3201 |
+
return nullptr;
|
|
|
|
| 3202 |
}
|
| 3203 |
+
return weight->tensor;
|
| 3204 |
+
}
|
| 3205 |
+
|
| 3206 |
+
struct ggml_tensor * require_tensor_meta(const char * name) const {
|
| 3207 |
+
struct ggml_tensor * tensor = get_tensor_meta(name);
|
| 3208 |
+
if (!tensor) {
|
| 3209 |
+
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
|
| 3210 |
+
}
|
| 3211 |
+
return tensor;
|
| 3212 |
}
|
| 3213 |
|
| 3214 |
struct ggml_tensor * get_tensor_meta(int i) const {
|
|
|
|
| 3224 |
return tensor;
|
| 3225 |
}
|
| 3226 |
|
| 3227 |
+
const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
|
| 3228 |
const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
|
| 3229 |
|
| 3230 |
if (cur == NULL) {
|
|
|
|
| 3236 |
|
| 3237 |
{
|
| 3238 |
bool is_ok = true;
|
| 3239 |
+
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
|
| 3240 |
+
if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
|
| 3241 |
is_ok = false;
|
| 3242 |
break;
|
| 3243 |
}
|
|
|
|
| 3251 |
}
|
| 3252 |
}
|
| 3253 |
|
| 3254 |
+
return cur;
|
| 3255 |
+
}
|
| 3256 |
+
|
| 3257 |
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
|
| 3258 |
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
| 3259 |
+
|
| 3260 |
+
if (cur == NULL) {
|
| 3261 |
+
return NULL;
|
| 3262 |
+
}
|
| 3263 |
+
|
| 3264 |
return create_tensor_for(ctx, cur);
|
| 3265 |
}
|
| 3266 |
|
| 3267 |
+
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
|
| 3268 |
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
| 3269 |
+
|
| 3270 |
+
if (cur == NULL) {
|
| 3271 |
+
return NULL;
|
| 3272 |
+
}
|
| 3273 |
+
|
| 3274 |
+
if (cur->type != base->type) {
|
| 3275 |
+
throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type)));
|
| 3276 |
+
}
|
| 3277 |
+
|
| 3278 |
+
std::array<int64_t, GGML_MAX_DIMS> dims;
|
| 3279 |
+
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
|
| 3280 |
+
dims[i] = i < ne.size() ? ne[i] : 1;
|
| 3281 |
+
}
|
| 3282 |
+
|
| 3283 |
+
struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
|
| 3284 |
+
dims[0], dims[1], dims[2], dims[3],
|
| 3285 |
+
cur->nb[1], cur->nb[2], cur->nb[3],
|
| 3286 |
+
offset);
|
| 3287 |
+
|
| 3288 |
+
ggml_set_name(tensor, name.c_str());
|
| 3289 |
+
|
| 3290 |
+
n_created++;
|
| 3291 |
+
|
| 3292 |
+
return tensor;
|
| 3293 |
+
}
|
| 3294 |
+
|
| 3295 |
void done_getting_tensors() const {
|
| 3296 |
if (n_created != n_tensors) {
|
| 3297 |
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
|
|
|
|
| 3304 |
mmaps_used.reserve(files.size());
|
| 3305 |
for (const auto & file : files) {
|
| 3306 |
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
|
| 3307 |
+
mmaps_used.emplace_back(mapping->size, 0);
|
| 3308 |
if (mlock_mmaps) {
|
| 3309 |
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
| 3310 |
mlock_mmap->init(mapping->addr);
|
|
|
|
| 3328 |
*last = 0;
|
| 3329 |
*addr = mapping->addr;
|
| 3330 |
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
| 3331 |
+
try {
|
| 3332 |
+
const auto * weight = get_weight(ggml_get_name(tensor));
|
| 3333 |
+
if (!weight) {
|
| 3334 |
+
continue;
|
| 3335 |
+
}
|
| 3336 |
+
if (weight->idx != idx) {
|
| 3337 |
+
continue;
|
| 3338 |
+
}
|
| 3339 |
+
*first = std::min(*first, weight->offs);
|
| 3340 |
+
*last = std::max(*last, weight->offs + ggml_nbytes(tensor));
|
| 3341 |
+
} catch(...) {
|
| 3342 |
+
// the tensor is not in the model
|
| 3343 |
}
|
|
|
|
|
|
|
| 3344 |
}
|
| 3345 |
}
|
| 3346 |
|
| 3347 |
// for backwards compatibility, does not support ggml-backend
|
| 3348 |
void load_data_for(struct ggml_tensor * cur) const {
|
| 3349 |
+
const auto & w = require_weight(ggml_get_name(cur));
|
| 3350 |
|
| 3351 |
if (use_mmap) {
|
| 3352 |
const auto & mapping = mappings.at(w.idx);
|
|
|
|
| 3379 |
|
| 3380 |
std::vector<no_init<uint8_t>> read_buf;
|
| 3381 |
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
| 3382 |
+
const auto * weight = get_weight(ggml_get_name(cur));
|
| 3383 |
+
if (weight == nullptr) {
|
| 3384 |
+
// this can happen with split experts models
|
| 3385 |
+
continue;
|
| 3386 |
+
}
|
| 3387 |
+
|
| 3388 |
if (progress_callback) {
|
| 3389 |
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
| 3390 |
return false;
|
| 3391 |
}
|
| 3392 |
}
|
| 3393 |
|
|
|
|
| 3394 |
size_t n_size = ggml_nbytes(cur);
|
| 3395 |
|
| 3396 |
if (use_mmap) {
|
| 3397 |
+
const auto & mapping = mappings.at(weight->idx);
|
| 3398 |
ggml_backend_buffer_t buf_mmap = nullptr;
|
| 3399 |
+
if (bufs_mmap.count(weight->idx)) {
|
| 3400 |
+
buf_mmap = bufs_mmap.at(weight->idx);
|
| 3401 |
}
|
| 3402 |
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
| 3403 |
if (buf_mmap && cur->data == nullptr) {
|
| 3404 |
+
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
|
| 3405 |
if (lmlocks) {
|
| 3406 |
+
const auto & lmlock = lmlocks->at(weight->idx);
|
| 3407 |
+
lmlock->grow_to(weight->offs + ggml_nbytes(cur));
|
| 3408 |
}
|
| 3409 |
|
| 3410 |
+
auto & mmap_used = mmaps_used[weight->idx];
|
| 3411 |
+
mmap_used.first = std::min(mmap_used.first, weight->offs);
|
| 3412 |
+
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
|
| 3413 |
} else {
|
| 3414 |
+
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
|
| 3415 |
}
|
| 3416 |
} else {
|
| 3417 |
+
GGML_ASSERT(weight->idx < files.size());
|
| 3418 |
+
const auto & file = files.at(weight->idx);
|
| 3419 |
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
| 3420 |
+
file->seek(weight->offs, SEEK_SET);
|
| 3421 |
file->read_raw(cur->data, ggml_nbytes(cur));
|
| 3422 |
} else {
|
| 3423 |
read_buf.resize(ggml_nbytes(cur));
|
| 3424 |
+
file->seek(weight->offs, SEEK_SET);
|
| 3425 |
file->read_raw(read_buf.data(), ggml_nbytes(cur));
|
| 3426 |
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
| 3427 |
}
|
|
|
|
| 3944 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 3945 |
}
|
| 3946 |
} break;
|
| 3947 |
+
case LLM_ARCH_XVERSE:
|
| 3948 |
+
{
|
| 3949 |
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 3950 |
+
switch (hparams.n_layer) {
|
| 3951 |
+
case 32: model.type = e_model::MODEL_7B; break;
|
| 3952 |
+
case 40: model.type = e_model::MODEL_13B; break;
|
| 3953 |
+
case 80: model.type = e_model::MODEL_65B; break;
|
| 3954 |
+
default: model.type = e_model::MODEL_UNKNOWN;
|
| 3955 |
+
}
|
| 3956 |
+
} break;
|
| 3957 |
case LLM_ARCH_COMMAND_R:
|
| 3958 |
{
|
| 3959 |
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
|
|
| 4350 |
|
| 4351 |
const int64_t n_layer = hparams.n_layer;
|
| 4352 |
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
|
| 4353 |
+
bool use_mmap_buffer = true;
|
| 4354 |
|
| 4355 |
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
| 4356 |
model.buft_input = llama_default_buffer_type_cpu(true);
|
|
|
|
| 4439 |
|
| 4440 |
// create one context per buffer type
|
| 4441 |
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
| 4442 |
+
|
| 4443 |
+
// for moe merged tensors
|
| 4444 |
+
ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
|
| 4445 |
+
|
| 4446 |
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
| 4447 |
for (auto & it : buft_layer_count) {
|
| 4448 |
struct ggml_init_params params = {
|
|
|
|
| 4469 |
const int64_t n_vocab = hparams.n_vocab;
|
| 4470 |
const int64_t n_vocab_type = hparams.n_vocab_type;
|
| 4471 |
const int64_t n_ff = hparams.n_ff;
|
| 4472 |
+
const int64_t n_expert = hparams.n_expert;
|
| 4473 |
+
|
| 4474 |
+
if (n_expert > 0 && hparams.n_expert_used == 0) {
|
| 4475 |
+
throw std::runtime_error("model has expert layers but no expert layers are used");
|
| 4476 |
+
}
|
| 4477 |
|
| 4478 |
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
| 4479 |
|
|
|
|
| 4528 |
|
| 4529 |
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
| 4530 |
|
| 4531 |
+
if (n_expert == 0) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4532 |
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
| 4533 |
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
| 4534 |
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
| 4535 |
} else {
|
| 4536 |
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
| 4537 |
+
|
| 4538 |
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
| 4539 |
+
if (layer.ffn_gate_exps) {
|
| 4540 |
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
| 4541 |
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
| 4542 |
+
} else {
|
| 4543 |
+
// merge split expert into a single tensor for compatibility with older models
|
| 4544 |
+
// requires disabling mmap
|
| 4545 |
+
use_mmap_buffer = false;
|
| 4546 |
+
|
| 4547 |
+
ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
|
| 4548 |
+
ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
|
| 4549 |
+
ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
|
| 4550 |
+
|
| 4551 |
+
layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
|
| 4552 |
+
layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
|
| 4553 |
+
layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
|
| 4554 |
+
|
| 4555 |
+
ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
|
| 4556 |
+
ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
|
| 4557 |
+
ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
|
| 4558 |
+
|
| 4559 |
+
for (uint32_t x = 0; x < n_expert; ++x) {
|
| 4560 |
+
// the individual experts are loaded into a view of the merged tensor
|
| 4561 |
+
ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
|
| 4562 |
+
ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
|
| 4563 |
+
ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
|
| 4564 |
+
}
|
| 4565 |
}
|
| 4566 |
}
|
| 4567 |
}
|
| 4568 |
} break;
|
| 4569 |
case LLM_ARCH_GROK:
|
| 4570 |
{
|
| 4571 |
+
if (n_expert == 0) {
|
| 4572 |
+
throw std::runtime_error("Grok model cannot have zero experts");
|
| 4573 |
+
}
|
| 4574 |
+
|
| 4575 |
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
| 4576 |
|
| 4577 |
// output
|
|
|
|
| 4603 |
|
| 4604 |
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
| 4605 |
|
| 4606 |
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
| 4607 |
|
| 4608 |
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
| 4609 |
+
if (layer.ffn_gate_exps) {
|
| 4610 |
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
| 4611 |
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
| 4612 |
+
} else {
|
| 4613 |
+
// merge split expert into a single tensor for compatibility with older models
|
| 4614 |
+
// requires disabling mmap
|
| 4615 |
+
use_mmap_buffer = false;
|
| 4616 |
+
|
| 4617 |
+
ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
|
| 4618 |
+
ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
|
| 4619 |
+
ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
|
| 4620 |
+
|
| 4621 |
+
layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
|
| 4622 |
+
layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
|
| 4623 |
+
layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
|
| 4624 |
+
|
| 4625 |
+
ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
|
| 4626 |
+
ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
|
| 4627 |
+
ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
|
| 4628 |
+
|
| 4629 |
+
for (uint32_t x = 0; x < n_expert; ++x) {
|
| 4630 |
+
// the individual experts are loaded into a view of the merged tensor
|
| 4631 |
+
ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
|
| 4632 |
+
ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
|
| 4633 |
+
ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
|
| 4634 |
+
}
|
| 4635 |
}
|
| 4636 |
|
| 4637 |
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
|
|
|
| 4872 |
case LLM_ARCH_MPT:
|
| 4873 |
{
|
| 4874 |
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
| 4875 |
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
|
| 4876 |
|
| 4877 |
// output
|
| 4878 |
{
|
|
|
|
| 4911 |
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
| 4912 |
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
|
| 4913 |
|
| 4914 |
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
|
| 4915 |
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
|
| 4916 |
+
|
| 4917 |
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
|
| 4918 |
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
|
| 4919 |
+
|
| 4920 |
// AWQ ScaleActivation layer
|
| 4921 |
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
|
| 4922 |
}
|
|
|
|
| 5363 |
layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
|
| 5364 |
}
|
| 5365 |
} break;
|
| 5366 |
+
case LLM_ARCH_XVERSE:
|
| 5367 |
+
{
|
| 5368 |
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
| 5369 |
+
{
|
| 5370 |
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
| 5371 |
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
| 5372 |
+
}
|
| 5373 |
+
for (int i = 0; i < n_layer; ++i) {
|
| 5374 |
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
| 5375 |
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
| 5376 |
+
auto & layer = model.layers[i];
|
| 5377 |
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
| 5378 |
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
| 5379 |
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
| 5380 |
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
| 5381 |
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
| 5382 |
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
| 5383 |
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
| 5384 |
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
| 5385 |
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
| 5386 |
+
}
|
| 5387 |
+
} break;
|
| 5388 |
case LLM_ARCH_COMMAND_R:
|
| 5389 |
{
|
| 5390 |
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
|
|
| 5423 |
|
| 5424 |
ml.done_getting_tensors();
|
| 5425 |
|
| 5426 |
+
ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
|
| 5427 |
model.mappings.reserve(ml.mappings.size());
|
| 5428 |
|
| 5429 |
// create the backend buffers
|
|
|
|
| 5444 |
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
| 5445 |
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
| 5446 |
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
| 5447 |
+
if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
|
| 5448 |
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
| 5449 |
void * addr = nullptr;
|
| 5450 |
size_t first, last;
|
|
|
|
| 5468 |
}
|
| 5469 |
}
|
| 5470 |
#ifdef GGML_USE_METAL
|
| 5471 |
+
else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
|
| 5472 |
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
| 5473 |
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
| 5474 |
void * addr = nullptr;
|
|
|
|
| 5551 |
}
|
| 5552 |
}
|
| 5553 |
|
| 5554 |
+
if (use_mmap_buffer) {
|
| 5555 |
+
for (auto & mapping : ml.mappings) {
|
| 5556 |
+
model.mappings.emplace_back(std::move(mapping));
|
| 5557 |
+
}
|
| 5558 |
}
|
| 5559 |
|
| 5560 |
// loading time will be recalculate after the first eval, so
|
|
|
|
| 5710 |
GGML_ASSERT(kv.size == n_ctx);
|
| 5711 |
|
| 5712 |
// compute the transposed [n_tokens, n_embd] V matrix
|
| 5713 |
+
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
| 5714 |
+
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
|
| 5715 |
cb(v_cur_t, "v_cur_t", il);
|
| 5716 |
|
| 5717 |
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
|
|
|
| 6422 |
for (int i = 0; i < n_expert_used; ++i) {
|
| 6423 |
ggml_tensor * cur_expert;
|
| 6424 |
|
| 6425 |
+
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
| 6426 |
cb(cur_up, "ffn_moe_up", il);
|
| 6427 |
|
| 6428 |
+
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
| 6429 |
cb(cur_gate, "ffn_moe_gate", il);
|
| 6430 |
|
| 6431 |
cur_gate = ggml_silu(ctx0, cur_gate);
|
| 6432 |
cb(cur_gate, "ffn_moe_silu", il);
|
| 6433 |
|
| 6434 |
+
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
| 6435 |
cb(cur_expert, "ffn_moe_gate_par", il);
|
| 6436 |
|
| 6437 |
+
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
| 6438 |
cb(cur_expert, "ffn_moe_down", il);
|
| 6439 |
|
| 6440 |
cur_expert = ggml_mul(ctx0, cur_expert,
|
|
|
|
| 6598 |
return gf;
|
| 6599 |
}
|
| 6600 |
|
| 6601 |
+
struct ggml_cgraph * build_xverse() {
|
| 6602 |
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 6603 |
+
|
| 6604 |
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 6605 |
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 6606 |
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
| 6607 |
+
|
| 6608 |
+
struct ggml_tensor * cur;
|
| 6609 |
+
struct ggml_tensor * inpL;
|
| 6610 |
+
|
| 6611 |
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
| 6612 |
+
|
| 6613 |
+
// inp_pos - contains the positions
|
| 6614 |
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
| 6615 |
+
|
| 6616 |
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 6617 |
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
| 6618 |
+
|
| 6619 |
+
// positions of the tokens in the KV cache
|
| 6620 |
+
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
| 6621 |
+
|
| 6622 |
+
for (int il = 0; il < n_layer; ++il) {
|
| 6623 |
+
struct ggml_tensor * inpSA = inpL;
|
| 6624 |
+
|
| 6625 |
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
| 6626 |
+
model.layers[il].attn_norm, NULL,
|
| 6627 |
+
LLM_NORM_RMS, cb, il);
|
| 6628 |
+
cb(cur, "attn_norm", il);
|
| 6629 |
+
|
| 6630 |
+
// self-attention
|
| 6631 |
+
{
|
| 6632 |
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
| 6633 |
+
cb(Qcur, "Qcur", il);
|
| 6634 |
+
|
| 6635 |
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
| 6636 |
+
cb(Kcur, "Kcur", il);
|
| 6637 |
+
|
| 6638 |
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
| 6639 |
+
cb(Vcur, "Vcur", il);
|
| 6640 |
+
|
| 6641 |
+
Qcur = ggml_rope_custom(
|
| 6642 |
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
| 6643 |
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
| 6644 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 6645 |
+
);
|
| 6646 |
+
cb(Qcur, "Qcur", il);
|
| 6647 |
+
|
| 6648 |
+
Kcur = ggml_rope_custom(
|
| 6649 |
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
| 6650 |
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
| 6651 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 6652 |
+
);
|
| 6653 |
+
cb(Kcur, "Kcur", il);
|
| 6654 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6655 |
+
model.layers[il].wo, NULL,
|
| 6656 |
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 6657 |
+
}
|
| 6658 |
+
|
| 6659 |
+
if (il == n_layer - 1) {
|
| 6660 |
+
// skip computing output for unused tokens
|
| 6661 |
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
| 6662 |
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
| 6663 |
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
| 6664 |
+
}
|
| 6665 |
+
|
| 6666 |
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
| 6667 |
+
cb(ffn_inp, "ffn_inp", il);
|
| 6668 |
+
|
| 6669 |
+
// feed-forward network
|
| 6670 |
+
{
|
| 6671 |
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
| 6672 |
+
model.layers[il].ffn_norm, NULL,
|
| 6673 |
+
LLM_NORM_RMS, cb, il);
|
| 6674 |
+
cb(cur, "ffn_norm", il);
|
| 6675 |
+
|
| 6676 |
+
cur = llm_build_ffn(ctx0, cur,
|
| 6677 |
+
model.layers[il].ffn_up, NULL,
|
| 6678 |
+
model.layers[il].ffn_gate, NULL,
|
| 6679 |
+
model.layers[il].ffn_down, NULL,
|
| 6680 |
+
NULL,
|
| 6681 |
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
| 6682 |
+
cb(cur, "ffn_out", il);
|
| 6683 |
+
}
|
| 6684 |
+
|
| 6685 |
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 6686 |
+
cb(cur, "l_out", il);
|
| 6687 |
+
|
| 6688 |
+
// input for next layer
|
| 6689 |
+
inpL = cur;
|
| 6690 |
+
}
|
| 6691 |
+
|
| 6692 |
+
cur = inpL;
|
| 6693 |
+
|
| 6694 |
+
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
|
| 6695 |
+
cb(cur, "result_norm", -1);
|
| 6696 |
+
|
| 6697 |
+
// lm_head
|
| 6698 |
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
| 6699 |
+
cb(cur, "result_output", -1);
|
| 6700 |
+
|
| 6701 |
+
ggml_build_forward_expand(gf, cur);
|
| 6702 |
+
|
| 6703 |
+
return gf;
|
| 6704 |
+
}
|
| 6705 |
+
|
| 6706 |
struct ggml_cgraph * build_falcon() {
|
| 6707 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 6708 |
|
|
|
|
| 6956 |
for (int i = 0; i < n_expert_used; ++i) {
|
| 6957 |
ggml_tensor * cur_expert;
|
| 6958 |
|
| 6959 |
+
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
| 6960 |
cb(cur_up, "ffn_moe_up", il);
|
| 6961 |
|
| 6962 |
+
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
| 6963 |
cb(cur_gate, "ffn_moe_gate", il);
|
| 6964 |
|
| 6965 |
//GeLU
|
| 6966 |
cur_gate = ggml_gelu(ctx0, cur_gate);
|
| 6967 |
cb(cur_gate, "ffn_moe_gelu", il);
|
| 6968 |
|
| 6969 |
+
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
| 6970 |
cb(cur_expert, "ffn_moe_gate_par", il);
|
| 6971 |
|
| 6972 |
+
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
| 6973 |
cb(cur_expert, "ffn_moe_down", il);
|
| 6974 |
|
| 6975 |
cur_expert = ggml_mul(ctx0, cur_expert,
|
|
|
|
| 7733 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 7734 |
|
| 7735 |
struct ggml_tensor * cur;
|
| 7736 |
+
struct ggml_tensor * pos;
|
| 7737 |
struct ggml_tensor * inpL;
|
| 7738 |
|
| 7739 |
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
|
|
|
| 7744 |
// positions of the tokens in the KV cache
|
| 7745 |
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
| 7746 |
|
| 7747 |
+
if (model.pos_embd) {
|
| 7748 |
+
// inp_pos - contains the positions
|
| 7749 |
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
| 7750 |
+
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
| 7751 |
+
cb(pos, "pos_embd", -1);
|
| 7752 |
+
|
| 7753 |
+
inpL = ggml_add(ctx0, inpL, pos);
|
| 7754 |
+
cb(inpL, "inpL", -1);
|
| 7755 |
+
}
|
| 7756 |
+
|
| 7757 |
for (int il = 0; il < n_layer; ++il) {
|
| 7758 |
struct ggml_tensor * attn_norm;
|
| 7759 |
|
|
|
|
| 7788 |
cb(Kcur, "Kcur", il);
|
| 7789 |
cb(Vcur, "Vcur", il);
|
| 7790 |
|
| 7791 |
+
// Q/K Layernorm
|
| 7792 |
+
if (model.layers[il].attn_q_norm) {
|
| 7793 |
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
| 7794 |
+
model.layers[il].attn_q_norm,
|
| 7795 |
+
model.layers[il].attn_q_norm_b,
|
| 7796 |
+
LLM_NORM, cb, il);
|
| 7797 |
+
cb(Qcur, "Qcur", il);
|
| 7798 |
|
| 7799 |
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
| 7800 |
+
model.layers[il].attn_k_norm,
|
| 7801 |
+
model.layers[il].attn_k_norm_b,
|
| 7802 |
+
LLM_NORM, cb, il);
|
| 7803 |
+
cb(Kcur, "Kcur", il);
|
| 7804 |
+
|
| 7805 |
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 7806 |
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
| 7807 |
+
|
| 7808 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 7809 |
model.layers[il].wo, model.layers[il].bo,
|
| 7810 |
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 7811 |
+
} else {
|
| 7812 |
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 7813 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 7814 |
+
model.layers[il].wo, model.layers[il].bo,
|
| 7815 |
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 7816 |
+
}
|
| 7817 |
}
|
| 7818 |
|
| 7819 |
if (il == n_layer - 1) {
|
|
|
|
| 9476 |
if (il == n_layer - 1) {
|
| 9477 |
// skip computing output for unused tokens
|
| 9478 |
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
| 9479 |
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
| 9480 |
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
| 9481 |
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
| 9482 |
}
|
| 9483 |
|
| 9484 |
struct ggml_tensor * attn_out = cur;
|
|
|
|
| 9713 |
{
|
| 9714 |
result = llm.build_mamba();
|
| 9715 |
} break;
|
| 9716 |
+
case LLM_ARCH_XVERSE:
|
| 9717 |
+
{
|
| 9718 |
+
result = llm.build_xverse();
|
| 9719 |
+
} break;
|
| 9720 |
case LLM_ARCH_COMMAND_R:
|
| 9721 |
{
|
| 9722 |
result = llm.build_command_r();
|
|
|
|
| 11623 |
// grammar - internal
|
| 11624 |
//
|
| 11625 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11626 |
|
| 11627 |
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
| 11628 |
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
| 11629 |
+
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
| 11630 |
const std::string & src,
|
| 11631 |
llama_partial_utf8 partial_start) {
|
| 11632 |
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
|
|
|
| 11828 |
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
| 11829 |
// produces the N possible stacks if the given char is accepted at those
|
| 11830 |
// positions
|
| 11831 |
+
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
| 11832 |
const std::vector<std::vector<llama_grammar_element>> & rules,
|
| 11833 |
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
| 11834 |
const uint32_t chr) {
|
|
|
|
| 13054 |
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
| 13055 |
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
| 13056 |
// tensor name.
|
|
|
|
| 13057 |
if (sscanf(name, "blk.%d.", &i_layer) != 1) {
|
| 13058 |
throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
|
| 13059 |
}
|
|
|
|
| 13415 |
kv_overrides = v->data();
|
| 13416 |
}
|
| 13417 |
llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
|
| 13418 |
+
ml.init_mappings(false); // no prefetching
|
| 13419 |
|
| 13420 |
llama_model model;
|
| 13421 |
llm_load_arch(ml, model);
|
|
|
|
| 13467 |
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
| 13468 |
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
| 13469 |
++qs.n_attention_wv;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13470 |
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
| 13471 |
qs.has_output = true;
|
| 13472 |
}
|
| 13473 |
}
|
| 13474 |
+
|
| 13475 |
+
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
| 13476 |
+
|
| 13477 |
+
// sanity checks
|
| 13478 |
+
GGML_ASSERT(qs.n_attention_wv == (int)model.hparams.n_layer && "n_attention_wv != n_layer is unexpected");
|
| 13479 |
|
| 13480 |
size_t total_size_org = 0;
|
| 13481 |
size_t total_size_new = 0;
|
|
|
|
| 13505 |
// placeholder for the meta data
|
| 13506 |
::zeros(fout, meta_size);
|
| 13507 |
|
| 13508 |
+
const auto tn = LLM_TN(model.arch);
|
| 13509 |
+
|
| 13510 |
for (int i = 0; i < ml.n_tensors; ++i) {
|
| 13511 |
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
| 13512 |
|
|
|
|
| 13529 |
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
| 13530 |
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
| 13531 |
|
| 13532 |
+
// quantize only 2D and 3D tensors (experts)
|
| 13533 |
+
quantize &= (ggml_n_dims(tensor) >= 2);
|
| 13534 |
quantize &= params->quantize_output_tensor || name != "output.weight";
|
| 13535 |
quantize &= !params->only_copy;
|
| 13536 |
|
|
|
|
| 13585 |
if (it == imatrix_data->end()) {
|
| 13586 |
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
|
| 13587 |
} else {
|
| 13588 |
+
if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
|
| 13589 |
imatrix = it->second.data();
|
| 13590 |
} else {
|
| 13591 |
LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
|
| 13592 |
+
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
|
| 13593 |
+
|
| 13594 |
+
// this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
|
| 13595 |
+
// this is a significant error and it may be good idea to abort the process if this happens,
|
| 13596 |
+
// since many people will miss the error and not realize that most of the model is being quantized without an imatrix
|
| 13597 |
+
// tok_embd should be ignored in this case, since it always causes this warning
|
| 13598 |
+
if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
| 13599 |
+
throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
|
| 13600 |
+
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
|
| 13601 |
+
}
|
| 13602 |
}
|
| 13603 |
}
|
| 13604 |
}
|
|
|
|
| 13635 |
new_data = work.data();
|
| 13636 |
|
| 13637 |
const int n_per_row = tensor->ne[0];
|
| 13638 |
+
const int nrows = tensor->ne[1];
|
| 13639 |
|
| 13640 |
static const int min_chunk_size = 32 * 512;
|
| 13641 |
const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
| 13642 |
|
| 13643 |
+
const int nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
| 13644 |
+
const int nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
| 13645 |
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
|
|
|
| 13646 |
|
| 13647 |
+
// quantize each expert separately since they have different importance matrices
|
| 13648 |
+
new_size = 0;
|
| 13649 |
+
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
| 13650 |
+
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
|
| 13651 |
+
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
| 13652 |
+
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
| 13653 |
+
|
| 13654 |
+
new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
| 13655 |
+
}
|
| 13656 |
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
| 13657 |
}
|
| 13658 |
total_size_org += ggml_nbytes(tensor);
|
|
|
|
| 14293 |
}
|
| 14294 |
}
|
| 14295 |
#elif defined(GGML_USE_VULKAN)
|
| 14296 |
+
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
| 14297 |
+
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
| 14298 |
+
llama_free(ctx);
|
| 14299 |
+
return nullptr;
|
| 14300 |
+
}
|
| 14301 |
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
| 14302 |
+
ggml_backend_t backend = ggml_backend_vk_init(0);
|
| 14303 |
+
if (backend == nullptr) {
|
| 14304 |
+
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
| 14305 |
+
llama_free(ctx);
|
| 14306 |
+
return nullptr;
|
| 14307 |
+
}
|
| 14308 |
+
ctx->backends.push_back(backend);
|
| 14309 |
+
} else {
|
| 14310 |
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
| 14311 |
ggml_backend_t backend = ggml_backend_vk_init(device);
|
| 14312 |
if (backend == nullptr) {
|
|
|
|
| 14525 |
case LLM_ARCH_ORION:
|
| 14526 |
case LLM_ARCH_INTERNLM2:
|
| 14527 |
case LLM_ARCH_MINICPM:
|
| 14528 |
+
case LLM_ARCH_XVERSE:
|
| 14529 |
case LLM_ARCH_COMMAND_R:
|
| 14530 |
return LLAMA_ROPE_TYPE_NORM;
|
| 14531 |
|
|
|
|
| 15863 |
ss << message->content << "</s>";
|
| 15864 |
}
|
| 15865 |
}
|
| 15866 |
+
} else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) {
|
| 15867 |
+
// openchat/openchat-3.5-0106,
|
| 15868 |
+
for (auto message : chat) {
|
| 15869 |
+
std::string role(message->role);
|
| 15870 |
+
if (role == "system") {
|
| 15871 |
+
ss << message->content << "<|end_of_turn|>";
|
| 15872 |
+
} else {
|
| 15873 |
+
role[0] = toupper(role[0]);
|
| 15874 |
+
ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
|
| 15875 |
+
}
|
| 15876 |
+
}
|
| 15877 |
+
if (add_ass) {
|
| 15878 |
+
ss << "GPT4 Correct Assistant:";
|
| 15879 |
+
}
|
| 15880 |
+
} else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) {
|
| 15881 |
+
// eachadea/vicuna-13b-1.1 (and Orca variant)
|
| 15882 |
+
for (auto message : chat) {
|
| 15883 |
+
std::string role(message->role);
|
| 15884 |
+
if (role == "system") {
|
| 15885 |
+
// Orca-Vicuna variant uses a system prefix
|
| 15886 |
+
if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) {
|
| 15887 |
+
ss << "SYSTEM: " << message->content << "\n";
|
| 15888 |
+
} else {
|
| 15889 |
+
ss << message->content << "\n\n";
|
| 15890 |
+
}
|
| 15891 |
+
} else if (role == "user") {
|
| 15892 |
+
ss << "USER: " << message->content << "\n";
|
| 15893 |
+
} else if (role == "assistant") {
|
| 15894 |
+
ss << "ASSISTANT: " << message->content << "</s>\n";
|
| 15895 |
+
}
|
| 15896 |
+
}
|
| 15897 |
+
if (add_ass) {
|
| 15898 |
+
ss << "ASSISTANT:";
|
| 15899 |
+
}
|
| 15900 |
+
} else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) {
|
| 15901 |
+
// deepseek-ai/deepseek-coder-33b-instruct
|
| 15902 |
+
for (auto message : chat) {
|
| 15903 |
+
std::string role(message->role);
|
| 15904 |
+
if (role == "system") {
|
| 15905 |
+
ss << message->content;
|
| 15906 |
+
} else if (role == "user") {
|
| 15907 |
+
ss << "### Instruction:\n" << message->content << "\n";
|
| 15908 |
+
} else if (role == "assistant") {
|
| 15909 |
+
ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
|
| 15910 |
+
}
|
| 15911 |
+
}
|
| 15912 |
+
if (add_ass) {
|
| 15913 |
+
ss << "### Response:\n";
|
| 15914 |
+
}
|
| 15915 |
} else {
|
| 15916 |
// template not supported
|
| 15917 |
return -1;
|
examples/talk-llama/llama.h
CHANGED
|
@@ -60,9 +60,9 @@ extern "C" {
|
|
| 60 |
|
| 61 |
enum llama_vocab_type {
|
| 62 |
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
| 63 |
-
LLAMA_VOCAB_TYPE_SPM = 1, //
|
| 64 |
-
LLAMA_VOCAB_TYPE_BPE = 2, //
|
| 65 |
-
LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
|
| 66 |
};
|
| 67 |
|
| 68 |
// note: these values should be synchronized with ggml_rope
|
|
@@ -1007,10 +1007,38 @@ extern "C" {
|
|
| 1007 |
|
| 1008 |
struct ggml_tensor;
|
| 1009 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1010 |
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
| 1011 |
struct llama_context * ctx
|
| 1012 |
);
|
| 1013 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1014 |
#endif // LLAMA_API_INTERNAL
|
| 1015 |
|
| 1016 |
#endif // LLAMA_H
|
|
|
|
| 60 |
|
| 61 |
enum llama_vocab_type {
|
| 62 |
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
| 63 |
+
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
| 64 |
+
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
| 65 |
+
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
| 66 |
};
|
| 67 |
|
| 68 |
// note: these values should be synchronized with ggml_rope
|
|
|
|
| 1007 |
|
| 1008 |
struct ggml_tensor;
|
| 1009 |
|
| 1010 |
+
struct llama_partial_utf8 {
|
| 1011 |
+
uint32_t value; // bit value so far (unshifted)
|
| 1012 |
+
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
| 1013 |
+
};
|
| 1014 |
+
|
| 1015 |
+
struct llama_grammar {
|
| 1016 |
+
const std::vector<std::vector<llama_grammar_element>> rules;
|
| 1017 |
+
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
| 1018 |
+
|
| 1019 |
+
// buffer for partially generated UTF-8 sequence from accepted tokens
|
| 1020 |
+
llama_partial_utf8 partial_utf8;
|
| 1021 |
+
};
|
| 1022 |
+
|
| 1023 |
+
struct llama_grammar_candidate {
|
| 1024 |
+
size_t index;
|
| 1025 |
+
const uint32_t * code_points;
|
| 1026 |
+
llama_partial_utf8 partial_utf8;
|
| 1027 |
+
};
|
| 1028 |
+
|
| 1029 |
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
| 1030 |
struct llama_context * ctx
|
| 1031 |
);
|
| 1032 |
|
| 1033 |
+
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
| 1034 |
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
| 1035 |
+
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
| 1036 |
+
const uint32_t chr);
|
| 1037 |
+
|
| 1038 |
+
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
| 1039 |
+
const std::string & src,
|
| 1040 |
+
llama_partial_utf8 partial_start);
|
| 1041 |
+
|
| 1042 |
#endif // LLAMA_API_INTERNAL
|
| 1043 |
|
| 1044 |
#endif // LLAMA_H
|