Spaces:
Running
Running
talk-llama : sync llama.cpp
Browse files- examples/talk-llama/llama.cpp +700 -202
- examples/talk-llama/llama.h +12 -19
examples/talk-llama/llama.cpp
CHANGED
|
@@ -15,6 +15,8 @@
|
|
| 15 |
# include "ggml-vulkan.h"
|
| 16 |
#elif defined(GGML_USE_SYCL)
|
| 17 |
# include "ggml-sycl.h"
|
|
|
|
|
|
|
| 18 |
#endif
|
| 19 |
|
| 20 |
#ifdef GGML_USE_METAL
|
|
@@ -202,10 +204,12 @@ enum llm_arch {
|
|
| 202 |
LLM_ARCH_PLAMO,
|
| 203 |
LLM_ARCH_CODESHELL,
|
| 204 |
LLM_ARCH_ORION,
|
|
|
|
|
|
|
| 205 |
LLM_ARCH_UNKNOWN,
|
| 206 |
};
|
| 207 |
|
| 208 |
-
static std::map<llm_arch,
|
| 209 |
{ LLM_ARCH_LLAMA, "llama" },
|
| 210 |
{ LLM_ARCH_FALCON, "falcon" },
|
| 211 |
{ LLM_ARCH_GPT2, "gpt2" },
|
|
@@ -224,6 +228,8 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
| 224 |
{ LLM_ARCH_PLAMO, "plamo" },
|
| 225 |
{ LLM_ARCH_CODESHELL, "codeshell" },
|
| 226 |
{ LLM_ARCH_ORION, "orion" },
|
|
|
|
|
|
|
| 227 |
};
|
| 228 |
|
| 229 |
enum llm_kv {
|
|
@@ -276,11 +282,12 @@ enum llm_kv {
|
|
| 276 |
LLM_KV_TOKENIZER_PAD_ID,
|
| 277 |
LLM_KV_TOKENIZER_ADD_BOS,
|
| 278 |
LLM_KV_TOKENIZER_ADD_EOS,
|
|
|
|
| 279 |
LLM_KV_TOKENIZER_HF_JSON,
|
| 280 |
LLM_KV_TOKENIZER_RWKV,
|
| 281 |
};
|
| 282 |
|
| 283 |
-
static std::map<llm_kv,
|
| 284 |
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
| 285 |
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
| 286 |
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
|
@@ -330,6 +337,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
| 330 |
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
| 331 |
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
| 332 |
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
|
|
|
| 333 |
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
| 334 |
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
| 335 |
};
|
|
@@ -340,7 +348,7 @@ struct LLM_KV {
|
|
| 340 |
llm_arch arch;
|
| 341 |
|
| 342 |
std::string operator()(llm_kv kv) const {
|
| 343 |
-
return ::format(LLM_KV_NAMES[kv]
|
| 344 |
}
|
| 345 |
};
|
| 346 |
|
|
@@ -667,7 +675,46 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
| 667 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 668 |
},
|
| 669 |
},
|
| 670 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 671 |
{
|
| 672 |
LLM_ARCH_UNKNOWN,
|
| 673 |
{
|
|
@@ -725,13 +772,13 @@ struct LLM_TN {
|
|
| 725 |
// gguf helpers
|
| 726 |
//
|
| 727 |
|
| 728 |
-
static std::map<
|
| 729 |
{ LLAMA_ROPE_SCALING_NONE, "none" },
|
| 730 |
{ LLAMA_ROPE_SCALING_LINEAR, "linear" },
|
| 731 |
{ LLAMA_ROPE_SCALING_YARN, "yarn" },
|
| 732 |
};
|
| 733 |
|
| 734 |
-
static
|
| 735 |
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
| 736 |
if (kv.second == name) {
|
| 737 |
return kv.first;
|
|
@@ -1158,10 +1205,10 @@ struct llama_mlock {
|
|
| 1158 |
#ifdef __APPLE__
|
| 1159 |
#define MLOCK_SUGGESTION \
|
| 1160 |
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
| 1161 |
-
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing
|
| 1162 |
#else
|
| 1163 |
#define MLOCK_SUGGESTION \
|
| 1164 |
-
"Try increasing
|
| 1165 |
#endif
|
| 1166 |
|
| 1167 |
bool raw_lock(const void * addr, size_t size) const {
|
|
@@ -1308,11 +1355,16 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
| 1308 |
#elif defined(GGML_USE_CUBLAS)
|
| 1309 |
buft = ggml_backend_cuda_buffer_type(gpu);
|
| 1310 |
#elif defined(GGML_USE_VULKAN)
|
| 1311 |
-
buft = ggml_backend_vk_buffer_type();
|
| 1312 |
#elif defined(GGML_USE_SYCL)
|
| 1313 |
buft = ggml_backend_sycl_buffer_type(gpu);
|
| 1314 |
#elif defined(GGML_USE_CLBLAST)
|
| 1315 |
buft = ggml_backend_opencl_buffer_type();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1316 |
#endif
|
| 1317 |
|
| 1318 |
if (buft == nullptr) {
|
|
@@ -1340,6 +1392,33 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
| 1340 |
GGML_UNUSED(tensor_split);
|
| 1341 |
}
|
| 1342 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1343 |
//
|
| 1344 |
// globals
|
| 1345 |
//
|
|
@@ -1363,6 +1442,7 @@ enum e_model {
|
|
| 1363 |
MODEL_UNKNOWN,
|
| 1364 |
MODEL_0_5B,
|
| 1365 |
MODEL_1B,
|
|
|
|
| 1366 |
MODEL_3B,
|
| 1367 |
MODEL_4B,
|
| 1368 |
MODEL_7B,
|
|
@@ -1370,6 +1450,7 @@ enum e_model {
|
|
| 1370 |
MODEL_13B,
|
| 1371 |
MODEL_14B,
|
| 1372 |
MODEL_15B,
|
|
|
|
| 1373 |
MODEL_30B,
|
| 1374 |
MODEL_34B,
|
| 1375 |
MODEL_40B,
|
|
@@ -1387,6 +1468,7 @@ static const size_t GiB = 1024*MiB;
|
|
| 1387 |
|
| 1388 |
struct llama_hparams {
|
| 1389 |
bool vocab_only;
|
|
|
|
| 1390 |
uint32_t n_vocab;
|
| 1391 |
uint32_t n_ctx_train; // context size the model was trained on
|
| 1392 |
uint32_t n_embd;
|
|
@@ -1406,8 +1488,7 @@ struct llama_hparams {
|
|
| 1406 |
float rope_freq_base_train;
|
| 1407 |
float rope_freq_scale_train;
|
| 1408 |
uint32_t n_yarn_orig_ctx;
|
| 1409 |
-
|
| 1410 |
-
bool rope_finetuned : 1;
|
| 1411 |
|
| 1412 |
float f_clamp_kqv;
|
| 1413 |
float f_max_alibi_bias;
|
|
@@ -1611,6 +1692,8 @@ struct llama_vocab {
|
|
| 1611 |
id special_suffix_id = 32008;
|
| 1612 |
id special_eot_id = 32010;
|
| 1613 |
|
|
|
|
|
|
|
| 1614 |
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
| 1615 |
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
| 1616 |
GGML_ASSERT(token_left.find('\n') == std::string::npos);
|
|
@@ -1707,6 +1790,10 @@ struct llama_context {
|
|
| 1707 |
ggml_backend_free(backend);
|
| 1708 |
}
|
| 1709 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1710 |
ggml_backend_buffer_free(buf_input);
|
| 1711 |
ggml_free(ctx_input);
|
| 1712 |
}
|
|
@@ -2360,6 +2447,7 @@ struct llama_model_loader {
|
|
| 2360 |
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
| 2361 |
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
| 2362 |
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
|
|
|
| 2363 |
default:
|
| 2364 |
{
|
| 2365 |
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
|
@@ -2670,7 +2758,7 @@ struct llama_model_loader {
|
|
| 2670 |
// load LLaMA models
|
| 2671 |
//
|
| 2672 |
|
| 2673 |
-
static
|
| 2674 |
auto it = LLM_ARCH_NAMES.find(arch);
|
| 2675 |
if (it == LLM_ARCH_NAMES.end()) {
|
| 2676 |
return "unknown";
|
|
@@ -2705,9 +2793,10 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
| 2705 |
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
| 2706 |
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
| 2707 |
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
| 2708 |
-
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "
|
| 2709 |
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
| 2710 |
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
|
|
|
| 2711 |
|
| 2712 |
default: return "unknown, may not work";
|
| 2713 |
}
|
|
@@ -2716,12 +2805,14 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
| 2716 |
static const char * llama_model_type_name(e_model type) {
|
| 2717 |
switch (type) {
|
| 2718 |
case MODEL_1B: return "1B";
|
|
|
|
| 2719 |
case MODEL_3B: return "3B";
|
| 2720 |
case MODEL_7B: return "7B";
|
| 2721 |
case MODEL_8B: return "8B";
|
| 2722 |
case MODEL_13B: return "13B";
|
| 2723 |
case MODEL_14B: return "14B";
|
| 2724 |
case MODEL_15B: return "15B";
|
|
|
|
| 2725 |
case MODEL_30B: return "30B";
|
| 2726 |
case MODEL_34B: return "34B";
|
| 2727 |
case MODEL_40B: return "40B";
|
|
@@ -2734,6 +2825,14 @@ static const char * llama_model_type_name(e_model type) {
|
|
| 2734 |
default: return "?B";
|
| 2735 |
}
|
| 2736 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2737 |
|
| 2738 |
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
| 2739 |
model.arch = ml.get_arch();
|
|
@@ -2846,6 +2945,15 @@ static void llm_load_hparams(
|
|
| 2846 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 2847 |
}
|
| 2848 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2849 |
case LLM_ARCH_FALCON:
|
| 2850 |
{
|
| 2851 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -2997,6 +3105,15 @@ static void llm_load_hparams(
|
|
| 2997 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 2998 |
}
|
| 2999 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3000 |
default: (void)0;
|
| 3001 |
}
|
| 3002 |
|
|
@@ -3048,6 +3165,11 @@ static void llm_load_vocab(
|
|
| 3048 |
vocab.special_unk_id = 0;
|
| 3049 |
vocab.special_sep_id = -1;
|
| 3050 |
vocab.special_pad_id = -1;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3051 |
} else if (tokenizer_name == "gpt2") {
|
| 3052 |
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
| 3053 |
|
|
@@ -3255,12 +3377,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
| 3255 |
const auto & hparams = model.hparams;
|
| 3256 |
const auto & vocab = model.vocab;
|
| 3257 |
|
| 3258 |
-
const
|
| 3259 |
|
| 3260 |
// hparams
|
| 3261 |
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
| 3262 |
-
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch)
|
| 3263 |
-
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type
|
| 3264 |
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
| 3265 |
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
| 3266 |
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
|
@@ -3281,7 +3403,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
| 3281 |
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
| 3282 |
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
| 3283 |
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
| 3284 |
-
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type
|
| 3285 |
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
| 3286 |
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
| 3287 |
LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
|
|
@@ -3347,22 +3469,18 @@ static bool llm_load_tensors(
|
|
| 3347 |
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
| 3348 |
}
|
| 3349 |
|
| 3350 |
-
#ifdef GGML_USE_CUBLAS
|
| 3351 |
if (split_mode == LLAMA_SPLIT_LAYER) {
|
| 3352 |
// calculate the split points
|
| 3353 |
-
int device_count =
|
| 3354 |
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
| 3355 |
-
float splits
|
| 3356 |
if (all_zero) {
|
| 3357 |
// default split, by free memory
|
| 3358 |
for (int i = 0; i < device_count; ++i) {
|
| 3359 |
-
|
| 3360 |
-
size_t free;
|
| 3361 |
-
ggml_backend_cuda_get_device_memory(i, &total, &free);
|
| 3362 |
-
splits[i] = free;
|
| 3363 |
}
|
| 3364 |
} else {
|
| 3365 |
-
std::copy(tensor_split, tensor_split + device_count, splits);
|
| 3366 |
}
|
| 3367 |
|
| 3368 |
// sum and normalize the splits to get the split points
|
|
@@ -3378,19 +3496,17 @@ static bool llm_load_tensors(
|
|
| 3378 |
// assign the repeating layers to the devices according to the splits
|
| 3379 |
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
| 3380 |
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
| 3381 |
-
int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
|
| 3382 |
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
| 3383 |
}
|
| 3384 |
// assign the output layer
|
| 3385 |
if (n_gpu_layers > n_layer) {
|
| 3386 |
-
int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits;
|
| 3387 |
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
| 3388 |
} else {
|
| 3389 |
model.buft_output = llama_default_buffer_type_cpu(true);
|
| 3390 |
}
|
| 3391 |
-
} else
|
| 3392 |
-
#endif
|
| 3393 |
-
{
|
| 3394 |
ggml_backend_buffer_type_t split_buft;
|
| 3395 |
if (split_mode == LLAMA_SPLIT_ROW) {
|
| 3396 |
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
|
@@ -3469,13 +3585,16 @@ static bool llm_load_tensors(
|
|
| 3469 |
switch (model.arch) {
|
| 3470 |
case LLM_ARCH_LLAMA:
|
| 3471 |
case LLM_ARCH_REFACT:
|
|
|
|
| 3472 |
{
|
| 3473 |
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
| 3474 |
|
| 3475 |
// output
|
| 3476 |
{
|
| 3477 |
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
| 3478 |
-
model.
|
|
|
|
|
|
|
| 3479 |
}
|
| 3480 |
|
| 3481 |
for (int i = 0; i < n_layer; ++i) {
|
|
@@ -4009,8 +4128,35 @@ static bool llm_load_tensors(
|
|
| 4009 |
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
| 4010 |
}
|
| 4011 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4012 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4013 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4014 |
default:
|
| 4015 |
throw std::runtime_error("unknown architecture");
|
| 4016 |
}
|
|
@@ -4063,8 +4209,7 @@ static bool llm_load_tensors(
|
|
| 4063 |
ctx_bufs.emplace_back(ctx, buf);
|
| 4064 |
}
|
| 4065 |
|
| 4066 |
-
|
| 4067 |
-
{
|
| 4068 |
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
| 4069 |
|
| 4070 |
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
|
@@ -4076,10 +4221,11 @@ static bool llm_load_tensors(
|
|
| 4076 |
const int max_offloadable_layers = hparams.n_layer + 1;
|
| 4077 |
|
| 4078 |
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
|
|
|
| 4079 |
|
| 4080 |
-
|
| 4081 |
-
|
| 4082 |
-
|
| 4083 |
}
|
| 4084 |
|
| 4085 |
// populate tensors_by_name
|
|
@@ -4107,7 +4253,7 @@ static bool llm_load_tensors(
|
|
| 4107 |
}
|
| 4108 |
|
| 4109 |
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
| 4110 |
-
static int llama_model_load(const std::string & fname, llama_model & model,
|
| 4111 |
try {
|
| 4112 |
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
| 4113 |
|
|
@@ -4128,6 +4274,22 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
|
|
| 4128 |
return 0;
|
| 4129 |
}
|
| 4130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4131 |
if (!llm_load_tensors(
|
| 4132 |
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
| 4133 |
params.progress_callback, params.progress_callback_user_data
|
|
@@ -4641,126 +4803,6 @@ struct llm_build_context {
|
|
| 4641 |
ctx0 = nullptr;
|
| 4642 |
}
|
| 4643 |
}
|
| 4644 |
-
struct ggml_cgraph * build_orion() {
|
| 4645 |
-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 4646 |
-
|
| 4647 |
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 4648 |
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 4649 |
-
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
| 4650 |
-
|
| 4651 |
-
struct ggml_tensor * cur;
|
| 4652 |
-
struct ggml_tensor * inpL;
|
| 4653 |
-
|
| 4654 |
-
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 4655 |
-
cb(inpL, "inp_embd", -1);
|
| 4656 |
-
|
| 4657 |
-
// inp_pos - contains the positions
|
| 4658 |
-
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 4659 |
-
cb(inp_pos, "inp_pos", -1);
|
| 4660 |
-
|
| 4661 |
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 4662 |
-
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 4663 |
-
cb(KQ_mask, "KQ_mask", -1);
|
| 4664 |
-
|
| 4665 |
-
// shift the entire K-cache if needed
|
| 4666 |
-
if (do_rope_shift) {
|
| 4667 |
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
| 4668 |
-
}
|
| 4669 |
-
|
| 4670 |
-
for (int il = 0; il < n_layer; ++il) {
|
| 4671 |
-
struct ggml_tensor * inpSA = inpL;
|
| 4672 |
-
|
| 4673 |
-
// norm
|
| 4674 |
-
cur = llm_build_norm(ctx0, inpL, hparams,
|
| 4675 |
-
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
| 4676 |
-
LLM_NORM, cb, il);
|
| 4677 |
-
cb(cur, "attn_norm", il);
|
| 4678 |
-
|
| 4679 |
-
// self-attention
|
| 4680 |
-
{
|
| 4681 |
-
// compute Q and K and RoPE them
|
| 4682 |
-
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
| 4683 |
-
cb(Qcur, "Qcur", il);
|
| 4684 |
-
// if (model.layers[il].bq) {
|
| 4685 |
-
// Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
| 4686 |
-
// cb(Qcur, "Qcur", il);
|
| 4687 |
-
// }
|
| 4688 |
-
|
| 4689 |
-
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
| 4690 |
-
cb(Kcur, "Kcur", il);
|
| 4691 |
-
// if (model.layers[il].bk) {
|
| 4692 |
-
// Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
| 4693 |
-
// cb(Kcur, "Kcur", il);
|
| 4694 |
-
// }
|
| 4695 |
-
|
| 4696 |
-
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
| 4697 |
-
cb(Vcur, "Vcur", il);
|
| 4698 |
-
// if (model.layers[il].bv) {
|
| 4699 |
-
// Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
| 4700 |
-
// cb(Vcur, "Vcur", il);
|
| 4701 |
-
// }
|
| 4702 |
-
|
| 4703 |
-
Qcur = ggml_rope_custom(
|
| 4704 |
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
| 4705 |
-
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
| 4706 |
-
ext_factor, attn_factor, beta_fast, beta_slow
|
| 4707 |
-
);
|
| 4708 |
-
cb(Qcur, "Qcur", il);
|
| 4709 |
-
|
| 4710 |
-
Kcur = ggml_rope_custom(
|
| 4711 |
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
| 4712 |
-
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
| 4713 |
-
ext_factor, attn_factor, beta_fast, beta_slow
|
| 4714 |
-
);
|
| 4715 |
-
cb(Kcur, "Kcur", il);
|
| 4716 |
-
|
| 4717 |
-
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 4718 |
-
model.layers[il].wo, NULL,
|
| 4719 |
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 4720 |
-
cb(cur, "kqv_out", il);
|
| 4721 |
-
}
|
| 4722 |
-
|
| 4723 |
-
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
| 4724 |
-
cb(ffn_inp, "ffn_inp", il);
|
| 4725 |
-
|
| 4726 |
-
// feed-forward network
|
| 4727 |
-
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
| 4728 |
-
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
| 4729 |
-
LLM_NORM, cb, il);
|
| 4730 |
-
cb(cur, "ffn_norm", il);
|
| 4731 |
-
|
| 4732 |
-
cur = llm_build_ffn(ctx0, cur,
|
| 4733 |
-
model.layers[il].ffn_up, NULL,
|
| 4734 |
-
model.layers[il].ffn_gate, NULL,
|
| 4735 |
-
model.layers[il].ffn_down, NULL,
|
| 4736 |
-
NULL,
|
| 4737 |
-
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
| 4738 |
-
cb(cur, "ffn_out", il);
|
| 4739 |
-
|
| 4740 |
-
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 4741 |
-
cb(cur, "l_out", il);
|
| 4742 |
-
|
| 4743 |
-
// input for next layer
|
| 4744 |
-
inpL = cur;
|
| 4745 |
-
}
|
| 4746 |
-
|
| 4747 |
-
cur = inpL;
|
| 4748 |
-
|
| 4749 |
-
cur = llm_build_norm(ctx0, cur, hparams,
|
| 4750 |
-
model.output_norm, model.output_norm_b,
|
| 4751 |
-
LLM_NORM, cb, -1);
|
| 4752 |
-
cb(cur, "result_norm", -1);
|
| 4753 |
-
|
| 4754 |
-
// lm_head
|
| 4755 |
-
cur = ggml_mul_mat(ctx0, model.output, cur);
|
| 4756 |
-
cb(cur, "result_output", -1);
|
| 4757 |
-
|
| 4758 |
-
ggml_build_forward_expand(gf, cur);
|
| 4759 |
-
|
| 4760 |
-
return gf;
|
| 4761 |
-
}
|
| 4762 |
-
|
| 4763 |
-
|
| 4764 |
|
| 4765 |
struct ggml_cgraph * build_llama() {
|
| 4766 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
@@ -6564,23 +6606,409 @@ struct llm_build_context {
|
|
| 6564 |
|
| 6565 |
return gf;
|
| 6566 |
}
|
| 6567 |
-
};
|
| 6568 |
|
| 6569 |
-
|
| 6570 |
-
|
| 6571 |
-
const llama_batch & batch) {
|
| 6572 |
-
const auto & model = lctx.model;
|
| 6573 |
|
| 6574 |
-
|
| 6575 |
-
|
|
|
|
| 6576 |
|
| 6577 |
-
|
| 6578 |
-
|
| 6579 |
-
|
| 6580 |
-
|
| 6581 |
-
|
| 6582 |
-
|
| 6583 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6584 |
|
| 6585 |
if (!lctx.cparams.offload_kqv) {
|
| 6586 |
if (strcmp(name, "kqv_merged_cont") == 0) {
|
|
@@ -6722,6 +7150,14 @@ static struct ggml_cgraph * llama_build_graph(
|
|
| 6722 |
{
|
| 6723 |
result = llm.build_orion();
|
| 6724 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6725 |
default:
|
| 6726 |
GGML_ASSERT(false);
|
| 6727 |
}
|
|
@@ -6849,15 +7285,12 @@ static int llama_decode_internal(
|
|
| 6849 |
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
| 6850 |
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
| 6851 |
// with the BLAS calls. need a better solution
|
| 6852 |
-
|
|
|
|
|
|
|
| 6853 |
n_threads = std::min(4, n_threads);
|
| 6854 |
}
|
| 6855 |
|
| 6856 |
-
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
|
| 6857 |
-
if ((ggml_cpu_has_cublas() || ggml_cpu_has_vulkan()) && fully_offloaded) {
|
| 6858 |
-
n_threads = 1;
|
| 6859 |
-
}
|
| 6860 |
-
|
| 6861 |
#ifdef GGML_USE_MPI
|
| 6862 |
const int64_t n_layer = hparams.n_layer;
|
| 6863 |
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
|
@@ -7669,7 +8102,9 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
| 7669 |
//
|
| 7670 |
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
| 7671 |
if (&fragment == &fragment_buffer.front()) {
|
| 7672 |
-
|
|
|
|
|
|
|
| 7673 |
}
|
| 7674 |
|
| 7675 |
#ifdef PRETOKENIZERDEBUG
|
|
@@ -8155,6 +8590,10 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can
|
|
| 8155 |
|
| 8156 |
const int64_t t_start_sample_us = ggml_time_us();
|
| 8157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8158 |
k = std::max(k, (int) min_keep);
|
| 8159 |
k = std::min(k, (int) candidates->size);
|
| 8160 |
|
|
@@ -9214,6 +9653,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 9214 |
else if (new_type != GGML_TYPE_Q8_0) {
|
| 9215 |
new_type = GGML_TYPE_Q6_K;
|
| 9216 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9217 |
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
| 9218 |
if (name.find("attn_v.weight") != std::string::npos) {
|
| 9219 |
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
|
@@ -9224,7 +9670,6 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 9224 |
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
|
| 9225 |
++qs.i_ffn_down;
|
| 9226 |
}
|
| 9227 |
-
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
|
| 9228 |
} else if (name.find("attn_v.weight") != std::string::npos) {
|
| 9229 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
| 9230 |
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
|
@@ -9232,6 +9677,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 9232 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
|
| 9233 |
new_type = GGML_TYPE_Q4_K;
|
| 9234 |
}
|
|
|
|
|
|
|
|
|
|
| 9235 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
| 9236 |
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
| 9237 |
}
|
|
@@ -9269,6 +9717,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 9269 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
| 9270 |
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
| 9271 |
}
|
|
|
|
|
|
|
|
|
|
| 9272 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
| 9273 |
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
|
| 9274 |
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
|
@@ -9300,13 +9751,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 9300 |
} else if (name.find("attn_output.weight") != std::string::npos) {
|
| 9301 |
if (arch != LLM_ARCH_FALCON) {
|
| 9302 |
if (qs.model.hparams.n_expert == 8) {
|
| 9303 |
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
|
| 9304 |
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
|
| 9305 |
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
| 9306 |
new_type = GGML_TYPE_Q5_K;
|
| 9307 |
}
|
| 9308 |
} else {
|
| 9309 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
|
|
|
| 9310 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
| 9311 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
| 9312 |
}
|
|
@@ -9349,7 +9801,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 9349 |
bool convert_incompatible_tensor = false;
|
| 9350 |
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
| 9351 |
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
| 9352 |
-
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS
|
|
|
|
| 9353 |
int nx = tensor->ne[0];
|
| 9354 |
int ny = tensor->ne[1];
|
| 9355 |
if (nx % QK_K != 0) {
|
|
@@ -9363,6 +9816,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 9363 |
switch (new_type) {
|
| 9364 |
case GGML_TYPE_IQ2_XXS:
|
| 9365 |
case GGML_TYPE_IQ2_XS:
|
|
|
|
| 9366 |
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
| 9367 |
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
| 9368 |
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
|
@@ -9404,6 +9858,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 9404 |
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
| 9405 |
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
|
| 9406 |
case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
|
|
|
|
| 9407 |
|
| 9408 |
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
| 9409 |
}
|
|
@@ -10054,18 +10509,47 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
| 10054 |
return result;
|
| 10055 |
}
|
| 10056 |
|
| 10057 |
-
|
| 10058 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10059 |
}
|
| 10060 |
|
| 10061 |
-
bool
|
| 10062 |
return llama_mmap::SUPPORTED;
|
| 10063 |
}
|
| 10064 |
|
| 10065 |
-
bool
|
| 10066 |
return llama_mlock::SUPPORTED;
|
| 10067 |
}
|
| 10068 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10069 |
void llama_backend_init(bool numa) {
|
| 10070 |
ggml_time_init();
|
| 10071 |
|
|
@@ -10097,8 +10581,8 @@ int64_t llama_time_us(void) {
|
|
| 10097 |
}
|
| 10098 |
|
| 10099 |
struct llama_model * llama_load_model_from_file(
|
| 10100 |
-
|
| 10101 |
-
|
| 10102 |
ggml_time_init();
|
| 10103 |
|
| 10104 |
llama_model * model = new llama_model;
|
|
@@ -10241,19 +10725,31 @@ struct llama_context * llama_new_context_with_model(
|
|
| 10241 |
}
|
| 10242 |
#elif defined(GGML_USE_VULKAN)
|
| 10243 |
if (model->n_gpu_layers > 0) {
|
| 10244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10245 |
if (backend == nullptr) {
|
| 10246 |
-
LLAMA_LOG_ERROR("%s: failed to initialize
|
| 10247 |
llama_free(ctx);
|
| 10248 |
return nullptr;
|
| 10249 |
}
|
| 10250 |
ctx->backends.push_back(backend);
|
| 10251 |
}
|
| 10252 |
-
#elif defined(
|
| 10253 |
if (model->n_gpu_layers > 0) {
|
| 10254 |
-
|
| 10255 |
if (backend == nullptr) {
|
| 10256 |
-
LLAMA_LOG_ERROR("%s: failed to initialize
|
| 10257 |
llama_free(ctx);
|
| 10258 |
return nullptr;
|
| 10259 |
}
|
|
@@ -10464,7 +10960,7 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
|
|
| 10464 |
|
| 10465 |
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
| 10466 |
return snprintf(buf, buf_size, "%s %s %s",
|
| 10467 |
-
llama_model_arch_name(model->arch)
|
| 10468 |
llama_model_type_name(model->type),
|
| 10469 |
llama_model_ftype_name(model->ftype).c_str());
|
| 10470 |
}
|
|
@@ -11106,22 +11602,24 @@ struct llama_batch llama_batch_get_one(
|
|
| 11106 |
};
|
| 11107 |
}
|
| 11108 |
|
| 11109 |
-
struct llama_batch llama_batch_init(int32_t
|
| 11110 |
llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
|
| 11111 |
|
| 11112 |
if (embd) {
|
| 11113 |
-
batch.embd = (float *) malloc(sizeof(float) *
|
| 11114 |
} else {
|
| 11115 |
-
batch.token = (llama_token *) malloc(sizeof(llama_token) *
|
| 11116 |
}
|
| 11117 |
|
| 11118 |
-
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) *
|
| 11119 |
-
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) *
|
| 11120 |
-
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) *
|
| 11121 |
-
for (int i = 0; i <
|
| 11122 |
batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
|
| 11123 |
}
|
| 11124 |
-
batch.
|
|
|
|
|
|
|
| 11125 |
|
| 11126 |
return batch;
|
| 11127 |
}
|
|
@@ -11132,7 +11630,7 @@ void llama_batch_free(struct llama_batch batch) {
|
|
| 11132 |
if (batch.pos) free(batch.pos);
|
| 11133 |
if (batch.n_seq_id) free(batch.n_seq_id);
|
| 11134 |
if (batch.seq_id) {
|
| 11135 |
-
for (int i = 0; i
|
| 11136 |
free(batch.seq_id[i]);
|
| 11137 |
}
|
| 11138 |
free(batch.seq_id);
|
|
|
|
| 15 |
# include "ggml-vulkan.h"
|
| 16 |
#elif defined(GGML_USE_SYCL)
|
| 17 |
# include "ggml-sycl.h"
|
| 18 |
+
#elif defined(GGML_USE_KOMPUTE)
|
| 19 |
+
# include "ggml-kompute.h"
|
| 20 |
#endif
|
| 21 |
|
| 22 |
#ifdef GGML_USE_METAL
|
|
|
|
| 204 |
LLM_ARCH_PLAMO,
|
| 205 |
LLM_ARCH_CODESHELL,
|
| 206 |
LLM_ARCH_ORION,
|
| 207 |
+
LLM_ARCH_INTERNLM2,
|
| 208 |
+
LLM_ARCH_MINICPM,
|
| 209 |
LLM_ARCH_UNKNOWN,
|
| 210 |
};
|
| 211 |
|
| 212 |
+
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
| 213 |
{ LLM_ARCH_LLAMA, "llama" },
|
| 214 |
{ LLM_ARCH_FALCON, "falcon" },
|
| 215 |
{ LLM_ARCH_GPT2, "gpt2" },
|
|
|
|
| 228 |
{ LLM_ARCH_PLAMO, "plamo" },
|
| 229 |
{ LLM_ARCH_CODESHELL, "codeshell" },
|
| 230 |
{ LLM_ARCH_ORION, "orion" },
|
| 231 |
+
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
| 232 |
+
{ LLM_ARCH_MINICPM, "minicpm" },
|
| 233 |
};
|
| 234 |
|
| 235 |
enum llm_kv {
|
|
|
|
| 282 |
LLM_KV_TOKENIZER_PAD_ID,
|
| 283 |
LLM_KV_TOKENIZER_ADD_BOS,
|
| 284 |
LLM_KV_TOKENIZER_ADD_EOS,
|
| 285 |
+
LLM_KV_TOKENIZER_ADD_PREFIX,
|
| 286 |
LLM_KV_TOKENIZER_HF_JSON,
|
| 287 |
LLM_KV_TOKENIZER_RWKV,
|
| 288 |
};
|
| 289 |
|
| 290 |
+
static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
| 291 |
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
| 292 |
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
| 293 |
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
|
|
|
| 337 |
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
| 338 |
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
| 339 |
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
| 340 |
+
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
| 341 |
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
| 342 |
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
| 343 |
};
|
|
|
|
| 348 |
llm_arch arch;
|
| 349 |
|
| 350 |
std::string operator()(llm_kv kv) const {
|
| 351 |
+
return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]);
|
| 352 |
}
|
| 353 |
};
|
| 354 |
|
|
|
|
| 675 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 676 |
},
|
| 677 |
},
|
| 678 |
+
{
|
| 679 |
+
LLM_ARCH_INTERNLM2,
|
| 680 |
+
{
|
| 681 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 682 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 683 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
| 684 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 685 |
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 686 |
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 687 |
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 688 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 689 |
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
| 690 |
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
| 691 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 692 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 693 |
+
},
|
| 694 |
+
},
|
| 695 |
+
{
|
| 696 |
+
LLM_ARCH_MINICPM,
|
| 697 |
+
{
|
| 698 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 699 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 700 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
| 701 |
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
| 702 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 703 |
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 704 |
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 705 |
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 706 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 707 |
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
| 708 |
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
| 709 |
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
| 710 |
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
| 711 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 712 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 713 |
+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
| 714 |
+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
| 715 |
+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
| 716 |
+
},
|
| 717 |
+
},
|
| 718 |
{
|
| 719 |
LLM_ARCH_UNKNOWN,
|
| 720 |
{
|
|
|
|
| 772 |
// gguf helpers
|
| 773 |
//
|
| 774 |
|
| 775 |
+
static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
| 776 |
{ LLAMA_ROPE_SCALING_NONE, "none" },
|
| 777 |
{ LLAMA_ROPE_SCALING_LINEAR, "linear" },
|
| 778 |
{ LLAMA_ROPE_SCALING_YARN, "yarn" },
|
| 779 |
};
|
| 780 |
|
| 781 |
+
static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
| 782 |
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
| 783 |
if (kv.second == name) {
|
| 784 |
return kv.first;
|
|
|
|
| 1205 |
#ifdef __APPLE__
|
| 1206 |
#define MLOCK_SUGGESTION \
|
| 1207 |
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
| 1208 |
+
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n"
|
| 1209 |
#else
|
| 1210 |
#define MLOCK_SUGGESTION \
|
| 1211 |
+
"Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n"
|
| 1212 |
#endif
|
| 1213 |
|
| 1214 |
bool raw_lock(const void * addr, size_t size) const {
|
|
|
|
| 1355 |
#elif defined(GGML_USE_CUBLAS)
|
| 1356 |
buft = ggml_backend_cuda_buffer_type(gpu);
|
| 1357 |
#elif defined(GGML_USE_VULKAN)
|
| 1358 |
+
buft = ggml_backend_vk_buffer_type(gpu);
|
| 1359 |
#elif defined(GGML_USE_SYCL)
|
| 1360 |
buft = ggml_backend_sycl_buffer_type(gpu);
|
| 1361 |
#elif defined(GGML_USE_CLBLAST)
|
| 1362 |
buft = ggml_backend_opencl_buffer_type();
|
| 1363 |
+
#elif defined(GGML_USE_KOMPUTE)
|
| 1364 |
+
buft = ggml_backend_kompute_buffer_type(gpu);
|
| 1365 |
+
if (buft == nullptr) {
|
| 1366 |
+
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
| 1367 |
+
}
|
| 1368 |
#endif
|
| 1369 |
|
| 1370 |
if (buft == nullptr) {
|
|
|
|
| 1392 |
GGML_UNUSED(tensor_split);
|
| 1393 |
}
|
| 1394 |
|
| 1395 |
+
static size_t llama_get_device_count() {
|
| 1396 |
+
#if defined(GGML_USE_CUBLAS)
|
| 1397 |
+
return ggml_backend_cuda_get_device_count();
|
| 1398 |
+
#elif defined(GGML_USE_VULKAN)
|
| 1399 |
+
return ggml_backend_vk_get_device_count();
|
| 1400 |
+
#else
|
| 1401 |
+
return 1;
|
| 1402 |
+
#endif
|
| 1403 |
+
}
|
| 1404 |
+
|
| 1405 |
+
static size_t llama_get_device_memory(int device) {
|
| 1406 |
+
#if defined(GGML_USE_CUBLAS)
|
| 1407 |
+
size_t total;
|
| 1408 |
+
size_t free;
|
| 1409 |
+
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
| 1410 |
+
return free;
|
| 1411 |
+
#elif defined(GGML_USE_VULKAN)
|
| 1412 |
+
size_t total;
|
| 1413 |
+
size_t free;
|
| 1414 |
+
ggml_backend_vk_get_device_memory(device, &total, &free);
|
| 1415 |
+
return free;
|
| 1416 |
+
#else
|
| 1417 |
+
return 1;
|
| 1418 |
+
GGML_UNUSED(device);
|
| 1419 |
+
#endif
|
| 1420 |
+
}
|
| 1421 |
+
|
| 1422 |
//
|
| 1423 |
// globals
|
| 1424 |
//
|
|
|
|
| 1442 |
MODEL_UNKNOWN,
|
| 1443 |
MODEL_0_5B,
|
| 1444 |
MODEL_1B,
|
| 1445 |
+
MODEL_2B,
|
| 1446 |
MODEL_3B,
|
| 1447 |
MODEL_4B,
|
| 1448 |
MODEL_7B,
|
|
|
|
| 1450 |
MODEL_13B,
|
| 1451 |
MODEL_14B,
|
| 1452 |
MODEL_15B,
|
| 1453 |
+
MODEL_20B,
|
| 1454 |
MODEL_30B,
|
| 1455 |
MODEL_34B,
|
| 1456 |
MODEL_40B,
|
|
|
|
| 1468 |
|
| 1469 |
struct llama_hparams {
|
| 1470 |
bool vocab_only;
|
| 1471 |
+
bool rope_finetuned;
|
| 1472 |
uint32_t n_vocab;
|
| 1473 |
uint32_t n_ctx_train; // context size the model was trained on
|
| 1474 |
uint32_t n_embd;
|
|
|
|
| 1488 |
float rope_freq_base_train;
|
| 1489 |
float rope_freq_scale_train;
|
| 1490 |
uint32_t n_yarn_orig_ctx;
|
| 1491 |
+
int32_t rope_scaling_type_train;
|
|
|
|
| 1492 |
|
| 1493 |
float f_clamp_kqv;
|
| 1494 |
float f_max_alibi_bias;
|
|
|
|
| 1692 |
id special_suffix_id = 32008;
|
| 1693 |
id special_eot_id = 32010;
|
| 1694 |
|
| 1695 |
+
bool add_space_prefix = true;
|
| 1696 |
+
|
| 1697 |
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
| 1698 |
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
| 1699 |
GGML_ASSERT(token_left.find('\n') == std::string::npos);
|
|
|
|
| 1790 |
ggml_backend_free(backend);
|
| 1791 |
}
|
| 1792 |
|
| 1793 |
+
#ifdef GGML_USE_VULKAN
|
| 1794 |
+
ggml_vk_free_cpu_assist();
|
| 1795 |
+
#endif
|
| 1796 |
+
|
| 1797 |
ggml_backend_buffer_free(buf_input);
|
| 1798 |
ggml_free(ctx_input);
|
| 1799 |
}
|
|
|
|
| 2447 |
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
| 2448 |
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
| 2449 |
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
| 2450 |
+
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
| 2451 |
default:
|
| 2452 |
{
|
| 2453 |
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
|
|
|
| 2758 |
// load LLaMA models
|
| 2759 |
//
|
| 2760 |
|
| 2761 |
+
static const char * llama_model_arch_name(llm_arch arch) {
|
| 2762 |
auto it = LLM_ARCH_NAMES.find(arch);
|
| 2763 |
if (it == LLM_ARCH_NAMES.end()) {
|
| 2764 |
return "unknown";
|
|
|
|
| 2793 |
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
| 2794 |
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
| 2795 |
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
| 2796 |
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
|
| 2797 |
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
| 2798 |
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
| 2799 |
+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
| 2800 |
|
| 2801 |
default: return "unknown, may not work";
|
| 2802 |
}
|
|
|
|
| 2805 |
static const char * llama_model_type_name(e_model type) {
|
| 2806 |
switch (type) {
|
| 2807 |
case MODEL_1B: return "1B";
|
| 2808 |
+
case MODEL_2B: return "2B";
|
| 2809 |
case MODEL_3B: return "3B";
|
| 2810 |
case MODEL_7B: return "7B";
|
| 2811 |
case MODEL_8B: return "8B";
|
| 2812 |
case MODEL_13B: return "13B";
|
| 2813 |
case MODEL_14B: return "14B";
|
| 2814 |
case MODEL_15B: return "15B";
|
| 2815 |
+
case MODEL_20B: return "20B";
|
| 2816 |
case MODEL_30B: return "30B";
|
| 2817 |
case MODEL_34B: return "34B";
|
| 2818 |
case MODEL_40B: return "40B";
|
|
|
|
| 2825 |
default: return "?B";
|
| 2826 |
}
|
| 2827 |
}
|
| 2828 |
+
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
| 2829 |
+
switch (type) {
|
| 2830 |
+
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
| 2831 |
+
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
| 2832 |
+
default: return "unknown";
|
| 2833 |
+
}
|
| 2834 |
+
}
|
| 2835 |
+
|
| 2836 |
|
| 2837 |
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
| 2838 |
model.arch = ml.get_arch();
|
|
|
|
| 2945 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 2946 |
}
|
| 2947 |
} break;
|
| 2948 |
+
case LLM_ARCH_MINICPM:
|
| 2949 |
+
{
|
| 2950 |
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 2951 |
+
|
| 2952 |
+
switch (hparams.n_layer) {
|
| 2953 |
+
case 40: model.type = e_model::MODEL_2B; break;
|
| 2954 |
+
default: model.type = e_model::MODEL_UNKNOWN;
|
| 2955 |
+
}
|
| 2956 |
+
} break;
|
| 2957 |
case LLM_ARCH_FALCON:
|
| 2958 |
{
|
| 2959 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
|
|
| 3105 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 3106 |
}
|
| 3107 |
} break;
|
| 3108 |
+
case LLM_ARCH_INTERNLM2:
|
| 3109 |
+
{
|
| 3110 |
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 3111 |
+
switch (hparams.n_layer) {
|
| 3112 |
+
case 32: model.type = e_model::MODEL_7B; break;
|
| 3113 |
+
case 48: model.type = e_model::MODEL_20B; break;
|
| 3114 |
+
default: model.type = e_model::MODEL_UNKNOWN;
|
| 3115 |
+
}
|
| 3116 |
+
} break;
|
| 3117 |
default: (void)0;
|
| 3118 |
}
|
| 3119 |
|
|
|
|
| 3165 |
vocab.special_unk_id = 0;
|
| 3166 |
vocab.special_sep_id = -1;
|
| 3167 |
vocab.special_pad_id = -1;
|
| 3168 |
+
|
| 3169 |
+
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
| 3170 |
+
if (add_space_prefix_keyidx != -1) {
|
| 3171 |
+
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
| 3172 |
+
} // The default value of add_space_prefix is true.
|
| 3173 |
} else if (tokenizer_name == "gpt2") {
|
| 3174 |
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
| 3175 |
|
|
|
|
| 3377 |
const auto & hparams = model.hparams;
|
| 3378 |
const auto & vocab = model.vocab;
|
| 3379 |
|
| 3380 |
+
const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
|
| 3381 |
|
| 3382 |
// hparams
|
| 3383 |
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
| 3384 |
+
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch));
|
| 3385 |
+
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
|
| 3386 |
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
| 3387 |
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
| 3388 |
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
|
|
|
| 3403 |
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
| 3404 |
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
| 3405 |
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
| 3406 |
+
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
| 3407 |
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
| 3408 |
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
| 3409 |
LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
|
|
|
|
| 3469 |
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
| 3470 |
}
|
| 3471 |
|
|
|
|
| 3472 |
if (split_mode == LLAMA_SPLIT_LAYER) {
|
| 3473 |
// calculate the split points
|
| 3474 |
+
int device_count = llama_get_device_count();
|
| 3475 |
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
| 3476 |
+
std::vector<float> splits(device_count);
|
| 3477 |
if (all_zero) {
|
| 3478 |
// default split, by free memory
|
| 3479 |
for (int i = 0; i < device_count; ++i) {
|
| 3480 |
+
splits[i] = llama_get_device_memory(i);
|
|
|
|
|
|
|
|
|
|
| 3481 |
}
|
| 3482 |
} else {
|
| 3483 |
+
std::copy(tensor_split, tensor_split + device_count, splits.begin());
|
| 3484 |
}
|
| 3485 |
|
| 3486 |
// sum and normalize the splits to get the split points
|
|
|
|
| 3496 |
// assign the repeating layers to the devices according to the splits
|
| 3497 |
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
| 3498 |
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
| 3499 |
+
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
|
| 3500 |
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
| 3501 |
}
|
| 3502 |
// assign the output layer
|
| 3503 |
if (n_gpu_layers > n_layer) {
|
| 3504 |
+
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
| 3505 |
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
| 3506 |
} else {
|
| 3507 |
model.buft_output = llama_default_buffer_type_cpu(true);
|
| 3508 |
}
|
| 3509 |
+
} else {
|
|
|
|
|
|
|
| 3510 |
ggml_backend_buffer_type_t split_buft;
|
| 3511 |
if (split_mode == LLAMA_SPLIT_ROW) {
|
| 3512 |
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
|
|
|
| 3585 |
switch (model.arch) {
|
| 3586 |
case LLM_ARCH_LLAMA:
|
| 3587 |
case LLM_ARCH_REFACT:
|
| 3588 |
+
case LLM_ARCH_MINICPM:
|
| 3589 |
{
|
| 3590 |
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
| 3591 |
|
| 3592 |
// output
|
| 3593 |
{
|
| 3594 |
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
| 3595 |
+
if (model.arch != LLM_ARCH_MINICPM){
|
| 3596 |
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
| 3597 |
+
}
|
| 3598 |
}
|
| 3599 |
|
| 3600 |
for (int i = 0; i < n_layer; ++i) {
|
|
|
|
| 4128 |
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
| 4129 |
}
|
| 4130 |
} break;
|
| 4131 |
+
case LLM_ARCH_INTERNLM2:
|
| 4132 |
+
{
|
| 4133 |
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
| 4134 |
+
|
| 4135 |
+
// output
|
| 4136 |
+
{
|
| 4137 |
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
| 4138 |
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
| 4139 |
+
}
|
| 4140 |
+
|
| 4141 |
+
for (int i = 0; i < n_layer; ++i) {
|
| 4142 |
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
| 4143 |
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
| 4144 |
+
|
| 4145 |
+
auto & layer = model.layers[i];
|
| 4146 |
|
| 4147 |
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
| 4148 |
+
// layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
| 4149 |
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
| 4150 |
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
| 4151 |
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
| 4152 |
|
| 4153 |
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
| 4154 |
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
| 4155 |
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
| 4156 |
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
| 4157 |
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
| 4158 |
+
}
|
| 4159 |
+
} break;
|
| 4160 |
default:
|
| 4161 |
throw std::runtime_error("unknown architecture");
|
| 4162 |
}
|
|
|
|
| 4209 |
ctx_bufs.emplace_back(ctx, buf);
|
| 4210 |
}
|
| 4211 |
|
| 4212 |
+
if (llama_supports_gpu_offload()) {
|
|
|
|
| 4213 |
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
| 4214 |
|
| 4215 |
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
|
|
|
| 4221 |
const int max_offloadable_layers = hparams.n_layer + 1;
|
| 4222 |
|
| 4223 |
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
| 4224 |
+
}
|
| 4225 |
|
| 4226 |
+
// print memory requirements
|
| 4227 |
+
for (ggml_backend_buffer_t buf : model.bufs) {
|
| 4228 |
+
LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
| 4229 |
}
|
| 4230 |
|
| 4231 |
// populate tensors_by_name
|
|
|
|
| 4253 |
}
|
| 4254 |
|
| 4255 |
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
| 4256 |
+
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
| 4257 |
try {
|
| 4258 |
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
| 4259 |
|
|
|
|
| 4274 |
return 0;
|
| 4275 |
}
|
| 4276 |
|
| 4277 |
+
#ifdef GGML_USE_KOMPUTE
|
| 4278 |
+
if (params.n_gpu_layers > 0 && (
|
| 4279 |
+
!(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
|
| 4280 |
+
|| !(
|
| 4281 |
+
model.ftype == LLAMA_FTYPE_ALL_F32 ||
|
| 4282 |
+
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
|
| 4283 |
+
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
| 4284 |
+
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
|
| 4285 |
+
)
|
| 4286 |
+
)) {
|
| 4287 |
+
// TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file
|
| 4288 |
+
LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
|
| 4289 |
+
params.n_gpu_layers = 0;
|
| 4290 |
+
}
|
| 4291 |
+
#endif
|
| 4292 |
+
|
| 4293 |
if (!llm_load_tensors(
|
| 4294 |
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
| 4295 |
params.progress_callback, params.progress_callback_user_data
|
|
|
|
| 4803 |
ctx0 = nullptr;
|
| 4804 |
}
|
| 4805 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4806 |
|
| 4807 |
struct ggml_cgraph * build_llama() {
|
| 4808 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
|
|
| 6606 |
|
| 6607 |
return gf;
|
| 6608 |
}
|
|
|
|
| 6609 |
|
| 6610 |
+
struct ggml_cgraph * build_orion() {
|
| 6611 |
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
|
|
|
|
|
| 6612 |
|
| 6613 |
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 6614 |
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 6615 |
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
| 6616 |
|
| 6617 |
+
struct ggml_tensor * cur;
|
| 6618 |
+
struct ggml_tensor * inpL;
|
| 6619 |
+
|
| 6620 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 6621 |
+
cb(inpL, "inp_embd", -1);
|
| 6622 |
+
|
| 6623 |
+
// inp_pos - contains the positions
|
| 6624 |
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 6625 |
+
cb(inp_pos, "inp_pos", -1);
|
| 6626 |
+
|
| 6627 |
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 6628 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 6629 |
+
cb(KQ_mask, "KQ_mask", -1);
|
| 6630 |
+
|
| 6631 |
+
// shift the entire K-cache if needed
|
| 6632 |
+
if (do_rope_shift) {
|
| 6633 |
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
| 6634 |
+
}
|
| 6635 |
+
|
| 6636 |
+
for (int il = 0; il < n_layer; ++il) {
|
| 6637 |
+
struct ggml_tensor * inpSA = inpL;
|
| 6638 |
+
|
| 6639 |
+
// norm
|
| 6640 |
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
| 6641 |
+
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
| 6642 |
+
LLM_NORM, cb, il);
|
| 6643 |
+
cb(cur, "attn_norm", il);
|
| 6644 |
+
|
| 6645 |
+
// self-attention
|
| 6646 |
+
{
|
| 6647 |
+
// compute Q and K and RoPE them
|
| 6648 |
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
| 6649 |
+
cb(Qcur, "Qcur", il);
|
| 6650 |
+
// if (model.layers[il].bq) {
|
| 6651 |
+
// Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
| 6652 |
+
// cb(Qcur, "Qcur", il);
|
| 6653 |
+
// }
|
| 6654 |
+
|
| 6655 |
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
| 6656 |
+
cb(Kcur, "Kcur", il);
|
| 6657 |
+
// if (model.layers[il].bk) {
|
| 6658 |
+
// Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
| 6659 |
+
// cb(Kcur, "Kcur", il);
|
| 6660 |
+
// }
|
| 6661 |
+
|
| 6662 |
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
| 6663 |
+
cb(Vcur, "Vcur", il);
|
| 6664 |
+
// if (model.layers[il].bv) {
|
| 6665 |
+
// Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
| 6666 |
+
// cb(Vcur, "Vcur", il);
|
| 6667 |
+
// }
|
| 6668 |
+
|
| 6669 |
+
Qcur = ggml_rope_custom(
|
| 6670 |
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
| 6671 |
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
| 6672 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 6673 |
+
);
|
| 6674 |
+
cb(Qcur, "Qcur", il);
|
| 6675 |
+
|
| 6676 |
+
Kcur = ggml_rope_custom(
|
| 6677 |
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
| 6678 |
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
| 6679 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 6680 |
+
);
|
| 6681 |
+
cb(Kcur, "Kcur", il);
|
| 6682 |
+
|
| 6683 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6684 |
+
model.layers[il].wo, NULL,
|
| 6685 |
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 6686 |
+
cb(cur, "kqv_out", il);
|
| 6687 |
+
}
|
| 6688 |
+
|
| 6689 |
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
| 6690 |
+
cb(ffn_inp, "ffn_inp", il);
|
| 6691 |
+
|
| 6692 |
+
// feed-forward network
|
| 6693 |
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
| 6694 |
+
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
| 6695 |
+
LLM_NORM, cb, il);
|
| 6696 |
+
cb(cur, "ffn_norm", il);
|
| 6697 |
+
|
| 6698 |
+
cur = llm_build_ffn(ctx0, cur,
|
| 6699 |
+
model.layers[il].ffn_up, NULL,
|
| 6700 |
+
model.layers[il].ffn_gate, NULL,
|
| 6701 |
+
model.layers[il].ffn_down, NULL,
|
| 6702 |
+
NULL,
|
| 6703 |
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
| 6704 |
+
cb(cur, "ffn_out", il);
|
| 6705 |
+
|
| 6706 |
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 6707 |
+
cb(cur, "l_out", il);
|
| 6708 |
+
|
| 6709 |
+
// input for next layer
|
| 6710 |
+
inpL = cur;
|
| 6711 |
+
}
|
| 6712 |
+
|
| 6713 |
+
cur = inpL;
|
| 6714 |
+
|
| 6715 |
+
cur = llm_build_norm(ctx0, cur, hparams,
|
| 6716 |
+
model.output_norm, model.output_norm_b,
|
| 6717 |
+
LLM_NORM, cb, -1);
|
| 6718 |
+
cb(cur, "result_norm", -1);
|
| 6719 |
+
|
| 6720 |
+
// lm_head
|
| 6721 |
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
| 6722 |
+
cb(cur, "result_output", -1);
|
| 6723 |
+
|
| 6724 |
+
ggml_build_forward_expand(gf, cur);
|
| 6725 |
+
|
| 6726 |
+
return gf;
|
| 6727 |
+
}
|
| 6728 |
+
|
| 6729 |
+
struct ggml_cgraph * build_internlm2() {
|
| 6730 |
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 6731 |
+
|
| 6732 |
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 6733 |
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 6734 |
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
| 6735 |
+
|
| 6736 |
+
struct ggml_tensor * cur;
|
| 6737 |
+
struct ggml_tensor * inpL;
|
| 6738 |
+
|
| 6739 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 6740 |
+
cb(inpL, "inp_embd", -1);
|
| 6741 |
+
|
| 6742 |
+
// inp_pos - contains the positions
|
| 6743 |
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 6744 |
+
cb(inp_pos, "inp_pos", -1);
|
| 6745 |
+
|
| 6746 |
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 6747 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 6748 |
+
cb(KQ_mask, "KQ_mask", -1);
|
| 6749 |
+
|
| 6750 |
+
// shift the entire K-cache if needed
|
| 6751 |
+
if (do_rope_shift) {
|
| 6752 |
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
| 6753 |
+
}
|
| 6754 |
+
|
| 6755 |
+
for (int il = 0; il < n_layer; ++il) {
|
| 6756 |
+
struct ggml_tensor * inpSA = inpL;
|
| 6757 |
+
|
| 6758 |
+
// norm
|
| 6759 |
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
| 6760 |
+
model.layers[il].attn_norm, NULL,
|
| 6761 |
+
LLM_NORM_RMS, cb, il);
|
| 6762 |
+
cb(cur, "attn_norm", il);
|
| 6763 |
+
|
| 6764 |
+
// self-attention
|
| 6765 |
+
{
|
| 6766 |
+
// compute Q and K and RoPE them
|
| 6767 |
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
| 6768 |
+
cb(Qcur, "Qcur", il);
|
| 6769 |
+
if (model.layers[il].bq) {
|
| 6770 |
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
| 6771 |
+
cb(Qcur, "Qcur", il);
|
| 6772 |
+
}
|
| 6773 |
+
|
| 6774 |
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
| 6775 |
+
cb(Kcur, "Kcur", il);
|
| 6776 |
+
if (model.layers[il].bk) {
|
| 6777 |
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
| 6778 |
+
cb(Kcur, "Kcur", il);
|
| 6779 |
+
}
|
| 6780 |
+
|
| 6781 |
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
| 6782 |
+
cb(Vcur, "Vcur", il);
|
| 6783 |
+
if (model.layers[il].bv) {
|
| 6784 |
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
| 6785 |
+
cb(Vcur, "Vcur", il);
|
| 6786 |
+
}
|
| 6787 |
+
|
| 6788 |
+
Qcur = ggml_rope_custom(
|
| 6789 |
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
| 6790 |
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
| 6791 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 6792 |
+
);
|
| 6793 |
+
cb(Qcur, "Qcur", il);
|
| 6794 |
+
|
| 6795 |
+
Kcur = ggml_rope_custom(
|
| 6796 |
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
| 6797 |
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
| 6798 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 6799 |
+
);
|
| 6800 |
+
cb(Kcur, "Kcur", il);
|
| 6801 |
+
|
| 6802 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6803 |
+
model.layers[il].wo, model.layers[il].bo,
|
| 6804 |
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 6805 |
+
cb(cur, "kqv_out", il);
|
| 6806 |
+
}
|
| 6807 |
+
|
| 6808 |
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
| 6809 |
+
cb(ffn_inp, "ffn_inp", il);
|
| 6810 |
+
|
| 6811 |
+
// feed-forward network
|
| 6812 |
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
| 6813 |
+
model.layers[il].ffn_norm, NULL,
|
| 6814 |
+
LLM_NORM_RMS, cb, il);
|
| 6815 |
+
cb(cur, "ffn_norm", il);
|
| 6816 |
+
|
| 6817 |
+
cur = llm_build_ffn(ctx0, cur,
|
| 6818 |
+
model.layers[il].ffn_up, NULL,
|
| 6819 |
+
model.layers[il].ffn_gate, NULL,
|
| 6820 |
+
model.layers[il].ffn_down, NULL,
|
| 6821 |
+
NULL,
|
| 6822 |
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
| 6823 |
+
cb(cur, "ffn_out", il);
|
| 6824 |
+
|
| 6825 |
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 6826 |
+
cb(cur, "l_out", il);
|
| 6827 |
+
|
| 6828 |
+
// input for next layer
|
| 6829 |
+
inpL = cur;
|
| 6830 |
+
}
|
| 6831 |
+
|
| 6832 |
+
cur = inpL;
|
| 6833 |
+
|
| 6834 |
+
cur = llm_build_norm(ctx0, cur, hparams,
|
| 6835 |
+
model.output_norm, NULL,
|
| 6836 |
+
LLM_NORM_RMS, cb, -1);
|
| 6837 |
+
cb(cur, "result_norm", -1);
|
| 6838 |
+
|
| 6839 |
+
// lm_head
|
| 6840 |
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
| 6841 |
+
cb(cur, "result_output", -1);
|
| 6842 |
+
|
| 6843 |
+
ggml_build_forward_expand(gf, cur);
|
| 6844 |
+
|
| 6845 |
+
return gf;
|
| 6846 |
+
}
|
| 6847 |
+
|
| 6848 |
+
// ref: https://arxiv.org/abs/2203.03466
|
| 6849 |
+
// https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
|
| 6850 |
+
// based on the original build_llama() function
|
| 6851 |
+
struct ggml_cgraph * build_minicpm() {
|
| 6852 |
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 6853 |
+
|
| 6854 |
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 6855 |
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 6856 |
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
| 6857 |
+
|
| 6858 |
+
const int64_t n_embd = hparams.n_embd;
|
| 6859 |
+
//TODO: if the model varies, these parameters need to be read from the model
|
| 6860 |
+
const int64_t n_embd_base = 256;
|
| 6861 |
+
const float scale_embd = 12.0f;
|
| 6862 |
+
const float scale_depth = 1.4f;
|
| 6863 |
+
|
| 6864 |
+
struct ggml_tensor * cur;
|
| 6865 |
+
struct ggml_tensor * inpL;
|
| 6866 |
+
|
| 6867 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 6868 |
+
cb(inpL, "inp_embd", -1);
|
| 6869 |
+
|
| 6870 |
+
// scale the input embeddings
|
| 6871 |
+
inpL = ggml_scale(ctx0, inpL, scale_embd);
|
| 6872 |
+
cb(inpL, "inp_scaled", -1);
|
| 6873 |
+
|
| 6874 |
+
// inp_pos - contains the positions
|
| 6875 |
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 6876 |
+
cb(inp_pos, "inp_pos", -1);
|
| 6877 |
+
|
| 6878 |
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 6879 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 6880 |
+
cb(KQ_mask, "KQ_mask", -1);
|
| 6881 |
+
|
| 6882 |
+
// shift the entire K-cache if needed
|
| 6883 |
+
if (do_rope_shift) {
|
| 6884 |
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
| 6885 |
+
}
|
| 6886 |
+
|
| 6887 |
+
for (int il = 0; il < n_layer; ++il) {
|
| 6888 |
+
struct ggml_tensor * inpSA = inpL;
|
| 6889 |
+
|
| 6890 |
+
// norm
|
| 6891 |
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
| 6892 |
+
model.layers[il].attn_norm, NULL,
|
| 6893 |
+
LLM_NORM_RMS, cb, il);
|
| 6894 |
+
cb(cur, "attn_norm", il);
|
| 6895 |
+
|
| 6896 |
+
// self-attention
|
| 6897 |
+
{
|
| 6898 |
+
// compute Q and K and RoPE them
|
| 6899 |
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
| 6900 |
+
cb(Qcur, "Qcur", il);
|
| 6901 |
+
if (model.layers[il].bq) {
|
| 6902 |
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
| 6903 |
+
cb(Qcur, "Qcur", il);
|
| 6904 |
+
}
|
| 6905 |
+
|
| 6906 |
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
| 6907 |
+
cb(Kcur, "Kcur", il);
|
| 6908 |
+
if (model.layers[il].bk) {
|
| 6909 |
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
| 6910 |
+
cb(Kcur, "Kcur", il);
|
| 6911 |
+
}
|
| 6912 |
+
|
| 6913 |
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
| 6914 |
+
cb(Vcur, "Vcur", il);
|
| 6915 |
+
if (model.layers[il].bv) {
|
| 6916 |
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
| 6917 |
+
cb(Vcur, "Vcur", il);
|
| 6918 |
+
}
|
| 6919 |
+
|
| 6920 |
+
Qcur = ggml_rope_custom(
|
| 6921 |
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
| 6922 |
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
| 6923 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 6924 |
+
);
|
| 6925 |
+
cb(Qcur, "Qcur", il);
|
| 6926 |
+
|
| 6927 |
+
Kcur = ggml_rope_custom(
|
| 6928 |
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
| 6929 |
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
| 6930 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 6931 |
+
);
|
| 6932 |
+
cb(Kcur, "Kcur", il);
|
| 6933 |
+
|
| 6934 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6935 |
+
model.layers[il].wo, model.layers[il].bo,
|
| 6936 |
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 6937 |
+
cb(cur, "kqv_out", il);
|
| 6938 |
+
}
|
| 6939 |
+
|
| 6940 |
+
// scale_res - scale the hidden states for residual connection
|
| 6941 |
+
const float scale_res = scale_depth/sqrtf(float(n_layer));
|
| 6942 |
+
cur = ggml_scale(ctx0, cur, scale_res);
|
| 6943 |
+
cb(cur, "hidden_scaled", -1);
|
| 6944 |
+
|
| 6945 |
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
| 6946 |
+
cb(ffn_inp, "ffn_inp", il);
|
| 6947 |
+
|
| 6948 |
+
// feed-forward network
|
| 6949 |
+
{
|
| 6950 |
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
| 6951 |
+
model.layers[il].ffn_norm, NULL,
|
| 6952 |
+
LLM_NORM_RMS, cb, il);
|
| 6953 |
+
cb(cur, "ffn_norm", il);
|
| 6954 |
+
|
| 6955 |
+
cur = llm_build_ffn(ctx0, cur,
|
| 6956 |
+
model.layers[il].ffn_up, NULL,
|
| 6957 |
+
model.layers[il].ffn_gate, NULL,
|
| 6958 |
+
model.layers[il].ffn_down, NULL,
|
| 6959 |
+
NULL,
|
| 6960 |
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
| 6961 |
+
cb(cur, "ffn_out", il);
|
| 6962 |
+
}
|
| 6963 |
+
|
| 6964 |
+
// scale the hidden states for residual connection
|
| 6965 |
+
cur = ggml_scale(ctx0, cur, scale_res);
|
| 6966 |
+
cb(cur, "hidden_scaled_ffn", -1);
|
| 6967 |
+
|
| 6968 |
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 6969 |
+
cb(cur, "l_out", il);
|
| 6970 |
+
|
| 6971 |
+
// input for next layer
|
| 6972 |
+
inpL = cur;
|
| 6973 |
+
}
|
| 6974 |
+
|
| 6975 |
+
cur = inpL;
|
| 6976 |
+
|
| 6977 |
+
cur = llm_build_norm(ctx0, cur, hparams,
|
| 6978 |
+
model.output_norm, NULL,
|
| 6979 |
+
LLM_NORM_RMS, cb, -1);
|
| 6980 |
+
cb(cur, "result_norm", -1);
|
| 6981 |
+
|
| 6982 |
+
// lm_head scaling
|
| 6983 |
+
const float scale_lmhead = float(n_embd_base)/float(n_embd);
|
| 6984 |
+
cur = ggml_scale(ctx0, cur, scale_lmhead);
|
| 6985 |
+
cb(cur, "lmhead_scaling", -1);
|
| 6986 |
+
|
| 6987 |
+
// lm_head
|
| 6988 |
+
cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
|
| 6989 |
+
cb(cur, "result_output", -1);
|
| 6990 |
+
|
| 6991 |
+
ggml_build_forward_expand(gf, cur);
|
| 6992 |
+
|
| 6993 |
+
return gf;
|
| 6994 |
+
}
|
| 6995 |
+
};
|
| 6996 |
+
|
| 6997 |
+
static struct ggml_cgraph * llama_build_graph(
|
| 6998 |
+
llama_context & lctx,
|
| 6999 |
+
const llama_batch & batch) {
|
| 7000 |
+
const auto & model = lctx.model;
|
| 7001 |
+
|
| 7002 |
+
// check if we should build the worst-case graph (for memory measurement)
|
| 7003 |
+
const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
|
| 7004 |
+
|
| 7005 |
+
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
| 7006 |
+
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
| 7007 |
+
if (il >= 0) {
|
| 7008 |
+
ggml_format_name(cur, "%s-%d", name, il);
|
| 7009 |
+
} else {
|
| 7010 |
+
ggml_set_name(cur, name);
|
| 7011 |
+
}
|
| 7012 |
|
| 7013 |
if (!lctx.cparams.offload_kqv) {
|
| 7014 |
if (strcmp(name, "kqv_merged_cont") == 0) {
|
|
|
|
| 7150 |
{
|
| 7151 |
result = llm.build_orion();
|
| 7152 |
} break;
|
| 7153 |
+
case LLM_ARCH_INTERNLM2:
|
| 7154 |
+
{
|
| 7155 |
+
result = llm.build_internlm2();
|
| 7156 |
+
} break;
|
| 7157 |
+
case LLM_ARCH_MINICPM:
|
| 7158 |
+
{
|
| 7159 |
+
result = llm.build_minicpm();
|
| 7160 |
+
} break;
|
| 7161 |
default:
|
| 7162 |
GGML_ASSERT(false);
|
| 7163 |
}
|
|
|
|
| 7285 |
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
| 7286 |
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
| 7287 |
// with the BLAS calls. need a better solution
|
| 7288 |
+
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
| 7289 |
+
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
| 7290 |
+
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
| 7291 |
n_threads = std::min(4, n_threads);
|
| 7292 |
}
|
| 7293 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7294 |
#ifdef GGML_USE_MPI
|
| 7295 |
const int64_t n_layer = hparams.n_layer;
|
| 7296 |
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
|
|
|
| 8102 |
//
|
| 8103 |
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
| 8104 |
if (&fragment == &fragment_buffer.front()) {
|
| 8105 |
+
if (vocab.add_space_prefix) {
|
| 8106 |
+
raw_text = " " + raw_text; // prefix with space if the first token is not special
|
| 8107 |
+
}
|
| 8108 |
}
|
| 8109 |
|
| 8110 |
#ifdef PRETOKENIZERDEBUG
|
|
|
|
| 8590 |
|
| 8591 |
const int64_t t_start_sample_us = ggml_time_us();
|
| 8592 |
|
| 8593 |
+
if (k <= 0) {
|
| 8594 |
+
k = candidates->size;
|
| 8595 |
+
}
|
| 8596 |
+
|
| 8597 |
k = std::max(k, (int) min_keep);
|
| 8598 |
k = std::min(k, (int) candidates->size);
|
| 8599 |
|
|
|
|
| 9653 |
else if (new_type != GGML_TYPE_Q8_0) {
|
| 9654 |
new_type = GGML_TYPE_Q6_K;
|
| 9655 |
}
|
| 9656 |
+
} else if (name == "token_embd.weight") {
|
| 9657 |
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
| 9658 |
+
new_type = GGML_TYPE_Q2_K;
|
| 9659 |
+
}
|
| 9660 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
| 9661 |
+
new_type = GGML_TYPE_Q4_K;
|
| 9662 |
+
}
|
| 9663 |
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
| 9664 |
if (name.find("attn_v.weight") != std::string::npos) {
|
| 9665 |
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
|
|
|
| 9670 |
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
|
| 9671 |
++qs.i_ffn_down;
|
| 9672 |
}
|
|
|
|
| 9673 |
} else if (name.find("attn_v.weight") != std::string::npos) {
|
| 9674 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
| 9675 |
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
|
|
|
| 9677 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
|
| 9678 |
new_type = GGML_TYPE_Q4_K;
|
| 9679 |
}
|
| 9680 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
| 9681 |
+
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
|
| 9682 |
+
}
|
| 9683 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
| 9684 |
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
| 9685 |
}
|
|
|
|
| 9717 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
| 9718 |
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
| 9719 |
}
|
| 9720 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
|
| 9721 |
+
new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
| 9722 |
+
}
|
| 9723 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
| 9724 |
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
|
| 9725 |
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
|
|
|
| 9751 |
} else if (name.find("attn_output.weight") != std::string::npos) {
|
| 9752 |
if (arch != LLM_ARCH_FALCON) {
|
| 9753 |
if (qs.model.hparams.n_expert == 8) {
|
| 9754 |
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
| 9755 |
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
|
| 9756 |
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
| 9757 |
new_type = GGML_TYPE_Q5_K;
|
| 9758 |
}
|
| 9759 |
} else {
|
| 9760 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
| 9761 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
|
| 9762 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
| 9763 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
| 9764 |
}
|
|
|
|
| 9801 |
bool convert_incompatible_tensor = false;
|
| 9802 |
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
| 9803 |
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
| 9804 |
+
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
| 9805 |
+
new_type == GGML_TYPE_IQ3_XXS) {
|
| 9806 |
int nx = tensor->ne[0];
|
| 9807 |
int ny = tensor->ne[1];
|
| 9808 |
if (nx % QK_K != 0) {
|
|
|
|
| 9816 |
switch (new_type) {
|
| 9817 |
case GGML_TYPE_IQ2_XXS:
|
| 9818 |
case GGML_TYPE_IQ2_XS:
|
| 9819 |
+
case GGML_TYPE_IQ3_XXS:
|
| 9820 |
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
| 9821 |
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
| 9822 |
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
|
|
|
| 9858 |
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
| 9859 |
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
|
| 9860 |
case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
|
| 9861 |
+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
|
| 9862 |
|
| 9863 |
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
| 9864 |
}
|
|
|
|
| 10509 |
return result;
|
| 10510 |
}
|
| 10511 |
|
| 10512 |
+
size_t llama_max_devices(void) {
|
| 10513 |
+
#if defined(GGML_USE_METAL)
|
| 10514 |
+
return 1;
|
| 10515 |
+
#elif defined(GGML_USE_CUBLAS)
|
| 10516 |
+
return GGML_CUDA_MAX_DEVICES;
|
| 10517 |
+
#elif defined(GGML_USE_SYCL)
|
| 10518 |
+
return GGML_SYCL_MAX_DEVICES;
|
| 10519 |
+
#elif defined(GGML_USE_VULKAN)
|
| 10520 |
+
return GGML_VK_MAX_DEVICES;
|
| 10521 |
+
#else
|
| 10522 |
+
return 1;
|
| 10523 |
+
#endif
|
| 10524 |
}
|
| 10525 |
|
| 10526 |
+
bool llama_supports_mmap(void) {
|
| 10527 |
return llama_mmap::SUPPORTED;
|
| 10528 |
}
|
| 10529 |
|
| 10530 |
+
bool llama_supports_mlock(void) {
|
| 10531 |
return llama_mlock::SUPPORTED;
|
| 10532 |
}
|
| 10533 |
|
| 10534 |
+
bool llama_supports_gpu_offload(void) {
|
| 10535 |
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
| 10536 |
+
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
| 10537 |
+
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
| 10538 |
+
return true;
|
| 10539 |
+
#else
|
| 10540 |
+
return false;
|
| 10541 |
+
#endif
|
| 10542 |
+
}
|
| 10543 |
+
|
| 10544 |
+
// deprecated:
|
| 10545 |
+
bool llama_mmap_supported(void) {
|
| 10546 |
+
return llama_supports_mmap();
|
| 10547 |
+
}
|
| 10548 |
+
|
| 10549 |
+
bool llama_mlock_supported(void) {
|
| 10550 |
+
return llama_supports_mlock();
|
| 10551 |
+
}
|
| 10552 |
+
|
| 10553 |
void llama_backend_init(bool numa) {
|
| 10554 |
ggml_time_init();
|
| 10555 |
|
|
|
|
| 10581 |
}
|
| 10582 |
|
| 10583 |
struct llama_model * llama_load_model_from_file(
|
| 10584 |
+
const char * path_model,
|
| 10585 |
+
struct llama_model_params params) {
|
| 10586 |
ggml_time_init();
|
| 10587 |
|
| 10588 |
llama_model * model = new llama_model;
|
|
|
|
| 10725 |
}
|
| 10726 |
#elif defined(GGML_USE_VULKAN)
|
| 10727 |
if (model->n_gpu_layers > 0) {
|
| 10728 |
+
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
| 10729 |
+
ggml_backend_t backend = ggml_backend_vk_init(device);
|
| 10730 |
+
if (backend == nullptr) {
|
| 10731 |
+
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
|
| 10732 |
+
llama_free(ctx);
|
| 10733 |
+
return nullptr;
|
| 10734 |
+
}
|
| 10735 |
+
ctx->backends.push_back(backend);
|
| 10736 |
+
}
|
| 10737 |
+
}
|
| 10738 |
+
#elif defined(GGML_USE_SYCL)
|
| 10739 |
+
if (model->n_gpu_layers > 0) {
|
| 10740 |
+
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
| 10741 |
if (backend == nullptr) {
|
| 10742 |
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
|
| 10743 |
llama_free(ctx);
|
| 10744 |
return nullptr;
|
| 10745 |
}
|
| 10746 |
ctx->backends.push_back(backend);
|
| 10747 |
}
|
| 10748 |
+
#elif defined(GGML_USE_KOMPUTE)
|
| 10749 |
if (model->n_gpu_layers > 0) {
|
| 10750 |
+
auto * backend = ggml_backend_kompute_init(model->main_gpu);
|
| 10751 |
if (backend == nullptr) {
|
| 10752 |
+
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
|
| 10753 |
llama_free(ctx);
|
| 10754 |
return nullptr;
|
| 10755 |
}
|
|
|
|
| 10960 |
|
| 10961 |
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
| 10962 |
return snprintf(buf, buf_size, "%s %s %s",
|
| 10963 |
+
llama_model_arch_name(model->arch),
|
| 10964 |
llama_model_type_name(model->type),
|
| 10965 |
llama_model_ftype_name(model->ftype).c_str());
|
| 10966 |
}
|
|
|
|
| 11602 |
};
|
| 11603 |
}
|
| 11604 |
|
| 11605 |
+
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
|
| 11606 |
llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
|
| 11607 |
|
| 11608 |
if (embd) {
|
| 11609 |
+
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
| 11610 |
} else {
|
| 11611 |
+
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
| 11612 |
}
|
| 11613 |
|
| 11614 |
+
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens_alloc);
|
| 11615 |
+
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens_alloc);
|
| 11616 |
+
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
|
| 11617 |
+
for (int i = 0; i < n_tokens_alloc; ++i) {
|
| 11618 |
batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
|
| 11619 |
}
|
| 11620 |
+
batch.seq_id[n_tokens_alloc] = nullptr;
|
| 11621 |
+
|
| 11622 |
+
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens_alloc);
|
| 11623 |
|
| 11624 |
return batch;
|
| 11625 |
}
|
|
|
|
| 11630 |
if (batch.pos) free(batch.pos);
|
| 11631 |
if (batch.n_seq_id) free(batch.n_seq_id);
|
| 11632 |
if (batch.seq_id) {
|
| 11633 |
+
for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
|
| 11634 |
free(batch.seq_id[i]);
|
| 11635 |
}
|
| 11636 |
free(batch.seq_id);
|
examples/talk-llama/llama.h
CHANGED
|
@@ -3,15 +3,7 @@
|
|
| 3 |
|
| 4 |
#include "ggml.h"
|
| 5 |
#include "ggml-backend.h"
|
| 6 |
-
|
| 7 |
-
#include "ggml-cuda.h"
|
| 8 |
-
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
| 9 |
-
#elif defined(GGML_USE_SYCL)
|
| 10 |
-
#include "ggml-sycl.h"
|
| 11 |
-
#define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
|
| 12 |
-
#else
|
| 13 |
-
#define LLAMA_MAX_DEVICES 1
|
| 14 |
-
#endif // GGML_USE_CUBLAS
|
| 15 |
#include <stddef.h>
|
| 16 |
#include <stdint.h>
|
| 17 |
#include <stdio.h>
|
|
@@ -49,11 +41,6 @@
|
|
| 49 |
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
| 50 |
#define LLAMA_SESSION_VERSION 4
|
| 51 |
|
| 52 |
-
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
| 53 |
-
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
| 54 |
-
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
| 55 |
-
#endif
|
| 56 |
-
|
| 57 |
#ifdef __cplusplus
|
| 58 |
extern "C" {
|
| 59 |
#endif
|
|
@@ -111,6 +98,7 @@ extern "C" {
|
|
| 111 |
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
| 112 |
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
| 113 |
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
|
|
|
| 114 |
|
| 115 |
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
| 116 |
};
|
|
@@ -199,7 +187,7 @@ extern "C" {
|
|
| 199 |
// LLAMA_SPLIT_LAYER: ignored
|
| 200 |
int32_t main_gpu;
|
| 201 |
|
| 202 |
-
// proportion of the model (layers or rows) to offload to each GPU, size:
|
| 203 |
const float * tensor_split;
|
| 204 |
|
| 205 |
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
|
@@ -225,7 +213,7 @@ extern "C" {
|
|
| 225 |
uint32_t n_batch; // prompt processing maximum batch size
|
| 226 |
uint32_t n_threads; // number of threads to use for generation
|
| 227 |
uint32_t n_threads_batch; // number of threads to use for batch processing
|
| 228 |
-
|
| 229 |
|
| 230 |
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
| 231 |
float rope_freq_base; // RoPE base frequency, 0 = from model
|
|
@@ -336,9 +324,14 @@ extern "C" {
|
|
| 336 |
|
| 337 |
LLAMA_API int64_t llama_time_us(void);
|
| 338 |
|
| 339 |
-
LLAMA_API
|
| 340 |
-
|
| 341 |
-
LLAMA_API bool
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
|
| 343 |
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
| 344 |
|
|
|
|
| 3 |
|
| 4 |
#include "ggml.h"
|
| 5 |
#include "ggml-backend.h"
|
| 6 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
#include <stddef.h>
|
| 8 |
#include <stdint.h>
|
| 9 |
#include <stdio.h>
|
|
|
|
| 41 |
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
| 42 |
#define LLAMA_SESSION_VERSION 4
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
#ifdef __cplusplus
|
| 45 |
extern "C" {
|
| 46 |
#endif
|
|
|
|
| 98 |
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
| 99 |
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
| 100 |
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
| 101 |
+
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
| 102 |
|
| 103 |
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
| 104 |
};
|
|
|
|
| 187 |
// LLAMA_SPLIT_LAYER: ignored
|
| 188 |
int32_t main_gpu;
|
| 189 |
|
| 190 |
+
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
| 191 |
const float * tensor_split;
|
| 192 |
|
| 193 |
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
|
|
|
| 213 |
uint32_t n_batch; // prompt processing maximum batch size
|
| 214 |
uint32_t n_threads; // number of threads to use for generation
|
| 215 |
uint32_t n_threads_batch; // number of threads to use for batch processing
|
| 216 |
+
int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
| 217 |
|
| 218 |
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
| 219 |
float rope_freq_base; // RoPE base frequency, 0 = from model
|
|
|
|
| 324 |
|
| 325 |
LLAMA_API int64_t llama_time_us(void);
|
| 326 |
|
| 327 |
+
LLAMA_API size_t llama_max_devices(void);
|
| 328 |
+
|
| 329 |
+
LLAMA_API bool llama_supports_mmap (void);
|
| 330 |
+
LLAMA_API bool llama_supports_mlock (void);
|
| 331 |
+
LLAMA_API bool llama_supports_gpu_offload(void);
|
| 332 |
+
|
| 333 |
+
LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
|
| 334 |
+
LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
|
| 335 |
|
| 336 |
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
| 337 |
|