Spaces:
Running
Running
talk-llama : sync llama.cpp
Browse files- examples/talk-llama/llama.cpp +520 -153
- examples/talk-llama/llama.h +41 -1
- examples/talk-llama/talk-llama.cpp +1 -1
- examples/talk-llama/unicode.h +42 -30
examples/talk-llama/llama.cpp
CHANGED
|
@@ -197,6 +197,7 @@ enum llm_arch {
|
|
| 197 |
LLM_ARCH_PERSIMMON,
|
| 198 |
LLM_ARCH_REFACT,
|
| 199 |
LLM_ARCH_BERT,
|
|
|
|
| 200 |
LLM_ARCH_BLOOM,
|
| 201 |
LLM_ARCH_STABLELM,
|
| 202 |
LLM_ARCH_QWEN,
|
|
@@ -211,27 +212,28 @@ enum llm_arch {
|
|
| 211 |
};
|
| 212 |
|
| 213 |
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
| 214 |
-
{ LLM_ARCH_LLAMA, "llama"
|
| 215 |
-
{ LLM_ARCH_FALCON, "falcon"
|
| 216 |
-
{ LLM_ARCH_GPT2, "gpt2"
|
| 217 |
-
{ LLM_ARCH_GPTJ, "gptj"
|
| 218 |
-
{ LLM_ARCH_GPTNEOX, "gptneox"
|
| 219 |
-
{ LLM_ARCH_MPT, "mpt"
|
| 220 |
-
{ LLM_ARCH_BAICHUAN, "baichuan"
|
| 221 |
-
{ LLM_ARCH_STARCODER, "starcoder"
|
| 222 |
-
{ LLM_ARCH_PERSIMMON, "persimmon"
|
| 223 |
-
{ LLM_ARCH_REFACT, "refact"
|
| 224 |
-
{ LLM_ARCH_BERT, "bert"
|
| 225 |
-
{
|
| 226 |
-
{
|
| 227 |
-
{
|
| 228 |
-
{
|
| 229 |
-
{
|
| 230 |
-
{
|
| 231 |
-
{
|
| 232 |
-
{
|
| 233 |
-
{
|
| 234 |
-
{
|
|
|
|
| 235 |
};
|
| 236 |
|
| 237 |
enum llm_kv {
|
|
@@ -254,6 +256,7 @@ enum llm_kv {
|
|
| 254 |
LLM_KV_TENSOR_DATA_LAYOUT,
|
| 255 |
LLM_KV_EXPERT_COUNT,
|
| 256 |
LLM_KV_EXPERT_USED_COUNT,
|
|
|
|
| 257 |
|
| 258 |
LLM_KV_ATTENTION_HEAD_COUNT,
|
| 259 |
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
|
@@ -311,6 +314,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
| 311 |
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
| 312 |
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
| 313 |
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
|
|
|
| 314 |
|
| 315 |
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
| 316 |
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
|
@@ -373,6 +377,7 @@ enum llm_tensor {
|
|
| 373 |
LLM_TENSOR_ATTN_OUT,
|
| 374 |
LLM_TENSOR_ATTN_NORM,
|
| 375 |
LLM_TENSOR_ATTN_NORM_2,
|
|
|
|
| 376 |
LLM_TENSOR_ATTN_ROT_EMBD,
|
| 377 |
LLM_TENSOR_FFN_GATE_INP,
|
| 378 |
LLM_TENSOR_FFN_NORM,
|
|
@@ -385,6 +390,7 @@ enum llm_tensor {
|
|
| 385 |
LLM_TENSOR_FFN_UP_EXP,
|
| 386 |
LLM_TENSOR_ATTN_Q_NORM,
|
| 387 |
LLM_TENSOR_ATTN_K_NORM,
|
|
|
|
| 388 |
};
|
| 389 |
|
| 390 |
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
|
@@ -550,12 +556,27 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
| 550 |
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
| 551 |
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
| 552 |
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
| 553 |
-
{
|
| 554 |
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 555 |
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 556 |
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 557 |
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 558 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 559 |
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 560 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 561 |
},
|
|
@@ -772,22 +793,37 @@ struct LLM_TN {
|
|
| 772 |
llm_arch arch;
|
| 773 |
|
| 774 |
std::string operator()(llm_tensor tensor) const {
|
|
|
|
|
|
|
|
|
|
| 775 |
return LLM_TENSOR_NAMES[arch].at(tensor);
|
| 776 |
}
|
| 777 |
|
| 778 |
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
|
|
|
|
|
|
|
|
|
| 779 |
return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
|
| 780 |
}
|
| 781 |
|
| 782 |
std::string operator()(llm_tensor tensor, int bid) const {
|
|
|
|
|
|
|
|
|
|
| 783 |
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
|
| 784 |
}
|
| 785 |
|
| 786 |
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
|
|
|
|
|
|
|
|
|
| 787 |
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
| 788 |
}
|
| 789 |
|
| 790 |
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
|
|
|
|
|
|
|
|
|
| 791 |
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
|
| 792 |
}
|
| 793 |
};
|
|
@@ -998,7 +1034,7 @@ struct llama_mmap {
|
|
| 998 |
int fd = fileno(file->fp);
|
| 999 |
int flags = MAP_SHARED;
|
| 1000 |
// prefetch/readahead impairs performance on NUMA systems
|
| 1001 |
-
if (numa)
|
| 1002 |
#ifdef __linux__
|
| 1003 |
// advise the kernel to read the file sequentially (increases readahead)
|
| 1004 |
if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
|
|
@@ -1468,6 +1504,7 @@ enum e_model {
|
|
| 1468 |
MODEL_22M,
|
| 1469 |
MODEL_33M,
|
| 1470 |
MODEL_109M,
|
|
|
|
| 1471 |
MODEL_335M,
|
| 1472 |
MODEL_0_5B,
|
| 1473 |
MODEL_1B,
|
|
@@ -1520,11 +1557,13 @@ struct llama_hparams {
|
|
| 1520 |
uint32_t n_yarn_orig_ctx;
|
| 1521 |
int32_t rope_scaling_type_train;
|
| 1522 |
|
| 1523 |
-
float f_clamp_kqv;
|
| 1524 |
-
float f_max_alibi_bias;
|
| 1525 |
|
| 1526 |
bool causal_attn = true;
|
|
|
|
| 1527 |
|
|
|
|
| 1528 |
|
| 1529 |
bool operator!=(const llama_hparams & other) const {
|
| 1530 |
if (this->vocab_only != other.vocab_only) return true;
|
|
@@ -1586,6 +1625,7 @@ struct llama_cparams {
|
|
| 1586 |
|
| 1587 |
bool mul_mat_q;
|
| 1588 |
bool offload_kqv;
|
|
|
|
| 1589 |
|
| 1590 |
ggml_backend_sched_eval_callback cb_eval;
|
| 1591 |
void * cb_eval_user_data;
|
|
@@ -1601,6 +1641,8 @@ struct llama_layer {
|
|
| 1601 |
struct ggml_tensor * attn_q_norm_b;
|
| 1602 |
struct ggml_tensor * attn_k_norm;
|
| 1603 |
struct ggml_tensor * attn_k_norm_b;
|
|
|
|
|
|
|
| 1604 |
|
| 1605 |
// attention
|
| 1606 |
struct ggml_tensor * wq;
|
|
@@ -1619,6 +1661,8 @@ struct llama_layer {
|
|
| 1619 |
// normalization
|
| 1620 |
struct ggml_tensor * ffn_norm;
|
| 1621 |
struct ggml_tensor * ffn_norm_b;
|
|
|
|
|
|
|
| 1622 |
|
| 1623 |
// ff
|
| 1624 |
struct ggml_tensor * ffn_gate; // w1
|
|
@@ -1880,8 +1924,10 @@ struct llama_context {
|
|
| 1880 |
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
| 1881 |
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
| 1882 |
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
|
|
|
| 1883 |
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
| 1884 |
-
struct ggml_tensor *
|
|
|
|
| 1885 |
|
| 1886 |
#ifdef GGML_USE_MPI
|
| 1887 |
ggml_mpi_context * ctx_mpi = NULL;
|
|
@@ -2480,6 +2526,7 @@ struct llama_model_loader {
|
|
| 2480 |
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
| 2481 |
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
| 2482 |
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
|
|
|
| 2483 |
default:
|
| 2484 |
{
|
| 2485 |
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
|
@@ -2829,6 +2876,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
| 2829 |
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
| 2830 |
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
| 2831 |
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
|
|
|
| 2832 |
|
| 2833 |
default: return "unknown, may not work";
|
| 2834 |
}
|
|
@@ -2836,6 +2884,11 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
| 2836 |
|
| 2837 |
static const char * llama_model_type_name(e_model type) {
|
| 2838 |
switch (type) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2839 |
case MODEL_1B: return "1B";
|
| 2840 |
case MODEL_2B: return "2B";
|
| 2841 |
case MODEL_3B: return "3B";
|
|
@@ -3005,6 +3058,11 @@ static void llm_load_hparams(
|
|
| 3005 |
case 40: model.type = e_model::MODEL_13B; break;
|
| 3006 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 3007 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3008 |
} break;
|
| 3009 |
case LLM_ARCH_STARCODER:
|
| 3010 |
{
|
|
@@ -3032,12 +3090,16 @@ static void llm_load_hparams(
|
|
| 3032 |
case 32: model.type = e_model::MODEL_1B; break;
|
| 3033 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 3034 |
}
|
|
|
|
|
|
|
|
|
|
| 3035 |
} break;
|
| 3036 |
case LLM_ARCH_BERT:
|
| 3037 |
{
|
| 3038 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
| 3039 |
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
| 3040 |
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
|
|
|
| 3041 |
|
| 3042 |
switch (hparams.n_layer) {
|
| 3043 |
case 3:
|
|
@@ -3053,6 +3115,17 @@ static void llm_load_hparams(
|
|
| 3053 |
model.type = e_model::MODEL_335M; break; // bge-large
|
| 3054 |
}
|
| 3055 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3056 |
case LLM_ARCH_BLOOM:
|
| 3057 |
{
|
| 3058 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -3065,11 +3138,12 @@ static void llm_load_hparams(
|
|
| 3065 |
case 4096: model.type = e_model::MODEL_7B; break;
|
| 3066 |
} break;
|
| 3067 |
}
|
|
|
|
|
|
|
|
|
|
| 3068 |
} break;
|
| 3069 |
case LLM_ARCH_MPT:
|
| 3070 |
{
|
| 3071 |
-
hparams.f_clamp_kqv = 0.0f;
|
| 3072 |
-
|
| 3073 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
| 3074 |
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
| 3075 |
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
|
@@ -3171,6 +3245,10 @@ static void llm_load_hparams(
|
|
| 3171 |
}
|
| 3172 |
|
| 3173 |
model.ftype = ml.ftype;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3174 |
}
|
| 3175 |
|
| 3176 |
// TODO: This should probably be in llama.h
|
|
@@ -3294,7 +3372,12 @@ static void llm_load_vocab(
|
|
| 3294 |
|
| 3295 |
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
| 3296 |
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
| 3297 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3298 |
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
| 3299 |
vocab.linefeed_id = vocab.special_pad_id;
|
| 3300 |
} else {
|
|
@@ -3850,10 +3933,14 @@ static bool llm_load_tensors(
|
|
| 3850 |
}
|
| 3851 |
} break;
|
| 3852 |
case LLM_ARCH_BERT:
|
|
|
|
| 3853 |
{
|
| 3854 |
-
model.tok_embd
|
| 3855 |
-
model.type_embd
|
| 3856 |
-
model.
|
|
|
|
|
|
|
|
|
|
| 3857 |
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
| 3858 |
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
| 3859 |
|
|
@@ -3863,29 +3950,38 @@ static bool llm_load_tensors(
|
|
| 3863 |
|
| 3864 |
auto & layer = model.layers[i];
|
| 3865 |
|
| 3866 |
-
|
| 3867 |
-
|
|
|
|
| 3868 |
|
| 3869 |
-
|
| 3870 |
-
|
| 3871 |
|
| 3872 |
-
|
| 3873 |
-
|
|
|
|
|
|
|
|
|
|
| 3874 |
|
| 3875 |
-
layer.
|
| 3876 |
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
| 3877 |
|
| 3878 |
-
layer.
|
| 3879 |
-
layer.
|
| 3880 |
|
| 3881 |
-
layer.
|
| 3882 |
-
layer.
|
| 3883 |
|
| 3884 |
-
|
| 3885 |
-
|
|
|
|
| 3886 |
|
| 3887 |
-
|
| 3888 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3889 |
}
|
| 3890 |
} break;
|
| 3891 |
case LLM_ARCH_BLOOM:
|
|
@@ -4364,9 +4460,21 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
| 4364 |
|
| 4365 |
model.hparams.vocab_only = params.vocab_only;
|
| 4366 |
|
| 4367 |
-
|
| 4368 |
-
|
| 4369 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4370 |
|
| 4371 |
llm_load_print_meta(ml, model);
|
| 4372 |
|
|
@@ -4683,10 +4791,10 @@ static struct ggml_tensor * llm_build_kqv(
|
|
| 4683 |
struct ggml_tensor * wo_b,
|
| 4684 |
struct ggml_tensor * q_cur,
|
| 4685 |
struct ggml_tensor * kq_mask,
|
|
|
|
| 4686 |
int64_t n_ctx,
|
| 4687 |
int32_t n_tokens,
|
| 4688 |
int32_t n_kv,
|
| 4689 |
-
float max_alibi_bias,
|
| 4690 |
float kq_scale,
|
| 4691 |
const llm_build_cb & cb,
|
| 4692 |
int il) {
|
|
@@ -4716,26 +4824,26 @@ static struct ggml_tensor * llm_build_kqv(
|
|
| 4716 |
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
| 4717 |
}
|
| 4718 |
|
| 4719 |
-
|
| 4720 |
-
|
|
|
|
|
|
|
|
|
|
| 4721 |
kq = ggml_scale(ctx, kq, kq_scale);
|
| 4722 |
cb(kq, "kq_scaled", il);
|
| 4723 |
|
| 4724 |
-
|
| 4725 |
-
|
| 4726 |
-
// TODO: K-shift is likely not working
|
| 4727 |
-
// TODO: change to ggml_add
|
| 4728 |
-
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
|
| 4729 |
-
cb(kq, "kq_scaled_alibi", il);
|
| 4730 |
-
}
|
| 4731 |
|
| 4732 |
kq = ggml_add(ctx, kq, kq_mask);
|
| 4733 |
cb(kq, "kq_masked", il);
|
| 4734 |
|
| 4735 |
kq = ggml_soft_max(ctx, kq);
|
| 4736 |
cb(kq, "kq_soft_max", il);
|
| 4737 |
-
} else
|
| 4738 |
-
|
|
|
|
|
|
|
| 4739 |
cb(kq, "kq_soft_max_ext", il);
|
| 4740 |
}
|
| 4741 |
|
|
@@ -4783,11 +4891,11 @@ static struct ggml_tensor * llm_build_kv(
|
|
| 4783 |
struct ggml_tensor * v_cur,
|
| 4784 |
struct ggml_tensor * q_cur,
|
| 4785 |
struct ggml_tensor * kq_mask,
|
|
|
|
| 4786 |
int64_t n_ctx,
|
| 4787 |
int32_t n_tokens,
|
| 4788 |
int32_t kv_head,
|
| 4789 |
int32_t n_kv,
|
| 4790 |
-
float max_alibi_bias,
|
| 4791 |
float kq_scale,
|
| 4792 |
const llm_build_cb & cb,
|
| 4793 |
int il) {
|
|
@@ -4801,9 +4909,8 @@ static struct ggml_tensor * llm_build_kv(
|
|
| 4801 |
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
| 4802 |
|
| 4803 |
struct ggml_tensor * cur;
|
| 4804 |
-
cur = llm_build_kqv(ctx, model, hparams, kv, graph,
|
| 4805 |
-
|
| 4806 |
-
q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
|
| 4807 |
cb(cur, "kqv_out", il);
|
| 4808 |
|
| 4809 |
return cur;
|
|
@@ -4844,7 +4951,7 @@ struct llm_build_context {
|
|
| 4844 |
const int32_t n_orig_ctx;
|
| 4845 |
|
| 4846 |
const bool do_rope_shift;
|
| 4847 |
-
const
|
| 4848 |
|
| 4849 |
const llm_build_cb & cb;
|
| 4850 |
|
|
@@ -4888,7 +4995,7 @@ struct llm_build_context {
|
|
| 4888 |
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
| 4889 |
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
| 4890 |
do_rope_shift (worst_case || kv_self.has_shift),
|
| 4891 |
-
|
| 4892 |
cb (cb),
|
| 4893 |
buf_compute_meta (lctx.buf_compute_meta) {
|
| 4894 |
// all initializations should be done in init()
|
|
@@ -4971,7 +5078,7 @@ struct llm_build_context {
|
|
| 4971 |
}
|
| 4972 |
|
| 4973 |
Qcur = ggml_rope_custom(
|
| 4974 |
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,
|
| 4975 |
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
| 4976 |
ext_factor, attn_factor, beta_fast, beta_slow
|
| 4977 |
);
|
|
@@ -4986,7 +5093,7 @@ struct llm_build_context {
|
|
| 4986 |
|
| 4987 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 4988 |
model.layers[il].wo, model.layers[il].bo,
|
| 4989 |
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
| 4990 |
cb(cur, "kqv_out", il);
|
| 4991 |
}
|
| 4992 |
|
|
@@ -5116,6 +5223,10 @@ struct llm_build_context {
|
|
| 5116 |
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 5117 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5119 |
// shift the entire K-cache if needed
|
| 5120 |
if (do_rope_shift) {
|
| 5121 |
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
|
@@ -5164,12 +5275,9 @@ struct llm_build_context {
|
|
| 5164 |
cb(Kcur, "Kcur", il);
|
| 5165 |
|
| 5166 |
|
| 5167 |
-
// apply ALiBi for 13B model
|
| 5168 |
-
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
|
| 5169 |
-
|
| 5170 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 5171 |
model.layers[il].wo, NULL,
|
| 5172 |
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
| 5173 |
cb(cur, "kqv_out", il);
|
| 5174 |
}
|
| 5175 |
|
|
@@ -5293,7 +5401,7 @@ struct llm_build_context {
|
|
| 5293 |
|
| 5294 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 5295 |
model.layers[il].wo, NULL,
|
| 5296 |
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
| 5297 |
cb(cur, "kqv_out", il);
|
| 5298 |
}
|
| 5299 |
|
|
@@ -5392,7 +5500,7 @@ struct llm_build_context {
|
|
| 5392 |
|
| 5393 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 5394 |
model.layers[il].wo, model.layers[il].bo,
|
| 5395 |
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
| 5396 |
cb(cur, "kqv_out", il);
|
| 5397 |
}
|
| 5398 |
|
|
@@ -5597,7 +5705,7 @@ struct llm_build_context {
|
|
| 5597 |
|
| 5598 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 5599 |
model.layers[il].wo, model.layers[il].bo,
|
| 5600 |
-
Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
| 5601 |
cb(cur, "kqv_out", il);
|
| 5602 |
}
|
| 5603 |
|
|
@@ -5659,6 +5767,10 @@ struct llm_build_context {
|
|
| 5659 |
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 5660 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5661 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5662 |
for (int il = 0; il < n_layer; ++il) {
|
| 5663 |
struct ggml_tensor * inpSA = inpL;
|
| 5664 |
|
|
@@ -5686,7 +5798,7 @@ struct llm_build_context {
|
|
| 5686 |
|
| 5687 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 5688 |
model.layers[il].wo, NULL,
|
| 5689 |
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
| 5690 |
cb(cur, "kqv_out", il);
|
| 5691 |
}
|
| 5692 |
|
|
@@ -5736,22 +5848,27 @@ struct llm_build_context {
|
|
| 5736 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 5737 |
|
| 5738 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
|
|
| 5739 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 5740 |
-
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
| 5741 |
|
| 5742 |
struct ggml_tensor * cur;
|
| 5743 |
struct ggml_tensor * inpL;
|
| 5744 |
|
| 5745 |
// get input vectors with right size
|
|
|
|
| 5746 |
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 5747 |
-
struct ggml_tensor *
|
|
|
|
| 5748 |
|
| 5749 |
// construct input embeddings (token, type, position)
|
| 5750 |
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
|
|
|
| 5751 |
// token types are hardcoded to zero ("Sentence A")
|
| 5752 |
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
|
| 5753 |
inpL = ggml_add(ctx0, inpL, type_row0);
|
| 5754 |
-
|
|
|
|
|
|
|
| 5755 |
cb(inpL, "inp_embd", -1);
|
| 5756 |
|
| 5757 |
// embed layer norm
|
|
@@ -5767,7 +5884,7 @@ struct llm_build_context {
|
|
| 5767 |
struct ggml_tensor * cur = inpL;
|
| 5768 |
|
| 5769 |
// self-attention
|
| 5770 |
-
{
|
| 5771 |
struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
| 5772 |
cb(Qcur, "Qcur", il);
|
| 5773 |
|
|
@@ -5782,7 +5899,38 @@ struct llm_build_context {
|
|
| 5782 |
|
| 5783 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 5784 |
model.layers[il].wo, model.layers[il].bo,
|
| 5785 |
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5786 |
cb(cur, "kqv_out", il);
|
| 5787 |
}
|
| 5788 |
|
|
@@ -5790,25 +5938,34 @@ struct llm_build_context {
|
|
| 5790 |
cur = ggml_add(ctx0, cur, inpL);
|
| 5791 |
|
| 5792 |
// attention layer norm
|
| 5793 |
-
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].
|
| 5794 |
|
| 5795 |
struct ggml_tensor * ffn_inp = cur;
|
| 5796 |
cb(ffn_inp, "ffn_inp", il);
|
| 5797 |
|
| 5798 |
// feed-forward network
|
| 5799 |
-
|
| 5800 |
-
|
| 5801 |
-
|
| 5802 |
-
|
| 5803 |
-
|
| 5804 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5805 |
cb(cur, "ffn_out", il);
|
| 5806 |
|
| 5807 |
// attentions bypass the intermediate layer
|
| 5808 |
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 5809 |
|
| 5810 |
// output layer norm
|
| 5811 |
-
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].
|
| 5812 |
|
| 5813 |
// input for next layer
|
| 5814 |
inpL = cur;
|
|
@@ -5817,9 +5974,15 @@ struct llm_build_context {
|
|
| 5817 |
// final output
|
| 5818 |
cur = inpL;
|
| 5819 |
|
| 5820 |
-
// pooling
|
| 5821 |
-
|
| 5822 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5823 |
|
| 5824 |
ggml_build_forward_expand(gf, cur);
|
| 5825 |
|
|
@@ -5843,6 +6006,10 @@ struct llm_build_context {
|
|
| 5843 |
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 5844 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5845 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5846 |
inpL = llm_build_norm(ctx0, inpL, hparams,
|
| 5847 |
model.tok_norm,
|
| 5848 |
model.tok_norm_b,
|
|
@@ -5876,7 +6043,7 @@ struct llm_build_context {
|
|
| 5876 |
|
| 5877 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 5878 |
model.layers[il].wo, model.layers[il].bo,
|
| 5879 |
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
| 5880 |
cb(cur, "kqv_out", il);
|
| 5881 |
}
|
| 5882 |
|
|
@@ -5936,6 +6103,10 @@ struct llm_build_context {
|
|
| 5936 |
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 5937 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5938 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5939 |
for (int il = 0; il < n_layer; ++il) {
|
| 5940 |
struct ggml_tensor * attn_norm;
|
| 5941 |
|
|
@@ -5969,7 +6140,7 @@ struct llm_build_context {
|
|
| 5969 |
|
| 5970 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 5971 |
model.layers[il].wo, NULL,
|
| 5972 |
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
| 5973 |
cb(cur, "kqv_out", il);
|
| 5974 |
}
|
| 5975 |
|
|
@@ -6091,7 +6262,7 @@ struct llm_build_context {
|
|
| 6091 |
|
| 6092 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6093 |
model.layers[il].wo, NULL,
|
| 6094 |
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
| 6095 |
cb(cur, "kqv_out", il);
|
| 6096 |
}
|
| 6097 |
|
|
@@ -6206,7 +6377,7 @@ struct llm_build_context {
|
|
| 6206 |
|
| 6207 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6208 |
model.layers[il].wo, NULL,
|
| 6209 |
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
| 6210 |
cb(cur, "kqv_out", il);
|
| 6211 |
}
|
| 6212 |
|
|
@@ -6327,7 +6498,7 @@ struct llm_build_context {
|
|
| 6327 |
|
| 6328 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6329 |
model.layers[il].wo, model.layers[il].bo,
|
| 6330 |
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
| 6331 |
cb(cur, "kqv_out", il);
|
| 6332 |
}
|
| 6333 |
|
|
@@ -6454,7 +6625,7 @@ struct llm_build_context {
|
|
| 6454 |
|
| 6455 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6456 |
model.layers[il].wo, model.layers[il].bo,
|
| 6457 |
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
| 6458 |
cb(cur, "kqv_out", il);
|
| 6459 |
}
|
| 6460 |
|
|
@@ -6557,7 +6728,7 @@ struct llm_build_context {
|
|
| 6557 |
|
| 6558 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6559 |
model.layers[il].wo, NULL,
|
| 6560 |
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
| 6561 |
cb(cur, "kqv_out", il);
|
| 6562 |
}
|
| 6563 |
struct ggml_tensor * sa_out = cur;
|
|
@@ -6656,7 +6827,7 @@ struct llm_build_context {
|
|
| 6656 |
|
| 6657 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6658 |
model.layers[il].wo, model.layers[il].bo,
|
| 6659 |
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
| 6660 |
cb(cur, "kqv_out", il);
|
| 6661 |
}
|
| 6662 |
|
|
@@ -6765,7 +6936,7 @@ struct llm_build_context {
|
|
| 6765 |
|
| 6766 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6767 |
model.layers[il].wo, model.layers[il].bo,
|
| 6768 |
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
| 6769 |
cb(cur, "kqv_out", il);
|
| 6770 |
}
|
| 6771 |
|
|
@@ -6883,7 +7054,7 @@ struct llm_build_context {
|
|
| 6883 |
|
| 6884 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6885 |
model.layers[il].wo, NULL,
|
| 6886 |
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
| 6887 |
cb(cur, "kqv_out", il);
|
| 6888 |
}
|
| 6889 |
|
|
@@ -7002,7 +7173,7 @@ struct llm_build_context {
|
|
| 7002 |
|
| 7003 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 7004 |
model.layers[il].wo, model.layers[il].bo,
|
| 7005 |
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
| 7006 |
cb(cur, "kqv_out", il);
|
| 7007 |
}
|
| 7008 |
|
|
@@ -7134,7 +7305,7 @@ struct llm_build_context {
|
|
| 7134 |
|
| 7135 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 7136 |
model.layers[il].wo, model.layers[il].bo,
|
| 7137 |
-
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv,
|
| 7138 |
cb(cur, "kqv_out", il);
|
| 7139 |
}
|
| 7140 |
|
|
@@ -7249,6 +7420,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
| 7249 |
result = llm.build_refact();
|
| 7250 |
} break;
|
| 7251 |
case LLM_ARCH_BERT:
|
|
|
|
| 7252 |
{
|
| 7253 |
result = llm.build_bert();
|
| 7254 |
} break;
|
|
@@ -7352,7 +7524,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
| 7352 |
|
| 7353 |
for (int i = 0; i < n_kv; ++i) {
|
| 7354 |
float f;
|
| 7355 |
-
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
|
|
|
|
| 7356 |
f = -INFINITY;
|
| 7357 |
} else {
|
| 7358 |
f = 0;
|
|
@@ -7363,13 +7536,15 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
| 7363 |
}
|
| 7364 |
}
|
| 7365 |
|
|
|
|
|
|
|
| 7366 |
|
| 7367 |
-
|
| 7368 |
-
|
| 7369 |
-
float * data = (float *) lctx.
|
| 7370 |
|
| 7371 |
-
for (int i = 0; i <
|
| 7372 |
-
data[i] =
|
| 7373 |
}
|
| 7374 |
}
|
| 7375 |
|
|
@@ -7384,6 +7559,49 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
| 7384 |
data[i] = lctx.kv_self.cells[i].delta;
|
| 7385 |
}
|
| 7386 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7387 |
}
|
| 7388 |
|
| 7389 |
// decode a batch of tokens by evaluating the transformer
|
|
@@ -7495,7 +7713,7 @@ static int llama_decode_internal(
|
|
| 7495 |
embeddings = gf->nodes[gf->n_nodes - 3];
|
| 7496 |
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
| 7497 |
}
|
| 7498 |
-
} else if (strcmp(res->name, "
|
| 7499 |
embeddings = res;
|
| 7500 |
res = nullptr;
|
| 7501 |
} else {
|
|
@@ -7615,11 +7833,12 @@ static int llama_decode_internal(
|
|
| 7615 |
if (!lctx.embedding.empty()) {
|
| 7616 |
auto & embedding_out = lctx.embedding;
|
| 7617 |
|
| 7618 |
-
const int64_t
|
|
|
|
| 7619 |
|
| 7620 |
-
embedding_out.resize(
|
| 7621 |
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
|
| 7622 |
-
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(),
|
| 7623 |
ggml_backend_synchronize(embeddings_backend);
|
| 7624 |
}
|
| 7625 |
|
|
@@ -7696,7 +7915,13 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
|
| 7696 |
switch (llama_vocab_get_type(vocab)) {
|
| 7697 |
case LLAMA_VOCAB_TYPE_SPM: {
|
| 7698 |
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
| 7699 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7700 |
}
|
| 7701 |
case LLAMA_VOCAB_TYPE_WPM:
|
| 7702 |
case LLAMA_VOCAB_TYPE_BPE: {
|
|
@@ -7744,7 +7969,7 @@ struct llm_bigram_spm {
|
|
| 7744 |
};
|
| 7745 |
|
| 7746 |
struct llm_tokenizer_spm {
|
| 7747 |
-
llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
|
| 7748 |
|
| 7749 |
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
| 7750 |
// split string into utf8 chars
|
|
@@ -7819,6 +8044,7 @@ private:
|
|
| 7819 |
|
| 7820 |
if (p == rev_merge.end()) {
|
| 7821 |
// output any symbols that did not form tokens as bytes.
|
|
|
|
| 7822 |
for (int j = 0; j < (int)symbol.n; ++j) {
|
| 7823 |
llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
|
| 7824 |
output.push_back(token_id);
|
|
@@ -8381,17 +8607,18 @@ struct fragment_buffer_variant {
|
|
| 8381 |
token(_token),
|
| 8382 |
raw_text(_dummy),
|
| 8383 |
offset(0),
|
| 8384 |
-
length(0){}
|
|
|
|
| 8385 |
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
|
| 8386 |
:
|
| 8387 |
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
|
| 8388 |
-
token((llama_vocab::id)-1),
|
| 8389 |
raw_text(_raw_text),
|
| 8390 |
offset(_offset),
|
| 8391 |
length(_length){
|
| 8392 |
-
GGML_ASSERT(
|
| 8393 |
-
GGML_ASSERT(
|
| 8394 |
-
GGML_ASSERT(
|
| 8395 |
}
|
| 8396 |
|
| 8397 |
const FRAGMENT_BUFFER_VARIANT_TYPE type;
|
|
@@ -8515,14 +8742,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
| 8515 |
}
|
| 8516 |
|
| 8517 |
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
| 8518 |
-
fragment_buffer.emplace_front(
|
| 8519 |
|
| 8520 |
-
if (special) tokenizer_st_partition(
|
| 8521 |
|
| 8522 |
switch (vocab.type) {
|
| 8523 |
case LLAMA_VOCAB_TYPE_SPM:
|
| 8524 |
{
|
| 8525 |
-
for (const auto & fragment: fragment_buffer) {
|
| 8526 |
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
| 8527 |
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
| 8528 |
|
|
@@ -8550,7 +8777,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
| 8550 |
} break;
|
| 8551 |
case LLAMA_VOCAB_TYPE_BPE:
|
| 8552 |
{
|
| 8553 |
-
for (const auto & fragment: fragment_buffer) {
|
| 8554 |
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
| 8555 |
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
| 8556 |
|
|
@@ -8566,7 +8793,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
| 8566 |
} break;
|
| 8567 |
case LLAMA_VOCAB_TYPE_WPM:
|
| 8568 |
{
|
| 8569 |
-
for (const auto & fragment: fragment_buffer) {
|
| 8570 |
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
| 8571 |
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
| 8572 |
|
|
@@ -10087,20 +10314,20 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 10087 |
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
| 10088 |
new_type = GGML_TYPE_Q8_0;
|
| 10089 |
}
|
| 10090 |
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
| 10091 |
new_type = GGML_TYPE_Q5_K;
|
| 10092 |
}
|
| 10093 |
else if (new_type != GGML_TYPE_Q8_0) {
|
| 10094 |
new_type = GGML_TYPE_Q6_K;
|
| 10095 |
}
|
| 10096 |
} else if (name == "token_embd.weight") {
|
| 10097 |
-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
| 10098 |
new_type = GGML_TYPE_Q2_K;
|
| 10099 |
}
|
| 10100 |
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
| 10101 |
new_type = GGML_TYPE_Q4_K;
|
| 10102 |
}
|
| 10103 |
-
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
| 10104 |
if (name.find("attn_v.weight") != std::string::npos) {
|
| 10105 |
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
| 10106 |
else new_type = GGML_TYPE_Q2_K;
|
|
@@ -10110,6 +10337,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 10110 |
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
|
| 10111 |
++qs.i_ffn_down;
|
| 10112 |
}
|
|
|
|
|
|
|
|
|
|
| 10113 |
} else if (name.find("attn_v.weight") != std::string::npos) {
|
| 10114 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
| 10115 |
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
|
@@ -10227,6 +10457,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 10227 |
}
|
| 10228 |
++qs.i_ffn_up;
|
| 10229 |
}
|
|
|
|
| 10230 |
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
| 10231 |
//}
|
| 10232 |
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
|
|
@@ -10242,7 +10473,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 10242 |
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
| 10243 |
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
| 10244 |
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
| 10245 |
-
new_type == GGML_TYPE_IQ3_XXS) {
|
| 10246 |
int nx = tensor->ne[0];
|
| 10247 |
int ny = tensor->ne[1];
|
| 10248 |
if (nx % QK_K != 0) {
|
|
@@ -10257,6 +10488,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 10257 |
case GGML_TYPE_IQ2_XXS:
|
| 10258 |
case GGML_TYPE_IQ2_XS:
|
| 10259 |
case GGML_TYPE_IQ3_XXS:
|
|
|
|
| 10260 |
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
| 10261 |
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
| 10262 |
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
|
@@ -10286,19 +10518,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 10286 |
|
| 10287 |
// K-quants
|
| 10288 |
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
| 10289 |
-
case LLAMA_FTYPE_MOSTLY_Q2_K:
|
| 10290 |
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
|
| 10291 |
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
| 10292 |
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
| 10293 |
-
case LLAMA_FTYPE_MOSTLY_Q3_K_L:
|
| 10294 |
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
| 10295 |
-
case LLAMA_FTYPE_MOSTLY_Q4_K_M:
|
| 10296 |
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
| 10297 |
-
case LLAMA_FTYPE_MOSTLY_Q5_K_M:
|
| 10298 |
-
case LLAMA_FTYPE_MOSTLY_Q6_K:
|
| 10299 |
-
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
|
| 10300 |
-
case LLAMA_FTYPE_MOSTLY_IQ2_XS
|
| 10301 |
-
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
|
|
|
|
| 10302 |
|
| 10303 |
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
| 10304 |
}
|
|
@@ -10428,7 +10661,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 10428 |
quantize &= !params->only_copy;
|
| 10429 |
|
| 10430 |
// do not quantize expert gating tensors
|
| 10431 |
-
quantize &= name.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10432 |
|
| 10433 |
enum ggml_type new_type;
|
| 10434 |
void * new_data;
|
|
@@ -10468,6 +10705,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 10468 |
}
|
| 10469 |
if ((new_type == GGML_TYPE_IQ2_XXS ||
|
| 10470 |
new_type == GGML_TYPE_IQ2_XS ||
|
|
|
|
| 10471 |
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
| 10472 |
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
| 10473 |
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
|
@@ -10702,7 +10940,7 @@ static int llama_apply_lora_from_file_internal(
|
|
| 10702 |
{
|
| 10703 |
LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
|
| 10704 |
__func__, ftype);
|
| 10705 |
-
return
|
| 10706 |
}
|
| 10707 |
}
|
| 10708 |
|
|
@@ -10930,6 +11168,7 @@ struct llama_context_params llama_context_default_params() {
|
|
| 10930 |
/*.logits_all =*/ false,
|
| 10931 |
/*.embedding =*/ false,
|
| 10932 |
/*.offload_kqv =*/ true,
|
|
|
|
| 10933 |
};
|
| 10934 |
|
| 10935 |
return result;
|
|
@@ -10990,7 +11229,7 @@ bool llama_mlock_supported(void) {
|
|
| 10990 |
return llama_supports_mlock();
|
| 10991 |
}
|
| 10992 |
|
| 10993 |
-
void llama_backend_init(
|
| 10994 |
ggml_time_init();
|
| 10995 |
|
| 10996 |
// needed to initialize f16 tables
|
|
@@ -11000,15 +11239,17 @@ void llama_backend_init(bool numa) {
|
|
| 11000 |
ggml_free(ctx);
|
| 11001 |
}
|
| 11002 |
|
| 11003 |
-
if (numa) {
|
| 11004 |
-
ggml_numa_init();
|
| 11005 |
-
}
|
| 11006 |
-
|
| 11007 |
#ifdef GGML_USE_MPI
|
| 11008 |
ggml_mpi_backend_init();
|
| 11009 |
#endif
|
| 11010 |
}
|
| 11011 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11012 |
void llama_backend_free(void) {
|
| 11013 |
#ifdef GGML_USE_MPI
|
| 11014 |
ggml_mpi_backend_free();
|
|
@@ -11085,6 +11326,7 @@ struct llama_context * llama_new_context_with_model(
|
|
| 11085 |
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
| 11086 |
cparams.mul_mat_q = params.mul_mat_q;
|
| 11087 |
cparams.offload_kqv = params.offload_kqv;
|
|
|
|
| 11088 |
|
| 11089 |
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
| 11090 |
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
|
@@ -11232,14 +11474,14 @@ struct llama_context * llama_new_context_with_model(
|
|
| 11232 |
// resized during inference, reserve maximum
|
| 11233 |
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
|
| 11234 |
|
| 11235 |
-
if (params.embedding){
|
| 11236 |
ctx->embedding.resize(hparams.n_embd);
|
| 11237 |
}
|
| 11238 |
|
| 11239 |
// graph inputs
|
| 11240 |
{
|
| 11241 |
ggml_init_params init_params = {
|
| 11242 |
-
/* .mem_size */ ggml_tensor_overhead()*
|
| 11243 |
/* .mem_buffer */ nullptr,
|
| 11244 |
/* .no_alloc */ true,
|
| 11245 |
};
|
|
@@ -11249,15 +11491,19 @@ struct llama_context * llama_new_context_with_model(
|
|
| 11249 |
ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
|
| 11250 |
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
| 11251 |
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
|
|
|
| 11252 |
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
| 11253 |
-
ctx->
|
|
|
|
| 11254 |
|
| 11255 |
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
| 11256 |
ggml_set_name(ctx->inp_embd, "inp_embd");
|
| 11257 |
ggml_set_name(ctx->inp_pos, "inp_pos");
|
| 11258 |
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
|
|
|
| 11259 |
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
| 11260 |
-
ggml_set_name(ctx->
|
|
|
|
| 11261 |
|
| 11262 |
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
| 11263 |
|
|
@@ -12108,6 +12354,10 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
| 12108 |
return ctx->embedding.data();
|
| 12109 |
}
|
| 12110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12111 |
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
| 12112 |
return model->vocab.id_to_token[token].text.c_str();
|
| 12113 |
}
|
|
@@ -12258,6 +12508,123 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
| 12258 |
return 0;
|
| 12259 |
}
|
| 12260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12261 |
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
| 12262 |
struct llama_timings result = {
|
| 12263 |
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
|
|
|
| 197 |
LLM_ARCH_PERSIMMON,
|
| 198 |
LLM_ARCH_REFACT,
|
| 199 |
LLM_ARCH_BERT,
|
| 200 |
+
LLM_ARCH_NOMIC_BERT,
|
| 201 |
LLM_ARCH_BLOOM,
|
| 202 |
LLM_ARCH_STABLELM,
|
| 203 |
LLM_ARCH_QWEN,
|
|
|
|
| 212 |
};
|
| 213 |
|
| 214 |
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
| 215 |
+
{ LLM_ARCH_LLAMA, "llama" },
|
| 216 |
+
{ LLM_ARCH_FALCON, "falcon" },
|
| 217 |
+
{ LLM_ARCH_GPT2, "gpt2" },
|
| 218 |
+
{ LLM_ARCH_GPTJ, "gptj" },
|
| 219 |
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
| 220 |
+
{ LLM_ARCH_MPT, "mpt" },
|
| 221 |
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
| 222 |
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
| 223 |
+
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
| 224 |
+
{ LLM_ARCH_REFACT, "refact" },
|
| 225 |
+
{ LLM_ARCH_BERT, "bert" },
|
| 226 |
+
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
| 227 |
+
{ LLM_ARCH_BLOOM, "bloom" },
|
| 228 |
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
| 229 |
+
{ LLM_ARCH_QWEN, "qwen" },
|
| 230 |
+
{ LLM_ARCH_QWEN2, "qwen2" },
|
| 231 |
+
{ LLM_ARCH_PHI2, "phi2" },
|
| 232 |
+
{ LLM_ARCH_PLAMO, "plamo" },
|
| 233 |
+
{ LLM_ARCH_CODESHELL, "codeshell" },
|
| 234 |
+
{ LLM_ARCH_ORION, "orion" },
|
| 235 |
+
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
| 236 |
+
{ LLM_ARCH_MINICPM, "minicpm" },
|
| 237 |
};
|
| 238 |
|
| 239 |
enum llm_kv {
|
|
|
|
| 256 |
LLM_KV_TENSOR_DATA_LAYOUT,
|
| 257 |
LLM_KV_EXPERT_COUNT,
|
| 258 |
LLM_KV_EXPERT_USED_COUNT,
|
| 259 |
+
LLM_KV_POOLING_TYPE,
|
| 260 |
|
| 261 |
LLM_KV_ATTENTION_HEAD_COUNT,
|
| 262 |
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
|
|
|
| 314 |
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
| 315 |
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
| 316 |
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
| 317 |
+
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
| 318 |
|
| 319 |
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
| 320 |
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
|
|
|
| 377 |
LLM_TENSOR_ATTN_OUT,
|
| 378 |
LLM_TENSOR_ATTN_NORM,
|
| 379 |
LLM_TENSOR_ATTN_NORM_2,
|
| 380 |
+
LLM_TENSOR_ATTN_OUT_NORM,
|
| 381 |
LLM_TENSOR_ATTN_ROT_EMBD,
|
| 382 |
LLM_TENSOR_FFN_GATE_INP,
|
| 383 |
LLM_TENSOR_FFN_NORM,
|
|
|
|
| 390 |
LLM_TENSOR_FFN_UP_EXP,
|
| 391 |
LLM_TENSOR_ATTN_Q_NORM,
|
| 392 |
LLM_TENSOR_ATTN_K_NORM,
|
| 393 |
+
LLM_TENSOR_LAYER_OUT_NORM,
|
| 394 |
};
|
| 395 |
|
| 396 |
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
|
|
|
| 556 |
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
| 557 |
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
| 558 |
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
| 559 |
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
| 560 |
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 561 |
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 562 |
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 563 |
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 564 |
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
| 565 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 566 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 567 |
+
},
|
| 568 |
+
},
|
| 569 |
+
{
|
| 570 |
+
LLM_ARCH_NOMIC_BERT,
|
| 571 |
+
{
|
| 572 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 573 |
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
| 574 |
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
| 575 |
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
| 576 |
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
| 577 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 578 |
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
| 579 |
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
| 580 |
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 581 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 582 |
},
|
|
|
|
| 793 |
llm_arch arch;
|
| 794 |
|
| 795 |
std::string operator()(llm_tensor tensor) const {
|
| 796 |
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
| 797 |
+
return "__missing__";
|
| 798 |
+
}
|
| 799 |
return LLM_TENSOR_NAMES[arch].at(tensor);
|
| 800 |
}
|
| 801 |
|
| 802 |
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
| 803 |
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
| 804 |
+
return "__missing__";
|
| 805 |
+
}
|
| 806 |
return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
|
| 807 |
}
|
| 808 |
|
| 809 |
std::string operator()(llm_tensor tensor, int bid) const {
|
| 810 |
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
| 811 |
+
return "__missing__";
|
| 812 |
+
}
|
| 813 |
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
|
| 814 |
}
|
| 815 |
|
| 816 |
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
| 817 |
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
| 818 |
+
return "__missing__";
|
| 819 |
+
}
|
| 820 |
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
| 821 |
}
|
| 822 |
|
| 823 |
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
| 824 |
+
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
| 825 |
+
return "__missing__";
|
| 826 |
+
}
|
| 827 |
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
|
| 828 |
}
|
| 829 |
};
|
|
|
|
| 1034 |
int fd = fileno(file->fp);
|
| 1035 |
int flags = MAP_SHARED;
|
| 1036 |
// prefetch/readahead impairs performance on NUMA systems
|
| 1037 |
+
if (numa) { prefetch = 0; }
|
| 1038 |
#ifdef __linux__
|
| 1039 |
// advise the kernel to read the file sequentially (increases readahead)
|
| 1040 |
if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
|
|
|
|
| 1504 |
MODEL_22M,
|
| 1505 |
MODEL_33M,
|
| 1506 |
MODEL_109M,
|
| 1507 |
+
MODEL_137M,
|
| 1508 |
MODEL_335M,
|
| 1509 |
MODEL_0_5B,
|
| 1510 |
MODEL_1B,
|
|
|
|
| 1557 |
uint32_t n_yarn_orig_ctx;
|
| 1558 |
int32_t rope_scaling_type_train;
|
| 1559 |
|
| 1560 |
+
float f_clamp_kqv = 0.0f;
|
| 1561 |
+
float f_max_alibi_bias = 0.0f;
|
| 1562 |
|
| 1563 |
bool causal_attn = true;
|
| 1564 |
+
bool need_kq_pos = false;
|
| 1565 |
|
| 1566 |
+
uint32_t pooling_type = LLAMA_POOLING_NONE;
|
| 1567 |
|
| 1568 |
bool operator!=(const llama_hparams & other) const {
|
| 1569 |
if (this->vocab_only != other.vocab_only) return true;
|
|
|
|
| 1625 |
|
| 1626 |
bool mul_mat_q;
|
| 1627 |
bool offload_kqv;
|
| 1628 |
+
bool do_pooling;
|
| 1629 |
|
| 1630 |
ggml_backend_sched_eval_callback cb_eval;
|
| 1631 |
void * cb_eval_user_data;
|
|
|
|
| 1641 |
struct ggml_tensor * attn_q_norm_b;
|
| 1642 |
struct ggml_tensor * attn_k_norm;
|
| 1643 |
struct ggml_tensor * attn_k_norm_b;
|
| 1644 |
+
struct ggml_tensor * attn_out_norm;
|
| 1645 |
+
struct ggml_tensor * attn_out_norm_b;
|
| 1646 |
|
| 1647 |
// attention
|
| 1648 |
struct ggml_tensor * wq;
|
|
|
|
| 1661 |
// normalization
|
| 1662 |
struct ggml_tensor * ffn_norm;
|
| 1663 |
struct ggml_tensor * ffn_norm_b;
|
| 1664 |
+
struct ggml_tensor * layer_out_norm;
|
| 1665 |
+
struct ggml_tensor * layer_out_norm_b;
|
| 1666 |
|
| 1667 |
// ff
|
| 1668 |
struct ggml_tensor * ffn_gate; // w1
|
|
|
|
| 1924 |
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
| 1925 |
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
| 1926 |
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
| 1927 |
+
struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx]
|
| 1928 |
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
| 1929 |
+
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
| 1930 |
+
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
| 1931 |
|
| 1932 |
#ifdef GGML_USE_MPI
|
| 1933 |
ggml_mpi_context * ctx_mpi = NULL;
|
|
|
|
| 2526 |
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
| 2527 |
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
| 2528 |
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
| 2529 |
+
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
| 2530 |
default:
|
| 2531 |
{
|
| 2532 |
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
|
|
|
| 2876 |
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
| 2877 |
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
| 2878 |
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
| 2879 |
+
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
| 2880 |
|
| 2881 |
default: return "unknown, may not work";
|
| 2882 |
}
|
|
|
|
| 2884 |
|
| 2885 |
static const char * llama_model_type_name(e_model type) {
|
| 2886 |
switch (type) {
|
| 2887 |
+
case MODEL_22M: return "22M";
|
| 2888 |
+
case MODEL_33M: return "33M";
|
| 2889 |
+
case MODEL_109M: return "109M";
|
| 2890 |
+
case MODEL_137M: return "137M";
|
| 2891 |
+
case MODEL_0_5B: return "0.5B";
|
| 2892 |
case MODEL_1B: return "1B";
|
| 2893 |
case MODEL_2B: return "2B";
|
| 2894 |
case MODEL_3B: return "3B";
|
|
|
|
| 3058 |
case 40: model.type = e_model::MODEL_13B; break;
|
| 3059 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 3060 |
}
|
| 3061 |
+
|
| 3062 |
+
if (model.type == e_model::MODEL_13B) {
|
| 3063 |
+
// TODO: become GGUF KV parameter
|
| 3064 |
+
hparams.f_max_alibi_bias = 8.0f;
|
| 3065 |
+
}
|
| 3066 |
} break;
|
| 3067 |
case LLM_ARCH_STARCODER:
|
| 3068 |
{
|
|
|
|
| 3090 |
case 32: model.type = e_model::MODEL_1B; break;
|
| 3091 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 3092 |
}
|
| 3093 |
+
|
| 3094 |
+
// TODO: become GGUF KV parameter
|
| 3095 |
+
hparams.f_max_alibi_bias = 8.0f;
|
| 3096 |
} break;
|
| 3097 |
case LLM_ARCH_BERT:
|
| 3098 |
{
|
| 3099 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
| 3100 |
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
| 3101 |
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
| 3102 |
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
| 3103 |
|
| 3104 |
switch (hparams.n_layer) {
|
| 3105 |
case 3:
|
|
|
|
| 3115 |
model.type = e_model::MODEL_335M; break; // bge-large
|
| 3116 |
}
|
| 3117 |
} break;
|
| 3118 |
+
case LLM_ARCH_NOMIC_BERT:
|
| 3119 |
+
{
|
| 3120 |
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
| 3121 |
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
| 3122 |
+
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
| 3123 |
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
| 3124 |
+
|
| 3125 |
+
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
| 3126 |
+
model.type = e_model::MODEL_137M;
|
| 3127 |
+
}
|
| 3128 |
+
} break;
|
| 3129 |
case LLM_ARCH_BLOOM:
|
| 3130 |
{
|
| 3131 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
|
|
| 3138 |
case 4096: model.type = e_model::MODEL_7B; break;
|
| 3139 |
} break;
|
| 3140 |
}
|
| 3141 |
+
|
| 3142 |
+
// TODO: become GGUF KV parameter
|
| 3143 |
+
hparams.f_max_alibi_bias = 8.0f;
|
| 3144 |
} break;
|
| 3145 |
case LLM_ARCH_MPT:
|
| 3146 |
{
|
|
|
|
|
|
|
| 3147 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
| 3148 |
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
| 3149 |
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
|
|
|
| 3245 |
}
|
| 3246 |
|
| 3247 |
model.ftype = ml.ftype;
|
| 3248 |
+
|
| 3249 |
+
if (hparams.f_max_alibi_bias > 0.0f) {
|
| 3250 |
+
hparams.need_kq_pos = true;
|
| 3251 |
+
}
|
| 3252 |
}
|
| 3253 |
|
| 3254 |
// TODO: This should probably be in llama.h
|
|
|
|
| 3372 |
|
| 3373 |
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
| 3374 |
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
| 3375 |
+
try {
|
| 3376 |
+
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
| 3377 |
+
} catch (const std::exception & e) {
|
| 3378 |
+
LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
|
| 3379 |
+
vocab.linefeed_id = vocab.special_pad_id;
|
| 3380 |
+
}
|
| 3381 |
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
| 3382 |
vocab.linefeed_id = vocab.special_pad_id;
|
| 3383 |
} else {
|
|
|
|
| 3933 |
}
|
| 3934 |
} break;
|
| 3935 |
case LLM_ARCH_BERT:
|
| 3936 |
+
case LLM_ARCH_NOMIC_BERT:
|
| 3937 |
{
|
| 3938 |
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
| 3939 |
+
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
|
| 3940 |
+
if (model.arch == LLM_ARCH_BERT) {
|
| 3941 |
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
|
| 3942 |
+
}
|
| 3943 |
+
|
| 3944 |
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
| 3945 |
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
| 3946 |
|
|
|
|
| 3950 |
|
| 3951 |
auto & layer = model.layers[i];
|
| 3952 |
|
| 3953 |
+
if (model.arch == LLM_ARCH_BERT) {
|
| 3954 |
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
| 3955 |
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
| 3956 |
|
| 3957 |
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
| 3958 |
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
| 3959 |
|
| 3960 |
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
| 3961 |
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
| 3962 |
+
} else {
|
| 3963 |
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
| 3964 |
+
}
|
| 3965 |
|
| 3966 |
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
|
|
| 3967 |
|
| 3968 |
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
|
| 3969 |
+
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
| 3970 |
|
| 3971 |
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
| 3972 |
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
| 3973 |
|
| 3974 |
+
if (model.arch == LLM_ARCH_BERT) {
|
| 3975 |
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
| 3976 |
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
| 3977 |
|
| 3978 |
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
| 3979 |
+
} else {
|
| 3980 |
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
| 3981 |
+
}
|
| 3982 |
+
|
| 3983 |
+
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
| 3984 |
+
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
| 3985 |
}
|
| 3986 |
} break;
|
| 3987 |
case LLM_ARCH_BLOOM:
|
|
|
|
| 4460 |
|
| 4461 |
model.hparams.vocab_only = params.vocab_only;
|
| 4462 |
|
| 4463 |
+
try {
|
| 4464 |
+
llm_load_arch(ml, model);
|
| 4465 |
+
} catch(const std::exception & e) {
|
| 4466 |
+
throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
|
| 4467 |
+
}
|
| 4468 |
+
try {
|
| 4469 |
+
llm_load_hparams(ml, model);
|
| 4470 |
+
} catch(const std::exception & e) {
|
| 4471 |
+
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
|
| 4472 |
+
}
|
| 4473 |
+
try {
|
| 4474 |
+
llm_load_vocab(ml, model);
|
| 4475 |
+
} catch(const std::exception & e) {
|
| 4476 |
+
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
|
| 4477 |
+
}
|
| 4478 |
|
| 4479 |
llm_load_print_meta(ml, model);
|
| 4480 |
|
|
|
|
| 4791 |
struct ggml_tensor * wo_b,
|
| 4792 |
struct ggml_tensor * q_cur,
|
| 4793 |
struct ggml_tensor * kq_mask,
|
| 4794 |
+
struct ggml_tensor * kq_pos,
|
| 4795 |
int64_t n_ctx,
|
| 4796 |
int32_t n_tokens,
|
| 4797 |
int32_t n_kv,
|
|
|
|
| 4798 |
float kq_scale,
|
| 4799 |
const llm_build_cb & cb,
|
| 4800 |
int il) {
|
|
|
|
| 4824 |
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
| 4825 |
}
|
| 4826 |
|
| 4827 |
+
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_SYCL)
|
| 4828 |
+
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, Kompute, and SYCL")
|
| 4829 |
+
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
| 4830 |
+
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
| 4831 |
+
if (hparams.f_max_alibi_bias > 0.0f) {
|
| 4832 |
kq = ggml_scale(ctx, kq, kq_scale);
|
| 4833 |
cb(kq, "kq_scaled", il);
|
| 4834 |
|
| 4835 |
+
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
| 4836 |
+
cb(kq, "kq_scaled_alibi", il);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4837 |
|
| 4838 |
kq = ggml_add(ctx, kq, kq_mask);
|
| 4839 |
cb(kq, "kq_masked", il);
|
| 4840 |
|
| 4841 |
kq = ggml_soft_max(ctx, kq);
|
| 4842 |
cb(kq, "kq_soft_max", il);
|
| 4843 |
+
} else
|
| 4844 |
+
#endif
|
| 4845 |
+
{
|
| 4846 |
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
| 4847 |
cb(kq, "kq_soft_max_ext", il);
|
| 4848 |
}
|
| 4849 |
|
|
|
|
| 4891 |
struct ggml_tensor * v_cur,
|
| 4892 |
struct ggml_tensor * q_cur,
|
| 4893 |
struct ggml_tensor * kq_mask,
|
| 4894 |
+
struct ggml_tensor * kq_pos,
|
| 4895 |
int64_t n_ctx,
|
| 4896 |
int32_t n_tokens,
|
| 4897 |
int32_t kv_head,
|
| 4898 |
int32_t n_kv,
|
|
|
|
| 4899 |
float kq_scale,
|
| 4900 |
const llm_build_cb & cb,
|
| 4901 |
int il) {
|
|
|
|
| 4909 |
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
| 4910 |
|
| 4911 |
struct ggml_tensor * cur;
|
| 4912 |
+
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
| 4913 |
+
q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
|
|
|
|
| 4914 |
cb(cur, "kqv_out", il);
|
| 4915 |
|
| 4916 |
return cur;
|
|
|
|
| 4951 |
const int32_t n_orig_ctx;
|
| 4952 |
|
| 4953 |
const bool do_rope_shift;
|
| 4954 |
+
const uint32_t pooling_type;
|
| 4955 |
|
| 4956 |
const llm_build_cb & cb;
|
| 4957 |
|
|
|
|
| 4995 |
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
| 4996 |
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
| 4997 |
do_rope_shift (worst_case || kv_self.has_shift),
|
| 4998 |
+
pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
|
| 4999 |
cb (cb),
|
| 5000 |
buf_compute_meta (lctx.buf_compute_meta) {
|
| 5001 |
// all initializations should be done in init()
|
|
|
|
| 5078 |
}
|
| 5079 |
|
| 5080 |
Qcur = ggml_rope_custom(
|
| 5081 |
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
| 5082 |
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
| 5083 |
ext_factor, attn_factor, beta_fast, beta_slow
|
| 5084 |
);
|
|
|
|
| 5093 |
|
| 5094 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 5095 |
model.layers[il].wo, model.layers[il].bo,
|
| 5096 |
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5097 |
cb(cur, "kqv_out", il);
|
| 5098 |
}
|
| 5099 |
|
|
|
|
| 5223 |
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 5224 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5225 |
|
| 5226 |
+
// positions of the tokens in the KV cache
|
| 5227 |
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
| 5228 |
+
cb(KQ_pos, "KQ_pos", -1);
|
| 5229 |
+
|
| 5230 |
// shift the entire K-cache if needed
|
| 5231 |
if (do_rope_shift) {
|
| 5232 |
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
|
|
|
| 5275 |
cb(Kcur, "Kcur", il);
|
| 5276 |
|
| 5277 |
|
|
|
|
|
|
|
|
|
|
| 5278 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 5279 |
model.layers[il].wo, NULL,
|
| 5280 |
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5281 |
cb(cur, "kqv_out", il);
|
| 5282 |
}
|
| 5283 |
|
|
|
|
| 5401 |
|
| 5402 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 5403 |
model.layers[il].wo, NULL,
|
| 5404 |
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5405 |
cb(cur, "kqv_out", il);
|
| 5406 |
}
|
| 5407 |
|
|
|
|
| 5500 |
|
| 5501 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 5502 |
model.layers[il].wo, model.layers[il].bo,
|
| 5503 |
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5504 |
cb(cur, "kqv_out", il);
|
| 5505 |
}
|
| 5506 |
|
|
|
|
| 5705 |
|
| 5706 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 5707 |
model.layers[il].wo, model.layers[il].bo,
|
| 5708 |
+
Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5709 |
cb(cur, "kqv_out", il);
|
| 5710 |
}
|
| 5711 |
|
|
|
|
| 5767 |
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 5768 |
cb(KQ_mask, "KQ_mask", -1);
|
| 5769 |
|
| 5770 |
+
// positions of the tokens in the KV cache
|
| 5771 |
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
| 5772 |
+
cb(KQ_pos, "KQ_pos", -1);
|
| 5773 |
+
|
| 5774 |
for (int il = 0; il < n_layer; ++il) {
|
| 5775 |
struct ggml_tensor * inpSA = inpL;
|
| 5776 |
|
|
|
|
| 5798 |
|
| 5799 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 5800 |
model.layers[il].wo, NULL,
|
| 5801 |
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5802 |
cb(cur, "kqv_out", il);
|
| 5803 |
}
|
| 5804 |
|
|
|
|
| 5848 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 5849 |
|
| 5850 |
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 5851 |
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
| 5852 |
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
|
|
| 5853 |
|
| 5854 |
struct ggml_tensor * cur;
|
| 5855 |
struct ggml_tensor * inpL;
|
| 5856 |
|
| 5857 |
// get input vectors with right size
|
| 5858 |
+
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
|
| 5859 |
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 5860 |
+
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
|
| 5861 |
+
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
|
| 5862 |
|
| 5863 |
// construct input embeddings (token, type, position)
|
| 5864 |
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 5865 |
+
|
| 5866 |
// token types are hardcoded to zero ("Sentence A")
|
| 5867 |
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
|
| 5868 |
inpL = ggml_add(ctx0, inpL, type_row0);
|
| 5869 |
+
if (model.arch == LLM_ARCH_BERT) {
|
| 5870 |
+
inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
|
| 5871 |
+
}
|
| 5872 |
cb(inpL, "inp_embd", -1);
|
| 5873 |
|
| 5874 |
// embed layer norm
|
|
|
|
| 5884 |
struct ggml_tensor * cur = inpL;
|
| 5885 |
|
| 5886 |
// self-attention
|
| 5887 |
+
if (model.arch == LLM_ARCH_BERT) {
|
| 5888 |
struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
| 5889 |
cb(Qcur, "Qcur", il);
|
| 5890 |
|
|
|
|
| 5899 |
|
| 5900 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 5901 |
model.layers[il].wo, model.layers[il].bo,
|
| 5902 |
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5903 |
+
cb(cur, "kqv_out", il);
|
| 5904 |
+
} else {
|
| 5905 |
+
// compute Q and K and RoPE them
|
| 5906 |
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
| 5907 |
+
cb(cur, "wqkv", il);
|
| 5908 |
+
|
| 5909 |
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
| 5910 |
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
| 5911 |
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
| 5912 |
+
|
| 5913 |
+
cb(Qcur, "Qcur", il);
|
| 5914 |
+
cb(Kcur, "Kcur", il);
|
| 5915 |
+
cb(Vcur, "Vcur", il);
|
| 5916 |
+
|
| 5917 |
+
Qcur = ggml_rope_custom(
|
| 5918 |
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
| 5919 |
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
| 5920 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 5921 |
+
);
|
| 5922 |
+
cb(Qcur, "Qcur", il);
|
| 5923 |
+
|
| 5924 |
+
Kcur = ggml_rope_custom(
|
| 5925 |
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
| 5926 |
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
| 5927 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 5928 |
+
);
|
| 5929 |
+
cb(Kcur, "Kcur", il);
|
| 5930 |
+
|
| 5931 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 5932 |
+
model.layers[il].wo, model.layers[il].bo,
|
| 5933 |
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5934 |
cb(cur, "kqv_out", il);
|
| 5935 |
}
|
| 5936 |
|
|
|
|
| 5938 |
cur = ggml_add(ctx0, cur, inpL);
|
| 5939 |
|
| 5940 |
// attention layer norm
|
| 5941 |
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
|
| 5942 |
|
| 5943 |
struct ggml_tensor * ffn_inp = cur;
|
| 5944 |
cb(ffn_inp, "ffn_inp", il);
|
| 5945 |
|
| 5946 |
// feed-forward network
|
| 5947 |
+
if (model.arch == LLM_ARCH_BERT) {
|
| 5948 |
+
cur = llm_build_ffn(ctx0, cur,
|
| 5949 |
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
| 5950 |
+
NULL, NULL,
|
| 5951 |
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
| 5952 |
+
NULL,
|
| 5953 |
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
| 5954 |
+
} else {
|
| 5955 |
+
cur = llm_build_ffn(ctx0, cur,
|
| 5956 |
+
model.layers[il].ffn_up, NULL,
|
| 5957 |
+
model.layers[il].ffn_gate, NULL,
|
| 5958 |
+
model.layers[il].ffn_down, NULL,
|
| 5959 |
+
NULL,
|
| 5960 |
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
| 5961 |
+
}
|
| 5962 |
cb(cur, "ffn_out", il);
|
| 5963 |
|
| 5964 |
// attentions bypass the intermediate layer
|
| 5965 |
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 5966 |
|
| 5967 |
// output layer norm
|
| 5968 |
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
|
| 5969 |
|
| 5970 |
// input for next layer
|
| 5971 |
inpL = cur;
|
|
|
|
| 5974 |
// final output
|
| 5975 |
cur = inpL;
|
| 5976 |
|
| 5977 |
+
// pooling layer
|
| 5978 |
+
if (pooling_type == LLAMA_POOLING_MEAN) {
|
| 5979 |
+
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
| 5980 |
+
} else if (pooling_type == LLAMA_POOLING_CLS) {
|
| 5981 |
+
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
| 5982 |
+
} else {
|
| 5983 |
+
GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
|
| 5984 |
+
}
|
| 5985 |
+
cb(cur, "result_embd", -1);
|
| 5986 |
|
| 5987 |
ggml_build_forward_expand(gf, cur);
|
| 5988 |
|
|
|
|
| 6006 |
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 6007 |
cb(KQ_mask, "KQ_mask", -1);
|
| 6008 |
|
| 6009 |
+
// positions of the tokens in the KV cache
|
| 6010 |
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
| 6011 |
+
cb(KQ_pos, "KQ_pos", -1);
|
| 6012 |
+
|
| 6013 |
inpL = llm_build_norm(ctx0, inpL, hparams,
|
| 6014 |
model.tok_norm,
|
| 6015 |
model.tok_norm_b,
|
|
|
|
| 6043 |
|
| 6044 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6045 |
model.layers[il].wo, model.layers[il].bo,
|
| 6046 |
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 6047 |
cb(cur, "kqv_out", il);
|
| 6048 |
}
|
| 6049 |
|
|
|
|
| 6103 |
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 6104 |
cb(KQ_mask, "KQ_mask", -1);
|
| 6105 |
|
| 6106 |
+
// positions of the tokens in the KV cache
|
| 6107 |
+
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
| 6108 |
+
cb(KQ_pos, "KQ_pos", -1);
|
| 6109 |
+
|
| 6110 |
for (int il = 0; il < n_layer; ++il) {
|
| 6111 |
struct ggml_tensor * attn_norm;
|
| 6112 |
|
|
|
|
| 6140 |
|
| 6141 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6142 |
model.layers[il].wo, NULL,
|
| 6143 |
+
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 6144 |
cb(cur, "kqv_out", il);
|
| 6145 |
}
|
| 6146 |
|
|
|
|
| 6262 |
|
| 6263 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6264 |
model.layers[il].wo, NULL,
|
| 6265 |
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 6266 |
cb(cur, "kqv_out", il);
|
| 6267 |
}
|
| 6268 |
|
|
|
|
| 6377 |
|
| 6378 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6379 |
model.layers[il].wo, NULL,
|
| 6380 |
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 6381 |
cb(cur, "kqv_out", il);
|
| 6382 |
}
|
| 6383 |
|
|
|
|
| 6498 |
|
| 6499 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6500 |
model.layers[il].wo, model.layers[il].bo,
|
| 6501 |
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 6502 |
cb(cur, "kqv_out", il);
|
| 6503 |
}
|
| 6504 |
|
|
|
|
| 6625 |
|
| 6626 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6627 |
model.layers[il].wo, model.layers[il].bo,
|
| 6628 |
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
| 6629 |
cb(cur, "kqv_out", il);
|
| 6630 |
}
|
| 6631 |
|
|
|
|
| 6728 |
|
| 6729 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6730 |
model.layers[il].wo, NULL,
|
| 6731 |
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 6732 |
cb(cur, "kqv_out", il);
|
| 6733 |
}
|
| 6734 |
struct ggml_tensor * sa_out = cur;
|
|
|
|
| 6827 |
|
| 6828 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6829 |
model.layers[il].wo, model.layers[il].bo,
|
| 6830 |
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 6831 |
cb(cur, "kqv_out", il);
|
| 6832 |
}
|
| 6833 |
|
|
|
|
| 6936 |
|
| 6937 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 6938 |
model.layers[il].wo, model.layers[il].bo,
|
| 6939 |
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 6940 |
cb(cur, "kqv_out", il);
|
| 6941 |
}
|
| 6942 |
|
|
|
|
| 7054 |
|
| 7055 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 7056 |
model.layers[il].wo, NULL,
|
| 7057 |
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 7058 |
cb(cur, "kqv_out", il);
|
| 7059 |
}
|
| 7060 |
|
|
|
|
| 7173 |
|
| 7174 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 7175 |
model.layers[il].wo, model.layers[il].bo,
|
| 7176 |
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 7177 |
cb(cur, "kqv_out", il);
|
| 7178 |
}
|
| 7179 |
|
|
|
|
| 7305 |
|
| 7306 |
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 7307 |
model.layers[il].wo, model.layers[il].bo,
|
| 7308 |
+
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 7309 |
cb(cur, "kqv_out", il);
|
| 7310 |
}
|
| 7311 |
|
|
|
|
| 7420 |
result = llm.build_refact();
|
| 7421 |
} break;
|
| 7422 |
case LLM_ARCH_BERT:
|
| 7423 |
+
case LLM_ARCH_NOMIC_BERT:
|
| 7424 |
{
|
| 7425 |
result = llm.build_bert();
|
| 7426 |
} break;
|
|
|
|
| 7524 |
|
| 7525 |
for (int i = 0; i < n_kv; ++i) {
|
| 7526 |
float f;
|
| 7527 |
+
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
|
| 7528 |
+
(hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
|
| 7529 |
f = -INFINITY;
|
| 7530 |
} else {
|
| 7531 |
f = 0;
|
|
|
|
| 7536 |
}
|
| 7537 |
}
|
| 7538 |
|
| 7539 |
+
if (hparams.need_kq_pos) {
|
| 7540 |
+
const int64_t n_kv = kv_self.n;
|
| 7541 |
|
| 7542 |
+
assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
|
| 7543 |
+
|
| 7544 |
+
float * data = (float *) lctx.inp_KQ_pos->data;
|
| 7545 |
|
| 7546 |
+
for (int i = 0; i < n_kv; ++i) {
|
| 7547 |
+
data[i] = float(lctx.kv_self.cells[i].pos);
|
| 7548 |
}
|
| 7549 |
}
|
| 7550 |
|
|
|
|
| 7559 |
data[i] = lctx.kv_self.cells[i].delta;
|
| 7560 |
}
|
| 7561 |
}
|
| 7562 |
+
|
| 7563 |
+
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
|
| 7564 |
+
const int64_t n_tokens = batch.n_tokens;
|
| 7565 |
+
|
| 7566 |
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
| 7567 |
+
float * data = (float *) lctx.inp_mean->data;
|
| 7568 |
+
|
| 7569 |
+
memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
|
| 7570 |
+
|
| 7571 |
+
std::vector<uint64_t> sum(n_tokens, 0);
|
| 7572 |
+
for (int i = 0; i < n_tokens; ++i) {
|
| 7573 |
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
| 7574 |
+
sum[seq_id] += 1;
|
| 7575 |
+
}
|
| 7576 |
+
|
| 7577 |
+
std::vector<float> div(n_tokens, 0.0f);
|
| 7578 |
+
for (int i = 0; i < n_tokens; ++i) {
|
| 7579 |
+
const uint64_t s = sum[i];
|
| 7580 |
+
if (s > 0) {
|
| 7581 |
+
div[i] = 1.0f/float(s);
|
| 7582 |
+
}
|
| 7583 |
+
}
|
| 7584 |
+
|
| 7585 |
+
for (int i = 0; i < n_tokens; ++i) {
|
| 7586 |
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
| 7587 |
+
data[seq_id*n_tokens + i] = div[seq_id];
|
| 7588 |
+
}
|
| 7589 |
+
}
|
| 7590 |
+
|
| 7591 |
+
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
|
| 7592 |
+
const int64_t n_tokens = batch.n_tokens;
|
| 7593 |
+
|
| 7594 |
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
| 7595 |
+
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
| 7596 |
+
|
| 7597 |
+
for (int i = 0; i < n_tokens; ++i) {
|
| 7598 |
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
| 7599 |
+
const llama_pos pos = batch.pos[i];
|
| 7600 |
+
if (pos == 0) {
|
| 7601 |
+
data[seq_id] = i;
|
| 7602 |
+
}
|
| 7603 |
+
}
|
| 7604 |
+
}
|
| 7605 |
}
|
| 7606 |
|
| 7607 |
// decode a batch of tokens by evaluating the transformer
|
|
|
|
| 7713 |
embeddings = gf->nodes[gf->n_nodes - 3];
|
| 7714 |
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
| 7715 |
}
|
| 7716 |
+
} else if (strcmp(res->name, "result_embd") == 0) {
|
| 7717 |
embeddings = res;
|
| 7718 |
res = nullptr;
|
| 7719 |
} else {
|
|
|
|
| 7833 |
if (!lctx.embedding.empty()) {
|
| 7834 |
auto & embedding_out = lctx.embedding;
|
| 7835 |
|
| 7836 |
+
const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
|
| 7837 |
+
const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
|
| 7838 |
|
| 7839 |
+
embedding_out.resize(embd_size);
|
| 7840 |
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
|
| 7841 |
+
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
|
| 7842 |
ggml_backend_synchronize(embeddings_backend);
|
| 7843 |
}
|
| 7844 |
|
|
|
|
| 7915 |
switch (llama_vocab_get_type(vocab)) {
|
| 7916 |
case LLAMA_VOCAB_TYPE_SPM: {
|
| 7917 |
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
| 7918 |
+
auto token = vocab.token_to_id.find(buf);
|
| 7919 |
+
if (token != vocab.token_to_id.end()) {
|
| 7920 |
+
return (*token).second;
|
| 7921 |
+
}
|
| 7922 |
+
// Try to fall back to just the byte as a string
|
| 7923 |
+
const char buf2[2] = { (char)ch, 0 };
|
| 7924 |
+
return vocab.token_to_id.at(buf2);
|
| 7925 |
}
|
| 7926 |
case LLAMA_VOCAB_TYPE_WPM:
|
| 7927 |
case LLAMA_VOCAB_TYPE_BPE: {
|
|
|
|
| 7969 |
};
|
| 7970 |
|
| 7971 |
struct llm_tokenizer_spm {
|
| 7972 |
+
llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
|
| 7973 |
|
| 7974 |
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
| 7975 |
// split string into utf8 chars
|
|
|
|
| 8044 |
|
| 8045 |
if (p == rev_merge.end()) {
|
| 8046 |
// output any symbols that did not form tokens as bytes.
|
| 8047 |
+
output.reserve(output.size() + symbol.n);
|
| 8048 |
for (int j = 0; j < (int)symbol.n; ++j) {
|
| 8049 |
llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
|
| 8050 |
output.push_back(token_id);
|
|
|
|
| 8607 |
token(_token),
|
| 8608 |
raw_text(_dummy),
|
| 8609 |
offset(0),
|
| 8610 |
+
length(0) {}
|
| 8611 |
+
|
| 8612 |
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
|
| 8613 |
:
|
| 8614 |
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
|
| 8615 |
+
token((llama_vocab::id) - 1),
|
| 8616 |
raw_text(_raw_text),
|
| 8617 |
offset(_offset),
|
| 8618 |
length(_length){
|
| 8619 |
+
GGML_ASSERT(_offset >= 0);
|
| 8620 |
+
GGML_ASSERT(_length >= 1);
|
| 8621 |
+
GGML_ASSERT(offset + length <= raw_text.length());
|
| 8622 |
}
|
| 8623 |
|
| 8624 |
const FRAGMENT_BUFFER_VARIANT_TYPE type;
|
|
|
|
| 8742 |
}
|
| 8743 |
|
| 8744 |
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
| 8745 |
+
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
| 8746 |
|
| 8747 |
+
if (special) tokenizer_st_partition(vocab, fragment_buffer);
|
| 8748 |
|
| 8749 |
switch (vocab.type) {
|
| 8750 |
case LLAMA_VOCAB_TYPE_SPM:
|
| 8751 |
{
|
| 8752 |
+
for (const auto & fragment : fragment_buffer) {
|
| 8753 |
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
| 8754 |
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
| 8755 |
|
|
|
|
| 8777 |
} break;
|
| 8778 |
case LLAMA_VOCAB_TYPE_BPE:
|
| 8779 |
{
|
| 8780 |
+
for (const auto & fragment : fragment_buffer) {
|
| 8781 |
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
| 8782 |
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
| 8783 |
|
|
|
|
| 8793 |
} break;
|
| 8794 |
case LLAMA_VOCAB_TYPE_WPM:
|
| 8795 |
{
|
| 8796 |
+
for (const auto & fragment : fragment_buffer) {
|
| 8797 |
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
| 8798 |
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
| 8799 |
|
|
|
|
| 10314 |
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
| 10315 |
new_type = GGML_TYPE_Q8_0;
|
| 10316 |
}
|
| 10317 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
| 10318 |
new_type = GGML_TYPE_Q5_K;
|
| 10319 |
}
|
| 10320 |
else if (new_type != GGML_TYPE_Q8_0) {
|
| 10321 |
new_type = GGML_TYPE_Q6_K;
|
| 10322 |
}
|
| 10323 |
} else if (name == "token_embd.weight") {
|
| 10324 |
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
| 10325 |
new_type = GGML_TYPE_Q2_K;
|
| 10326 |
}
|
| 10327 |
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
| 10328 |
new_type = GGML_TYPE_Q4_K;
|
| 10329 |
}
|
| 10330 |
+
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
| 10331 |
if (name.find("attn_v.weight") != std::string::npos) {
|
| 10332 |
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
| 10333 |
else new_type = GGML_TYPE_Q2_K;
|
|
|
|
| 10337 |
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
|
| 10338 |
++qs.i_ffn_down;
|
| 10339 |
}
|
| 10340 |
+
else if (name.find("attn_output.weight") != std::string::npos) {
|
| 10341 |
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
|
| 10342 |
+
}
|
| 10343 |
} else if (name.find("attn_v.weight") != std::string::npos) {
|
| 10344 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
| 10345 |
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
|
|
|
| 10457 |
}
|
| 10458 |
++qs.i_ffn_up;
|
| 10459 |
}
|
| 10460 |
+
|
| 10461 |
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
| 10462 |
//}
|
| 10463 |
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
|
|
|
|
| 10473 |
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
| 10474 |
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
| 10475 |
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
| 10476 |
+
new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
| 10477 |
int nx = tensor->ne[0];
|
| 10478 |
int ny = tensor->ne[1];
|
| 10479 |
if (nx % QK_K != 0) {
|
|
|
|
| 10488 |
case GGML_TYPE_IQ2_XXS:
|
| 10489 |
case GGML_TYPE_IQ2_XS:
|
| 10490 |
case GGML_TYPE_IQ3_XXS:
|
| 10491 |
+
case GGML_TYPE_IQ1_S:
|
| 10492 |
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
| 10493 |
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
| 10494 |
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
|
|
|
| 10518 |
|
| 10519 |
// K-quants
|
| 10520 |
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
| 10521 |
+
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
| 10522 |
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
|
| 10523 |
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
| 10524 |
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
| 10525 |
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
| 10526 |
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
| 10527 |
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
|
| 10528 |
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
| 10529 |
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
| 10530 |
+
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
| 10531 |
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
|
| 10532 |
+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
|
| 10533 |
+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
| 10534 |
+
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S ; break;
|
| 10535 |
|
| 10536 |
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
| 10537 |
}
|
|
|
|
| 10661 |
quantize &= !params->only_copy;
|
| 10662 |
|
| 10663 |
// do not quantize expert gating tensors
|
| 10664 |
+
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
|
| 10665 |
+
|
| 10666 |
+
// do not quantize positional embeddings and token types (BERT)
|
| 10667 |
+
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
|
| 10668 |
+
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
|
| 10669 |
|
| 10670 |
enum ggml_type new_type;
|
| 10671 |
void * new_data;
|
|
|
|
| 10705 |
}
|
| 10706 |
if ((new_type == GGML_TYPE_IQ2_XXS ||
|
| 10707 |
new_type == GGML_TYPE_IQ2_XS ||
|
| 10708 |
+
new_type == GGML_TYPE_IQ1_S ||
|
| 10709 |
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
| 10710 |
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
| 10711 |
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
|
|
|
| 10940 |
{
|
| 10941 |
LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
|
| 10942 |
__func__, ftype);
|
| 10943 |
+
return 1;
|
| 10944 |
}
|
| 10945 |
}
|
| 10946 |
|
|
|
|
| 11168 |
/*.logits_all =*/ false,
|
| 11169 |
/*.embedding =*/ false,
|
| 11170 |
/*.offload_kqv =*/ true,
|
| 11171 |
+
/*.do_pooling =*/ true,
|
| 11172 |
};
|
| 11173 |
|
| 11174 |
return result;
|
|
|
|
| 11229 |
return llama_supports_mlock();
|
| 11230 |
}
|
| 11231 |
|
| 11232 |
+
void llama_backend_init(void) {
|
| 11233 |
ggml_time_init();
|
| 11234 |
|
| 11235 |
// needed to initialize f16 tables
|
|
|
|
| 11239 |
ggml_free(ctx);
|
| 11240 |
}
|
| 11241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11242 |
#ifdef GGML_USE_MPI
|
| 11243 |
ggml_mpi_backend_init();
|
| 11244 |
#endif
|
| 11245 |
}
|
| 11246 |
|
| 11247 |
+
void llama_numa_init(enum ggml_numa_strategy numa) {
|
| 11248 |
+
if (numa != GGML_NUMA_STRATEGY_DISABLED) {
|
| 11249 |
+
ggml_numa_init(numa);
|
| 11250 |
+
}
|
| 11251 |
+
}
|
| 11252 |
+
|
| 11253 |
void llama_backend_free(void) {
|
| 11254 |
#ifdef GGML_USE_MPI
|
| 11255 |
ggml_mpi_backend_free();
|
|
|
|
| 11326 |
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
| 11327 |
cparams.mul_mat_q = params.mul_mat_q;
|
| 11328 |
cparams.offload_kqv = params.offload_kqv;
|
| 11329 |
+
cparams.do_pooling = params.do_pooling;
|
| 11330 |
|
| 11331 |
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
| 11332 |
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
|
|
|
| 11474 |
// resized during inference, reserve maximum
|
| 11475 |
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
|
| 11476 |
|
| 11477 |
+
if (params.embedding) {
|
| 11478 |
ctx->embedding.resize(hparams.n_embd);
|
| 11479 |
}
|
| 11480 |
|
| 11481 |
// graph inputs
|
| 11482 |
{
|
| 11483 |
ggml_init_params init_params = {
|
| 11484 |
+
/* .mem_size */ ggml_tensor_overhead()*8,
|
| 11485 |
/* .mem_buffer */ nullptr,
|
| 11486 |
/* .no_alloc */ true,
|
| 11487 |
};
|
|
|
|
| 11491 |
ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
|
| 11492 |
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
| 11493 |
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
| 11494 |
+
ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
|
| 11495 |
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
| 11496 |
+
ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
|
| 11497 |
+
ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
| 11498 |
|
| 11499 |
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
| 11500 |
ggml_set_name(ctx->inp_embd, "inp_embd");
|
| 11501 |
ggml_set_name(ctx->inp_pos, "inp_pos");
|
| 11502 |
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
| 11503 |
+
ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos");
|
| 11504 |
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
| 11505 |
+
ggml_set_name(ctx->inp_mean, "inp_mean");
|
| 11506 |
+
ggml_set_name(ctx->inp_cls, "inp_cls");
|
| 11507 |
|
| 11508 |
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
| 11509 |
|
|
|
|
| 12354 |
return ctx->embedding.data();
|
| 12355 |
}
|
| 12356 |
|
| 12357 |
+
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
| 12358 |
+
return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
|
| 12359 |
+
}
|
| 12360 |
+
|
| 12361 |
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
| 12362 |
return model->vocab.id_to_token[token].text.c_str();
|
| 12363 |
}
|
|
|
|
| 12508 |
return 0;
|
| 12509 |
}
|
| 12510 |
|
| 12511 |
+
// trim whitespace from the beginning and end of a string
|
| 12512 |
+
static std::string trim(const std::string & str) {
|
| 12513 |
+
size_t start = 0;
|
| 12514 |
+
size_t end = str.size();
|
| 12515 |
+
while (start < end && isspace(str[start])) {
|
| 12516 |
+
start += 1;
|
| 12517 |
+
}
|
| 12518 |
+
while (end > start && isspace(str[end - 1])) {
|
| 12519 |
+
end -= 1;
|
| 12520 |
+
}
|
| 12521 |
+
return str.substr(start, end - start);
|
| 12522 |
+
}
|
| 12523 |
+
|
| 12524 |
+
// Simple version of "llama_apply_chat_template" that only works with strings
|
| 12525 |
+
// This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
|
| 12526 |
+
static int32_t llama_chat_apply_template_internal(
|
| 12527 |
+
const std::string & tmpl,
|
| 12528 |
+
const std::vector<const llama_chat_message *> & chat,
|
| 12529 |
+
std::string & dest, bool add_ass) {
|
| 12530 |
+
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
| 12531 |
+
std::stringstream ss;
|
| 12532 |
+
if (tmpl.find("<|im_start|>") != std::string::npos) {
|
| 12533 |
+
// chatml template
|
| 12534 |
+
for (auto message : chat) {
|
| 12535 |
+
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
| 12536 |
+
}
|
| 12537 |
+
if (add_ass) {
|
| 12538 |
+
ss << "<|im_start|>assistant\n";
|
| 12539 |
+
}
|
| 12540 |
+
} else if (tmpl.find("[INST]") != std::string::npos) {
|
| 12541 |
+
// llama2 template and its variants
|
| 12542 |
+
// [variant] support system message
|
| 12543 |
+
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
|
| 12544 |
+
// [variant] space before + after response
|
| 12545 |
+
bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
|
| 12546 |
+
// [variant] add BOS inside history
|
| 12547 |
+
bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
|
| 12548 |
+
// [variant] trim spaces from the input message
|
| 12549 |
+
bool strip_message = tmpl.find("content.strip()") != std::string::npos;
|
| 12550 |
+
// construct the prompt
|
| 12551 |
+
bool is_inside_turn = true; // skip BOS at the beginning
|
| 12552 |
+
ss << "[INST] ";
|
| 12553 |
+
for (auto message : chat) {
|
| 12554 |
+
std::string content = strip_message ? trim(message->content) : message->content;
|
| 12555 |
+
std::string role(message->role);
|
| 12556 |
+
if (!is_inside_turn) {
|
| 12557 |
+
is_inside_turn = true;
|
| 12558 |
+
ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
|
| 12559 |
+
}
|
| 12560 |
+
if (role == "system") {
|
| 12561 |
+
if (support_system_message) {
|
| 12562 |
+
ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
|
| 12563 |
+
} else {
|
| 12564 |
+
// if the model does not support system message, we still include it in the first message, but without <<SYS>>
|
| 12565 |
+
ss << content << "\n";
|
| 12566 |
+
}
|
| 12567 |
+
} else if (role == "user") {
|
| 12568 |
+
ss << content << " [/INST]";
|
| 12569 |
+
} else {
|
| 12570 |
+
ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
|
| 12571 |
+
is_inside_turn = false;
|
| 12572 |
+
}
|
| 12573 |
+
}
|
| 12574 |
+
// llama2 templates seem to not care about "add_generation_prompt"
|
| 12575 |
+
} else if (tmpl.find("<|user|>") != std::string::npos) {
|
| 12576 |
+
// zephyr template
|
| 12577 |
+
for (auto message : chat) {
|
| 12578 |
+
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
| 12579 |
+
}
|
| 12580 |
+
if (add_ass) {
|
| 12581 |
+
ss << "<|assistant|>\n";
|
| 12582 |
+
}
|
| 12583 |
+
} else {
|
| 12584 |
+
// template not supported
|
| 12585 |
+
return -1;
|
| 12586 |
+
}
|
| 12587 |
+
dest = ss.str();
|
| 12588 |
+
return dest.size();
|
| 12589 |
+
}
|
| 12590 |
+
|
| 12591 |
+
LLAMA_API int32_t llama_chat_apply_template(
|
| 12592 |
+
const struct llama_model * model,
|
| 12593 |
+
const char * tmpl,
|
| 12594 |
+
const struct llama_chat_message * chat,
|
| 12595 |
+
size_t n_msg,
|
| 12596 |
+
bool add_ass,
|
| 12597 |
+
char * buf,
|
| 12598 |
+
int32_t length) {
|
| 12599 |
+
std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
|
| 12600 |
+
if (tmpl == nullptr) {
|
| 12601 |
+
GGML_ASSERT(model != nullptr);
|
| 12602 |
+
// load template from model
|
| 12603 |
+
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
|
| 12604 |
+
std::string template_key = "tokenizer.chat_template";
|
| 12605 |
+
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), curr_tmpl.size());
|
| 12606 |
+
if (res < 0) {
|
| 12607 |
+
// worst case: there is no information about template, we will use chatml by default
|
| 12608 |
+
curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
|
| 12609 |
+
} else {
|
| 12610 |
+
curr_tmpl = std::string(model_template.data(), model_template.size());
|
| 12611 |
+
}
|
| 12612 |
+
}
|
| 12613 |
+
// format the chat to string
|
| 12614 |
+
std::vector<const llama_chat_message *> chat_vec;
|
| 12615 |
+
chat_vec.resize(n_msg);
|
| 12616 |
+
for (size_t i = 0; i < n_msg; i++) {
|
| 12617 |
+
chat_vec[i] = &chat[i];
|
| 12618 |
+
}
|
| 12619 |
+
std::string formatted_chat;
|
| 12620 |
+
int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
|
| 12621 |
+
if (res < 0) {
|
| 12622 |
+
return res;
|
| 12623 |
+
}
|
| 12624 |
+
strncpy(buf, formatted_chat.c_str(), length);
|
| 12625 |
+
return res;
|
| 12626 |
+
}
|
| 12627 |
+
|
| 12628 |
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
| 12629 |
struct llama_timings result = {
|
| 12630 |
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
examples/talk-llama/llama.h
CHANGED
|
@@ -100,6 +100,7 @@ extern "C" {
|
|
| 100 |
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
| 101 |
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
| 102 |
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
|
|
|
| 103 |
|
| 104 |
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
| 105 |
};
|
|
@@ -112,6 +113,12 @@ extern "C" {
|
|
| 112 |
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
| 113 |
};
|
| 114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
enum llama_split_mode {
|
| 116 |
LLAMA_SPLIT_NONE = 0, // single GPU
|
| 117 |
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
|
@@ -236,6 +243,7 @@ extern "C" {
|
|
| 236 |
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
| 237 |
bool embedding; // embedding mode only
|
| 238 |
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
|
|
|
| 239 |
};
|
| 240 |
|
| 241 |
// model quantization parameters
|
|
@@ -297,6 +305,12 @@ extern "C" {
|
|
| 297 |
int32_t n_eval;
|
| 298 |
};
|
| 299 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
// Helpers for getting default parameters
|
| 301 |
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
| 302 |
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
|
@@ -305,7 +319,10 @@ extern "C" {
|
|
| 305 |
// Initialize the llama + ggml backend
|
| 306 |
// If numa is true, use NUMA optimizations
|
| 307 |
// Call once at the start of the program
|
| 308 |
-
LLAMA_API void llama_backend_init(
|
|
|
|
|
|
|
|
|
|
| 309 |
|
| 310 |
// Call once at the end of the program - currently only used for MPI
|
| 311 |
LLAMA_API void llama_backend_free(void);
|
|
@@ -628,6 +645,10 @@ extern "C" {
|
|
| 628 |
// shape: [n_embd] (1-dimensional)
|
| 629 |
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
| 630 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 631 |
//
|
| 632 |
// Vocab
|
| 633 |
//
|
|
@@ -684,6 +705,25 @@ extern "C" {
|
|
| 684 |
char * buf,
|
| 685 |
int32_t length);
|
| 686 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 687 |
//
|
| 688 |
// Grammar
|
| 689 |
//
|
|
|
|
| 100 |
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
| 101 |
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
| 102 |
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
| 103 |
+
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
| 104 |
|
| 105 |
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
| 106 |
};
|
|
|
|
| 113 |
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
| 114 |
};
|
| 115 |
|
| 116 |
+
enum llama_pooling_type {
|
| 117 |
+
LLAMA_POOLING_NONE = 0,
|
| 118 |
+
LLAMA_POOLING_MEAN = 1,
|
| 119 |
+
LLAMA_POOLING_CLS = 2,
|
| 120 |
+
};
|
| 121 |
+
|
| 122 |
enum llama_split_mode {
|
| 123 |
LLAMA_SPLIT_NONE = 0, // single GPU
|
| 124 |
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
|
|
|
| 243 |
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
| 244 |
bool embedding; // embedding mode only
|
| 245 |
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
| 246 |
+
bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
|
| 247 |
};
|
| 248 |
|
| 249 |
// model quantization parameters
|
|
|
|
| 305 |
int32_t n_eval;
|
| 306 |
};
|
| 307 |
|
| 308 |
+
// used in chat template
|
| 309 |
+
typedef struct llama_chat_message {
|
| 310 |
+
const char * role;
|
| 311 |
+
const char * content;
|
| 312 |
+
} llama_chat_message;
|
| 313 |
+
|
| 314 |
// Helpers for getting default parameters
|
| 315 |
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
| 316 |
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
|
|
|
| 319 |
// Initialize the llama + ggml backend
|
| 320 |
// If numa is true, use NUMA optimizations
|
| 321 |
// Call once at the start of the program
|
| 322 |
+
LLAMA_API void llama_backend_init(void);
|
| 323 |
+
|
| 324 |
+
//optional:
|
| 325 |
+
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
| 326 |
|
| 327 |
// Call once at the end of the program - currently only used for MPI
|
| 328 |
LLAMA_API void llama_backend_free(void);
|
|
|
|
| 645 |
// shape: [n_embd] (1-dimensional)
|
| 646 |
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
| 647 |
|
| 648 |
+
// Get the embeddings for the ith sequence
|
| 649 |
+
// llama_get_embeddings(ctx) + i*n_embd
|
| 650 |
+
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
| 651 |
+
|
| 652 |
//
|
| 653 |
// Vocab
|
| 654 |
//
|
|
|
|
| 705 |
char * buf,
|
| 706 |
int32_t length);
|
| 707 |
|
| 708 |
+
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
| 709 |
+
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
| 710 |
+
/// NOTE: This function only support some known jinja templates. It is not a jinja parser.
|
| 711 |
+
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
| 712 |
+
/// @param chat Pointer to a list of multiple llama_chat_message
|
| 713 |
+
/// @param n_msg Number of llama_chat_message in this chat
|
| 714 |
+
/// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
|
| 715 |
+
/// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
|
| 716 |
+
/// @param length The size of the allocated buffer
|
| 717 |
+
/// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
|
| 718 |
+
LLAMA_API int32_t llama_chat_apply_template(
|
| 719 |
+
const struct llama_model * model,
|
| 720 |
+
const char * tmpl,
|
| 721 |
+
const struct llama_chat_message * chat,
|
| 722 |
+
size_t n_msg,
|
| 723 |
+
bool add_ass,
|
| 724 |
+
char * buf,
|
| 725 |
+
int32_t length);
|
| 726 |
+
|
| 727 |
//
|
| 728 |
// Grammar
|
| 729 |
//
|
examples/talk-llama/talk-llama.cpp
CHANGED
|
@@ -288,7 +288,7 @@ int main(int argc, char ** argv) {
|
|
| 288 |
|
| 289 |
// llama init
|
| 290 |
|
| 291 |
-
llama_backend_init(
|
| 292 |
|
| 293 |
auto lmparams = llama_model_default_params();
|
| 294 |
if (!params.use_gpu) {
|
|
|
|
| 288 |
|
| 289 |
// llama init
|
| 290 |
|
| 291 |
+
llama_backend_init();
|
| 292 |
|
| 293 |
auto lmparams = llama_model_default_params();
|
| 294 |
if (!params.use_gpu) {
|
examples/talk-llama/unicode.h
CHANGED
|
@@ -264,26 +264,29 @@ static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) {
|
|
| 264 |
offset += 1;
|
| 265 |
return result;
|
| 266 |
}
|
| 267 |
-
|
| 268 |
throw std::invalid_argument("invalid character");
|
| 269 |
}
|
| 270 |
-
|
| 271 |
-
if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80))
|
| 272 |
throw std::invalid_argument("invalid character");
|
|
|
|
| 273 |
auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
|
| 274 |
offset += 2;
|
| 275 |
return result;
|
| 276 |
}
|
| 277 |
-
|
| 278 |
-
if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80))
|
| 279 |
throw std::invalid_argument("invalid character");
|
|
|
|
| 280 |
auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
|
| 281 |
offset += 3;
|
| 282 |
return result;
|
| 283 |
}
|
| 284 |
-
|
| 285 |
-
if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80))
|
| 286 |
throw std::invalid_argument("invalid character");
|
|
|
|
| 287 |
auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
|
| 288 |
offset += 4;
|
| 289 |
return result;
|
|
@@ -331,21 +334,22 @@ static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t
|
|
| 331 |
offset += 1;
|
| 332 |
return result;
|
| 333 |
}
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
|
| 338 |
-
offset += 2;
|
| 339 |
-
return result;
|
| 340 |
}
|
| 341 |
-
|
|
|
|
|
|
|
|
|
|
| 342 |
}
|
| 343 |
|
| 344 |
static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
|
| 345 |
std::vector<uint32_t> result;
|
| 346 |
size_t offset = 0;
|
| 347 |
-
while (offset < utf16.size())
|
| 348 |
result.push_back(codepoint_from_utf16(utf16, offset));
|
|
|
|
| 349 |
return result;
|
| 350 |
}
|
| 351 |
|
|
@@ -361,44 +365,52 @@ static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> &
|
|
| 361 |
static std::unordered_map<uint32_t, int> codepoint_type_map() {
|
| 362 |
std::unordered_map<uint32_t, int> codepoint_types;
|
| 363 |
for (auto p : digit_ranges) {
|
| 364 |
-
for(auto i = p.first; i <= p.second; ++ i)
|
| 365 |
codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
|
|
|
|
| 366 |
}
|
| 367 |
-
for(auto p : letter_ranges) {
|
| 368 |
-
for(auto i = p.first; i <= p.second; ++ i)
|
| 369 |
codepoint_types[i] = CODEPOINT_TYPE_LETTER;
|
|
|
|
| 370 |
}
|
| 371 |
-
for(auto p : whitespace_ranges) {
|
| 372 |
-
for(auto i = p.first; i <= p.second; ++ i)
|
| 373 |
codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
|
|
|
|
| 374 |
}
|
| 375 |
-
for(auto p : accent_mark_ranges) {
|
| 376 |
-
for(auto i = p.first; i <= p.second; ++ i)
|
| 377 |
codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
|
|
|
|
| 378 |
}
|
| 379 |
-
for(auto p : punctuation_ranges) {
|
| 380 |
-
for(auto i = p.first; i <= p.second; ++ i)
|
| 381 |
codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
|
|
|
|
| 382 |
}
|
| 383 |
-
for
|
| 384 |
-
for (auto i = p.first; i <= p.second; ++i)
|
| 385 |
codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
|
|
|
|
| 386 |
}
|
| 387 |
-
for(auto p : control_ranges) {
|
| 388 |
-
for(auto i = p.first; i <= p.second; ++ i)
|
| 389 |
codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
|
|
|
|
| 390 |
}
|
| 391 |
return codepoint_types;
|
| 392 |
}
|
| 393 |
|
| 394 |
static int codepoint_type(uint32_t cp) {
|
| 395 |
static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
|
| 396 |
-
return codepoint_types
|
| 397 |
}
|
| 398 |
|
| 399 |
static int codepoint_type(const std::string & utf8) {
|
| 400 |
-
if (utf8.length() == 0)
|
| 401 |
return CODEPOINT_TYPE_UNIDENTIFIED;
|
|
|
|
| 402 |
size_t offset = 0;
|
| 403 |
return codepoint_type(codepoint_from_utf8(utf8, offset));
|
| 404 |
}
|
|
|
|
| 264 |
offset += 1;
|
| 265 |
return result;
|
| 266 |
}
|
| 267 |
+
if (!(utf8[offset + 0] & 0x40)) {
|
| 268 |
throw std::invalid_argument("invalid character");
|
| 269 |
}
|
| 270 |
+
if (!(utf8[offset + 0] & 0x20)) {
|
| 271 |
+
if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) {
|
| 272 |
throw std::invalid_argument("invalid character");
|
| 273 |
+
}
|
| 274 |
auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
|
| 275 |
offset += 2;
|
| 276 |
return result;
|
| 277 |
}
|
| 278 |
+
if (!(utf8[offset + 0] & 0x10)) {
|
| 279 |
+
if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) {
|
| 280 |
throw std::invalid_argument("invalid character");
|
| 281 |
+
}
|
| 282 |
auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
|
| 283 |
offset += 3;
|
| 284 |
return result;
|
| 285 |
}
|
| 286 |
+
if (!(utf8[offset + 0] & 0x08)) {
|
| 287 |
+
if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) {
|
| 288 |
throw std::invalid_argument("invalid character");
|
| 289 |
+
}
|
| 290 |
auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
|
| 291 |
offset += 4;
|
| 292 |
return result;
|
|
|
|
| 334 |
offset += 1;
|
| 335 |
return result;
|
| 336 |
}
|
| 337 |
+
|
| 338 |
+
if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
|
| 339 |
+
throw std::invalid_argument("invalid character");
|
|
|
|
|
|
|
|
|
|
| 340 |
}
|
| 341 |
+
|
| 342 |
+
auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
|
| 343 |
+
offset += 2;
|
| 344 |
+
return result;
|
| 345 |
}
|
| 346 |
|
| 347 |
static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
|
| 348 |
std::vector<uint32_t> result;
|
| 349 |
size_t offset = 0;
|
| 350 |
+
while (offset < utf16.size()) {
|
| 351 |
result.push_back(codepoint_from_utf16(utf16, offset));
|
| 352 |
+
}
|
| 353 |
return result;
|
| 354 |
}
|
| 355 |
|
|
|
|
| 365 |
static std::unordered_map<uint32_t, int> codepoint_type_map() {
|
| 366 |
std::unordered_map<uint32_t, int> codepoint_types;
|
| 367 |
for (auto p : digit_ranges) {
|
| 368 |
+
for (auto i = p.first; i <= p.second; ++ i) {
|
| 369 |
codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
|
| 370 |
+
}
|
| 371 |
}
|
| 372 |
+
for (auto p : letter_ranges) {
|
| 373 |
+
for (auto i = p.first; i <= p.second; ++ i) {
|
| 374 |
codepoint_types[i] = CODEPOINT_TYPE_LETTER;
|
| 375 |
+
}
|
| 376 |
}
|
| 377 |
+
for (auto p : whitespace_ranges) {
|
| 378 |
+
for (auto i = p.first; i <= p.second; ++ i) {
|
| 379 |
codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
|
| 380 |
+
}
|
| 381 |
}
|
| 382 |
+
for (auto p : accent_mark_ranges) {
|
| 383 |
+
for (auto i = p.first; i <= p.second; ++ i) {
|
| 384 |
codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
|
| 385 |
+
}
|
| 386 |
}
|
| 387 |
+
for (auto p : punctuation_ranges) {
|
| 388 |
+
for (auto i = p.first; i <= p.second; ++ i) {
|
| 389 |
codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
|
| 390 |
+
}
|
| 391 |
}
|
| 392 |
+
for (auto p : symbol_ranges) {
|
| 393 |
+
for (auto i = p.first; i <= p.second; ++i) {
|
| 394 |
codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
|
| 395 |
+
}
|
| 396 |
}
|
| 397 |
+
for (auto p : control_ranges) {
|
| 398 |
+
for (auto i = p.first; i <= p.second; ++ i) {
|
| 399 |
codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
|
| 400 |
+
}
|
| 401 |
}
|
| 402 |
return codepoint_types;
|
| 403 |
}
|
| 404 |
|
| 405 |
static int codepoint_type(uint32_t cp) {
|
| 406 |
static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
|
| 407 |
+
return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types.at(cp);
|
| 408 |
}
|
| 409 |
|
| 410 |
static int codepoint_type(const std::string & utf8) {
|
| 411 |
+
if (utf8.length() == 0) {
|
| 412 |
return CODEPOINT_TYPE_UNIDENTIFIED;
|
| 413 |
+
}
|
| 414 |
size_t offset = 0;
|
| 415 |
return codepoint_type(codepoint_from_utf8(utf8, offset));
|
| 416 |
}
|