Spaces:
Running
Running
talk-llama : sync llama.cpp
Browse files
examples/talk-llama/llama-impl.h
CHANGED
|
@@ -24,3 +24,24 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
|
|
| 24 |
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
| 25 |
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
| 26 |
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
| 25 |
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
| 26 |
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
| 27 |
+
|
| 28 |
+
//
|
| 29 |
+
// helpers
|
| 30 |
+
//
|
| 31 |
+
|
| 32 |
+
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
| 33 |
+
if (search.empty()) {
|
| 34 |
+
return;
|
| 35 |
+
}
|
| 36 |
+
std::string builder;
|
| 37 |
+
builder.reserve(s.length());
|
| 38 |
+
size_t pos = 0;
|
| 39 |
+
size_t last_pos = 0;
|
| 40 |
+
while ((pos = s.find(search, last_pos)) != std::string::npos) {
|
| 41 |
+
builder.append(s, last_pos, pos - last_pos);
|
| 42 |
+
builder.append(replace);
|
| 43 |
+
last_pos = pos + search.length();
|
| 44 |
+
}
|
| 45 |
+
builder.append(s, last_pos, std::string::npos);
|
| 46 |
+
s = std::move(builder);
|
| 47 |
+
}
|
examples/talk-llama/llama-sampling.cpp
CHANGED
|
@@ -85,14 +85,14 @@ void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_arra
|
|
| 85 |
constexpr float bucket_low = -10.0f;
|
| 86 |
constexpr float bucket_high = 10.0f;
|
| 87 |
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
| 88 |
-
constexpr float
|
| 89 |
|
| 90 |
std::vector<int> bucket_idx(candidates->size);
|
| 91 |
std::vector<int> histo(nbuckets, 0);
|
| 92 |
|
| 93 |
for (int i = 0; i < (int)candidates->size; ++i) {
|
| 94 |
const float val = candidates->data[i].logit;
|
| 95 |
-
int ib = int(bucket_scale * val +
|
| 96 |
ib = std::max(0, std::min(nbuckets-1, ib));
|
| 97 |
bucket_idx[i] = ib;
|
| 98 |
++histo[ib];
|
|
|
|
| 85 |
constexpr float bucket_low = -10.0f;
|
| 86 |
constexpr float bucket_high = 10.0f;
|
| 87 |
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
| 88 |
+
constexpr float bucket_inter = -bucket_low * bucket_scale;
|
| 89 |
|
| 90 |
std::vector<int> bucket_idx(candidates->size);
|
| 91 |
std::vector<int> histo(nbuckets, 0);
|
| 92 |
|
| 93 |
for (int i = 0; i < (int)candidates->size; ++i) {
|
| 94 |
const float val = candidates->data[i].logit;
|
| 95 |
+
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
| 96 |
ib = std::max(0, std::min(nbuckets-1, ib));
|
| 97 |
bucket_idx[i] = ib;
|
| 98 |
++histo[ib];
|
examples/talk-llama/llama-vocab.cpp
CHANGED
|
@@ -16,20 +16,6 @@
|
|
| 16 |
// helpers
|
| 17 |
//
|
| 18 |
|
| 19 |
-
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
| 20 |
-
std::string result;
|
| 21 |
-
for (size_t pos = 0; ; pos += search.length()) {
|
| 22 |
-
auto new_pos = s.find(search, pos);
|
| 23 |
-
if (new_pos == std::string::npos) {
|
| 24 |
-
result += s.substr(pos, s.size() - pos);
|
| 25 |
-
break;
|
| 26 |
-
}
|
| 27 |
-
result += s.substr(pos, new_pos - pos) + replace;
|
| 28 |
-
pos = new_pos;
|
| 29 |
-
}
|
| 30 |
-
s = std::move(result);
|
| 31 |
-
}
|
| 32 |
-
|
| 33 |
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
| 34 |
static std::string format(const char * fmt, ...) {
|
| 35 |
va_list ap;
|
|
@@ -335,6 +321,21 @@ private:
|
|
| 335 |
|
| 336 |
// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
|
| 337 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
struct llm_bigram_bpe {
|
| 339 |
struct comparator {
|
| 340 |
bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
|
|
@@ -343,7 +344,7 @@ struct llm_bigram_bpe {
|
|
| 343 |
};
|
| 344 |
|
| 345 |
using queue_storage = std::vector<llm_bigram_bpe>;
|
| 346 |
-
using queue =
|
| 347 |
llm_symbol::index left;
|
| 348 |
llm_symbol::index right;
|
| 349 |
std::string text;
|
|
@@ -402,6 +403,7 @@ struct llm_tokenizer_bpe {
|
|
| 402 |
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
| 403 |
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
|
| 404 |
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
|
|
|
|
| 405 |
regex_exprs = {
|
| 406 |
"\\p{N}",
|
| 407 |
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
@@ -424,6 +426,8 @@ struct llm_tokenizer_bpe {
|
|
| 424 |
};
|
| 425 |
break;
|
| 426 |
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
|
|
|
|
|
|
| 427 |
regex_exprs = {
|
| 428 |
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
| 429 |
};
|
|
@@ -531,8 +535,7 @@ struct llm_tokenizer_bpe {
|
|
| 531 |
|
| 532 |
// build token(s)
|
| 533 |
while (!work_queue.empty()) {
|
| 534 |
-
auto bigram = work_queue.
|
| 535 |
-
work_queue.pop();
|
| 536 |
|
| 537 |
auto & left_symbol = symbols[bigram.left];
|
| 538 |
auto & right_symbol = symbols[bigram.right];
|
|
@@ -1480,11 +1483,11 @@ llama_token llama_token_pad_impl(const struct llama_vocab & vocab) {
|
|
| 1480 |
return vocab.special_pad_id;
|
| 1481 |
}
|
| 1482 |
|
| 1483 |
-
|
| 1484 |
return vocab.tokenizer_add_bos;
|
| 1485 |
}
|
| 1486 |
|
| 1487 |
-
|
| 1488 |
return vocab.tokenizer_add_eos;
|
| 1489 |
}
|
| 1490 |
|
|
|
|
| 16 |
// helpers
|
| 17 |
//
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
| 20 |
static std::string format(const char * fmt, ...) {
|
| 21 |
va_list ap;
|
|
|
|
| 321 |
|
| 322 |
// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
|
| 323 |
|
| 324 |
+
template<typename T, typename Container = std::vector<T>, typename Compare = std::less<typename Container::value_type>>
|
| 325 |
+
class llama_priority_queue : public std::priority_queue<T, Container, Compare> {
|
| 326 |
+
public:
|
| 327 |
+
using std::priority_queue<T, Container, Compare>::priority_queue;
|
| 328 |
+
|
| 329 |
+
T pop_move() {
|
| 330 |
+
T item = std::move(this->c.front());
|
| 331 |
+
std::pop_heap(this->c.begin(), this->c.end(), this->comp);
|
| 332 |
+
this->c.pop_back();
|
| 333 |
+
return item;
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
void pop() = delete;
|
| 337 |
+
};
|
| 338 |
+
|
| 339 |
struct llm_bigram_bpe {
|
| 340 |
struct comparator {
|
| 341 |
bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
|
|
|
|
| 344 |
};
|
| 345 |
|
| 346 |
using queue_storage = std::vector<llm_bigram_bpe>;
|
| 347 |
+
using queue = llama_priority_queue<llm_bigram_bpe, queue_storage, comparator>;
|
| 348 |
llm_symbol::index left;
|
| 349 |
llm_symbol::index right;
|
| 350 |
std::string text;
|
|
|
|
| 403 |
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
| 404 |
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
|
| 405 |
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
|
| 406 |
+
case LLAMA_VOCAB_PRE_TYPE_EXAONE:
|
| 407 |
regex_exprs = {
|
| 408 |
"\\p{N}",
|
| 409 |
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
|
|
| 426 |
};
|
| 427 |
break;
|
| 428 |
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
| 429 |
+
case LLAMA_VOCAB_PRE_TYPE_BLOOM:
|
| 430 |
+
case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
|
| 431 |
regex_exprs = {
|
| 432 |
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
| 433 |
};
|
|
|
|
| 535 |
|
| 536 |
// build token(s)
|
| 537 |
while (!work_queue.empty()) {
|
| 538 |
+
auto bigram = work_queue.pop_move();
|
|
|
|
| 539 |
|
| 540 |
auto & left_symbol = symbols[bigram.left];
|
| 541 |
auto & right_symbol = symbols[bigram.right];
|
|
|
|
| 1483 |
return vocab.special_pad_id;
|
| 1484 |
}
|
| 1485 |
|
| 1486 |
+
bool llama_add_bos_token_impl(const struct llama_vocab & vocab) {
|
| 1487 |
return vocab.tokenizer_add_bos;
|
| 1488 |
}
|
| 1489 |
|
| 1490 |
+
bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
|
| 1491 |
return vocab.tokenizer_add_eos;
|
| 1492 |
}
|
| 1493 |
|
examples/talk-llama/llama-vocab.h
CHANGED
|
@@ -95,8 +95,8 @@ llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
|
|
| 95 |
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
|
| 96 |
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
| 100 |
|
| 101 |
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
| 102 |
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
|
|
|
| 95 |
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
|
| 96 |
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
|
| 97 |
|
| 98 |
+
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
|
| 99 |
+
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
|
| 100 |
|
| 101 |
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
| 102 |
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
examples/talk-llama/llama.cpp
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
examples/talk-llama/llama.h
CHANGED
|
@@ -93,15 +93,15 @@ extern "C" {
|
|
| 93 |
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
| 94 |
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
| 95 |
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
|
|
|
|
|
|
|
|
|
| 96 |
};
|
| 97 |
|
| 98 |
-
// note: these values should be synchronized with ggml_rope
|
| 99 |
-
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
|
| 100 |
enum llama_rope_type {
|
| 101 |
LLAMA_ROPE_TYPE_NONE = -1,
|
| 102 |
-
LLAMA_ROPE_TYPE_NORM =
|
| 103 |
-
LLAMA_ROPE_TYPE_NEOX =
|
| 104 |
-
LLAMA_ROPE_TYPE_GLM = 4,
|
| 105 |
};
|
| 106 |
|
| 107 |
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
|
@@ -504,10 +504,16 @@ extern "C" {
|
|
| 504 |
// Returns true if the model contains an encoder that requires llama_encode() call
|
| 505 |
LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
|
| 506 |
|
|
|
|
|
|
|
|
|
|
| 507 |
// For encoder-decoder models, this function returns id of the token that must be provided
|
| 508 |
// to the decoder to start generating output sequence. For other models, it returns -1.
|
| 509 |
LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
|
| 510 |
|
|
|
|
|
|
|
|
|
|
| 511 |
// Returns 0 on success
|
| 512 |
LLAMA_API uint32_t llama_model_quantize(
|
| 513 |
const char * fname_inp,
|
|
@@ -912,11 +918,8 @@ extern "C" {
|
|
| 912 |
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
| 913 |
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
|
| 914 |
|
| 915 |
-
|
| 916 |
-
LLAMA_API
|
| 917 |
-
|
| 918 |
-
// Returns -1 if unknown, 1 for true or 0 for false.
|
| 919 |
-
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
|
| 920 |
|
| 921 |
// Codellama infill tokens
|
| 922 |
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
|
|
|
| 93 |
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
| 94 |
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
| 95 |
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
| 96 |
+
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
| 97 |
+
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
| 98 |
+
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
| 99 |
};
|
| 100 |
|
|
|
|
|
|
|
| 101 |
enum llama_rope_type {
|
| 102 |
LLAMA_ROPE_TYPE_NONE = -1,
|
| 103 |
+
LLAMA_ROPE_TYPE_NORM = 0,
|
| 104 |
+
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
|
|
|
|
| 105 |
};
|
| 106 |
|
| 107 |
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
|
|
|
| 504 |
// Returns true if the model contains an encoder that requires llama_encode() call
|
| 505 |
LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
|
| 506 |
|
| 507 |
+
// Returns true if the model contains a decoder that requires llama_decode() call
|
| 508 |
+
LLAMA_API bool llama_model_has_decoder(const struct llama_model * model);
|
| 509 |
+
|
| 510 |
// For encoder-decoder models, this function returns id of the token that must be provided
|
| 511 |
// to the decoder to start generating output sequence. For other models, it returns -1.
|
| 512 |
LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
|
| 513 |
|
| 514 |
+
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
|
| 515 |
+
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
|
| 516 |
+
|
| 517 |
// Returns 0 on success
|
| 518 |
LLAMA_API uint32_t llama_model_quantize(
|
| 519 |
const char * fname_inp,
|
|
|
|
| 918 |
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
| 919 |
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
|
| 920 |
|
| 921 |
+
LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
|
| 922 |
+
LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
|
|
|
|
|
|
|
|
|
|
| 923 |
|
| 924 |
// Codellama infill tokens
|
| 925 |
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|