Spaces:
Sleeping
Sleeping
| // | |
| // interface implementation | |
| // | |
| struct llama_sampler_chain_params llama_sampler_chain_default_params() { | |
| struct llama_sampler_chain_params result = { | |
| /*.no_perf =*/ true, | |
| }; | |
| return result; | |
| } | |
| size_t llama_max_devices(void) { | |
| return 16; | |
| } | |
| bool llama_supports_mmap(void) { | |
| return llama_mmap::SUPPORTED; | |
| } | |
| bool llama_supports_mlock(void) { | |
| return llama_mlock::SUPPORTED; | |
| } | |
| bool llama_supports_gpu_offload(void) { | |
| return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr || | |
| llama_supports_rpc(); | |
| } | |
| bool llama_supports_rpc(void) { | |
| return ggml_backend_reg_by_name("RPC") != nullptr; | |
| } | |
| void llama_backend_init(void) { | |
| ggml_time_init(); | |
| // needed to initialize f16 tables | |
| { | |
| struct ggml_init_params params = { 0, NULL, false }; | |
| struct ggml_context * ctx = ggml_init(params); | |
| ggml_free(ctx); | |
| } | |
| } | |
| void llama_numa_init(enum ggml_numa_strategy numa) { | |
| if (numa != GGML_NUMA_STRATEGY_DISABLED) { | |
| auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); | |
| GGML_ASSERT(dev && "CPU backend is not loaded"); | |
| auto * reg = ggml_backend_dev_backend_reg(dev); | |
| auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init"); | |
| numa_init_fn(numa); | |
| } | |
| } | |
| void llama_backend_free(void) { | |
| ggml_quantize_free(); | |
| } | |
| int64_t llama_time_us(void) { | |
| return ggml_time_us(); | |
| } | |
| // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback | |
| static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) { | |
| // loading time will be recalculated after the first eval, so | |
| // we take page faults deferred by mmap() into consideration | |
| model.t_load_us = 0; | |
| time_meas tm(model.t_load_us); | |
| model.t_start_us = tm.t_start_us; | |
| try { | |
| llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides); | |
| ml.print_info(); | |
| model.hparams.vocab_only = params.vocab_only; | |
| try { | |
| model.load_arch(ml); | |
| } catch(const std::exception & e) { | |
| throw std::runtime_error("error loading model architecture: " + std::string(e.what())); | |
| } | |
| try { | |
| model.load_hparams(ml); | |
| } catch(const std::exception & e) { | |
| throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what())); | |
| } | |
| try { | |
| model.load_vocab(ml); | |
| } catch(const std::exception & e) { | |
| throw std::runtime_error("error loading model vocabulary: " + std::string(e.what())); | |
| } | |
| model.load_stats(ml); | |
| model.print_info(); | |
| if (params.vocab_only) { | |
| LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); | |
| return 0; | |
| } | |
| if (!model.load_tensors(ml)) { | |
| return -2; | |
| } | |
| } catch (const std::exception & err) { | |
| LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); | |
| return -1; | |
| } | |
| return 0; | |
| } | |
| static struct llama_model * llama_model_load_from_file_impl( | |
| const std::string & path_model, | |
| std::vector<std::string> & splits, | |
| struct llama_model_params params) { | |
| ggml_time_init(); | |
| unsigned cur_percentage = 0; | |
| if (params.progress_callback == NULL) { | |
| params.progress_callback_user_data = &cur_percentage; | |
| params.progress_callback = [](float progress, void * ctx) { | |
| unsigned * cur_percentage_p = (unsigned *) ctx; | |
| unsigned percentage = (unsigned) (100 * progress); | |
| while (percentage > *cur_percentage_p) { | |
| *cur_percentage_p = percentage; | |
| LLAMA_LOG_CONT("."); | |
| if (percentage >= 100) { | |
| LLAMA_LOG_CONT("\n"); | |
| } | |
| } | |
| return true; | |
| }; | |
| } | |
| llama_model * model = new llama_model(params); | |
| // create list of devices to use with this model | |
| if (params.devices) { | |
| for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) { | |
| model->devices.push_back(*dev); | |
| } | |
| } else { | |
| std::vector<ggml_backend_dev_t> rpc_servers; | |
| // use all available devices | |
| for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { | |
| ggml_backend_dev_t dev = ggml_backend_dev_get(i); | |
| switch (ggml_backend_dev_type(dev)) { | |
| case GGML_BACKEND_DEVICE_TYPE_CPU: | |
| case GGML_BACKEND_DEVICE_TYPE_ACCEL: | |
| // skip CPU backends since they are handled separately | |
| break; | |
| case GGML_BACKEND_DEVICE_TYPE_GPU: | |
| ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); | |
| if (ggml_backend_reg_name(reg) == std::string("RPC")) { | |
| rpc_servers.push_back(dev); | |
| } else { | |
| model->devices.push_back(dev); | |
| } | |
| break; | |
| } | |
| } | |
| // add RPC servers at the front of the list | |
| if (!rpc_servers.empty()) { | |
| model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end()); | |
| } | |
| } | |
| // if using single GPU mode, remove all except the main GPU | |
| if (params.split_mode == LLAMA_SPLIT_MODE_NONE) { | |
| if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) { | |
| LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size()); | |
| llama_model_free(model); | |
| return nullptr; | |
| } | |
| ggml_backend_dev_t main_gpu = model->devices[params.main_gpu]; | |
| model->devices.clear(); | |
| model->devices.push_back(main_gpu); | |
| } | |
| for (auto * dev : model->devices) { | |
| size_t free, total; // NOLINT | |
| ggml_backend_dev_memory(dev, &free, &total); | |
| LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024); | |
| } | |
| const int status = llama_model_load(path_model, splits, *model, params); | |
| GGML_ASSERT(status <= 0); | |
| if (status < 0) { | |
| if (status == -1) { | |
| LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); | |
| } else if (status == -2) { | |
| LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); | |
| } | |
| llama_model_free(model); | |
| return nullptr; | |
| } | |
| return model; | |
| } | |
| // deprecated | |
| struct llama_model * llama_load_model_from_file( | |
| const char * path_model, | |
| struct llama_model_params params) { | |
| return llama_model_load_from_file(path_model, params); | |
| } | |
| struct llama_model * llama_model_load_from_file( | |
| const char * path_model, | |
| struct llama_model_params params) { | |
| std::vector<std::string> splits = {}; | |
| return llama_model_load_from_file_impl(path_model, splits, params); | |
| } | |
| struct llama_model * llama_model_load_from_splits( | |
| const char ** paths, | |
| size_t n_paths, | |
| struct llama_model_params params) { | |
| std::vector<std::string> splits; | |
| if (n_paths == 0) { | |
| LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__); | |
| return nullptr; | |
| } | |
| for (size_t i = 0; i < n_paths; ++i) { | |
| splits.push_back(paths[i]); | |
| } | |
| return llama_model_load_from_file_impl(splits.front(), splits, params); | |
| } | |
| void llama_model_save_to_file(const struct llama_model * model, const char * path_model) { | |
| llama_model_saver ms(*model); | |
| ms.add_kv_from_model(); | |
| ms.add_tensors_from_model(); | |
| ms.save(path_model); | |
| } | |
| // | |
| // chat templates | |
| // | |
| int32_t llama_chat_apply_template( | |
| const char * tmpl, | |
| const struct llama_chat_message * chat, | |
| size_t n_msg, | |
| bool add_ass, | |
| char * buf, | |
| int32_t length) { | |
| const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl); | |
| // format the chat to string | |
| std::vector<const llama_chat_message *> chat_vec; | |
| chat_vec.resize(n_msg); | |
| for (size_t i = 0; i < n_msg; i++) { | |
| chat_vec[i] = &chat[i]; | |
| } | |
| std::string formatted_chat; | |
| llm_chat_template detected_tmpl = llm_chat_detect_template(curr_tmpl); | |
| if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) { | |
| return -1; | |
| } | |
| int32_t res = llm_chat_apply_template(detected_tmpl, chat_vec, formatted_chat, add_ass); | |
| if (res < 0) { | |
| return res; | |
| } | |
| if (buf && length > 0) { | |
| strncpy(buf, formatted_chat.c_str(), length); | |
| } | |
| return res; | |
| } | |
| // | |
| // model split | |
| // | |
| int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) { | |
| static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf"; | |
| if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) { | |
| return strlen(split_path); | |
| } | |
| return 0; | |
| } | |
| int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) { | |
| std::string str_split_path(split_path); | |
| char postfix[32]; | |
| snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count); | |
| std::string str_postfix(postfix); | |
| // check if split_prefix ends with postfix | |
| int size_prefix = str_split_path.size() - str_postfix.size(); | |
| if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) { | |
| snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path); | |
| return size_prefix; | |
| } | |
| return 0; | |
| } | |
| const char * llama_print_system_info(void) { | |
| static std::string s; | |
| s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls. | |
| for (size_t i = 0; i < ggml_backend_reg_count(); i++) { | |
| auto * reg = ggml_backend_reg_get(i); | |
| auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features"); | |
| if (get_features_fn) { | |
| ggml_backend_feature * features = get_features_fn(reg); | |
| s += ggml_backend_reg_name(reg); | |
| s += " : "; | |
| for (; features->name; features++) { | |
| s += features->name; | |
| s += " = "; | |
| s += features->value; | |
| s += " | "; | |
| } | |
| } | |
| } | |
| return s.c_str(); | |
| } | |