| #pragma once |
|
|
| #include "llama.h" |
|
|
| #include "llama-impl.h" |
| #include "llama-arch.h" |
| #include "llama-mmap.h" |
|
|
| #include "ggml-cpp.h" |
|
|
| #include <cstddef> |
| #include <map> |
| #include <stdexcept> |
| #include <unordered_map> |
|
|
| using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>; |
|
|
| enum llama_fver { |
| GGUF_FILE_VERSION_V1 = 1, |
| GGUF_FILE_VERSION_V2 = 2, |
| GGUF_FILE_VERSION_V3 = 3, |
| }; |
|
|
| const char * llama_file_version_name(llama_fver version); |
|
|
| struct llama_model_loader { |
| |
| struct llama_tensor_weight { |
| uint16_t idx; |
| size_t offs; |
|
|
| ggml_tensor * tensor; |
|
|
| llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) { |
| const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor)); |
| if (tensor_idx < 0) { |
| throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor))); |
| } |
|
|
| offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); |
| if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size()) { |
| throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor))); |
| } |
| } |
| }; |
|
|
| |
| struct weight_name_comparer { |
| bool operator()(const std::string & a, const std::string & b) const { |
| int a_layer = -1; |
| int b_layer = -1; |
| sscanf(a.c_str(), "blk.%d.", &a_layer); |
| sscanf(b.c_str(), "blk.%d.", &b_layer); |
| if (a_layer != b_layer) { |
| return a_layer < b_layer; |
| } |
| return a < b; |
| } |
| }; |
|
|
| static const int TENSOR_NOT_REQUIRED = 1; |
| static const int TENSOR_DUPLICATED = 2; |
|
|
| int n_kv = 0; |
| int n_tensors = 0; |
| int n_created = 0; |
|
|
| uint64_t n_elements = 0; |
| size_t n_bytes = 0; |
|
|
| bool use_mmap = false; |
| bool check_tensors; |
|
|
| llama_files files; |
| llama_ftype ftype; |
| llama_fver fver; |
|
|
| llama_mmaps mappings; |
|
|
| std::map<std::string, struct llama_tensor_weight, weight_name_comparer> weights_map; |
| std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides; |
|
|
| gguf_context_ptr meta; |
| std::vector<ggml_context_ptr> contexts; |
|
|
| std::string arch_name; |
| LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); |
|
|
| size_t size_done = 0; |
| size_t size_data = 0; |
| std::vector<std::pair<size_t, size_t>> mmaps_used; |
|
|
| llama_model_loader( |
| const std::string & fname, |
| std::vector<std::string> & splits, |
| bool use_mmap, |
| bool check_tensors, |
| const struct llama_model_kv_override * param_overrides_p); |
|
|
| template<typename T> |
| typename std::enable_if<std::is_integral<T>::value, bool>::type |
| get_arr_n(const std::string & key, T & result, bool required = true); |
|
|
| template<typename T> |
| typename std::enable_if<std::is_integral<T>::value, bool>::type |
| get_arr_n(enum llm_kv kid, T & result, bool required = true); |
|
|
| template<typename T> |
| bool get_arr(const std::string & key, std::vector<T> & result, bool required = true); |
|
|
| template<typename T, size_t N_MAX> |
| bool get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required = true); |
|
|
| template<typename T> |
| bool get_arr(enum llm_kv kid, T & result, bool required = true); |
|
|
| template<typename T> |
| bool get_key(const std::string & key, T & result, bool required = true); |
|
|
| template<typename T> |
| bool get_key(enum llm_kv kid, T & result, bool required = true); |
|
|
| template<typename T, size_t N_MAX> |
| bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required = true); |
|
|
| template<typename T> |
| bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true); |
|
|
| std::string get_arch_name() const; |
|
|
| enum llm_arch get_arch() const; |
|
|
| const llama_tensor_weight * get_weight(const char * name) const; |
|
|
| const llama_tensor_weight & require_weight(const char * name) const; |
|
|
| struct ggml_tensor * get_tensor_meta(const char * name) const; |
|
|
| struct ggml_tensor * require_tensor_meta(const std::string & name) const; |
|
|
| const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const; |
|
|
| struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0); |
|
|
| struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true); |
|
|
| void done_getting_tensors() const; |
|
|
| void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr); |
|
|
| void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const; |
|
|
| |
| void load_data_for(struct ggml_tensor * cur) const; |
|
|
| |
| bool load_all_data( |
| struct ggml_context * ctx, |
| llama_buf_map & bufs, |
| llama_mlocks * lmlocks, |
| llama_progress_callback progress_callback, |
| void * progress_callback_user_data); |
|
|
| std::string ftype_name() const; |
|
|
| void print_info() const; |
| }; |
|
|