| #ifndef __T5_HPP__ |
| #define __T5_HPP__ |
|
|
| #include <float.h> |
| #include <limits> |
| #include <map> |
| #include <memory> |
| #include <regex> |
| #include <sstream> |
| #include <string> |
| #include <unordered_map> |
|
|
| #include "darts.h" |
| #include "ggml_extend.hpp" |
| #include "json.hpp" |
| #include "model.h" |
|
|
| |
| |
| |
| |
| |
| |
| class MetaspacePreTokenizer { |
| private: |
| std::string replacement; |
| bool add_prefix_space; |
|
|
| public: |
| MetaspacePreTokenizer(const std::string replacement = " ", bool add_prefix_space = true) |
| : replacement(replacement), add_prefix_space(add_prefix_space) {} |
|
|
| std::string tokenize(const std::string& input) const { |
| std::string tokens; |
| std::stringstream ss(input); |
|
|
| if (add_prefix_space) { |
| tokens += replacement; |
| } |
|
|
| std::string token; |
| bool firstToken = true; |
| while (std::getline(ss, token, ' ')) { |
| if (!firstToken) |
| tokens += replacement + token; |
| else |
| tokens += token; |
|
|
| firstToken = false; |
| } |
|
|
| return tokens; |
| } |
| }; |
|
|
| using EncodeResult = std::vector<std::pair<std::string, int>>; |
| class T5UniGramTokenizer { |
| public: |
| enum Status { |
| OK, |
| NO_PIECES_LOADED, |
| NO_ENTRY_FOUND, |
| BUILD_DOUBLE_ARRAY_FAILED, |
| PIECE_ALREADY_DEFINED, |
| INVLIAD_JSON |
| }; |
|
|
| protected: |
| MetaspacePreTokenizer pre_tokenizer; |
|
|
| |
| std::vector<std::pair<std::string, float>> piece_score_pairs; |
|
|
| float min_score_ = 0.0; |
| float max_score_ = 0.0; |
| std::unique_ptr<Darts::DoubleArray> trie_; |
|
|
| |
| |
| int trie_results_size_; |
| |
| int unk_id_ = 2; |
| std::string eos_token_ = "</s>"; |
| int eos_id_ = 1; |
| int pad_id_ = 0; |
| |
| Status status_ = OK; |
|
|
| float kUnkPenalty = 10.0; |
|
|
| std::string replacement; |
| bool add_prefix_space = true; |
|
|
| void InitializePieces(const std::string& json_str) { |
| nlohmann::json data; |
|
|
| try { |
| data = nlohmann::json::parse(json_str); |
| } catch (const nlohmann::json::parse_error& e) { |
| status_ = INVLIAD_JSON; |
| return; |
| } |
| if (!data.contains("model")) { |
| status_ = INVLIAD_JSON; |
| return; |
| } |
| nlohmann::json model = data["model"]; |
| if (!model.contains("vocab")) { |
| status_ = INVLIAD_JSON; |
| return; |
| } |
| if (model.contains("unk_id")) { |
| unk_id_ = model["unk_id"]; |
| } |
|
|
| replacement = data["pre_tokenizer"]["replacement"]; |
| add_prefix_space = data["pre_tokenizer"]["add_prefix_space"]; |
|
|
| pre_tokenizer = MetaspacePreTokenizer(replacement, add_prefix_space); |
|
|
| for (const auto& item : model["vocab"]) { |
| if (item.size() != 2 || !item[0].is_string() || !item[1].is_number_float()) { |
| status_ = INVLIAD_JSON; |
| return; |
| } |
| std::string piece = item[0]; |
| float score = item[1]; |
| piece_score_pairs.emplace_back(piece, score); |
| } |
| } |
|
|
| |
| void BuildTrie(std::vector<std::pair<std::string, int>>* pieces) { |
| if (status_ != OK) |
| return; |
|
|
| if (pieces->empty()) { |
| status_ = NO_PIECES_LOADED; |
| return; |
| } |
|
|
| |
| |
| sort(pieces->begin(), pieces->end()); |
|
|
| |
| std::vector<const char*> key(pieces->size()); |
| std::vector<int> value(pieces->size()); |
| for (size_t i = 0; i < pieces->size(); ++i) { |
| key[i] = (*pieces)[i].first.data(); |
| value[i] = (*pieces)[i].second; |
| } |
|
|
| trie_ = std::unique_ptr<Darts::DoubleArray>(new Darts::DoubleArray()); |
| if (trie_->build(key.size(), const_cast<char**>(&key[0]), nullptr, |
| &value[0]) != 0) { |
| status_ = BUILD_DOUBLE_ARRAY_FAILED; |
| return; |
| } |
|
|
| |
| const int kMaxTrieResultsSize = 1024; |
| std::vector<Darts::DoubleArray::result_pair_type> results( |
| kMaxTrieResultsSize); |
| trie_results_size_ = 0; |
| for (const auto& p : *pieces) { |
| const int num_nodes = trie_->commonPrefixSearch( |
| p.first.data(), results.data(), results.size(), p.first.size()); |
| trie_results_size_ = std::max(trie_results_size_, num_nodes); |
| } |
|
|
| if (trie_results_size_ == 0) |
| status_ = NO_ENTRY_FOUND; |
| } |
|
|
| |
| inline float GetScoreInlined(int id) const { |
| return piece_score_pairs[id].second; |
| } |
|
|
| inline bool IsUnusedInlined(int id) const { |
| return false; |
| } |
|
|
| inline bool IsUserDefinedInlined(int id) const { |
| return false; |
| } |
|
|
| inline size_t OneCharLen(const char* src) const { |
| return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4]; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| EncodeResult EncodeOptimized(const std::string& normalized) const { |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| if (status() != OK || normalized.empty()) { |
| return {}; |
| } |
| |
| struct BestPathNode { |
| int id = -1; |
| float best_path_score = |
| 0; |
| int starts_at = |
| -1; |
| |
| }; |
| const int size = normalized.size(); |
| const float unk_score = min_score() - kUnkPenalty; |
| |
| std::vector<BestPathNode> best_path_ends_at(size + 1); |
| |
| int starts_at = 0; |
| while (starts_at < size) { |
| std::size_t node_pos = 0; |
| std::size_t key_pos = starts_at; |
| const auto best_path_score_till_here = |
| best_path_ends_at[starts_at].best_path_score; |
| bool has_single_node = false; |
| const int mblen = |
| std::min<int>(OneCharLen(normalized.data() + starts_at), |
| size - starts_at); |
| while (key_pos < size) { |
| const int ret = |
| trie_->traverse(normalized.data(), node_pos, key_pos, key_pos + 1); |
| if (ret == -2) |
| break; |
| if (ret >= 0) { |
| if (IsUnusedInlined(ret)) |
| continue; |
| |
| auto& target_node = best_path_ends_at[key_pos]; |
| const auto length = (key_pos - starts_at); |
| |
| const auto score = IsUserDefinedInlined(ret) |
| ? (length * max_score_ - 0.1) |
| : GetScoreInlined(ret); |
| const auto candidate_best_path_score = |
| score + best_path_score_till_here; |
| if (target_node.starts_at == -1 || |
| candidate_best_path_score > target_node.best_path_score) { |
| target_node.best_path_score = candidate_best_path_score; |
| target_node.starts_at = starts_at; |
| target_node.id = ret; |
| } |
| if (!has_single_node && length == mblen) { |
| has_single_node = true; |
| } |
| } |
| } |
| if (!has_single_node) { |
| auto& target_node = best_path_ends_at[starts_at + mblen]; |
| const auto candidate_best_path_score = |
| unk_score + best_path_score_till_here; |
| if (target_node.starts_at == -1 || |
| candidate_best_path_score > target_node.best_path_score) { |
| target_node.best_path_score = candidate_best_path_score; |
| target_node.starts_at = starts_at; |
| target_node.id = unk_id_; |
| } |
| } |
| |
| starts_at += mblen; |
| } |
| |
| EncodeResult results; |
| int ends_at = size; |
| while (ends_at > 0) { |
| const auto& node = best_path_ends_at[ends_at]; |
| results.emplace_back( |
| normalized.substr(node.starts_at, ends_at - node.starts_at), node.id); |
| ends_at = node.starts_at; |
| } |
| std::reverse(results.begin(), results.end()); |
| return results; |
| } |
|
|
| public: |
| explicit T5UniGramTokenizer(const std::string& json_str = "") { |
| if (json_str.size() != 0) { |
| InitializePieces(json_str); |
| } else { |
| InitializePieces(ModelLoader::load_t5_tokenizer_json()); |
| } |
|
|
| min_score_ = FLT_MAX; |
| max_score_ = FLT_MIN; |
|
|
| std::vector<std::pair<std::string, int>> pieces; |
| for (int i = 0; i < piece_score_pairs.size(); i++) { |
| const auto& sp = piece_score_pairs[i]; |
|
|
| min_score_ = std::min(min_score_, sp.second); |
| max_score_ = std::max(max_score_, sp.second); |
|
|
| pieces.emplace_back(sp.first, i); |
| } |
|
|
| BuildTrie(&pieces); |
| } |
| ~T5UniGramTokenizer(){}; |
|
|
| std::string Normalize(const std::string& input) const { |
| |
| |
| std::string normalized = std::regex_replace(input, std::regex(" {2,}"), " "); |
| return normalized; |
| } |
|
|
| std::vector<int> Encode(const std::string& input, bool append_eos_if_not_present = true) const { |
| std::string normalized = Normalize(input); |
| normalized = pre_tokenizer.tokenize(normalized); |
| EncodeResult result = EncodeOptimized(normalized); |
| if (result.size() > 0 && append_eos_if_not_present) { |
| auto item = result[result.size() - 1]; |
| if (item.first != eos_token_) { |
| result.emplace_back(eos_token_, eos_id_); |
| } |
| } |
| std::vector<int> tokens; |
| for (auto item : result) { |
| tokens.push_back(item.second); |
| } |
| return tokens; |
| } |
|
|
| void pad_tokens(std::vector<int>& tokens, |
| std::vector<float>& weights, |
| size_t max_length = 0, |
| bool padding = false) { |
| if (max_length > 0 && padding) { |
| size_t orig_token_num = tokens.size() - 1; |
| size_t n = std::ceil(orig_token_num * 1.0 / (max_length - 1)); |
| if (n == 0) { |
| n = 1; |
| } |
| size_t length = max_length * n; |
| LOG_DEBUG("token length: %llu", length); |
| std::vector<int> new_tokens; |
| std::vector<float> new_weights; |
| int token_idx = 0; |
| for (int i = 0; i < length; i++) { |
| if (token_idx >= orig_token_num) { |
| break; |
| } |
| if (i % max_length == max_length - 1) { |
| new_tokens.push_back(eos_id_); |
| new_weights.push_back(1.0); |
| } else { |
| new_tokens.push_back(tokens[token_idx]); |
| new_weights.push_back(weights[token_idx]); |
| token_idx++; |
| } |
| } |
|
|
| new_tokens.push_back(eos_id_); |
| new_weights.push_back(1.0); |
| tokens = new_tokens; |
| weights = new_weights; |
|
|
| if (padding) { |
| int pad_token_id = pad_id_; |
| tokens.insert(tokens.end(), length - tokens.size(), pad_token_id); |
| weights.insert(weights.end(), length - weights.size(), 1.0); |
| } |
| } |
| } |
|
|
| |
| |
| float min_score() const { return min_score_; } |
|
|
| |
| |
| float max_score() const { return max_score_; } |
|
|
| Status status() const { return status_; } |
| }; |
|
|
| class T5LayerNorm : public UnaryBlock { |
| protected: |
| int64_t hidden_size; |
| float eps; |
|
|
| void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") { |
| enum ggml_type wtype = GGML_TYPE_F32; |
| params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); |
| } |
|
|
| public: |
| T5LayerNorm(int64_t hidden_size, |
| float eps = 1e-06f) |
| : hidden_size(hidden_size), |
| eps(eps) {} |
|
|
| struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { |
| struct ggml_tensor* w = params["weight"]; |
| x = ggml_rms_norm(ctx, x, eps); |
| x = ggml_mul(ctx, x, w); |
| return x; |
| } |
| }; |
|
|
| struct T5DenseActDense : public UnaryBlock { |
| public: |
| T5DenseActDense(int64_t model_dim, int64_t ff_dim) { |
| blocks["wi"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false)); |
| blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false)); |
| } |
|
|
| struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { |
| |
| auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]); |
| auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]); |
|
|
| x = wi->forward(ctx, x); |
| x = ggml_relu_inplace(ctx, x); |
| x = wo->forward(ctx, x); |
| return x; |
| } |
| }; |
|
|
| struct T5DenseGatedActDense : public UnaryBlock { |
| public: |
| T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) { |
| blocks["wi_0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false)); |
| blocks["wi_1"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false)); |
| blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false)); |
| } |
|
|
| struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { |
| |
| auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]); |
| auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]); |
| auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]); |
|
|
| auto hidden_gelu = ggml_gelu_inplace(ctx, wi_0->forward(ctx, x)); |
| auto hidden_linear = wi_1->forward(ctx, x); |
| x = ggml_mul_inplace(ctx, hidden_gelu, hidden_linear); |
| x = wo->forward(ctx, x); |
| return x; |
| } |
| }; |
|
|
| struct T5LayerFF : public UnaryBlock { |
| public: |
| T5LayerFF(int64_t model_dim, int64_t ff_dim) { |
| blocks["DenseReluDense"] = std::shared_ptr<GGMLBlock>(new T5DenseGatedActDense(model_dim, ff_dim)); |
| blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim)); |
| } |
|
|
| struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { |
| |
| auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]); |
| auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]); |
|
|
| auto forwarded_states = layer_norm->forward(ctx, x); |
| forwarded_states = DenseReluDense->forward(ctx, forwarded_states); |
| x = ggml_add_inplace(ctx, forwarded_states, x); |
| return x; |
| } |
| }; |
|
|
| class T5Attention : public GGMLBlock { |
| protected: |
| int64_t model_dim; |
| int64_t inner_dim; |
| int64_t num_heads; |
| bool using_relative_attention_bias; |
| int64_t relative_attention_num_buckets = 32; |
| int64_t relative_attention_max_distance = 128; |
|
|
| public: |
| T5Attention(int64_t model_dim, |
| int64_t inner_dim, |
| int64_t num_heads, |
| bool using_relative_attention_bias = false) |
| : model_dim(model_dim), |
| inner_dim(inner_dim), |
| num_heads(num_heads), |
| using_relative_attention_bias(using_relative_attention_bias) { |
| blocks["q"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false)); |
| blocks["k"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false)); |
| blocks["v"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false)); |
| blocks["o"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, model_dim, false)); |
| if (using_relative_attention_bias) { |
| blocks["relative_attention_bias"] = std::shared_ptr<GGMLBlock>(new Embedding(relative_attention_num_buckets, num_heads)); |
| } |
| } |
|
|
| struct ggml_tensor* compute_bias(struct ggml_context* ctx, |
| struct ggml_tensor* relative_position_bucket) { |
| auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]); |
|
|
| auto values = relative_attention_bias->forward(ctx, relative_position_bucket); |
| values = ggml_cont(ctx, ggml_permute(ctx, values, 2, 0, 1, 3)); |
| return values; |
| } |
|
|
| |
| std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx, |
| struct ggml_tensor* x, |
| struct ggml_tensor* past_bias = NULL, |
| struct ggml_tensor* mask = NULL, |
| struct ggml_tensor* relative_position_bucket = NULL) { |
| auto q_proj = std::dynamic_pointer_cast<Linear>(blocks["q"]); |
| auto k_proj = std::dynamic_pointer_cast<Linear>(blocks["k"]); |
| auto v_proj = std::dynamic_pointer_cast<Linear>(blocks["v"]); |
| auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["o"]); |
|
|
| int64_t n_head = num_heads; |
| int64_t d_head = inner_dim / n_head; |
|
|
| auto q = q_proj->forward(ctx, x); |
| auto k = k_proj->forward(ctx, x); |
| auto v = v_proj->forward(ctx, x); |
|
|
| if (using_relative_attention_bias && relative_position_bucket != NULL) { |
| past_bias = compute_bias(ctx, relative_position_bucket); |
| } |
| if (past_bias != NULL) { |
| if (mask != NULL) { |
| mask = ggml_add(ctx, mask, past_bias); |
| } else { |
| mask = past_bias; |
| } |
| } |
|
|
| k = ggml_scale_inplace(ctx, k, sqrt(d_head)); |
|
|
| x = ggml_nn_attention_ext(ctx, q, k, v, num_heads, mask); |
|
|
| x = out_proj->forward(ctx, x); |
| return {x, past_bias}; |
| } |
| }; |
|
|
| struct T5LayerSelfAttention : public GGMLBlock { |
| public: |
| T5LayerSelfAttention(int64_t model_dim, |
| int64_t inner_dim, |
| int64_t ff_dim, |
| int64_t num_heads, |
| bool using_relative_attention_bias) { |
| blocks["SelfAttention"] = std::shared_ptr<GGMLBlock>(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias)); |
| blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim)); |
| } |
|
|
| std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx, |
| struct ggml_tensor* x, |
| struct ggml_tensor* past_bias = NULL, |
| struct ggml_tensor* mask = NULL, |
| struct ggml_tensor* relative_position_bucket = NULL) { |
| |
| auto SelfAttention = std::dynamic_pointer_cast<T5Attention>(blocks["SelfAttention"]); |
| auto layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]); |
|
|
| auto normed_hidden_state = layer_norm->forward(ctx, x); |
| auto ret = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket); |
| auto output = ret.first; |
| past_bias = ret.second; |
|
|
| x = ggml_add_inplace(ctx, output, x); |
| return {x, past_bias}; |
| } |
| }; |
|
|
| struct T5Block : public GGMLBlock { |
| public: |
| T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) { |
| blocks["layer.0"] = std::shared_ptr<GGMLBlock>(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias)); |
| blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim)); |
| } |
|
|
| std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx, |
| struct ggml_tensor* x, |
| struct ggml_tensor* past_bias = NULL, |
| struct ggml_tensor* mask = NULL, |
| struct ggml_tensor* relative_position_bucket = NULL) { |
| |
| auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]); |
| auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]); |
|
|
| auto ret = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket); |
| x = ret.first; |
| past_bias = ret.second; |
| x = layer_1->forward(ctx, x); |
| return {x, past_bias}; |
| } |
| }; |
|
|
| struct T5Stack : public GGMLBlock { |
| int64_t num_layers; |
|
|
| public: |
| T5Stack(int64_t num_layers, |
| int64_t model_dim, |
| int64_t inner_dim, |
| int64_t ff_dim, |
| int64_t num_heads) |
| : num_layers(num_layers) { |
| for (int i = 0; i < num_layers; i++) { |
| blocks["block." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new T5Block(model_dim, inner_dim, ff_dim, num_heads, i == 0)); |
| } |
|
|
| blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim)); |
| } |
|
|
| struct ggml_tensor* forward(struct ggml_context* ctx, |
| struct ggml_tensor* x, |
| struct ggml_tensor* past_bias = NULL, |
| struct ggml_tensor* attention_mask = NULL, |
| struct ggml_tensor* relative_position_bucket = NULL) { |
| |
| for (int i = 0; i < num_layers; i++) { |
| auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]); |
|
|
| auto ret = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket); |
| x = ret.first; |
| past_bias = ret.second; |
| } |
|
|
| auto final_layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["final_layer_norm"]); |
|
|
| x = final_layer_norm->forward(ctx, x); |
| return x; |
| } |
| }; |
|
|
| struct T5 : public GGMLBlock { |
| public: |
| T5(int64_t num_layers, |
| int64_t model_dim, |
| int64_t ff_dim, |
| int64_t num_heads, |
| int64_t vocab_size) { |
| blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(num_layers, model_dim, model_dim, ff_dim, num_heads)); |
| blocks["shared"] = std::shared_ptr<GGMLBlock>(new Embedding(vocab_size, model_dim)); |
| } |
|
|
| struct ggml_tensor* forward(struct ggml_context* ctx, |
| struct ggml_tensor* input_ids, |
| struct ggml_tensor* past_bias = NULL, |
| struct ggml_tensor* attention_mask = NULL, |
| struct ggml_tensor* relative_position_bucket = NULL) { |
| |
|
|
| auto shared = std::dynamic_pointer_cast<Embedding>(blocks["shared"]); |
| auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]); |
|
|
| auto x = shared->forward(ctx, input_ids); |
| x = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket); |
| return x; |
| } |
| }; |
|
|
| struct T5Runner : public GGMLRunner { |
| T5 model; |
| std::vector<int> relative_position_bucket_vec; |
|
|
| T5Runner(ggml_backend_t backend, |
| std::map<std::string, enum ggml_type>& tensor_types, |
| const std::string prefix, |
| int64_t num_layers = 24, |
| int64_t model_dim = 4096, |
| int64_t ff_dim = 10240, |
| int64_t num_heads = 64, |
| int64_t vocab_size = 32128) |
| : GGMLRunner(backend), model(num_layers, model_dim, ff_dim, num_heads, vocab_size) { |
| model.init(params_ctx, tensor_types, prefix); |
| } |
|
|
| std::string get_desc() { |
| return "t5"; |
| } |
|
|
| void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { |
| model.get_param_tensors(tensors, prefix); |
| } |
|
|
| struct ggml_tensor* forward(struct ggml_context* ctx, |
| struct ggml_tensor* input_ids, |
| struct ggml_tensor* relative_position_bucket) { |
| size_t N = input_ids->ne[1]; |
| size_t n_token = input_ids->ne[0]; |
|
|
| auto hidden_states = model.forward(ctx, input_ids, NULL, NULL, relative_position_bucket); |
| return hidden_states; |
| } |
|
|
| struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids) { |
| struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); |
|
|
| input_ids = to_backend(input_ids); |
|
|
| relative_position_bucket_vec = compute_relative_position_bucket(input_ids->ne[0], input_ids->ne[0]); |
|
|
| |
| |
| |
| |
| |
| |
|
|
| auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx, |
| GGML_TYPE_I32, |
| input_ids->ne[0], |
| input_ids->ne[0]); |
| set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data()); |
|
|
| struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, relative_position_bucket); |
|
|
| ggml_build_forward_expand(gf, hidden_states); |
|
|
| return gf; |
| } |
|
|
| void compute(const int n_threads, |
| struct ggml_tensor* input_ids, |
| ggml_tensor** output, |
| ggml_context* output_ctx = NULL) { |
| auto get_graph = [&]() -> struct ggml_cgraph* { |
| return build_graph(input_ids); |
| }; |
| GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); |
| } |
|
|
| static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position, |
| bool bidirectional = true, |
| int num_buckets = 32, |
| int max_distance = 128) { |
| std::vector<int> relative_buckets(relative_position.size(), 0); |
| std::vector<int> abs_relative_position = relative_position; |
|
|
| if (bidirectional) { |
| num_buckets = num_buckets / 2; |
| for (size_t i = 0; i < relative_position.size(); ++i) { |
| if (relative_position[i] > 0) { |
| relative_buckets[i] += num_buckets; |
| } |
| abs_relative_position[i] = std::abs(relative_position[i]); |
| } |
| } else { |
| for (size_t i = 0; i < relative_position.size(); ++i) { |
| abs_relative_position[i] = std::max(-relative_position[i], 0); |
| } |
| } |
|
|
| int max_exact = num_buckets / 2; |
| std::vector<int> relative_position_if_large(relative_position.size(), 0); |
|
|
| for (size_t i = 0; i < relative_position.size(); ++i) { |
| if (abs_relative_position[i] < max_exact) { |
| relative_buckets[i] += abs_relative_position[i]; |
| } else { |
| float log_pos = std::log(static_cast<float>(abs_relative_position[i]) / max_exact); |
| float log_base = std::log(static_cast<float>(max_distance) / max_exact); |
| relative_position_if_large[i] = max_exact + static_cast<int>((log_pos / log_base) * (num_buckets - max_exact)); |
| relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1); |
| relative_buckets[i] += relative_position_if_large[i]; |
| } |
| } |
|
|
| return relative_buckets; |
| } |
|
|
| std::vector<int> compute_relative_position_bucket(int query_length, |
| int key_length) { |
| std::vector<int> context_position(query_length); |
| std::vector<int> memory_position(key_length); |
|
|
| for (int i = 0; i < query_length; ++i) { |
| context_position[i] = i; |
| } |
| for (int i = 0; i < key_length; ++i) { |
| memory_position[i] = i; |
| } |
|
|
| std::vector<std::vector<int>> relative_position(query_length, std::vector<int>(key_length, 0)); |
| for (int i = 0; i < query_length; ++i) { |
| for (int j = 0; j < key_length; ++j) { |
| relative_position[i][j] = memory_position[j] - context_position[i]; |
| } |
| } |
|
|
| std::vector<int> relative_position_bucket; |
| for (int i = 0; i < query_length; ++i) { |
| std::vector<int> result = _relative_position_bucket(relative_position[i], true); |
| relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end()); |
| } |
|
|
| return relative_position_bucket; |
| } |
| }; |
|
|
| struct T5Embedder { |
| T5UniGramTokenizer tokenizer; |
| T5Runner model; |
|
|
| static std::map<std::string, enum ggml_type> empty_tensor_types; |
|
|
| T5Embedder(ggml_backend_t backend, |
| std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types, |
| const std::string prefix = "", |
| int64_t num_layers = 24, |
| int64_t model_dim = 4096, |
| int64_t ff_dim = 10240, |
| int64_t num_heads = 64, |
| int64_t vocab_size = 32128) |
| : model(backend, tensor_types, prefix, num_layers, model_dim, ff_dim, num_heads, vocab_size) { |
| } |
|
|
| void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { |
| model.get_param_tensors(tensors, prefix); |
| } |
|
|
| void alloc_params_buffer() { |
| model.alloc_params_buffer(); |
| } |
|
|
| std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text, |
| size_t max_length = 0, |
| bool padding = false) { |
| auto parsed_attention = parse_prompt_attention(text); |
|
|
| { |
| std::stringstream ss; |
| ss << "["; |
| for (const auto& item : parsed_attention) { |
| ss << "['" << item.first << "', " << item.second << "], "; |
| } |
| ss << "]"; |
| LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str()); |
| } |
|
|
| std::vector<int> tokens; |
| std::vector<float> weights; |
| for (const auto& item : parsed_attention) { |
| const std::string& curr_text = item.first; |
| float curr_weight = item.second; |
| std::vector<int> curr_tokens = tokenizer.Encode(curr_text, false); |
| tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end()); |
| weights.insert(weights.end(), curr_tokens.size(), curr_weight); |
| } |
|
|
| int EOS_TOKEN_ID = 1; |
| tokens.push_back(EOS_TOKEN_ID); |
| weights.push_back(1.0); |
|
|
| tokenizer.pad_tokens(tokens, weights, max_length, padding); |
|
|
| |
| |
| |
| |
|
|
| return {tokens, weights}; |
| } |
|
|
| void test() { |
| struct ggml_init_params params; |
| params.mem_size = static_cast<size_t>(10 * 1024 * 1024); |
| params.mem_buffer = NULL; |
| params.no_alloc = false; |
|
|
| struct ggml_context* work_ctx = ggml_init(params); |
| GGML_ASSERT(work_ctx != NULL); |
|
|
| { |
| |
| |
| |
| |
| |
| |
| std::string text("a lovely cat"); |
| auto tokens_and_weights = tokenize(text, 77, true); |
| std::vector<int>& tokens = tokens_and_weights.first; |
| std::vector<float>& weights = tokens_and_weights.second; |
| for (auto token : tokens) { |
| printf("%d ", token); |
| } |
| printf("\n"); |
| auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); |
| struct ggml_tensor* out = NULL; |
|
|
| int t0 = ggml_time_ms(); |
| model.compute(8, input_ids, &out, work_ctx); |
| int t1 = ggml_time_ms(); |
|
|
| print_ggml_tensor(out); |
| LOG_DEBUG("t5 test done in %dms", t1 - t0); |
| } |
| } |
|
|
| static void load_from_file_and_test(const std::string& file_path) { |
| |
| ggml_backend_t backend = ggml_backend_cpu_init(); |
| ggml_type model_data_type = GGML_TYPE_F32; |
| std::shared_ptr<T5Embedder> t5 = std::shared_ptr<T5Embedder>(new T5Embedder(backend)); |
| { |
| LOG_INFO("loading from '%s'", file_path.c_str()); |
|
|
| t5->alloc_params_buffer(); |
| std::map<std::string, ggml_tensor*> tensors; |
| t5->get_param_tensors(tensors, ""); |
|
|
| ModelLoader model_loader; |
| if (!model_loader.init_from_file(file_path)) { |
| LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); |
| return; |
| } |
|
|
| bool success = model_loader.load_tensors(tensors, backend); |
|
|
| if (!success) { |
| LOG_ERROR("load tensors from model loader failed"); |
| return; |
| } |
|
|
| LOG_INFO("t5 model loaded"); |
| } |
| t5->test(); |
| } |
| }; |
|
|
| #endif |