| import json |
| import time |
| from subword_nmt.apply_bpe import BPE |
| import ctranslate2 |
|
|
| class TranslationServer: |
|
|
| def clean_text(self, text): |
| """Clean input text.""" |
| return " ".join(text.strip().split()) |
|
|
| def normalize_output(self, text): |
| """Normalize translation output.""" |
| replacements = { |
| "▁": " ", |
| "'": "'", |
| """: "\"", |
| "&": "&", |
| "@@": "", |
| } |
| for old, new in replacements.items(): |
| text = text.replace(old, new) |
| |
| return " ".join(text.split()).strip() |
|
|
| def __init__(self, model_path="es_en"): |
| self.model_path = model_path |
| self.bpe_path = f"{model_path}/bpe.model" |
| self.vocab_path = f"{model_path}/shared_vocabulary.json" |
|
|
| |
| with open(self.bpe_path, "r", encoding="utf-8") as bpe_file: |
| self.bpe = BPE(bpe_file) |
|
|
| |
| with open(self.vocab_path, "r", encoding="utf-8") as f: |
| self.vocab = json.load(f) |
|
|
| self.token_to_id = {token: i for i, token in enumerate(self.vocab)} |
| self.id_to_token = {i: token for i, token in enumerate(self.vocab)} |
|
|
| |
| self.translator = ctranslate2.Translator(model_path) |
|
|
| def translate_bpe(self, text, beam_size=5): |
|
|
| |
| tokens = self.bpe.process_line(text).split() |
|
|
| |
| result = self.translator.translate_batch( |
| [tokens], |
| beam_size=beam_size, |
| length_penalty=1.0 |
| ) |
|
|
| |
| output_tokens = result[0].hypotheses[0] |
|
|
| |
| output = " ".join(output_tokens) |
| output = self.normalize_output(output) |
|
|
| return output |
|
|
| def translate_text(self, text): |
| text = self.clean_text(text) |
| output = self.translate_bpe(text) |
| return output |
|
|
|
|
| if __name__ == "__main__": |
| server = TranslationServer("es_en") |
| |
| |
| |
| text = "Estoy llamando desde la oficina de impuestos." |
| print(server.translate_text(text)) |
|
|
|
|