| from typing import Dict, List, Any |
| from transformers import ( |
| AutoModelForCausalLM, |
| AutoTokenizer) |
| import torch |
|
|
| model = AutoModelForCausalLM.from_pretrained( |
| "sjster/test_medium", |
| trust_remote_code=True, |
| quantization_config=None, |
| torch_dtype=torch.float, |
| device_map="auto", |
| ) |
|
|
| class EndpointHandler(): |
| def __init__(self, path=""): |
| |
| self.model = AutoModelForCausalLM.from_pretrained( |
| path, |
| trust_remote_code=True, |
| quantization_config=None, |
| torch_dtype=torch.float, |
| device_map="auto") |
| self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True) |
| self.tokenizer.padding_side = "left" |
| self.tokenizer.pad_token = self.tokenizer.eos_token |
| self.tokenizer.add_eos_token = True |
|
|
| def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: |
| """ |
| data args: |
| inputs (:obj: `str` | `PIL.Image` | `np.array`) |
| kwargs |
| Return: |
| A :obj:`list` | `dict`: will be serialized and returned |
| """ |
|
|
| inputs = data.pop("inputs", data) |
| messages = [ |
| { |
| "role": "user", |
| "content": "" |
| + inputs, |
| }, |
| ] |
| encodeds = self.tokenizer.apply_chat_template(messages, return_tensors="pt") |
| encoded_length = len(encodeds[0]) |
| model_inputs = encodeds.to('cuda') |
| result = self.model.generate(model_inputs, |
| do_sample=False, |
| output_scores=True, |
| return_dict_in_generate=True, |
| output_attentions=True, |
| output_hidden_states=True, |
| |
| |
| early_stopping = True, |
| |
| max_new_tokens=400) |
| x, logits_gen = result.sequences, result.scores |
| x = x[:,encoded_length:] |
| decoded = self.tokenizer.batch_decode(x) |
|
|
| return [{"outputs": decoded[0]}] |
|
|