| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import builtins |
| | import contextlib |
| | import copy |
| | import functools |
| | import time |
| | import weakref |
| | from collections import OrderedDict |
| | from types import FunctionType, MethodType |
| | from typing import Any, Callable, Dict, List, Optional, Tuple |
| |
|
| | from .utils import is_paddle_available, is_paddlenlp_available |
| |
|
| |
|
| | def copy_func(f): |
| | "Copy a non-builtin function (NB `copy.copy` does not work for this)" |
| | if not isinstance(f, FunctionType): |
| | return copy.copy(f) |
| | fn = FunctionType(f.__code__, f.__globals__, f.__name__, f.__defaults__, f.__closure__) |
| | fn.__kwdefaults__ = f.__kwdefaults__ |
| | fn.__dict__.update(f.__dict__) |
| | fn.__annotations__.update(f.__annotations__) |
| | fn.__qualname__ = f.__qualname__ |
| | return fn |
| |
|
| |
|
| | |
| | def patch_to(cls, as_prop=False, cls_method=False): |
| | "Decorator: add `f` to `cls`" |
| | if not isinstance(cls, (tuple, list)): |
| | cls = (cls,) |
| |
|
| | def _inner(f): |
| | for c_ in cls: |
| | nf = copy_func(f) |
| | nm = f.__name__ |
| | |
| | for o in functools.WRAPPER_ASSIGNMENTS: |
| | setattr(nf, o, getattr(f, o)) |
| | nf.__qualname__ = f"{c_.__name__}.{nm}" |
| | if cls_method: |
| | setattr(c_, nm, MethodType(nf, c_)) |
| | else: |
| | setattr(c_, nm, property(nf) if as_prop else nf) |
| | |
| | return globals().get(nm, builtins.__dict__.get(nm, None)) |
| |
|
| | return _inner |
| |
|
| |
|
| | if is_paddle_available(): |
| | import paddle |
| | import paddle.nn as nn |
| |
|
| | @contextlib.contextmanager |
| | def device_scope(device="cpu"): |
| | new_device = device.replace("cuda", "gpu") |
| | old_device = paddle.get_device() |
| | if str(new_device) == str(old_device): |
| | yield |
| | else: |
| | try: |
| | paddle.set_device(new_device) |
| | yield |
| | finally: |
| | paddle.set_device(old_device) |
| |
|
| | paddle.device_scope = device_scope |
| |
|
| | class RNGStatesTracker: |
| | def __init__(self): |
| | self.states_ = {} |
| |
|
| | def reset(self): |
| | self.states_ = {} |
| |
|
| | def remove(self, generator_name=None): |
| | if generator_name is not None: |
| | del self.states_[generator_name] |
| |
|
| | def manual_seed(self, seed, generator_name=None): |
| | if generator_name is None: |
| | generator_name = str(time.time()) |
| | if generator_name in self.states_: |
| | raise ValueError("state {} already exists".format(generator_name)) |
| | orig_rng_state = paddle.get_cuda_rng_state() |
| | paddle.seed(seed) |
| | self.states_[generator_name] = paddle.get_cuda_rng_state() |
| | paddle.set_cuda_rng_state(orig_rng_state) |
| | return generator_name |
| |
|
| | @contextlib.contextmanager |
| | def rng_state(self, generator_name=None): |
| | if generator_name is not None: |
| | if generator_name not in self.states_: |
| | raise ValueError("state {} does not exist".format(generator_name)) |
| | orig_cuda_rng_state = paddle.get_cuda_rng_state() |
| | paddle.set_cuda_rng_state(self.states_[generator_name]) |
| | try: |
| | yield |
| | finally: |
| | self.states_[generator_name] = paddle.get_cuda_rng_state() |
| | paddle.set_cuda_rng_state(orig_cuda_rng_state) |
| | else: |
| | yield |
| |
|
| | RNG_STATE_TRACKER = RNGStatesTracker() |
| |
|
| | def get_rng_state_tracker(*args, **kwargs): |
| | return RNG_STATE_TRACKER |
| |
|
| | paddle.Generator = get_rng_state_tracker |
| | randn = paddle.randn |
| |
|
| | def randn_pt(shape, dtype=None, name=None, **kwargs): |
| | generator = kwargs.get("generator", None) |
| | if generator is None: |
| | return randn(shape, dtype=dtype, name=name) |
| | else: |
| | with get_rng_state_tracker().rng_state(generator): |
| | return randn(shape, dtype=dtype, name=name) |
| |
|
| | paddle.randn = randn_pt |
| |
|
| | rand = paddle.rand |
| |
|
| | def rand_pt(shape, dtype=None, name=None, **kwargs): |
| | generator = kwargs.get("generator", None) |
| | if generator is None: |
| | return randn(shape, dtype=dtype, name=name) |
| | else: |
| | with get_rng_state_tracker().rng_state(generator): |
| | return rand(shape, dtype=dtype, name=name) |
| |
|
| | paddle.rand = rand_pt |
| |
|
| | @patch_to(nn.Layer) |
| | def get_sublayer(self, target: str): |
| | if target == "": |
| | return self |
| |
|
| | atoms: List[str] = target.split(".") |
| | mod: nn.Layer = self |
| |
|
| | for item in atoms: |
| | if not hasattr(mod, item): |
| | raise AttributeError(mod.__class__.__name__ + " has no " "attribute `" + item + "`") |
| |
|
| | mod = getattr(mod, item) |
| |
|
| | if not isinstance(mod, nn.Layer): |
| | raise AttributeError("`" + item + "` is not " "an nn.Layer") |
| | return mod |
| |
|
| | class _WrappedHook: |
| | def __init__(self, hook: Callable, module: Optional["nn.Layer"] = None): |
| | self.hook: Callable = hook |
| | functools.update_wrapper(self, hook) |
| |
|
| | self.with_module: bool = False |
| |
|
| | if module is not None: |
| | self.module: weakref.ReferenceType["nn.Layer"] = weakref.ref(module) |
| | self.with_module = True |
| |
|
| | def __call__(self, *args: Any, **kwargs: Any) -> Any: |
| | if self.with_module: |
| | module = self.module() |
| | if module is None: |
| | raise RuntimeError("You are trying to call the hook of a dead Module!") |
| | return self.hook(module, *args, **kwargs) |
| | return self.hook(*args, **kwargs) |
| |
|
| | def __getstate__(self) -> Dict: |
| | result = {"hook": self.hook, "with_module": self.with_module} |
| | if self.with_module: |
| | result["module"] = self.module() |
| |
|
| | return result |
| |
|
| | def __setstate__(self, state: Dict): |
| | self.hook = state["hook"] |
| | self.with_module = state["with_module"] |
| |
|
| | if self.with_module: |
| | if state["module"] is None: |
| | raise RuntimeError("You are trying to revive the hook of a dead Module!") |
| | self.module = weakref.ref(state["module"]) |
| |
|
| | from paddle.fluid.dygraph.layers import HookRemoveHelper |
| |
|
| | @patch_to(nn.Layer) |
| | def register_load_state_dict_pre_hook(self, hook, with_module=False): |
| | handle = HookRemoveHelper(self.load_state_dict_pre_hooks) |
| | self.load_state_dict_pre_hooks[handle._hook_id] = _WrappedHook(hook, self if with_module else None) |
| | return handle |
| |
|
| | raw_set_state_dict = nn.Layer.set_state_dict |
| |
|
| | @patch_to(nn.Layer) |
| | def set_state_dict(self, state_dict, use_structured_name: bool = True): |
| | for hook in self.load_state_dict_pre_hooks.values(): |
| | hook(state_dict) |
| | return raw_set_state_dict(self, state_dict, use_structured_name=use_structured_name) |
| |
|
| | nn.Layer.load_dict = nn.Layer.set_state_dict |
| | nn.Layer.set_dict = nn.Layer.set_state_dict |
| |
|
| | raw_init = nn.Layer.__init__ |
| |
|
| | @patch_to(nn.Layer) |
| | def __init__(self, name_scope=None, dtype="float32"): |
| | raw_init(self, name_scope=name_scope, dtype=dtype) |
| | self.load_state_dict_pre_hooks = OrderedDict() |
| |
|
| |
|
| | if is_paddle_available() and is_paddlenlp_available(): |
| | import paddle |
| |
|
| | import paddlenlp.transformers |
| | from paddlenlp.transformers import PretrainedModel |
| |
|
| | @patch_to(PretrainedModel, as_prop=True) |
| | def dtype(self): |
| | try: |
| | return next(self.named_parameters())[1].dtype |
| | except StopIteration: |
| | return paddle.get_default_dtype() |
| |
|
| | @patch_to(PretrainedModel, as_prop=True) |
| | def device(self): |
| | try: |
| | return next(self.named_parameters())[1].place |
| | except StopIteration: |
| | return paddle.get_device() |
| |
|
| | try: |
| | from paddlenlp.transformers import XLMRobertaTokenizer |
| | except ImportError: |
| | |
| | """Tokenization classes for XLM-RoBERTa model.""" |
| | import os |
| | from shutil import copyfile |
| |
|
| | import sentencepiece as spm |
| |
|
| | from paddlenlp.transformers.tokenizer_utils import ( |
| | AddedToken, |
| | PretrainedTokenizer, |
| | ) |
| | from paddlenlp.utils.log import logger |
| |
|
| | SPIECE_UNDERLINE = "▁" |
| |
|
| | class XLMRobertaTokenizer(PretrainedTokenizer): |
| |
|
| | resource_files_names = {"vocab_file": "sentencepiece.bpe.model"} |
| | pretrained_resource_files_map = {} |
| | pretrained_init_configuration = {} |
| | max_model_input_sizes = { |
| | "xlm-roberta-base": 512, |
| | "xlm-roberta-large": 512, |
| | "xlm-roberta-large-finetuned-conll02-dutch": 512, |
| | "xlm-roberta-large-finetuned-conll02-spanish": 512, |
| | "xlm-roberta-large-finetuned-conll03-english": 512, |
| | "xlm-roberta-large-finetuned-conll03-german": 512, |
| | } |
| | model_input_names = ["input_ids", "attention_mask"] |
| |
|
| | def __init__( |
| | self, |
| | vocab_file, |
| | bos_token="<s>", |
| | eos_token="</s>", |
| | sep_token="</s>", |
| | cls_token="<s>", |
| | unk_token="<unk>", |
| | pad_token="<pad>", |
| | mask_token="<mask>", |
| | sp_model_kwargs: Optional[Dict[str, Any]] = None, |
| | **kwargs |
| | ) -> None: |
| | |
| | mask_token = ( |
| | AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token |
| | ) |
| |
|
| | self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs |
| |
|
| | super().__init__( |
| | bos_token=bos_token, |
| | eos_token=eos_token, |
| | unk_token=unk_token, |
| | sep_token=sep_token, |
| | cls_token=cls_token, |
| | pad_token=pad_token, |
| | mask_token=mask_token, |
| | sp_model_kwargs=self.sp_model_kwargs, |
| | **kwargs, |
| | ) |
| |
|
| | self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) |
| | self.sp_model.Load(str(vocab_file)) |
| | self.vocab_file = vocab_file |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3} |
| |
|
| | |
| | self.fairseq_offset = 1 |
| |
|
| | self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + self.fairseq_offset |
| | self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} |
| |
|
| | def __getstate__(self): |
| | state = self.__dict__.copy() |
| | state["sp_model"] = None |
| | state["sp_model_proto"] = self.sp_model.serialized_model_proto() |
| | return state |
| |
|
| | def __setstate__(self, d): |
| | self.__dict__ = d |
| |
|
| | |
| | if not hasattr(self, "sp_model_kwargs"): |
| | self.sp_model_kwargs = {} |
| |
|
| | self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) |
| | self.sp_model.LoadFromSerializedProto(self.sp_model_proto) |
| |
|
| | def build_inputs_with_special_tokens( |
| | self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
| | ) -> List[int]: |
| | """ |
| | Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and |
| | adding special tokens. An XLM-RoBERTa sequence has the following format: |
| | - single sequence: `<s> X </s>` |
| | - pair of sequences: `<s> A </s></s> B </s>` |
| | Args: |
| | token_ids_0 (`List[int]`): |
| | List of IDs to which the special tokens will be added. |
| | token_ids_1 (`List[int]`, *optional*): |
| | Optional second list of IDs for sequence pairs. |
| | Returns: |
| | `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. |
| | """ |
| |
|
| | if token_ids_1 is None: |
| | return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] |
| | cls = [self.cls_token_id] |
| | sep = [self.sep_token_id] |
| | return cls + token_ids_0 + sep + sep + token_ids_1 + sep |
| |
|
| | def get_special_tokens_mask( |
| | self, |
| | token_ids_0: List[int], |
| | token_ids_1: Optional[List[int]] = None, |
| | already_has_special_tokens: bool = False, |
| | ) -> List[int]: |
| | """ |
| | Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding |
| | special tokens using the tokenizer `prepare_for_model` method. |
| | Args: |
| | token_ids_0 (`List[int]`): |
| | List of IDs. |
| | token_ids_1 (`List[int]`, *optional*): |
| | Optional second list of IDs for sequence pairs. |
| | already_has_special_tokens (`bool`, *optional*, defaults to `False`): |
| | Whether or not the token list is already formatted with special tokens for the model. |
| | Returns: |
| | `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. |
| | """ |
| |
|
| | if already_has_special_tokens: |
| | return super().get_special_tokens_mask( |
| | token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True |
| | ) |
| |
|
| | if token_ids_1 is None: |
| | return [1] + ([0] * len(token_ids_0)) + [1] |
| | return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] |
| |
|
| | def create_token_type_ids_from_sequences( |
| | self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
| | ) -> List[int]: |
| | """ |
| | Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does |
| | not make use of token type ids, therefore a list of zeros is returned. |
| | Args: |
| | token_ids_0 (`List[int]`): |
| | List of IDs. |
| | token_ids_1 (`List[int]`, *optional*): |
| | Optional second list of IDs for sequence pairs. |
| | Returns: |
| | `List[int]`: List of zeros. |
| | """ |
| |
|
| | sep = [self.sep_token_id] |
| | cls = [self.cls_token_id] |
| |
|
| | if token_ids_1 is None: |
| | return len(cls + token_ids_0 + sep) * [0] |
| | return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] |
| |
|
| | @property |
| | def vocab_size(self): |
| | return len(self.sp_model) + self.fairseq_offset + 1 |
| |
|
| | def get_vocab(self): |
| | vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} |
| | vocab.update(self.added_tokens_encoder) |
| | return vocab |
| |
|
| | def _tokenize(self, text: str) -> List[str]: |
| | return self.sp_model.encode(text, out_type=str) |
| |
|
| | def _convert_token_to_id(self, token): |
| | """Converts a token (str) in an id using the vocab.""" |
| | if token in self.fairseq_tokens_to_ids: |
| | return self.fairseq_tokens_to_ids[token] |
| | spm_id = self.sp_model.PieceToId(token) |
| |
|
| | |
| | return spm_id + self.fairseq_offset if spm_id else self.unk_token_id |
| |
|
| | def _convert_id_to_token(self, index): |
| | """Converts an index (integer) in a token (str) using the vocab.""" |
| | if index in self.fairseq_ids_to_tokens: |
| | return self.fairseq_ids_to_tokens[index] |
| | return self.sp_model.IdToPiece(index - self.fairseq_offset) |
| |
|
| | def convert_tokens_to_string(self, tokens): |
| | """Converts a sequence of tokens (strings for sub-words) in a single string.""" |
| | out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() |
| | return out_string |
| |
|
| | def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: |
| | if not os.path.isdir(save_directory): |
| | logger.error(f"Vocabulary path ({save_directory}) should be a directory") |
| | return |
| | out_vocab_file = os.path.join( |
| | save_directory, |
| | (filename_prefix + "-" if filename_prefix else "") + self.resource_files_names["vocab_file"], |
| | ) |
| |
|
| | if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile( |
| | self.vocab_file |
| | ): |
| | copyfile(self.vocab_file, out_vocab_file) |
| | elif not os.path.isfile(self.vocab_file): |
| | with open(out_vocab_file, "wb") as fi: |
| | content_spiece_model = self.sp_model.serialized_model_proto() |
| | fi.write(content_spiece_model) |
| |
|
| | return (out_vocab_file,) |
| |
|
| | paddlenlp.transformers.XLMRobertaTokenizer = XLMRobertaTokenizer |
| |
|
| | |
| | from paddlenlp.transformers import BertModel |
| |
|
| | raw_forward = BertModel.forward |
| |
|
| | @patch_to(BertModel) |
| | def forward( |
| | self, |
| | input_ids: paddle.Tensor, |
| | token_type_ids: Optional[paddle.Tensor] = None, |
| | position_ids: Optional[paddle.Tensor] = None, |
| | attention_mask: Optional[paddle.Tensor] = None, |
| | past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None, |
| | use_cache: Optional[bool] = None, |
| | output_hidden_states: Optional[bool] = None, |
| | output_attentions: Optional[bool] = None, |
| | return_dict: Optional[bool] = None, |
| | ): |
| | if attention_mask is None: |
| | attention_mask = paddle.ones_like(input_ids) |
| | return raw_forward( |
| | self, |
| | input_ids, |
| | token_type_ids, |
| | position_ids, |
| | attention_mask, |
| | past_key_values, |
| | use_cache, |
| | output_hidden_states, |
| | output_attentions, |
| | return_dict, |
| | ) |
| |
|