paddlenlp.transformers.reformer.tokenizer 源代码

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sentencepiece as spm

from ..albert.tokenizer import AlbertEnglishTokenizer

__all__ = ['ReformerTokenizer']


[文档]class ReformerTokenizer(AlbertEnglishTokenizer): """ Constructs a Reformer tokenizer based on SentencePiece . This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer` which contains most of the main methods. For more information regarding those methods, please refer to this superclass. Args: sentencepiece_model_file (str): The vocabulary file (ends with '.spm') required to instantiate a `SentencePiece <https://github.com/google/sentencepiece>`__ tokenizer. do_lower_case (bool): Whether or not to lowercase the input when tokenizing. Defaults to `False`. remove_space (bool): Whether or note to remove space when tokenizing. Defaults to `True`. keep_accents (bool): Whether or note to keep accents when tokenizing. Defaults to `False`. eos_token (str): A special token representing the *eos (end-of-sentence)* token. Defaults to "</s>". unk_token (str): A special token representing the *unknown (out-of-vocabulary)* token. An unknown token is set to be `unk_token` inorder to be converted to an ID. Defaults to "<unk>". pad_token (str): A special token used to make arrays of tokens the same size for batching purposes. Defaults to "<unk>". """ resource_files_names = {"sentencepiece_model_file": "spiece.model", } pretrained_resource_files_map = { "sentencepiece_model_file": { "reformer-crime-and-punishment": "http://paddlenlp.bj.bcebos.com/models/transformers/reformer/reformer-crime-and-punishment/spiece.model", }, } pretrained_init_configuration = { "reformer-crime-and-punishment": { "do_lower_case": False }, } def __init__(self, sentencepiece_model_file, do_lower_case=False, remove_space=True, keep_accents=False, eos_token="</s>", unk_token="<unk>", pad_token="<unk>", **kwargs): self.do_lower_case = do_lower_case self.remove_space = remove_space self.keep_accents = keep_accents self.sentencepiece_model_file = sentencepiece_model_file self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(sentencepiece_model_file) def __call__(self, text, text_pair=None, max_seq_len=None, stride=0, is_split_into_words=False, pad_to_max_seq_len=False, truncation_strategy="longest_first", return_position_ids=False, return_token_type_ids=False, return_attention_mask=True, return_length=False, return_overflowing_tokens=False, return_special_tokens_mask=False): return super(ReformerTokenizer, self).__call__( text, text_pair, max_seq_len, stride, is_split_into_words, pad_to_max_seq_len, truncation_strategy, return_position_ids, return_token_type_ids, return_attention_mask, return_length, return_overflowing_tokens, return_special_tokens_mask)
[文档] def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ Build model inputs from a sequence or a pair of sequence. An Reformer sequence has the following format: - single sequence: ``X`` - pair of sequences: ``A B `` Args: token_ids_0 (List[int]): List of IDs to which the special tokens will be added. token_ids_1 (List[int], optional): Optional second list of IDs for sequence pairs. Defaults to None. Returns: List[int]: List of input_id with the appropriate special tokens. """ if token_ids_1 is None: return token_ids_0 return token_ids_0 + token_ids_1
[文档] def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): """ Create a mask from the two sequences. If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). Args: token_ids_0 (List[int]): List of IDs. token_ids_1 (List[int], optional): Optional second list of IDs for sequence pairs. Returns: List[int]: List of token_type_id according to the given sequence(s). """ if token_ids_1 is None: return len(token_ids_0) * [0] return len(token_ids_0) * [0] + len(token_ids_1) * [1]