# encoding=utf-8
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from ..gpt.tokenizer import GPTTokenizer
__all__ = ["BlenderbotSmallTokenizer"]
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"blenderbot_small-90M": 512}
# Copy from paddlenlp.transformers.gpt.tokenizer.get_pairs
def get_pairs(word):
"""
Args:
word (tuple): tuple of symbols (symbols being variable-length strings).
Returns:
set: symbol pairs in a word.
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
[docs]
class BlenderbotSmallTokenizer(GPTTokenizer):
r"""
Constructs a BlenderbotSmall tokenizer based on Byte-Pair-Encoding.
This tokenizer inherits from :class:`~paddlenlp.transformers.GPTTokenizer`,
which contains most of the main methods.
Please should refer to the superclass for more information regarding methods.
Args:
vocab_file (str): file path of the vocabulary
merges_file (str): file path of the merges file.
errors (str): The method to handle errors in decoding
max_len (int): The specified maximum sequence length. Default: "None".
special_tokens (dict): The additional special tokens. Default: "None".
bos_token (str): The special token for beginning of sequence token. Default: "__start__".
eos_token (str): The special token for end of sequence token. Default: "__end__".
unk_token (str): The special token for unknown tokens. Default: "__unk__"
pad_token (str): The special token for padding. Default: "__null__".
eol_token (str): The special token for newline. Default: "__newln__".
Examples:
.. code-block:: python
from paddlenlp.transformers import BlenderbotSmallTokenizer
tokenizer = BlenderbotSmallTokenizer.from_pretrained("blenderbot_small-90M")
text = "My friends are cool but they eat too many carbs."
inputs = tokenizer(text)
# above line outputs:
# {'input_ids': [42, 643, 46, 1430, 45, 52, 1176, 146, 177, 753, 2430, 5],
# 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
"""
resource_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
pretrained_resource_files_map = {
"vocab_file": {
"blenderbot_small-90M": "https://bj.bcebos.com/paddlenlp/models/transformers/blenderbot_small/blenderbot_small-90M-vocab.json",
},
"merges_file": {
"blenderbot_small-90M": "https://bj.bcebos.com/paddlenlp/models/transformers/blenderbot_small/blenderbot_small-90M-merges.txt",
},
}
pretrained_init_configuration = {"blenderbot_small-90M": {}}
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
max_len=None,
special_tokens=None,
bos_token="__start__",
eos_token="__end__",
unk_token="__unk__",
pad_token="__null__",
eol_token="__newln__",
**kwargs
):
super(BlenderbotSmallTokenizer, self).__init__(
vocab_file=vocab_file,
merges_file=merges_file,
errors=errors,
max_len=max_len,
special_tokens=special_tokens,
pad_token=pad_token,
eos_token=eos_token,
eol_token=eol_token,
**kwargs,
)
self.pat = r"\S+\n?" # String matching pattern of BlenderbotSmall is different from Blenderbot
self.unk_id = self.encoder[unk_token]
self.eol_token = eol_token
[docs]
def bpe(self, token):
"""
Apply Byte-Pair-Encoding on token.
The process of bpe in BlenderbotSmall is different from Blenderbot.
Args:
token (str): The token to be converted.
Returns:
str: Converted token.
"""
if token in self.cache:
return self.cache[token]
token = re.sub("([.,!?()])", r" \1", token)
token = re.sub("(')", r" \1 ", token)
token = re.sub(r"\s{2,}", " ", token)
if "\n" in token:
token = token.replace("\n", self.eol_token)
tokens = token.split(" ")
words = []
for token in tokens:
if not len(token):
continue
token = token.lower()
word = tuple(token)
word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
pairs = get_pairs(word)
if not pairs:
words.append(token)
continue
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except ValueError:
new_word.extend(word[i:])
break
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = "@@ ".join(word)
word = word[:-4]
self.cache[token] = word
words.append(word)
return " ".join(words)
[docs]
def convert_tokens_to_string(self, tokens):
"""
Converts a sequence of tokens (list of string) to a single string.
Args:
tokens (list[str]): A sequence of tokens.
Returns:
str: Converted string.
"""
return " ".join(tokens).replace("@@ ", "").strip()
[docs]
def convert_ids_to_string(self, ids, skip_special_tokens=True, clean_up_tokenization_spaces=True):
"""
Converts a sequence of ids (list of integers) to a single string.
Args:
ids (list[int]):
A sequence of ids corresponding to tokens.
skip_special_tokens (bool, optional):
Whether to skip and not decode special tokens when converting. Defaults to `False`.
clean_up_tokenization_spaces (bool, optional):
Whether to Clean up a list of simple English tokenization artifacts
like spaces before punctuations and abbreviated forms.
Returns:
str: Converted string.
"""
tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
output_string = self.convert_tokens_to_string(tokens)
if clean_up_tokenization_spaces:
output_string = (
output_string.replace(" .", ".")
.replace(" ?", "?")
.replace(" !", "!")
.replace(" ,", ",")
.replace(" ' ", "'")
.replace(" n't", "n't")
.replace(" 'm", "'m")
.replace(" 's", "'s")
.replace(" 've", "'ve")
.replace(" 're", "'re")
)
return output_string