Source code for paddlenlp.transformers.blenderbot_small.tokenizer

# encoding=utf-8
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re

from ..gpt.tokenizer import GPTTokenizer

__all__ = ["BlenderbotSmallTokenizer"]

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"blenderbot_small-90M": 512}


# Copy from paddlenlp.transformers.gpt.tokenizer.get_pairs
def get_pairs(word):
    """
    Args:
        word (tuple): tuple of symbols (symbols being variable-length strings).

    Returns:
        set: symbol pairs in a word.
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs


[docs]class BlenderbotSmallTokenizer(GPTTokenizer):
    r"""
    Constructs a BlenderbotSmall tokenizer based on Byte-Pair-Encoding.

    This tokenizer inherits from :class:`~paddlenlp.transformers.GPTTokenizer`,
    which contains most of the main methods.
    Please should refer to the superclass for more information regarding methods.
    Args:
        vocab_file (str): file path of the vocabulary
        merges_file (str): file path of the merges file.
        errors (str): The method to handle errors in decoding
        max_len (int): The specified maximum sequence length. Default: "None".
        special_tokens (dict): The additional special tokens. Default: "None".
        bos_token (str): The special token for beginning of sequence token. Default: "__start__".
        eos_token (str): The special token for end of sequence token. Default: "__end__".
        unk_token (str): The special token for unknown tokens. Default: "__unk__"
        pad_token (str): The special token for padding. Default: "__null__".
        eol_token (str): The special token for newline. Default: "__newln__".
    Examples:
        .. code-block:: python
            from paddlenlp.transformers import BlenderbotSmallTokenizer
            tokenizer = BlenderbotSmallTokenizer.from_pretrained("blenderbot_small-90M")
            text = "My friends are cool but they eat too many carbs."
            inputs = tokenizer(text)
            # above line outputs:
            #   {'input_ids': [42, 643, 46, 1430, 45, 52, 1176, 146, 177, 753, 2430, 5],
            #   'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
    """
    resource_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
    pretrained_resource_files_map = {
        "vocab_file": {
            "blenderbot_small-90M": "https://bj.bcebos.com/paddlenlp/models/transformers/blenderbot_small/blenderbot_small-90M-vocab.json",
        },
        "merges_file": {
            "blenderbot_small-90M": "https://bj.bcebos.com/paddlenlp/models/transformers/blenderbot_small/blenderbot_small-90M-merges.txt",
        },
    }
    pretrained_init_configuration = {"blenderbot_small-90M": {}}
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    def __init__(
        self,
        vocab_file,
        merges_file,
        errors="replace",
        max_len=None,
        special_tokens=None,
        bos_token="__start__",
        eos_token="__end__",
        unk_token="__unk__",
        pad_token="__null__",
        eol_token="__newln__",
        **kwargs
    ):
        super(BlenderbotSmallTokenizer, self).__init__(
            vocab_file=vocab_file,
            merges_file=merges_file,
            errors=errors,
            max_len=max_len,
            special_tokens=special_tokens,
            pad_token=pad_token,
            eos_token=eos_token,
            eol_token=eol_token,
            **kwargs,
        )
        self.pat = r"\S+\n?"  # String matching pattern of BlenderbotSmall is different from Blenderbot
        self.unk_id = self.encoder[unk_token]
        self.eol_token = eol_token

[docs]    def bpe(self, token):
        """
        Apply Byte-Pair-Encoding on token.
        The process of bpe in BlenderbotSmall is different from Blenderbot.
        Args:
            token (str): The token to be converted.

        Returns:
            str: Converted token.
        """
        if token in self.cache:
            return self.cache[token]
        token = re.sub("([.,!?()])", r" \1", token)
        token = re.sub("(')", r" \1 ", token)
        token = re.sub(r"\s{2,}", " ", token)
        if "\n" in token:
            token = token.replace("\n", self.eol_token)
        tokens = token.split(" ")
        words = []
        for token in tokens:
            if not len(token):
                continue

            token = token.lower()
            word = tuple(token)
            word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
            pairs = get_pairs(word)

            if not pairs:
                words.append(token)
                continue

            while True:
                bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
                if bigram not in self.bpe_ranks:
                    break
                first, second = bigram
                new_word = []
                i = 0

                while i < len(word):
                    try:
                        j = word.index(first, i)
                        new_word.extend(word[i:j])
                        i = j
                    except ValueError:
                        new_word.extend(word[i:])
                        break

                    if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                        new_word.append(first + second)
                        i += 2
                    else:
                        new_word.append(word[i])
                        i += 1
                new_word = tuple(new_word)
                word = new_word
                if len(word) == 1:
                    break
                else:
                    pairs = get_pairs(word)
            word = "@@ ".join(word)
            word = word[:-4]

            self.cache[token] = word
            words.append(word)
        return " ".join(words)

[docs]    def convert_tokens_to_string(self, tokens):
        """
        Converts a sequence of tokens (list of string) to a single string.
        Args:
            tokens (list[str]): A sequence of tokens.

        Returns:
            str: Converted string.
        """
        return " ".join(tokens).replace("@@ ", "").strip()

[docs]    def convert_ids_to_string(self, ids, skip_special_tokens=True, clean_up_tokenization_spaces=True):
        """
        Converts a sequence of ids (list of integers) to a single string.
        Args:
            ids (list[int]):
                A sequence of ids corresponding to tokens.
            skip_special_tokens (bool, optional):
                Whether to skip and not decode special tokens when converting. Defaults to `False`.
            clean_up_tokenization_spaces (bool, optional):
                Whether to Clean up a list of simple English tokenization artifacts
                like spaces before punctuations and abbreviated forms.
        Returns:
            str: Converted string.
        """
        tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
        output_string = self.convert_tokens_to_string(tokens)
        if clean_up_tokenization_spaces:
            output_string = (
                output_string.replace(" .", ".")
                .replace(" ?", "?")
                .replace(" !", "!")
                .replace(" ,", ",")
                .replace(" ' ", "'")
                .replace(" n't", "n't")
                .replace(" 'm", "'m")
                .replace(" 's", "'s")
                .replace(" 've", "'ve")
                .replace(" 're", "'re")
            )
        return output_string