paddlenlp.experimental.faster_tokenizer 源代码

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib

import paddle
import paddle.nn as nn
from paddle.common_ops_import import LayerHelper
from paddle.framework import core

from paddlenlp.transformers import BertTokenizer, ErnieTokenizer, RobertaTokenizer
from paddlenlp.transformers.ppminilm.tokenizer import PPMiniLMTokenizer
from paddlenlp.utils.log import logger

__all__ = ["to_tensor", "to_vocab_buffer", "FasterTokenizer"]


[文档]def to_tensor(string_values, name="text"):
    """
    Create the tensor that the value holds the list of string.
    NOTICE: The value will be holded in the cpu place.

    Args:
        string_values(list[string]): The value will be setted to the tensor.
        name(string): The name of the tensor.
    """
    tensor = paddle.Tensor(core.VarDesc.VarType.STRING, [], name, core.VarDesc.VarType.STRINGS, False)
    tensor.value().set_string_list(string_values)
    return tensor


[文档]def to_vocab_buffer(vocab_dict, name):
    """
    Create the tensor that the value holds the map, the type of key is the string.
    NOTICE: The value will be holded in the cpu place.

    Args:
        vocab_dict(dict): The value will be setted to the tensor.
            The key is token and the value is the token index.
        name(string): The name of the tensor.
    """
    tensor = paddle.Tensor(core.VarDesc.VarType.RAW, [], name, core.VarDesc.VarType.VOCAB, True)
    tensor.value().set_vocab(vocab_dict)
    return tensor


[文档]class FasterTokenizer(nn.Layer):
    name_map = {
        "bert-base-uncased": BertTokenizer,
        "bert-large-uncased": BertTokenizer,
        "bert-base-cased": BertTokenizer,
        "bert-large-cased": BertTokenizer,
        "bert-base-multilingual-uncased": BertTokenizer,
        "bert-base-multilingual-cased": BertTokenizer,
        "bert-base-chinese": BertTokenizer,
        "bert-wwm-chinese": BertTokenizer,
        "bert-wwm-ext-chinese": BertTokenizer,
        "ernie-1.0": ErnieTokenizer,
        "ernie-2.0-en": ErnieTokenizer,
        "ernie-2.0-large-en": ErnieTokenizer,
        "roberta-wwm-ext": RobertaTokenizer,
        "roberta-wwm-ext-large": RobertaTokenizer,
        "rbt3": RobertaTokenizer,
        "rbtl3": RobertaTokenizer,
        "ppminilm-6l-768h": PPMiniLMTokenizer,
    }

    def __init__(self, vocab, do_lower_case=False, is_split_into_words=False):
        super(FasterTokenizer, self).__init__()

        try:
            self.mod = importlib.import_module("paddle._C_ops")
        except Exception:
            logger.warning(
                "The paddlepaddle version is {paddle.__version__}, not the latest. Please upgrade the paddlepaddle package (>= 2.2.1)."
            )
            self.mod = importlib.import_module("paddle.framework.core.ops")

        vocab_buffer = to_vocab_buffer(vocab, "vocab")
        self.register_buffer("vocab", vocab_buffer, persistable=True)

        self.do_lower_case = do_lower_case
        self.is_split_into_words = is_split_into_words

[文档]    def forward(self, text, text_pair=None, max_seq_len=0, pad_to_max_seq_len=False):
        if paddle.in_dynamic_mode():
            if isinstance(text, list) or isinstance(text, tuple):
                text = to_tensor(list(text))
            if text_pair is not None:
                if isinstance(text_pair, list) or isinstance(text_pair, tuple):
                    text_pair = to_tensor(list(text_pair))
            input_ids, seg_ids = self.mod.faster_tokenizer(
                self.vocab,
                text,
                text_pair,
                "do_lower_case",
                self.do_lower_case,
                "max_seq_len",
                max_seq_len,
                "pad_to_max_seq_len",
                pad_to_max_seq_len,
                "is_split_into_words",
                self.is_split_into_words,
            )

            return input_ids, seg_ids

        attrs = {
            "do_lower_case": self.do_lower_case,
            "max_seq_len": max_seq_len,
            "pad_to_max_seq_len": pad_to_max_seq_len,
            "is_split_into_words": self.is_split_into_words,
        }
        helper = LayerHelper("faster_tokenizer")
        input_ids = helper.create_variable_for_type_inference(dtype="int64")
        seg_ids = helper.create_variable_for_type_inference(dtype="int64")
        if text_pair is None:
            helper.append_op(
                type="faster_tokenizer",
                inputs={"Vocab": self.vocab, "Text": text},
                outputs={"InputIds": input_ids, "SegmentIds": seg_ids},
                attrs=attrs,
            )
        else:
            helper.append_op(
                type="faster_tokenizer",
                inputs={"Vocab": self.vocab, "Text": text, "TextPair": text_pair},
                outputs={"InputIds": input_ids, "SegmentIds": seg_ids},
                attrs=attrs,
            )
        return input_ids, seg_ids

    @classmethod
    def from_pretrained(cls, name):
        if name in cls.name_map:
            tokenizer_cls = cls.name_map[name]
            tokenizer = tokenizer_cls.from_pretrained(name)
            faster_tokenizer = cls(tokenizer.vocab.token_to_idx, tokenizer.do_lower_case)
            return faster_tokenizer
        else:
            raise ValueError("Unknown name %s. Now %s surports  %s" % (name, cls.__name__, list(cls.name_map.keys())))