Source code for paddlenlp.ops.fast_transformer.transformer.fast_transformer

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil

import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from paddlenlp.ops import (
    InferBartDecoding,
    InferGptDecoding,
    InferGptJDecoding,
    InferMBartDecoding,
    InferMIRODecoding,
    InferOptDecoding,
    InferPegasusDecoding,
    InferT5Decoding,
    InferTransformerDecoding,
    InferUnifiedDecoding,
)
from paddlenlp.transformers import (
    BartPretrainedModel,
    CodeGenPreTrainedModel,
    GPTChineseTokenizer,
    GPTJPretrainedModel,
    GPTPretrainedModel,
    GPTTokenizer,
    InferTransformerModel,
    MBartPretrainedModel,
    OPTPretrainedModel,
    PegasusPretrainedModel,
    PositionalEmbedding,
    T5PretrainedModel,
    TransformerModel,
    UnifiedTransformerPretrainedModel,
    UNIMOPretrainedModel,
    WordEmbedding,
    position_encoding_init,
)
from paddlenlp.utils.log import logger

from .encoder import enable_fast_encoder


[docs]class FasterTransformer(TransformerModel):
    """
    FasterTransformer is a fast version for generation with the Transformer
    model. It uses a custom op based on and enhancing NV FasterTransformer to
    do fast generation.

    Args:
        src_vocab_size (int):
            The size of source vocabulary.
        trg_vocab_size (int):
            The size of target vocabulary.
        max_length (int):
            The maximum length of input sequences.
        num_encoder_layers (int):
            The number of sub-layers to be stacked in the encoder.
        num_decoder_layers (int):
            The number of sub-layers to be stacked in the decoder.
        n_head (int):
            The number of head used in multi-head attention.
        d_model (int):
            The dimension for word embeddings, which is also the last dimension of
            the input and output of multi-head attention, position-wise feed-forward
            networks, encoder and decoder.
        d_inner_hid (int):
            Size of the hidden layer in position-wise feed-forward networks.
        dropout (float):
            Dropout rates. Used for pre-process, activation and inside attention.
        weight_sharing (bool):
            Whether to use weight sharing.
        attn_dropout (float):
            The dropout probability used in MHA to drop some attention target.
            If None, use the value of dropout. Defaults to None.
        act_dropout (float):
            The dropout probability used after FFN activition. If None, use
            the value of dropout. Defaults to None.
        bos_id (int, optional):
            The start token id and also is used as padding id. Defaults to 0.
        eos_id (int, optional):
            The end token id. Defaults to 1.
        pad_id (int, optional):
            The pad token id. Defaults to None. If it's None, the bos_id will be used as pad_id.
        decoding_strategy (str, optional):
            Indicating the strategy of decoding. It can be 'beam_search', 'beam_search_v2',
            'topk_sampling' and 'topp_sampling'. For beam search strategies,
            'v2' would select the top `beam_size * 2` beams and process the top
            `beam_size` alive and finish beams in them separately, while 'v1'
            would only select the top `beam_size` beams and mix up the alive and
            finish beams. 'v2' always searchs more and get better results, since
            the alive beams would always be `beam_size` while the number of alive
            beams in `v1` might decrease when meeting the end token. However,
            'v2' always generates longer results thus might do more calculation
            and be slower.
        beam_size (int, optional):
            The beam width for beam search. Defaults to 4.
        topk (int, optional):
            The number of highest probability tokens to keep for top-k sampling.
            Defaults to 4.
        topp (float, optional):
            The most probable tokens whose cumulative probability is not less than
            `topp` are kept for top-p sampling. Defaults to 4.
        max_out_len (int, optional):
            The maximum output length. Defaults to 256.
        diversity_rate (float, optional):
            Refer to `A Simple, Fast Diverse Decoding Algorithm for Neural Generation <https://arxiv.org/abs/1611.08562>`_
            for details. Bigger `diversity_rate` would lead to more diversity.
            if `diversity_rate == 0` is equivalent to naive BeamSearch. Default
            to 0 if not set.
        use_fp16_decoding(bool, optional):
            Whether to use fp16 for decoding.
        enable_fast_encoder(bool, optional):
            Whether to use the fast version of encoder. This is experimental option for now.
            Defaults to False.
        use_fp16_encoder(bool, optional):
            Whether to use fp16 for encoder. Only works when enable_fast_encoder is True.
            Defaults to False.
        rel_len(bool, optional):
            Indicating whether `max_out_len` in is the length relative to that
            of source text. Only works in `v2` temporarily. It is suggest to set
            a small `max_out_len` and use `rel_len=True`. Default to False if
            not set.
        alpha(float, optional):
            The power number in length penalty calculation. Only works in `v2`
            temporarily. Refer to `GNMT <https://arxiv.org/pdf/1609.08144.pdf>`_.
            Default to 0.6 if not set.
    """

    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        max_length,
        num_encoder_layers,
        num_decoder_layers,
        n_head,
        d_model,
        d_inner_hid,
        dropout,
        weight_sharing,
        attn_dropout=None,
        act_dropout=None,
        bos_id=0,
        eos_id=1,
        pad_id=None,
        decoding_strategy="beam_search",
        beam_size=4,
        topk=1,
        topp=0.0,
        max_out_len=256,
        diversity_rate=0.0,
        decoding_lib=None,
        use_fp16_decoding=False,
        enable_fast_encoder=False,
        use_fp16_encoder=False,
        rel_len=False,
        alpha=0.6,
    ):
        # if decoding_lib is None:
        #     raise ValueError(
        #         "The args decoding_lib must be set to use FasterTransformer. ")
        # elif not os.path.exists(decoding_lib):
        #     raise ValueError("The path to decoding lib is not exist.")

        args = dict(locals())
        args.pop("self")
        args.pop("__class__", None)
        self.decoding_strategy = args.pop("decoding_strategy")
        self.beam_size = args.pop("beam_size")
        self.topk = args.pop("topk")
        self.topp = args.pop("topp")
        self.max_out_len = args.pop("max_out_len")
        self.diversity_rate = args.pop("diversity_rate")
        self.decoding_lib = args.pop("decoding_lib")
        self.use_fp16_decoding = args.pop("use_fp16_decoding")
        self.enable_fast_encoder = args.pop("enable_fast_encoder")
        self.use_fp16_encoder = args.pop("use_fp16_encoder")
        self.rel_len = args.pop("rel_len")
        self.alpha = args.pop("alpha")
        self.dropout = dropout
        self.weight_sharing = weight_sharing
        self.trg_vocab_size = trg_vocab_size
        self.d_model = d_model
        self.bos_id = bos_id
        self.pad_id = pad_id if pad_id is not None else self.bos_id
        self.max_length = max_length
        super(FasterTransformer, self).__init__(**args)

        if self.enable_fast_encoder:
            logger.warning("enable_fast_encoder is an experimental option and subject to change.")
        elif self.use_fp16_encoder:
            self.use_fp16_encoder = False

        self.decoding_linear = nn.Linear(in_features=d_model, out_features=trg_vocab_size)

        if weight_sharing:
            self.trg_word_embedding = WordEmbedding(vocab_size=trg_vocab_size, emb_dim=d_model, bos_id=self.bos_id)
            self.trg_pos_embedding = PositionalEmbedding(emb_dim=d_model, max_length=max_length)

        self.decoding = InferTransformerDecoding(
            decoder=self.transformer.decoder,
            word_embedding=self.trg_word_embedding.word_embedding,
            positional_embedding=self.trg_pos_embedding.pos_encoder,
            linear=self.decoding_linear,
            num_decoder_layers=num_decoder_layers,
            n_head=n_head,
            d_model=d_model,
            bos_id=bos_id,
            eos_id=eos_id,
            decoding_strategy=decoding_strategy,
            beam_size=beam_size,
            topk=topk,
            topp=topp,
            max_out_len=max_out_len,
            diversity_rate=self.diversity_rate,
            decoding_lib=self.decoding_lib,
            use_fp16_decoding=self.use_fp16_decoding,
            rel_len=self.rel_len,
            alpha=self.alpha,
        )

[docs]    def forward(self, src_word, trg_word=None):
        src_max_len = paddle.shape(src_word)[-1]
        src_slf_attn_bias = (
            paddle.cast(src_word == self.pad_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9
        )
        src_pos = paddle.cast(src_word != self.pad_id, dtype=src_word.dtype) * paddle.arange(start=0, end=src_max_len)

        # Run encoder
        src_emb = self.src_word_embedding(src_word)
        src_pos_emb = self.src_pos_embedding(src_pos)
        src_emb = src_emb + src_pos_emb
        enc_input = F.dropout(src_emb, p=self.dropout, training=False) if self.dropout else src_emb

        if self.enable_fast_encoder and self.use_fp16_encoder:
            enc_input = paddle.cast(enc_input, dtype="float16")

        enc_output = self.transformer.encoder(enc_input, src_slf_attn_bias)

        if self.use_fp16_decoding and enc_output.dtype != paddle.float16:
            enc_output = paddle.cast(enc_output, dtype="float16")
        elif not self.use_fp16_decoding and enc_output.dtype != paddle.float32:
            enc_output = paddle.cast(enc_output, dtype="float32")

        mem_seq_lens = paddle.sum(paddle.cast(src_word != self.pad_id, dtype="int32"), dtype="int32", axis=1)
        ids = self.decoding(enc_output, mem_seq_lens, trg_word=trg_word)

        return ids

    def load(self, init_from_params=None, state_dict=None):
        # Load the trained model
        if init_from_params is None and state_dict is None:
            raise ValueError("Either init_from_params or state_dict must be given to load the infer model. ")

        if state_dict is None:
            state_dict = paddle.load(init_from_params, return_numpy=True)
        else:
            for state in state_dict:
                # NOTE: This API only used in dygraph, so paddle.Tensor is enough.
                if isinstance(state_dict[state], paddle.Tensor):
                    state_dict[state] = state_dict[state].numpy()

        # To set weight[padding_idx] to 0.
        state_dict["trg_word_embedding.word_embedding.weight"][self.bos_id] = [0] * self.d_model

        # Dealing with weight sharing.
        if self.weight_sharing:
            state_dict["decoding_linear.weight"] = np.transpose(state_dict["trg_word_embedding.word_embedding.weight"])
        else:
            state_dict["decoding_linear.weight"] = state_dict["linear.weight"]

        if self.decoding._fuse_qkv:
            for item in self.state_dict():
                if "decoder" in item and "self_attn.q_proj" in item:
                    num_layer = item.split(".")[3]
                    param_type = item.split(".")[-1]

                    state_dict["decoding.slf_q_" + param_type + "_" + num_layer] = np.concatenate(
                        (
                            state_dict[item],
                            state_dict["transformer.decoder.layers." + num_layer + ".self_attn.k_proj." + param_type],
                            state_dict["transformer.decoder.layers." + num_layer + ".self_attn.v_proj." + param_type],
                        ),
                        axis=-1,
                    )

        if self.use_fp16_decoding:
            for item in self.state_dict():
                if "decoder" in item or "decoding.slf" in item:
                    state_dict[item] = np.float16(state_dict[item])
            state_dict["decoding_linear.weight"] = np.float16(state_dict["decoding_linear.weight"])
            state_dict["trg_word_embedding.word_embedding.weight"] = np.float16(
                state_dict["trg_word_embedding.word_embedding.weight"]
            )
            state_dict["trg_pos_embedding.pos_encoder.weight"] = np.float16(
                state_dict["trg_pos_embedding.pos_encoder.weight"]
            )
            state_dict["decoding_linear.bias"] = np.zeros([self.trg_vocab_size], dtype="float16")

        self.load_dict(state_dict)

        if self.enable_fast_encoder:
            self = enable_fast_encoder(self, use_fp16=self.use_fp16_encoder)

[docs]    def export_params(self, init_from_params, place):
        """
        This method is used for load static graph from dygraph checkpoint
        or export inference model using static graph.
        Do NOT support faster encoder.

        Args:
            init_from_params (string):
                The path to dygraph checkpoint.
            place (paddle.Place):
                The place to execute static graph.

        Example:
            .. code-block::
                paddle.enable_static()
                place = "gpu"
                place = paddle.set_device(place)
                reader.adapt_vocab_size(args)

                test_program = paddle.static.Program()
                startup_program = paddle.static.Program()
                with paddle.static.program_guard(test_program, startup_program):
                    src_word = paddle.static.data(
                        name="src_word", shape=[None, None], dtype="int64")

                    # Define model
                    transformer = FasterTransformer(
                        src_vocab_size=args.src_vocab_size,
                        trg_vocab_size=args.trg_vocab_size,
                        max_length=args.max_length + 1,
                        num_encoder_layers=args.n_layer,
                        num_decoder_layers=args.n_layer,
                        n_head=args.n_head,
                        d_model=args.d_model,
                        d_inner_hid=args.d_inner_hid,
                        dropout=args.dropout,
                        weight_sharing=args.weight_sharing,
                        bos_id=args.bos_idx,
                        eos_id=args.eos_idx,
                        decoding_strategy=args.decoding_strategy,
                        beam_size=args.beam_size,
                        max_out_len=args.max_out_len,
                        decoding_lib=args.decoding_lib,
                        use_fp16_decoding=args.use_fp16_decoding,
                        rel_len=args.use_rel_len,
                        alpha=args.alpha)

                    finished_seq = transformer(src_word=src_word)

                test_program = test_program.clone(for_test=True)

                exe = paddle.static.Executor(place)
                exe.run(startup_program)

                # Load checkpoint.
                transformer.export_params(
                    init_from_params=os.path.join(args.init_from_params,
                                                "transformer.pdparams"),
                    place=place)

                paddle.static.save_inference_model(
                    os.path.join(args.inference_model_dir, "transformer"),
                    feed_vars=src_word,
                    fetch_vars=finished_seq,
                    executor=exe,
                    program=test_program)
        """
        # Load the trained model
        assert init_from_params, "Please set init_from_params to load the infer model."

        model_dict = paddle.load(init_from_params, return_numpy=True)

        # To set weight[padding_idx] to 0.
        model_dict["trg_word_embedding.word_embedding.weight"][self.bos_id] = [0] * self.d_model

        # Dealing with weight sharing.
        if self.weight_sharing:
            model_dict["decoding_linear.weight"] = np.transpose(model_dict["trg_word_embedding.word_embedding.weight"])
        else:
            model_dict["decoding_linear.weight"] = model_dict["linear.weight"]

        # To avoid a longer length than training, reset the size of position
        # encoding to max_length
        model_dict["encoder.pos_encoder.weight"] = position_encoding_init(self.max_length, self.d_model)
        model_dict["decoder.pos_encoder.weight"] = position_encoding_init(self.max_length, self.d_model)

        if self.decoding._fuse_qkv:
            for item in self.state_dict():
                if "decoder" in item and "self_attn.q_proj" in item:
                    num_layer = item.split(".")[3]
                    param_type = item.split(".")[-1]

                    model_dict["decoding.slf_q_" + param_type + "_" + num_layer] = np.concatenate(
                        (
                            model_dict[item],
                            model_dict["transformer.decoder.layers." + num_layer + ".self_attn.k_proj." + param_type],
                            model_dict["transformer.decoder.layers." + num_layer + ".self_attn.v_proj." + param_type],
                        ),
                        axis=-1,
                    )

        if self.use_fp16_decoding:
            for item in self.state_dict():
                if "decoder" in item or "decoding.slf" in item:
                    model_dict[item] = np.float16(model_dict[item])
            model_dict["decoding_linear.weight"] = np.float16(model_dict["decoding_linear.weight"])
            model_dict["trg_word_embedding.word_embedding.weight"] = np.float16(
                model_dict["trg_word_embedding.word_embedding.weight"]
            )
            model_dict["trg_pos_embedding.pos_encoder.weight"] = np.float16(
                model_dict["trg_pos_embedding.pos_encoder.weight"]
            )
            model_dict["decoding_linear.bias"] = np.zeros([self.trg_vocab_size], dtype="float16")

        for item in self.state_dict():
            param = self
            attr_list = item.split(".")
            for attr in attr_list:
                param = getattr(param, attr)
            param_name = param.name
            var = paddle.static.global_scope().find_var(param_name).get_tensor()
            var.set(model_dict[item], place)


[docs]class TransformerGenerator(paddle.nn.Layer):
    """
    The Transformer model for auto-regressive generation with beam search. It wraps
    `FasterTransformer` and `InferTransformerModel`, and automatically chioces using
    `FasterTransformer` (with jit building) or the slower verison `InferTransformerModel`.

    Args:
        src_vocab_size (int):
            The size of source vocabulary.
        trg_vocab_size (int):
            The size of target vocabulary.
        max_length (int):
            The maximum length of input sequences.
        num_encoder_layers (int):
            The number of sub-layers to be stacked in the encoder.
        num_decoder_layers (int):
            The number of sub-layers to be stacked in the decoder.
        n_head (int):
            The number of head used in multi-head attention.
        d_model (int):
            The dimension for word embeddings, which is also the last dimension of
            the input and output of multi-head attention, position-wise feed-forward
            networks, encoder and decoder.
        d_inner_hid (int):
            Size of the hidden layer in position-wise feed-forward networks.
        dropout (float):
            Dropout rates. Used for pre-process, activation and inside attention.
        weight_sharing (bool):
            Whether to use weight sharing.
        bos_id (int, optional):
            The start token id and also is used as padding id. Defaults to 0.
        eos_id (int, optional):
            The end token id. Defaults to 1.
        beam_size (int, optional):
            The beam width for beam search. Defaults to 4.
        max_out_len (int, optional):
            The maximum output length. Defaults to 256.
        activation (str, optional):
            The activation used in FFN. Defaults to "relu".
        normalize_before (bool, optional):
            Whether to apply pre-normalization. Defaults to True.
        kwargs:
            The key word arguments can be `output_time_major`, `use_ft`, `use_fp16_decoding`,
            `rel_len`, `alpha`:

            - `output_time_major(bool, optional)`: Indicate the data layout of predicted
            Tensor. If `False`, the data layout would be batch major with shape
            `[batch_size, seq_len, beam_size]`. If  `True`, the data layout would
            be time major with shape `[seq_len, batch_size, beam_size]`. Default
            to `False`.

            - `use_ft(bool, optional)`: Whether to use FastGeneration
            for decoding. Default to True if not set.

            - `use_fp16_decoding(bool, optional)`: Whether to use fp16
            for decoding.  Only works when using FastGeneration.

            - `beam_search_version(str, optional)`: Indicating the strategy of
            beam search. It can be 'v1' or 'v2'. 'v2' would select the top
            `beam_size * 2` beams and process the top `beam_size` alive and
            finish beams in them separately, while 'v1' would only select the
            top `beam_size` beams and mix up the alive and finish beams. 'v2' always
            searchs more and get better results, since the alive beams would
            always be `beam_size` while the number of alive beams in `v1` might
            decrease when meeting the end token. However, 'v2' always generates
            longer results thus might do more calculation and be slower.

            - `rel_len(bool, optional)`: Indicating whether `max_out_len` in is
            the length relative to that of source text. Only works in `v2` temporarily.
            It is suggest to set a small `max_out_len` and use `rel_len=True`.
            Default to False if not set.

            - `alpha(float, optional)`: The power number in length penalty
            calculation. Refer to `GNMT <https://arxiv.org/pdf/1609.08144.pdf>`_.
            Only works in `v2` temporarily. Default to 0.6 if not set.

            - diversity_rate(float, optional): Refer to `A Simple, Fast Diverse
            Decoding Algorithm for Neural Generation <https://arxiv.org/abs/1611.08562>`_
            for details. Bigger `diversity_rate` would lead to more diversity.
            if `diversity_rate == 0` is equivalent to naive BeamSearch. Default
            to 0 if not set. **NOTE**: Only works when using FastGeneration
            temporarily.
    """

    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        max_length,
        num_encoder_layers,
        num_decoder_layers,
        n_head,
        d_model,
        d_inner_hid,
        dropout,
        weight_sharing,
        bos_id=0,
        eos_id=1,
        pad_id=None,
        beam_size=4,
        max_out_len=256,
        activation="relu",
        normalize_before=True,
        **kwargs
    ):
        logger.warning("TransformerGenerator is an experimental API and subject to change.")
        # `kwargs` can include output_time_major, use_fp16_decoding, topk, topp.
        # The later three arguments can only work when using FastGeneration,
        # and expose topk, topp later.
        super(TransformerGenerator, self).__init__()
        self.d_model = d_model
        self.max_length = max_length
        self.output_time_major = kwargs.pop("output_time_major", True)
        # Only works for FastGeneration.
        # TODO: original version supports diversity rate.
        diversity_rate = kwargs.pop("diversity_rate", 0.0)
        use_fp16_decoding = kwargs.pop("use_fp16_decoding", False)
        use_ft = kwargs.pop("use_ft", True)
        beam_search_version = kwargs.pop("beam_search_version", "v1")
        rel_len = kwargs.pop("rel_len", False)
        alpha = kwargs.pop("alpha", 0.6)

        # TODO: Faster version needs to update attr to support custom
        # activation and normalize_before which are both aupport in cpp codes.
        if use_ft and activation == "relu" and normalize_before:
            try:
                decoding_strategy = "beam_search_v2" if beam_search_version == "v2" else "beam_search"
                self.transformer = FasterTransformer(
                    src_vocab_size=src_vocab_size,
                    trg_vocab_size=trg_vocab_size,
                    max_length=max_length,
                    num_encoder_layers=num_encoder_layers,
                    num_decoder_layers=num_decoder_layers,
                    n_head=n_head,
                    d_model=d_model,
                    d_inner_hid=d_inner_hid,
                    dropout=dropout,
                    weight_sharing=weight_sharing,
                    bos_id=bos_id,
                    eos_id=eos_id,
                    pad_id=pad_id,
                    beam_size=beam_size,
                    max_out_len=max_out_len,
                    diversity_rate=diversity_rate,
                    decoding_strategy=decoding_strategy,
                    use_fp16_decoding=use_fp16_decoding,
                    rel_len=rel_len,
                    alpha=alpha,
                )
            except Exception:
                logger.warning(
                    "Exception occurs when using FastGeneration. " "The original forward will be involved. "
                )
                if diversity_rate != 0:
                    logger.warning(
                        "diversity_rate would not work since it is only " "supported by FastGeneration temporarily."
                    )
                self.transformer = InferTransformerModel(
                    src_vocab_size=src_vocab_size,
                    trg_vocab_size=trg_vocab_size,
                    max_length=max_length,
                    num_encoder_layers=num_encoder_layers,
                    num_decoder_layers=num_decoder_layers,
                    n_head=n_head,
                    d_model=d_model,
                    d_inner_hid=d_inner_hid,
                    dropout=dropout,
                    weight_sharing=weight_sharing,
                    bos_id=bos_id,
                    eos_id=eos_id,
                    pad_id=pad_id,
                    beam_size=beam_size,
                    max_out_len=max_out_len,
                    output_time_major=self.output_time_major,
                    beam_search_version=beam_search_version,
                    activation=activation,
                    normalize_before=normalize_before,
                    rel_len=rel_len,
                    alpha=alpha,
                )
        else:
            if diversity_rate != 0:
                logger.warning(
                    "diversity_rate would not work since it is only " "supported by FastGeneration temporarily."
                )
            self.transformer = InferTransformerModel(
                src_vocab_size=src_vocab_size,
                trg_vocab_size=trg_vocab_size,
                max_length=max_length,
                num_encoder_layers=num_encoder_layers,
                num_decoder_layers=num_decoder_layers,
                n_head=n_head,
                d_model=d_model,
                d_inner_hid=d_inner_hid,
                dropout=dropout,
                weight_sharing=weight_sharing,
                bos_id=bos_id,
                eos_id=eos_id,
                pad_id=pad_id,
                beam_size=beam_size,
                max_out_len=max_out_len,
                output_time_major=self.output_time_major,
                beam_search_version=beam_search_version,
                activation=activation,
                normalize_before=normalize_before,
                rel_len=rel_len,
                alpha=alpha,
            )

[docs]    def forward(self, src_word, trg_word=None):
        r"""
        Performs decoding for transformer model.

        Args:
            src_word (Tensor):
                The ids of source sequence words. It is a tensor with shape
                `[batch_size, source_sequence_length]` and its data type can be
                int or int64.
            trg_word (Tensor):
                The ids of target sequence words. Normally, it should NOT be
                given. If it's given, force decoding with previous output token
                will be trigger. Defaults to None.

        Returns:
            Tensor:
                An int64 tensor shaped indicating the predicted ids. Its shape is
                `[batch_size, seq_len, beam_size]` or `[seq_len, batch_size, beam_size]`
                according to `output_time_major`. While, when using FastGeneration
                and beam search v2, the beam dimension would be doubled to include
                both the top `beam_size` alive and finish beams, thus the tensor
                shape is `[batch_size, seq_len, beam_size * 2]` or `[seq_len, batch_size, beam_size * 2]`.

        Example:
            .. code-block::

                import paddle
                from paddlenlp.ops import TransformerGenerator

                transformer = TransformerGenerator(
                    src_vocab_size=30000,
                    trg_vocab_size=30000,
                    max_length=256,
                    num_encoder_layers=6,
                    num_decoder_layers=6,
                    n_head=8,
                    d_model=512,
                    d_inner_hid=2048,
                    dropout=0.1,
                    weight_sharing=True,
                    bos_id=0,
                    eos_id=1,
                    beam_size=4,
                    max_out_len=256)

                batch_size = 5
                seq_len = 10
                transformer(
                    src_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len]))
        """
        out = self.transformer(src_word, trg_word=trg_word)
        # TODO(guosheng): FasterTransformer has an output with layout
        # `[seq_len, batch_size, beam_size]`. While the output layout of
        # original one is `[batch_size, seq_len, beam_size]`. Maybe we need
        # unify them later.
        if not self.output_time_major and isinstance(self.transformer, FasterTransformer):
            out = paddle.transpose(out, [1, 0, 2])
        return out

    def load(self, path=None, state_dict=None):
        if path is None and state_dict is None:
            raise ValueError("Either path or state_dict must be given to load the infer model. ")

        if isinstance(self.transformer, FasterTransformer):
            self.transformer.load(path, state_dict)
        else:
            if state_dict is None:
                state_dict = paddle.load(path)
            self.transformer.load_dict(state_dict)


[docs]class FasterOPT(OPTPretrainedModel):
    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False):
        super(FasterOPT, self).__init__(model.config)
        self._model = model
        self.use_fp16_decoding = use_fp16_decoding
        self.decoding = InferOptDecoding(model=model, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding)

[docs]    def forward(
        self,
        input_ids,
        seq_len=None,
        attention_mask=None,
        top_k=4,
        top_p=0.0,
        max_length=256,
        bos_token_id=None,
        eos_token_id=None,
        pad_token_id=None,
        forced_eos_token_id=None,
        temperature=0,
        decode_strategy="sample",
        num_return_sequences=1,
        **model_kwargs
    ):
        if input_ids.dtype == paddle.int64:
            input_ids = paddle.cast(input_ids, "int32")

        # change top_p to zero if not using top_p sampling for FT
        if decode_strategy == "greedy_search":
            top_p = 0.0
            top_k = 1
        if top_p == 1.0:
            top_p = 0.0
        if seq_len is None:
            seq_len = paddle.sum(paddle.cast(input_ids != pad_token_id, dtype="int32"), axis=-1, dtype="int32")

            if bos_token_id == pad_token_id and paddle.sum(paddle.any(input_ids == pad_token_id), dtype="int64") > 0:
                seq_len = seq_len + 1

        if num_return_sequences > 1:
            input_ids, model_kwargs = self.expand_inputs_for_generation(
                input_ids, expand_size=num_return_sequences, seq_len=seq_len, attention_mask=attention_mask
            )
            seq_len = model_kwargs["seq_len"]
            attention_mask = model_kwargs.get("attention_mask", None)

        return self.decoding(
            input_ids,
            mem_seq_len=seq_len,
            attention_mask=attention_mask,
            topk=top_k,
            topp=top_p,
            max_out_len=max_length,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            pad_token_id=pad_token_id,
            forced_eos_token_id=forced_eos_token_id,
            temperature=temperature,
        )

    def export_params(self, state_to_load, place):
        for item in state_to_load:
            param_data = np.array(state_to_load[item])
            if self.use_fp16_decoding:
                param_data = np.float16(param_data)

            param = self
            attr_list = item.split(".")
            attr_list = ["decoding", "model"] + attr_list
            for attr in attr_list:
                param = getattr(param, attr)
            param_name = param.name
            var = paddle.static.global_scope().find_var(param_name).get_tensor()
            var.set(param_data, place)

    def save_resources(self, tokenizer, path):
        vocab_file = os.path.join(path, "vocab.txt")
        if isinstance(tokenizer, GPTTokenizer):
            with open(vocab_file, "w", encoding="utf-8") as f:
                for token in tokenizer.encoder:
                    f.write(token + "\n")
            merges_file = os.path.join(path, "merges.txt")
            shutil.copyfile(tokenizer._merges_file, merges_file)
        elif isinstance(tokenizer, GPTChineseTokenizer):
            tokenizer.save_resources(path)

    generate = forward


[docs]class FasterGPT(GPTPretrainedModel):
    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False):
        super(FasterGPT, self).__init__(model.config)
        self._model = model
        self.use_fp16_decoding = use_fp16_decoding
        self.decoding = InferGptDecoding(model=model, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding)

[docs]    def forward(
        self,
        input_ids,
        seq_len=None,
        attention_mask=None,
        top_k=4,
        top_p=0.0,
        max_length=256,
        bos_token_id=None,
        eos_token_id=None,
        pad_token_id=None,
        forced_eos_token_id=None,
        temperature=0,
        decode_strategy="sample",
        num_return_sequences=1,
        **model_kwargs
    ):
        if input_ids.dtype == paddle.int64:
            input_ids = paddle.cast(input_ids, "int32")

        # change top_p to zero if not using top_p sampling for FT
        if decode_strategy == "greedy_search":
            top_p = 0.0
            top_k = 1
        if top_p == 1.0:
            top_p = 0.0
        if seq_len is None:
            seq_len = paddle.sum(paddle.cast(input_ids != pad_token_id, dtype="int32"), axis=-1, dtype="int32")

            if bos_token_id == pad_token_id and paddle.sum(paddle.any(input_ids == pad_token_id), dtype="int64") > 0:
                seq_len = seq_len + 1

        if num_return_sequences > 1:
            input_ids, model_kwargs = self.expand_inputs_for_generation(
                input_ids, expand_size=num_return_sequences, seq_len=seq_len, attention_mask=attention_mask
            )
            seq_len = model_kwargs["seq_len"]
            attention_mask = model_kwargs.get("attention_mask", None)

        return self.decoding(
            input_ids,
            mem_seq_len=seq_len,
            attention_mask=attention_mask,
            topk=top_k,
            topp=top_p,
            max_out_len=max_length,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            pad_token_id=pad_token_id,
            forced_eos_token_id=forced_eos_token_id,
            temperature=temperature,
        )

    def export_params(self, state_to_load, place):
        for item in state_to_load:
            param_data = np.array(state_to_load[item])
            if self.use_fp16_decoding:
                param_data = np.float16(param_data)

            param = self
            attr_list = item.split(".")
            attr_list = ["decoding", "model"] + attr_list
            for attr in attr_list:
                param = getattr(param, attr)
            param_name = param.name
            var = paddle.static.global_scope().find_var(param_name).get_tensor()
            var.set(param_data, place)

    def save_resources(self, tokenizer, path):
        vocab_file = os.path.join(path, "vocab.txt")
        if isinstance(tokenizer, GPTTokenizer):
            with open(vocab_file, "w", encoding="utf-8") as f:
                for token in tokenizer.encoder:
                    f.write(token + "\n")
            merges_file = os.path.join(path, "merges.txt")
            shutil.copyfile(tokenizer._merges_file, merges_file)
        elif isinstance(tokenizer, GPTChineseTokenizer):
            tokenizer.save_resources(path)

    generate = forward


[docs]class FasterUnifiedTransformer(UnifiedTransformerPretrainedModel):
    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False):
        super(FasterUnifiedTransformer, self).__init__(model.config)
        self._model = model
        self._use_fp16_decoding = use_fp16_decoding
        self.vocab_size = model.lm_head.decoder_bias.shape[0]
        self.unk_token_id = self._model.config.unk_token_id
        self.mask_token_id = self._model.config.mask_token_id
        self.bos_token_id = self._model.config.bos_token_id
        self.pad_token_id = self._model.config.pad_token_id
        self.logits_mask = self.generate_logits_mask(use_fp16_decoding)
        self._n_head = self._model.config.num_attention_heads
        self._hidden_dims = self._model.config.hidden_size
        self._normalize_before = self._model.config.normalize_before
        self._size_per_head = self._hidden_dims // self._n_head
        self._n_layer = self._model.config.num_hidden_layers
        self._hidden_act = self._model.config.hidden_act

        self.decoding = InferUnifiedDecoding(
            model=self._model,
            decoding_lib=decoding_lib,
            use_fp16_decoding=use_fp16_decoding,
            logits_mask=self.logits_mask,
            n_head=self._n_head,
            hidden_dims=self._hidden_dims,
            size_per_head=self._size_per_head,
            n_layer=self._n_layer,
            unk_id=self.unk_token_id,
            mask_id=self.mask_token_id,
            normalize_before=self._normalize_before,
            hidden_act=self._hidden_act,
        )

    def prepare_inputs_for_generation(
        self, input_ids, token_type_ids, attention_mask, seq_len, position_ids=None, role_ids=None, **kwargs
    ):
        input_ids = input_ids[:, :-1]
        if input_ids.dtype == paddle.int64:
            input_ids = paddle.cast(input_ids, dtype="int32")

        if token_type_ids.dtype == paddle.int64:
            token_type_ids = paddle.cast(token_type_ids, dtype="int32")
        decoder_type_ids = token_type_ids[:, -1:]
        token_type_ids = token_type_ids[:, :-1]

        # TODO(guosheng): attention_mask of UnifiedTransformer uses 0/-INF
        # and is 4D. While now we want to use 1/0 to unify all models and
        # tokenizers.
        attention_mask = attention_mask[:, :, :-1, :-1] if attention_mask.ndim == 4 else attention_mask[:, :-1, :-1]
        attention_mask = paddle.cast(attention_mask == 0, dtype="float16" if self._use_fp16_decoding else "float32")

        seq_len = seq_len - 1
        if seq_len.dtype == paddle.int64:
            seq_len = paddle.cast(seq_len, dtype="int32")

        if position_ids is not None:
            if position_ids.dtype == paddle.int64:
                position_ids = paddle.cast(position_ids, dtype="int32")
            decoder_position_ids = position_ids[:, -1:]
            position_ids = position_ids[:, :-1]
        else:
            decoder_position_ids = None

        field_values = {}
        if role_ids is not None:
            if role_ids.dtype == paddle.int64:
                role_ids = paddle.cast(role_ids, dtype="int32")
            decoder_role_ids = role_ids[:, -1:]
            role_ids = role_ids[:, :-1]
        else:
            decoder_role_ids = None

        field_values["input_ids"] = input_ids
        field_values["token_type_ids"] = token_type_ids
        field_values["attention_mask"] = attention_mask
        field_values["seq_len"] = seq_len
        field_values["decoder_type_ids"] = decoder_type_ids
        field_values["position_ids"] = position_ids
        field_values["decoder_position_ids"] = decoder_position_ids
        field_values["role_ids"] = role_ids
        field_values["decoder_role_ids"] = decoder_role_ids

        return field_values

    def generate_logits_mask(self, use_fp16_decoding):
        # pre-process distribution
        logits_mask = np.zeros(shape=[self.vocab_size], dtype=np.float32)

        if use_fp16_decoding:
            logits_mask[self.unk_token_id] = -1e4
            logits_mask[self.bos_token_id] = -1e4
            logits_mask[self.pad_token_id] = -1e4
        else:
            logits_mask[self.unk_token_id] = -1e9
            logits_mask[self.bos_token_id] = -1e9
            logits_mask[self.pad_token_id] = -1e9

        logits_mask_t = paddle.assign(logits_mask)
        if use_fp16_decoding:
            return paddle.cast(logits_mask_t, dtype="float16")
        else:
            return logits_mask_t

[docs]    def forward(
        self,
        input_ids,
        token_type_ids,
        attention_mask,
        seq_len=None,
        role_ids=None,
        position_ids=None,
        max_length=128,
        min_length=0,
        top_k=4,
        top_p=0.0,
        decode_strategy="sampling",
        bos_token_id=None,
        eos_token_id=None,
        pad_token_id=None,
        num_beams=4,
        diversity_rate=0.0,
        temperature=1.0,
        num_return_sequences=1,
        length_penalty=0.6,
        early_stopping=False,
        forced_eos_token_id=None,
        **model_kwargs
    ):

        if seq_len is None:
            assert input_ids is not None, "You have to specify either input_ids when generating seq_len."
            seq_len = paddle.sum(
                paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, keepdim=True, dtype="int32"
            )
        if decode_strategy.startswith("beam_search"):
            input_ids, model_kwargs = self.expand_inputs_for_generation(
                input_ids,
                expand_size=num_beams,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                attention_mask=attention_mask,
                seq_len=seq_len,
                role_ids=role_ids,
            )
        elif decode_strategy == "sampling":
            input_ids, model_kwargs = self.expand_inputs_for_generation(
                input_ids,
                expand_size=num_return_sequences,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                attention_mask=attention_mask,
                seq_len=seq_len,
                role_ids=role_ids,
            )
        elif decode_strategy == "greedy_search":
            model_kwargs = {
                "token_type_ids": token_type_ids,
                "position_ids": position_ids,
                "attention_mask": attention_mask,
                "seq_len": seq_len,
                "role_ids": role_ids,
            }
        else:
            raise ValueError("Only greedy search, beam search and sampling are supported. ")

        model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)

        seq_len = model_inputs.pop("seq_len")
        decoder_type_ids = model_inputs.pop("decoder_type_ids")
        role_ids = model_inputs.pop("role_ids", None)
        decoder_role_ids = model_inputs.pop("decoder_role_ids", None)
        position_ids = model_inputs.pop("position_ids", None)
        decoder_position_ids = model_inputs.pop("decoder_position_ids", None)

        return self.decoding(
            input_ids=model_inputs["input_ids"],
            attn_mask=model_inputs["attention_mask"],
            memory_seq_lens=seq_len,
            type_id=model_inputs["token_type_ids"],
            decoder_type_id=decoder_type_ids,
            role_id=role_ids,
            decoder_role_id=decoder_role_ids,
            position_id=position_ids,
            decoder_position_id=decoder_position_ids,
            beam_size=num_beams,
            diversity_rate=diversity_rate,
            topk=top_k,
            topp=top_p,
            decoding_strategy=decode_strategy,
            max_out_len=max_length,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            pad_token_id=pad_token_id,
            temperature=temperature,
            length_penalty=length_penalty,
            pos_bias=True,
            forced_eos_token_id=forced_eos_token_id,
            early_stopping=early_stopping,
            min_length=min_length,
        )

    generate = forward


[docs]class FasterUNIMOText(UNIMOPretrainedModel):
    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, **kwargs):
        super(FasterUNIMOText, self).__init__(model.config)
        self._model = model
        self._use_fp16_decoding = use_fp16_decoding
        self.unk_token_id = self._model.config.unk_token_id
        self.mask_token_id = self._model.config.mask_token_id
        self.bos_token_id = self._model.config.bos_token_id
        self.pad_token_id = self._model.config.pad_token_id
        self.vocab_size = model.lm_head.decoder_bias.shape[0]

        self.logits_mask = self.generate_logits_mask(use_fp16_decoding)
        self._n_head = self._model.config.num_attention_heads
        self._hidden_dims = self._model.config.hidden_size
        self._normalize_before = self._model.config.normalize_before
        self._size_per_head = self._hidden_dims // self._n_head
        self._n_layer = self._model.config.num_hidden_layers
        self._hidden_act = self._model.config.hidden_act
        self.trans_out = kwargs.get("trans_out", False)

        self.decoding = InferUnifiedDecoding(
            model=self._model,
            decoding_lib=decoding_lib,
            use_fp16_decoding=use_fp16_decoding,
            logits_mask=self.logits_mask,
            n_head=self._n_head,
            hidden_dims=self._hidden_dims,
            size_per_head=self._size_per_head,
            n_layer=self._n_layer,
            unk_id=self.unk_token_id,
            mask_id=self.mask_token_id,
            normalize_before=self._normalize_before,
            hidden_act=self._hidden_act,
        )

    def prepare_inputs_for_generation(self, input_ids, token_type_ids, attention_mask, **kwargs):
        input_ids = input_ids[:, :-1]
        if input_ids.dtype == paddle.int64:
            input_ids = paddle.cast(input_ids, dtype="int32")

        if token_type_ids.dtype == paddle.int64:
            token_type_ids = paddle.cast(token_type_ids, dtype="int32")
        decoder_type_ids = token_type_ids[:, -1:]
        token_type_ids = token_type_ids[:, :-1]

        attention_mask = attention_mask[:, :, :-1, :-1]
        attention_mask = paddle.cast(attention_mask == 0, dtype="float16" if self._use_fp16_decoding else "float32")

        seq_len = kwargs.get("seq_len") - 1
        if seq_len.dtype == paddle.int64:
            seq_len = paddle.cast(seq_len, dtype="int32")

        return {
            "input_ids": input_ids,
            "token_type_ids": token_type_ids,
            "attention_mask": attention_mask,
            "seq_len": seq_len,
            "decoder_type_ids": decoder_type_ids,
        }

    def generate_logits_mask(self, use_fp16_decoding):
        # pre-process distribution
        logits_mask = np.zeros(shape=[self.vocab_size], dtype=np.float32)

        if use_fp16_decoding:
            logits_mask[self.unk_token_id] = -1e4
            logits_mask[self.bos_token_id] = -1e4
            logits_mask[self.pad_token_id] = -1e4
        else:
            logits_mask[self.unk_token_id] = -1e9
            logits_mask[self.bos_token_id] = -1e9
            logits_mask[self.pad_token_id] = -1e9

        logits_mask_t = paddle.assign(logits_mask)
        if use_fp16_decoding:
            return paddle.cast(logits_mask_t, dtype="float16")
        else:
            return logits_mask_t

[docs]    def forward(
        self,
        input_ids,
        token_type_ids,
        attention_mask,
        seq_len=None,
        max_length=128,
        min_length=0,
        top_k=4,
        top_p=0.0,
        num_beams=4,
        decode_strategy="sampling",
        bos_token_id=None,
        eos_token_id=None,
        pad_token_id=None,
        diversity_rate=0.0,
        temperature=1.0,
        num_return_sequences=1,
        length_penalty=0.6,
        early_stopping=False,
        forced_eos_token_id=None,
        position_ids=None,
        **model_kwargs
    ):

        if seq_len is None:
            assert input_ids is not None, "You have to specify either input_ids when generating seq_len."
            seq_len = paddle.sum(paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, dtype="int32")
        if decode_strategy.startswith("beam_search"):
            input_ids, model_kwargs = self.expand_inputs_for_generation(
                input_ids,
                expand_size=num_beams,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                attention_mask=attention_mask,
                seq_len=seq_len,
            )
        elif decode_strategy == "sampling":
            input_ids, model_kwargs = self.expand_inputs_for_generation(
                input_ids,
                expand_size=num_return_sequences,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                attention_mask=attention_mask,
                seq_len=seq_len,
            )
        elif decode_strategy == "greedy_search":
            model_kwargs = {
                "token_type_ids": token_type_ids,
                "position_ids": position_ids,
                "attention_mask": attention_mask,
                "seq_len": seq_len,
            }
        else:
            raise ValueError("Only greedy search, beam search and sampling are supported. ")

        model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
        seq_len = model_inputs.pop("seq_len")
        decoder_type_ids = model_inputs.pop("decoder_type_ids")

        ids, output_scores = self.decoding(
            input_ids=model_inputs["input_ids"],
            attn_mask=model_inputs["attention_mask"],
            memory_seq_lens=seq_len,
            type_id=model_inputs["token_type_ids"],
            decoder_type_id=decoder_type_ids,
            beam_size=num_beams,
            diversity_rate=diversity_rate,
            topk=top_k,
            topp=top_p,
            decoding_strategy=decode_strategy,
            max_out_len=max_length,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            pad_token_id=pad_token_id,
            temperature=temperature,
            length_penalty=length_penalty,
            forced_eos_token_id=forced_eos_token_id,
            pos_bias=False,
            early_stopping=early_stopping,
            min_length=min_length,
        )
        if self.trans_out:
            if decode_strategy.startswith("beam_search"):
                ids = ids.transpose([1, 2, 0])
            else:
                ids = ids.transpose([1, 0])
        return ids, output_scores

    generate = forward


[docs]class FasterMIRO(UNIMOPretrainedModel):
    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, **kwargs):
        super(FasterMIRO, self).__init__(model.config)
        self._model = model
        self._use_fp16_decoding = use_fp16_decoding
        self.unk_token_id = self._model.config.unk_token_id
        self.mask_token_id = self._model.config.mask_token_id
        self.bos_token_id = self._model.config.bos_token_id
        self.pad_token_id = self._model.config.pad_token_id
        self.vocab_size = model.lm_head.decoder_bias.shape[0]

        self.logits_mask = self.generate_logits_mask(use_fp16_decoding)
        self._n_head = self._model.config.num_attention_heads
        self._hidden_dims = self._model.config.hidden_size
        self._normalize_before = self._model.config.normalize_before
        self._size_per_head = self._hidden_dims // self._n_head
        self._n_layer = self._model.config.num_hidden_layers
        self._hidden_act = self._model.config.hidden_act
        self.trans_out = kwargs.get("trans_out", False)

        self.decoding = InferMIRODecoding(
            model=self._model,
            decoding_lib=decoding_lib,
            use_fp16_decoding=use_fp16_decoding,
            logits_mask=self.logits_mask,
            n_head=self._n_head,
            hidden_dims=self._hidden_dims,
            size_per_head=self._size_per_head,
            n_layer=self._n_layer,
            unk_id=self.unk_token_id,
            mask_id=self.mask_token_id,
            normalize_before=self._normalize_before,
            hidden_act=self._hidden_act,
        )

    def prepare_inputs_for_generation(self, input_ids, token_type_ids, attention_mask, **kwargs):
        input_ids = input_ids[:, :-1]
        if input_ids.dtype == paddle.int64:
            input_ids = paddle.cast(input_ids, dtype="int32")

        if token_type_ids.dtype == paddle.int64:
            token_type_ids = paddle.cast(token_type_ids, dtype="int32")
        decoder_type_ids = token_type_ids[:, -1:]
        token_type_ids = token_type_ids[:, :-1]

        attention_mask = attention_mask[:, :, :-1, :-1]
        attention_mask = paddle.cast(attention_mask == 0, dtype="float16" if self._use_fp16_decoding else "float32")

        seq_len = kwargs.get("seq_len") - 1
        if seq_len.dtype == paddle.int64:
            seq_len = paddle.cast(seq_len, dtype="int32")

        return {
            "input_ids": input_ids,
            "token_type_ids": token_type_ids,
            "attention_mask": attention_mask,
            "seq_len": seq_len,
            "decoder_type_ids": decoder_type_ids,
        }

    def generate_logits_mask(self, use_fp16_decoding):
        # pre-process distribution
        logits_mask = np.zeros(shape=[self.vocab_size], dtype=np.float32)

        if use_fp16_decoding:
            logits_mask[self.unk_token_id] = -1e4
            logits_mask[self.bos_token_id] = -1e4
            logits_mask[self.pad_token_id] = -1e4
        else:
            logits_mask[self.unk_token_id] = -1e9
            logits_mask[self.bos_token_id] = -1e9
            logits_mask[self.pad_token_id] = -1e9

        logits_mask_t = paddle.assign(logits_mask)
        if use_fp16_decoding:
            return paddle.cast(logits_mask_t, dtype="float16")
        else:
            return logits_mask_t

[docs]    def forward(
        self,
        input_ids,
        token_type_ids,
        attention_mask,
        seq_len=None,
        max_length=128,
        min_length=0,
        top_k=4,
        top_p=0.0,
        num_beams=4,
        decode_strategy="sampling",
        bos_token_id=None,
        eos_token_id=None,
        pad_token_id=None,
        diversity_rate=0.0,
        temperature=1.0,
        num_return_sequences=1,
        length_penalty=0.6,
        early_stopping=False,
        forced_eos_token_id=None,
        position_ids=None,
        **model_kwargs
    ):

        if seq_len is None:
            assert input_ids is not None, "You have to specify either input_ids when generating seq_len."
            seq_len = paddle.sum(paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, dtype="int32")
        if decode_strategy.startswith("beam_search"):
            input_ids, model_kwargs = self.expand_inputs_for_generation(
                input_ids,
                expand_size=num_beams,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                attention_mask=attention_mask,
                seq_len=seq_len,
            )
        elif decode_strategy == "sampling":
            input_ids, model_kwargs = self.expand_inputs_for_generation(
                input_ids,
                expand_size=num_return_sequences,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                attention_mask=attention_mask,
                seq_len=seq_len,
            )
        elif decode_strategy == "greedy_search":
            model_kwargs = {
                "token_type_ids": token_type_ids,
                "position_ids": position_ids,
                "attention_mask": attention_mask,
                "seq_len": seq_len,
            }
        else:
            raise ValueError("Only greedy search, beam search and sampling are supported. ")

        model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
        seq_len = model_inputs.pop("seq_len")
        decoder_type_ids = model_inputs.pop("decoder_type_ids")

        ids, output_scores = self.decoding(
            input_ids=model_inputs["input_ids"],
            attn_mask=model_inputs["attention_mask"],
            memory_seq_lens=seq_len,
            type_id=model_inputs["token_type_ids"],
            decoder_type_id=decoder_type_ids,
            beam_size=num_beams,
            diversity_rate=diversity_rate,
            topk=top_k,
            topp=top_p,
            decoding_strategy=decode_strategy,
            max_out_len=max_length,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            pad_token_id=pad_token_id,
            temperature=temperature,
            length_penalty=length_penalty,
            forced_eos_token_id=forced_eos_token_id,
            pos_bias=False,
            early_stopping=early_stopping,
            min_length=min_length,
        )
        if self.trans_out:
            if decode_strategy.startswith("beam_search"):
                ids = ids.transpose([1, 2, 0])
            else:
                ids = ids.transpose([1, 0])
        return ids, output_scores

    generate = forward


[docs]class FasterBART(BartPretrainedModel):
    enable_faster_encoder_func = enable_fast_encoder

    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, enable_fast_encoder=True):
        super(FasterBART, self).__init__(model.config)
        self.use_fp16_decoding = use_fp16_decoding
        self._model = model
        if use_fp16_decoding:
            weight_attr = paddle.ParamAttr(initializer=nn.initializer.Assign(model.bart.encoder.embed_tokens.weight))
            model.bart.encoder.embed_tokens = nn.Embedding(
                *model.bart.encoder.embed_tokens.weight.shape, weight_attr=weight_attr
            )
        self.encoder = model.bart.get_encoder()
        self.decoder = model.bart.get_decoder()
        self.pad_token_id = model.bart.config["pad_token_id"]
        self.enable_fast_encoder = enable_fast_encoder

        self.decoding = InferBartDecoding(
            model=self._model, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding
        )
        if self.enable_fast_encoder:
            # Must use `enable_fast_encoder` in `__init__` when dygraph to static graph.
            self.encoder = FasterBART.enable_faster_encoder_func(self.encoder)

    def get_encoder(self):
        return self.encoder

    def get_decoder(self):
        return self.decoder

[docs]    def forward(
        self,
        input_ids=None,
        encoder_output=None,
        seq_len=None,
        num_beams=4,
        top_k=1,
        top_p=0.0,
        temperature=1.0,
        decode_strategy="beam_search",
        bos_token_id=None,
        eos_token_id=None,
        pad_token_id=None,
        decoder_start_token_id=None,
        min_length=0,
        max_length=20,
        diversity_rate=0.0,
        length_penalty=0.6,
        num_return_sequences=1,
        early_stopping=False,
        forced_eos_token_id=None,
        **model_kwargs
    ):

        if encoder_output is None:
            assert input_ids is not None, "You have to specify either input_ids or encoder_output."
            encoder_output = self.prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)[
                "encoder_output"
            ]
        if seq_len is None:
            assert input_ids is not None, "You have to specify either input_ids when generating seq_len."
            seq_len = paddle.sum(
                paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, keepdim=True, dtype="int32"
            )
        if self.use_fp16_decoding:
            encoder_output = paddle.cast(encoder_output, "float16")
        if decode_strategy.startswith("beam_search") and num_beams > 1:
            encoder_output, expanded_kwargs = self.expand_inputs_for_generation(
                encoder_output, expand_size=num_beams, seq_len=seq_len
            )
            seq_len = expanded_kwargs["seq_len"]
        elif decode_strategy == "sampling" and num_return_sequences > 1:
            encoder_output, expanded_kwargs = self.expand_inputs_for_generation(
                encoder_output, expand_size=num_return_sequences, seq_len=seq_len
            )
            seq_len = expanded_kwargs["seq_len"]
        if decoder_start_token_id is not None:
            bos_token_id = decoder_start_token_id

        return self.decoding(
            enc_output=encoder_output,
            memory_seq_lens=seq_len,
            beam_size=num_beams,
            top_k=top_k,
            decoding_strategy=decode_strategy,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            pad_token_id=pad_token_id,
            top_p=top_p,
            max_out_len=max_length,
            min_out_len=min_length,
            temperature=temperature,
            diversity_rate=diversity_rate,
            alpha=length_penalty,
            early_stopping=early_stopping,
            forced_eos_token_id=forced_eos_token_id,
        )

    generate = forward


[docs]class FasterMBART(MBartPretrainedModel):
    enable_faster_encoder_func = enable_fast_encoder

    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, enable_fast_encoder=False):
        super(FasterMBART, self).__init__(model.config)
        self.use_fp16_decoding = use_fp16_decoding
        self._model = model
        if use_fp16_decoding:
            weight_attr = paddle.ParamAttr(initializer=nn.initializer.Assign(model.mbart.encoder.embed_tokens.weight))
            model.mbart.encoder.embed_tokens = nn.Embedding(
                *model.mbart.encoder.embed_tokens.weight.shape, weight_attr=weight_attr
            )
        self.encoder = model.mbart.get_encoder()
        self.decoder = model.mbart.get_decoder()
        self.pad_token_id = model.mbart.config["pad_token_id"]
        self.enable_fast_encoder = enable_fast_encoder

        self.decoding = InferMBartDecoding(
            model=self._model,
            decoding_lib=decoding_lib,
            use_fp16_decoding=use_fp16_decoding,
            hidden_act=model.mbart.config["activation_function"],
        )

        if self.enable_fast_encoder:
            # Must use `enable_fast_encoder` in `__init__` when dygraph to static graph.
            self.encoder = FasterMBART.enable_faster_encoder_func(self.encoder)

    def get_encoder(self):
        return self.encoder

    def get_decoder(self):
        return self.decoder

[docs]    def forward(
        self,
        input_ids=None,
        encoder_output=None,
        seq_len=None,
        forced_bos_token_id=None,
        num_beams=4,
        top_k=1,
        top_p=0.0,
        decode_strategy="beam_search_v3",
        bos_token_id=None,
        eos_token_id=None,
        pad_token_id=None,
        decoder_start_token_id=None,
        max_length=256,
        diversity_rate=0.0,
        length_penalty=0.6,
        temperature=1.0,
        num_return_sequences=1,
        early_stopping=False,
        forced_eos_token_id=None,
        **model_kwargs
    ):

        bos_token_id = bos_token_id if bos_token_id is not None else getattr(self._model, "bos_token_id", None)
        eos_token_id = eos_token_id if eos_token_id is not None else getattr(self._model, "eos_token_id", None)
        pad_token_id = pad_token_id if pad_token_id is not None else getattr(self._model, "pad_token_id", None)
        decoder_start_token_id = (
            decoder_start_token_id
            if decoder_start_token_id is not None
            else getattr(self._model, "decoder_start_token_id", None)
        )

        # (gongenlei) Not enable_fast_encoder temporarily
        if encoder_output is None:
            assert input_ids is not None, "You have to specify either input_ids or encoder_output."
            encoder_output = self.prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)[
                "encoder_output"
            ]
        batch_size = paddle.shape(encoder_output)[0]
        if seq_len is None:
            assert input_ids is not None, "You have to specify either input_ids when generating seq_len."
            seq_len = paddle.sum(paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, dtype="int32")
        if self.use_fp16_decoding:
            encoder_output = paddle.cast(encoder_output, "float16")
        if decode_strategy.startswith("beam_search") and num_beams > 1:
            encoder_output, expanded_kwargs = self.expand_inputs_for_generation(
                encoder_output, expand_size=num_beams, seq_len=seq_len
            )
            seq_len = expanded_kwargs["seq_len"]
        elif decode_strategy == "sampling" and num_return_sequences > 1:
            encoder_output, expanded_kwargs = self.expand_inputs_for_generation(
                encoder_output, expand_size=num_return_sequences, seq_len=seq_len
            )
            seq_len = expanded_kwargs["seq_len"]
        if decoder_start_token_id is not None:
            bos_token_id = decoder_start_token_id

        if not isinstance(forced_bos_token_id, type(input_ids)):
            if forced_bos_token_id is not None:
                if decode_strategy == "sampling":
                    forced_bos_token_id = paddle.full(
                        [batch_size * num_return_sequences, 1], forced_bos_token_id, dtype="int32"
                    )
                else:
                    forced_bos_token_id = paddle.full([batch_size, 1], forced_bos_token_id, dtype="int32")
            else:
                forced_bos_token_id = paddle.zeros([0])
        elif decode_strategy == "sampling":
            num_samples = paddle.shape(encoder_output)[0]
            forced_bos_token_id = paddle.expand(forced_bos_token_id, shape=[num_samples, 1])

        return self.decoding(
            enc_output=encoder_output,
            memory_seq_lens=seq_len,
            beam_size=num_beams,
            trg_word=forced_bos_token_id,
            top_k=top_k,
            top_p=top_p,
            decoding_strategy=decode_strategy,
            diversity_rate=diversity_rate,
            max_out_len=max_length,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            pad_token_id=pad_token_id,
            alpha=length_penalty,
            temperature=temperature,
            early_stopping=early_stopping,
        )

    generate = forward


[docs]class FasterGPTJ(GPTJPretrainedModel):
    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False):
        super(FasterGPTJ, self).__init__(model.config)
        self._model = model
        self.use_fp16_decoding = use_fp16_decoding
        self.decoding = InferGptJDecoding(model=model, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding)

[docs]    def forward(
        self,
        input_ids,
        seq_len=None,
        attention_mask=None,
        top_k=4,
        top_p=0.0,
        min_length=0,
        max_length=256,
        bos_token_id=None,
        eos_token_id=None,
        pad_token_id=None,
        forced_eos_token_id=None,
        temperature=0,
        repetition_penalty=1.0,
        decode_strategy="sampling",
        num_return_sequences=1,
        **model_kwargs
    ):
        if input_ids.dtype == paddle.int64:
            input_ids = paddle.cast(input_ids, "int32")

        # change top_p to zero if not using top_p sampling for FT
        if decode_strategy == "greedy_search":
            top_p = 0.0
            top_k = 1
        if top_p == 1.0:
            top_p = 0.0
        if seq_len is None:
            seq_len = paddle.sum(paddle.cast(input_ids != pad_token_id, dtype="int32"), axis=-1, dtype="int32")

        if num_return_sequences > 1:
            input_ids, model_kwargs = self.expand_inputs_for_generation(
                input_ids, expand_size=num_return_sequences, seq_len=seq_len, attention_mask=attention_mask
            )
            seq_len = model_kwargs["seq_len"]
            attention_mask = model_kwargs.get("attention_mask", None)

        return self.decoding(
            input_ids,
            mem_seq_len=seq_len,
            attention_mask=attention_mask,
            topk=top_k,
            topp=top_p,
            max_out_len=max_length,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            pad_token_id=pad_token_id,
            forced_eos_token_id=forced_eos_token_id,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            min_length=min_length,
        )

    generate = forward


[docs]class FasterCodeGen(CodeGenPreTrainedModel):
    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False):
        super(FasterCodeGen, self).__init__(model.config)
        self._model = model
        self.use_fp16_decoding = use_fp16_decoding
        self.decoding = InferGptJDecoding(
            model=model, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding, transpose_qkv=True
        )

[docs]    def forward(
        self,
        input_ids,
        seq_len=None,
        attention_mask=None,
        top_k=4,
        top_p=0.0,
        min_length=0,
        max_length=256,
        bos_token_id=None,
        eos_token_id=None,
        pad_token_id=None,
        forced_eos_token_id=None,
        temperature=0,
        repetition_penalty=1.0,
        decode_strategy="sampling",
        num_return_sequences=1,
        **model_kwargs
    ):
        if input_ids.dtype == paddle.int64:
            input_ids = paddle.cast(input_ids, "int32")

        # change top_p to zero if not using top_p sampling for FT
        if decode_strategy == "greedy_search":
            top_p = 0.0
            top_k = 1
        if top_p == 1.0:
            top_p = 0.0
        if seq_len is None:
            seq_len = paddle.sum(paddle.cast(input_ids != pad_token_id, dtype="int32"), axis=-1, dtype="int32")

        if num_return_sequences > 1:
            input_ids, model_kwargs = self.expand_inputs_for_generation(
                input_ids, expand_size=num_return_sequences, seq_len=seq_len, attention_mask=attention_mask
            )
            seq_len = model_kwargs["seq_len"]
            attention_mask = model_kwargs.get("attention_mask", None)

        return self.decoding(
            input_ids,
            mem_seq_len=seq_len,
            attention_mask=attention_mask,
            topk=top_k,
            topp=top_p,
            max_out_len=max_length,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            pad_token_id=pad_token_id,
            forced_eos_token_id=forced_eos_token_id,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            min_length=min_length,
        )

    generate = forward


[docs]class FasterPegasus(PegasusPretrainedModel):
    enable_faster_encoder_func = enable_fast_encoder

    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, enable_fast_encoder=False, **kwargs):
        super(FasterPegasus, self).__init__(model.config)
        self.use_fp16_decoding = use_fp16_decoding
        self._model = model
        self.encoder = model.get_encoder()
        self.decoder = model.get_decoder()
        self.pad_token_id = model.pegasus.config["pad_token_id"]
        self.enable_fast_encoder = enable_fast_encoder
        self.trans_out = kwargs.get("trans_out", False)

        self.decoding = InferPegasusDecoding(
            model=self._model,
            decoding_lib=decoding_lib,
            use_fp16_decoding=use_fp16_decoding,
            hidden_act=model.pegasus.config["activation_function"],
        )

        # TODO(gongenlei): Support faster_encoder
        # if self.enable_fast_encoder:
        #     # Must use `enable_fast_encoder` in `__init__` when dygraph to static graph.
        #     self.encoder = FasterPegasus.enable_faster_encoder_func(self.encoder)

    def get_encoder(self):
        return self.encoder

    def get_decoder(self):
        return self.decoder

[docs]    def forward(
        self,
        input_ids=None,
        encoder_output=None,
        seq_len=None,
        min_length=0,
        max_length=256,
        num_beams=4,
        decode_strategy="beam_search_v3",
        decoder_start_token_id=None,
        bos_token_id=None,
        eos_token_id=None,
        pad_token_id=None,
        diversity_rate=0.0,
        length_penalty=0.6,
        top_k=1,
        top_p=0.0,
        temperature=1.0,
        num_return_sequences=1,
        early_stopping=False,
        forced_bos_token_id=None,
        forced_eos_token_id=None,
        **model_kwargs
    ):

        bos_token_id = bos_token_id if bos_token_id is not None else getattr(self._model, "bos_token_id", None)
        eos_token_id = eos_token_id if eos_token_id is not None else getattr(self._model, "eos_token_id", None)
        pad_token_id = pad_token_id if pad_token_id is not None else getattr(self._model, "pad_token_id", None)
        decoder_start_token_id = (
            decoder_start_token_id
            if decoder_start_token_id is not None
            else getattr(self._model, "decoder_start_token_id", None)
        )

        if encoder_output is None:
            assert input_ids is not None, "You have to specify either input_ids or encoder_output."
            encoder_output = self.prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)[
                "encoder_output"
            ]

        if seq_len is None:
            assert input_ids is not None, "You have to specify either input_ids when generating seq_len."
            seq_len = paddle.sum(paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, dtype="int32")
        if self.use_fp16_decoding:
            encoder_output = paddle.cast(encoder_output, "float16")
        if decode_strategy.startswith("beam_search") and num_beams > 1:
            encoder_output, expanded_kwargs = self.expand_inputs_for_generation(
                encoder_output, expand_size=num_beams, seq_len=seq_len
            )
            seq_len = expanded_kwargs["seq_len"]
        elif decode_strategy == "sampling" and num_return_sequences > 1:
            encoder_output, expanded_kwargs = self.expand_inputs_for_generation(
                encoder_output, expand_size=num_return_sequences, seq_len=seq_len
            )
            seq_len = expanded_kwargs["seq_len"]
        if decoder_start_token_id is not None:
            bos_token_id = decoder_start_token_id

        ids = self.decoding(
            enc_output=encoder_output,
            memory_seq_lens=seq_len,
            beam_size=num_beams,
            top_k=top_k,
            top_p=top_p,
            decoding_strategy=decode_strategy,
            max_out_len=max_length,
            min_out_len=min_length,
            diversity_rate=diversity_rate,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            pad_token_id=pad_token_id,
            alpha=length_penalty,
            temperature=temperature,
            early_stopping=early_stopping,
            forced_eos_token_id=forced_eos_token_id,
        )

        if self.trans_out:
            if decode_strategy.startswith("beam_search"):
                ids = ids.transpose([1, 2, 0])
            else:
                ids = ids.transpose([1, 0])

        return ids

    generate = forward


[docs]class FasterT5(T5PretrainedModel):
    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False):
        super(FasterT5, self).__init__(model.config)
        self.use_fp16_decoding = use_fp16_decoding
        self._model = model
        if use_fp16_decoding:
            weight_attr = paddle.ParamAttr(initializer=nn.initializer.Assign(model.encoder.embed_tokens.weight))
            model.encoder.embed_tokens = nn.Embedding(
                *model.encoder.embed_tokens.weight.shape, weight_attr=weight_attr
            )
        self.encoder = model.t5.get_encoder()
        self.decoder = model.t5.get_decoder()
        self.pad_token_id = model.t5.config["pad_token_id"]

        self.decoding = InferT5Decoding(
            model=self._model, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding
        )

    def get_encoder(self):
        return self.encoder

    def get_decoder(self):
        return self.decoder

[docs]    def forward(
        self,
        input_ids=None,
        encoder_output=None,
        seq_len=None,
        max_length=128,
        min_length=0,
        top_k=4,
        top_p=0.0,
        num_beams=4,
        decode_strategy="sampling",
        decoder_start_token_id=None,
        bos_token_id=None,
        eos_token_id=None,
        pad_token_id=None,
        diversity_rate=0.0,
        temperature=1.0,
        num_return_sequences=1,
        length_penalty=0.6,
        early_stopping=False,
        forced_eos_token_id=None,
        **model_kwargs
    ):

        bos_token_id = bos_token_id if bos_token_id is not None else getattr(self._model, "bos_token_id", None)
        eos_token_id = eos_token_id if eos_token_id is not None else getattr(self._model, "eos_token_id", None)
        pad_token_id = pad_token_id if pad_token_id is not None else getattr(self._model, "pad_token_id", None)

        if encoder_output is None:
            assert input_ids is not None, "You have to specify either input_ids or encoder_output."
            encoder_output = self.prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)[
                "encoder_output"
            ]

            if isinstance(encoder_output, (list, tuple)):
                encoder_output = encoder_output[0]

        if seq_len is None:
            assert input_ids is not None, "You have to specify either input_ids when generating seq_len."
            seq_len = paddle.sum(paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, dtype="int32")
        if self.use_fp16_decoding:
            encoder_output = paddle.cast(encoder_output, "float16")
        if decode_strategy.startswith("beam_search") and num_beams > 1:
            encoder_output, expanded_kwargs = self.expand_inputs_for_generation(
                encoder_output, expand_size=num_beams, seq_len=seq_len
            )
            seq_len = expanded_kwargs["seq_len"]
        elif decode_strategy == "sampling" and num_return_sequences > 1:
            encoder_output, expanded_kwargs = self.expand_inputs_for_generation(
                encoder_output, expand_size=num_return_sequences, seq_len=seq_len
            )
            seq_len = expanded_kwargs["seq_len"]
        if decoder_start_token_id is not None:
            bos_token_id = decoder_start_token_id

        return self.decoding(
            enc_output=encoder_output,
            memory_seq_lens=seq_len,
            beam_size=num_beams,
            top_k=top_k,
            top_p=top_p,
            decoding_strategy=decode_strategy,
            max_out_len=max_length,
            diversity_rate=diversity_rate,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            pad_token_id=pad_token_id,
            alpha=length_penalty,
            temperature=temperature,
            early_stopping=early_stopping,
        )

    generate = forward