Source code for paddlenlp.transformers.transformer.modeling

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn import (
    TransformerDecoder,
    TransformerDecoderLayer,
    TransformerEncoder,
    TransformerEncoderLayer,
)
from paddle.utils import map_structure

__all__ = [
    "position_encoding_init",
    "WordEmbedding",
    "PositionalEmbedding",
    "CrossEntropyCriterion",
    "TransformerDecodeCell",
    "TransformerBeamSearchDecoder",
    "TransformerModel",
    "InferTransformerModel",
    "LabelSmoothedCrossEntropyCriterion",
]


[docs]def position_encoding_init(n_position, d_pos_vec, dtype="float32"):
    """
    Generates the initial values for the sinusoidal position encoding table.
    This method follows the implementation in tensor2tensor, but is slightly
    different from the description in "Attention Is All You Need".

    Args:
        n_position (int):
            The largest position for sequences, that is, the maximum length
            of source or target sequences.
        d_pos_vec (int):
            The size of positional embedding vector.
        dtype (str, optional):
            The output `numpy.array`'s data type. Defaults to "float32".

    Returns:
        numpy.array:
            The embedding table of sinusoidal position encoding with shape
            `[n_position, d_pos_vec]`.

    Example:
        .. code-block::

            from paddlenlp.transformers import position_encoding_init

            max_length = 256
            emb_dim = 512
            pos_table = position_encoding_init(max_length, emb_dim)
    """
    channels = d_pos_vec
    position = np.arange(n_position)
    num_timescales = channels // 2
    log_timescale_increment = np.log(float(1e4) / float(1)) / (num_timescales - 1)
    inv_timescales = np.exp(np.arange(num_timescales) * -log_timescale_increment)
    scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales, 0)
    signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
    signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], "constant")
    position_enc = signal
    return position_enc.astype(dtype)


[docs]class WordEmbedding(nn.Layer):
    r"""
    Word Embedding Layer of Transformer.
    This layer automatically constructs a 2D embedding matrix based on the
    input the size of vocabulary (`vocab_size`) and the size of each embedding
    vector (`emb_dim`). This layer lookups embeddings vector of ids provided
    by input `word`.

    After the embedding, those weights are multiplied by `sqrt(d_model)` which is
    `sqrt(emb_dim)` in the interface.

    .. math::

        Out = embedding(word) * sqrt(emb\_dim)

    Args:
        vocab_size (int):
            The size of vocabulary.
        emb_dim (int):
            Dimensionality of each embedding vector.
        bos_id (int, optional):
            The start token id and also is used as padding id. Defaults to 0.
    """

    def __init__(self, vocab_size, emb_dim, bos_id=0):
        super(WordEmbedding, self).__init__()
        self.emb_dim = emb_dim

        self.word_embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=emb_dim,
            padding_idx=bos_id,
            weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(0.0, emb_dim ** (-0.5))),
        )

[docs]    def forward(self, word):
        r"""
        Computes word embedding.

        Args:
            word (Tensor):
                The input ids which indicates the sequences' words with shape
                `[batch_size, sequence_length]` whose data type can be
                int or int64.

        Returns:
            Tensor:
                The (scaled) embedding tensor of shape
                `(batch_size, sequence_length, emb_dim)` whose data type can be
                float32 or float64.

        Example:
            .. code-block::

                import paddle
                from paddlenlp.transformers import WordEmbedding

                word_embedding = WordEmbedding(
                    vocab_size=30000,
                    emb_dim=512,
                    bos_id=0)

                batch_size = 5
                sequence_length = 10
                src_words = paddle.randint(low=3, high=30000, shape=[batch_size, sequence_length])
                src_emb = word_embedding(src_words)
        """
        word_emb = self.emb_dim**0.5 * self.word_embedding(word)
        return word_emb


[docs]class PositionalEmbedding(nn.Layer):
    """
    This layer produces sinusoidal positional embeddings of any length.
    While in `forward()` method, this layer lookups embeddings vector of
    ids provided by input `pos`.

    Args:
        emb_dim (int):
            The size of each embedding vector.
        max_length (int):
            The maximum length of sequences.
    """

    def __init__(self, emb_dim, max_length):
        super(PositionalEmbedding, self).__init__()
        self.emb_dim = emb_dim

        self.pos_encoder = nn.Embedding(
            num_embeddings=max_length,
            embedding_dim=self.emb_dim,
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Assign(position_encoding_init(max_length, self.emb_dim))
            ),
        )

[docs]    def forward(self, pos):
        r"""
        Computes positional embedding.

        Args:
            pos (Tensor):
                The input position ids with shape `[batch_size, sequence_length]` whose
                data type can be int or int64.

        Returns:
            Tensor:
                The positional embedding tensor of shape
                `(batch_size, sequence_length, emb_dim)` whose data type can be
                float32 or float64.

        Example:
            .. code-block::

                import paddle
                from paddlenlp.transformers import PositionalEmbedding

                pos_embedding = PositionalEmbedding(
                    emb_dim=512,
                    max_length=256)

                batch_size = 5
                pos = paddle.tile(paddle.arange(start=0, end=50), repeat_times=[batch_size, 1])
                pos_emb = pos_embedding(pos)
        """
        pos_emb = self.pos_encoder(pos)
        pos_emb.stop_gradient = True
        return pos_emb


[docs]class CrossEntropyCriterion(nn.Layer):
    """
    Computes the cross entropy loss for given input with or without label smoothing.

    Args:
        label_smooth_eps (float, optional):
            The weight used to mix up the original ground-truth distribution
            and the fixed distribution. Defaults to None. If given, label smoothing
            will be applied on `label`.
        pad_idx (int, optional):
            The token id used to pad variant sequence. Defaults to 0.
    """

    def __init__(self, label_smooth_eps=None, pad_idx=0):
        super(CrossEntropyCriterion, self).__init__()
        self.label_smooth_eps = label_smooth_eps
        self.pad_idx = pad_idx

[docs]    def forward(self, predict, label):
        r"""
        Computes cross entropy loss with or without label smoothing.

        Args:
            predict (Tensor):
                The predict results of `TransformerModel` with shape
                `[batch_size, sequence_length, vocab_size]` whose data type can
                be float32 or float64.
            label (Tensor):
                The label for correspoding results with shape
                `[batch_size, sequence_length, 1]`.

        Returns:
            tuple:
                A tuple with items: (`sum_cost`, `avg_cost`, `token_num`).

                With the corresponding fields:

                - `sum_cost` (Tensor):
                    The sum of loss of current batch whose data type can be float32, float64.
                - `avg_cost` (Tensor):
                    The average loss of current batch whose data type can be float32, float64.
                    The relation between `sum_cost` and `avg_cost` can be described as:

                    .. math::

                        avg\_cost = sum\_cost / token\_num

                - `token_num` (Tensor):
                    The number of tokens of current batch. Its data type can be float32, float64.

        Example:
            .. code-block::

                import paddle
                from paddlenlp.transformers import CrossEntropyCriterion

                criterion = CrossEntropyCriterion(label_smooth_eps=0.1, pad_idx=0)
                batch_size = 1
                seq_len = 2
                vocab_size = 30000
                predict = paddle.rand(shape=[batch_size, seq_len, vocab_size])
                label = paddle.randint(
                    low=3,
                    high=vocab_size,
                    shape=[batch_size, seq_len, 1])

                criterion(predict, label)
        """
        weights = paddle.cast(label != self.pad_idx, dtype=paddle.get_default_dtype())
        if self.label_smooth_eps:
            label = paddle.squeeze(label, axis=[2])
            label = F.label_smooth(
                label=F.one_hot(x=label, num_classes=predict.shape[-1]), epsilon=self.label_smooth_eps
            )

        cost = F.cross_entropy(
            input=predict, label=label, reduction="none", soft_label=True if self.label_smooth_eps else False
        )
        weighted_cost = cost * weights
        sum_cost = paddle.sum(weighted_cost)
        token_num = paddle.sum(weights)
        token_num.stop_gradient = True
        avg_cost = sum_cost / token_num
        return sum_cost, avg_cost, token_num


def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=True):
    if target.dim() == lprobs.dim() - 1:
        target = target.unsqueeze(-1)

    num_tokens = paddle.shape(lprobs)[0]
    index = paddle.arange(0, num_tokens, dtype="int64").unsqueeze(-1)
    index = paddle.concat([index, target], axis=-1)
    index.stop_gradient = True

    log_probs = -lprobs

    nll_loss = paddle.gather_nd(log_probs, index=index).unsqueeze(-1)
    smooth_loss = log_probs.sum(axis=-1, keepdim=True)

    pad_mask = paddle.cast(target != ignore_index, dtype=paddle.get_default_dtype())
    nll_loss = nll_loss * pad_mask
    smooth_loss = smooth_loss * pad_mask
    if reduce:
        nll_loss = nll_loss.sum()
        smooth_loss = smooth_loss.sum()
    eps_i = epsilon / (lprobs.shape[-1] - 1)
    loss = (1.0 - epsilon - eps_i) * nll_loss + eps_i * smooth_loss
    token_num = paddle.sum(pad_mask)
    return loss, loss / token_num, token_num


[docs]class LabelSmoothedCrossEntropyCriterion(nn.Layer):
    def __init__(self, label_smoothing, padding_idx=0):
        super().__init__()
        self.eps = label_smoothing
        self.padding_idx = padding_idx

[docs]    def forward(self, predict, label, reduce=True):
        return self.compute_loss(predict, label, reduce=reduce)

    def get_lprobs_and_target(self, predict, label):
        lprobs = paddle.nn.functional.log_softmax(predict, axis=-1)
        return lprobs.reshape([-1, lprobs.shape[-1]]), label.reshape([-1])

    def compute_loss(self, predict, label, reduce=True):
        lprobs, label = self.get_lprobs_and_target(predict, label)
        return label_smoothed_nll_loss(lprobs, label, self.eps, ignore_index=self.padding_idx, reduce=reduce)


[docs]class TransformerDecodeCell(nn.Layer):
    """
    This layer wraps a Transformer decoder combined with embedding
    layer and output layer to produce logits from ids and position.

    Args:
        decoder (callable):
            Can be a `paddle.nn.TransformerDecoder` instance. Or a wrapper that includes an
            embedding layer accepting ids and positions and includes an
            output layer transforming decoder output to logits.
        word_embedding (callable, optional):
            Can be a `WordEmbedding` instance or a callable that accepts ids as
            arguments and return embeddings. It can be None if `decoder`
            includes a embedding layer. Defaults to None.
        pos_embedding (callable, optional):
            Can be a `PositionalEmbedding` instance or a callable that accepts position
            as arguments and return embeddings. It can be None if `decoder`
            includes a positional embedding layer. Defaults to None.
        linear (callable, optional):
            Can be a `paddle.nn.Linear` instance or a callable to transform decoder
            output to logits.
        dropout (float, optional):
            The dropout rate for the results of `word_embedding` and `pos_embedding`.
            Defaults to 0.1.
    """

    def __init__(self, decoder, word_embedding=None, pos_embedding=None, linear=None, dropout=0.1):
        super(TransformerDecodeCell, self).__init__()
        self.decoder = decoder
        self.word_embedding = word_embedding
        self.pos_embedding = pos_embedding
        self.linear = linear
        self.dropout = dropout

[docs]    def forward(self, inputs, states, static_cache, trg_src_attn_bias, memory, **kwargs):
        r"""
        Produces logits.

        Args:
            inputs (Tensor|tuple|list):
                A tuple/list includes target ids and positions. If `word_embedding` is None,
                then it should be a Tensor which means the input for decoder.
            states (list):
                It is a list and each element of the list is an instance
                of `paddle.nn.MultiheadAttention.Cache` for corresponding decoder
                layer. It can be produced by `paddle.nn.TransformerDecoder.gen_cache`.
            static_cache (list):
                It is a list and each element of the list is an instance of
                `paddle.nn.MultiheadAttention.StaticCache` for corresponding
                decoder layer. It can be produced by `paddle.nn.TransformerDecoder.gen_cache`.
            trg_src_attn_bias (Tensor):
                A tensor used in self attention to prevents attention to some unwanted
                positions, usually the subsequent positions. It is a tensor with shape
                broadcasted to `[batch_size, n_head, target_length, target_length]`,
                where the unwanted positions have `-INF` values and the others
                have 0 values. The data type should be float32 or float64. It can
                be None when nothing wanted or needed to be prevented attention to.
            memory (Tensor):
                The output of Transformer encoder. It is a tensor with shape
                `[batch_size, source_length, d_model]` and its data type can be
                float32 or float64.

        Returns:
            tuple:
                A tuple with items: `(outputs, new_states)`

                With the corresponding fields:

                - `outputs` (Tensor):
                    A float32 or float64 3D tensor representing logits shaped
                    `[batch_size, sequence_length, vocab_size]`.
                - `new_states` (Tensor):
                    This output has the same structure and data type with `states`
                    while the length is one larger since concatanating the
                    intermediate results of current step.

        Example:
            .. code-block::

                import paddle
                from paddlenlp.transformers import TransformerDecodeCell
                from paddlenlp.transformers import TransformerBeamSearchDecoder

                def decoder():
                    # do decoder
                    pass

                cell = TransformerDecodeCell(decoder())

                self.decode = TransformerBeamSearchDecoder(
                    cell, start_token=0, end_token=1, beam_size=4,
                    var_dim_in_state=2)
        """

        if states and static_cache:
            states = list(zip(states, static_cache))

        if self.word_embedding:
            if not isinstance(inputs, (list, tuple)):
                inputs = inputs

            word_emb = self.word_embedding(inputs[0])
            pos_emb = self.pos_embedding(inputs[1])
            word_emb = word_emb + pos_emb
            inputs = F.dropout(word_emb, p=self.dropout, training=False) if self.dropout else word_emb

            cell_outputs, new_states = self.decoder(inputs, memory, None, trg_src_attn_bias, states)
        else:
            cell_outputs, new_states = self.decoder(inputs, memory, None, trg_src_attn_bias, states)

        if self.linear:
            cell_outputs = self.linear(cell_outputs)

        new_states = [cache[0] for cache in new_states]

        return cell_outputs, new_states


[docs]class TransformerBeamSearchDecoder(nn.decode.BeamSearchDecoder):
    """
    This layer is a subclass of `BeamSearchDecoder` to make
    beam search adapt to Transformer decoder.

    Args:
        cell (`TransformerDecodeCell`):
            An instance of `TransformerDecoderCell`.
        start_token (int):
            The start token id.
        end_token (int):
            The end token id.
        beam_size (int):
            The beam width used in beam search.
        var_dim_in_state (int):
            Indicate which dimension of states is variant.
    """

    def __init__(self, cell, start_token, end_token, beam_size, var_dim_in_state):
        super(TransformerBeamSearchDecoder, self).__init__(cell, start_token, end_token, beam_size)
        self.cell = cell
        self.var_dim_in_state = var_dim_in_state

    def _merge_batch_beams_with_var_dim(self, c):
        # Init length of cache is 0, and it increases with decoding carrying on,
        # thus need to reshape elaborately
        var_dim_in_state = self.var_dim_in_state + 1  # count in beam dim
        c = paddle.transpose(c, list(range(var_dim_in_state, len(c.shape))) + list(range(0, var_dim_in_state)))
        c = paddle.reshape(
            c,
            [0] * (len(c.shape) - var_dim_in_state)
            + [self.batch_size * self.beam_size]
            + [int(size) for size in c.shape[-var_dim_in_state + 2 :]],
        )
        c = paddle.transpose(
            c,
            list(range((len(c.shape) + 1 - var_dim_in_state), len(c.shape)))
            + list(range(0, (len(c.shape) + 1 - var_dim_in_state))),
        )
        return c

    def _split_batch_beams_with_var_dim(self, c):
        var_dim_size = paddle.shape(c)[self.var_dim_in_state]
        c = paddle.reshape(
            c,
            [-1, self.beam_size]
            + [int(size) for size in c.shape[1 : self.var_dim_in_state]]
            + [var_dim_size]
            + [int(size) for size in c.shape[self.var_dim_in_state + 1 :]],
        )
        return c

[docs]    @staticmethod
    def tile_beam_merge_with_batch(t, beam_size):
        r"""
        Tiles the batch dimension of a tensor. Specifically, this function takes
        a tensor t shaped `[batch_size, s0, s1, ...]` composed of minibatch
        entries `t[0], ..., t[batch_size - 1]` and tiles it to have a shape
        `[batch_size * beam_size, s0, s1, ...]` composed of minibatch entries
        `t[0], t[0], ..., t[1], t[1], ...` where each minibatch entry is repeated
        `beam_size` times.

        Args:
            t (list|tuple):
                A list of tensor with shape `[batch_size, ...]`.
            beam_size (int):
                The beam width used in beam search.

        Returns:
            Tensor:
                A tensor with shape `[batch_size * beam_size, ...]`, whose
                data type is same as `t`.

        Example:
            .. code-block::

                import paddle
                from paddlenlp.transformers import TransformerBeamSearchDecoder

                t = paddle.rand(shape=[10, 10])
                TransformerBeamSearchDecoder.tile_beam_merge_with_batch(t, beam_size=4)
        """
        return map_structure(lambda x: nn.decode.BeamSearchDecoder.tile_beam_merge_with_batch(x, beam_size), t)

[docs]    def step(self, time, inputs, states, **kwargs):
        """
        Perform a beam search decoding step, which uses cell to get probabilities,
        and follows a beam search step to calculate scores and select candidate token ids.

        Args:
             time(Tensor): An `int64` tensor with shape `[1]` provided by the caller,
                 representing the current time step number of decoding.
             inputs(Tensor): A tensor variable. It is same as `initial_inputs`
                 returned by `initialize()` for the first decoding step and
                 `next_inputs` returned by `step()` for the others.
             states(Tensor): A structure of tensor variables.
                 It is same as the `initial_cell_states` returned by `initialize()`
                 for the first decoding step and `next_states` returned by
                 `step()` for the others.
             kwargs(dict, optional): Additional keyword arguments, provided by the caller `dynamic_decode`.

        Returns:
             tuple: Returns tuple (``beam_search_output, beam_search_state, next_inputs, finished``).
             `beam_search_state` and `next_inputs` have the same structure,
             shape and data type as the input arguments states and inputs separately.
             `beam_search_output` is a namedtuple(including scores, predicted_ids, parent_ids as fields) of tensor variables,
             where `scores, predicted_ids, parent_ids` all has a tensor value shaped [batch_size, beam_size] with data type
             float32, int64, int64. `finished` is a bool tensor with shape [batch_size, beam_size].

        """
        # Steps for decoding.
        # Compared to RNN, Transformer has 3D data at every decoding step
        inputs = paddle.reshape(inputs, [-1, 1])  # token
        pos = paddle.ones_like(inputs) * time  # pos

        cell_states = map_structure(self._merge_batch_beams_with_var_dim, states.cell_states)

        cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states, **kwargs)

        # Squeeze to adapt to BeamSearchDecoder which use 2D logits
        cell_outputs = map_structure(lambda x: paddle.squeeze(x, [1]) if len(x.shape) == 3 else x, cell_outputs)
        cell_outputs = map_structure(self._split_batch_beams, cell_outputs)
        next_cell_states = map_structure(self._split_batch_beams_with_var_dim, next_cell_states)

        beam_search_output, beam_search_state = self._beam_search_step(
            time=time, logits=cell_outputs, next_cell_states=next_cell_states, beam_state=states
        )

        if kwargs.get("trg_word", None) is not None:
            if paddle.in_dynamic_mode():
                if paddle.shape(kwargs.get("trg_word"))[1] > time:
                    beam_search_output, beam_search_state = self.force_decoding(
                        beam_search_output, beam_search_state, kwargs.get("trg_word"), kwargs.get("trg_length"), time
                    )
            else:

                def condition(trg_word, time):
                    return paddle.shape(trg_word)[1] > time

                def default_fn(beam_search_output, beam_search_state):
                    return beam_search_output, beam_search_state

                from functools import partial

                beam_search_output, beam_search_state = paddle.static.nn.case(
                    [
                        (
                            condition(kwargs.get("trg_word"), time),
                            partial(
                                self.force_decoding,
                                beam_search_output=beam_search_output,
                                beam_search_state=beam_search_state,
                                trg_word=kwargs.get("trg_word"),
                                trg_length=kwargs.get("trg_length"),
                                time=time,
                            ),
                        )
                    ],
                    default=partial(
                        default_fn, beam_search_output=beam_search_output, beam_search_state=beam_search_state
                    ),
                )

        next_inputs, finished = (beam_search_output.predicted_ids, beam_search_state.finished)

        return (beam_search_output, beam_search_state, next_inputs, finished)

    def force_decoding(self, beam_search_output, beam_search_state, trg_word, trg_length, time):
        batch_size = paddle.shape(beam_search_output.predicted_ids)[0]
        beam_size = paddle.shape(beam_search_output.predicted_ids)[1]

        ids_dtype = beam_search_output.predicted_ids.dtype
        scores_dtype = beam_search_output.scores.dtype
        parent_ids = paddle.zeros(shape=[batch_size, 1], dtype=ids_dtype)
        scores = paddle.ones(shape=[batch_size, beam_size], dtype=scores_dtype) * -1e4
        scores = paddle.scatter(
            scores.flatten(),
            paddle.arange(0, batch_size * beam_size, step=beam_size, dtype="int64"),
            paddle.zeros([batch_size]),
        ).reshape([batch_size, beam_size])

        force_position = paddle.unsqueeze(trg_length > time, [1])
        # NOTE: When the date type of the input of paddle.tile is bool
        # and enable static mode, its stop_gradient must be True .
        force_position.stop_gradient = True
        force_position = paddle.tile(force_position, [1, beam_size])
        crt_trg_word = paddle.slice(trg_word, axes=[1], starts=[time], ends=[time + 1])
        crt_trg_word = paddle.tile(crt_trg_word, [1, beam_size])

        predicted_ids = paddle.where(force_position, crt_trg_word, beam_search_output.predicted_ids)
        scores = paddle.where(force_position, scores, beam_search_output.scores)
        parent_ids = paddle.where(force_position, parent_ids, beam_search_output.parent_ids)

        cell_states = beam_search_state.cell_states
        log_probs = paddle.where(force_position, scores, beam_search_state.log_probs)
        finished = beam_search_state.finished
        lengths = beam_search_state.lengths

        return self.OutputWrapper(scores, predicted_ids, parent_ids), self.StateWrapper(
            cell_states, log_probs, finished, lengths
        )


[docs]class TransformerModel(nn.Layer):
    """
    The Transformer model.

    This model is a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
    and refer to the Paddle documentation for all matter related to general usage and behavior.

    Args:
        src_vocab_size (int):
            The size of source vocabulary.
        trg_vocab_size (int):
            The size of target vocabulary.
        max_length (int):
            The maximum length of input sequences.
        num_encoder_layers (int):
            The number of sub-layers to be stacked in the encoder.
        num_decoder_layers (int):
            The number of sub-layers to be stacked in the decoder.
        n_head (int):
            The number of head used in multi-head attention.
        d_model (int):
            The dimension for word embeddings, which is also the last dimension of
            the input and output of multi-head attention, position-wise feed-forward
            networks, encoder and decoder.
        d_inner_hid (int):
            Size of the hidden layer in position-wise feed-forward networks.
        dropout (float):
            Dropout rates. Used for pre-process, activation and inside attention.
        weight_sharing (bool):
            Whether to use weight sharing.
        attn_dropout (float):
            The dropout probability used in MHA to drop some attention target.
            If None, use the value of dropout. Defaults to None.
        act_dropout (float):
            The dropout probability used after FFN activation. If None, use
            the value of dropout. Defaults to None.
        bos_id (int, optional):
            The start token id and also be used as padding id. Defaults to 0.
        eos_id (int, optional):
            The end token id. Defaults to 1.
        pad_id (int, optional):
            The pad token id. Defaults to None. If it's None, the bos_id will be used as pad_id.
        activation (str, optional):
            The activation used in FFN. Defaults to "relu".
        normalize_before (bool, optional):
            Whether to apply pre-normalization. Defaults to True.
    """

    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        max_length,
        num_encoder_layers,
        num_decoder_layers,
        n_head,
        d_model,
        d_inner_hid,
        dropout,
        weight_sharing,
        attn_dropout=None,
        act_dropout=None,
        bos_id=0,
        eos_id=1,
        pad_id=None,
        activation="relu",
        normalize_before=True,
    ):
        super(TransformerModel, self).__init__()
        self.trg_vocab_size = trg_vocab_size
        self.emb_dim = d_model
        self.bos_id = bos_id
        self.eos_id = eos_id
        self.pad_id = pad_id if pad_id is not None else self.bos_id
        self.dropout = dropout

        self.src_word_embedding = WordEmbedding(vocab_size=src_vocab_size, emb_dim=d_model, bos_id=self.pad_id)
        self.src_pos_embedding = PositionalEmbedding(emb_dim=d_model, max_length=max_length)
        if weight_sharing:
            assert (
                src_vocab_size == trg_vocab_size
            ), "Vocabularies in source and target should be same for weight sharing."
            self.trg_word_embedding = self.src_word_embedding
            self.trg_pos_embedding = self.src_pos_embedding
        else:
            self.trg_word_embedding = WordEmbedding(vocab_size=trg_vocab_size, emb_dim=d_model, bos_id=self.pad_id)
            self.trg_pos_embedding = PositionalEmbedding(emb_dim=d_model, max_length=max_length)

        if not normalize_before:
            encoder_layer = TransformerEncoderLayer(
                d_model=d_model,
                nhead=n_head,
                dim_feedforward=d_inner_hid,
                dropout=dropout,
                activation=activation,
                attn_dropout=attn_dropout,
                act_dropout=act_dropout,
                normalize_before=normalize_before,
            )
            encoder_with_post_norm = TransformerEncoder(encoder_layer, num_encoder_layers)

            decoder_layer = TransformerDecoderLayer(
                d_model=d_model,
                nhead=n_head,
                dim_feedforward=d_inner_hid,
                dropout=dropout,
                activation=activation,
                attn_dropout=attn_dropout,
                act_dropout=act_dropout,
                normalize_before=normalize_before,
            )
            decoder_with_post_norm = TransformerDecoder(decoder_layer, num_decoder_layers)

        self.transformer = paddle.nn.Transformer(
            d_model=d_model,
            nhead=n_head,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=d_inner_hid,
            dropout=dropout,
            attn_dropout=attn_dropout,
            act_dropout=act_dropout,
            activation=activation,
            normalize_before=normalize_before,
            custom_encoder=None if normalize_before else encoder_with_post_norm,
            custom_decoder=None if normalize_before else decoder_with_post_norm,
        )

        if weight_sharing:
            self.linear = lambda x: paddle.matmul(
                x=x, y=self.trg_word_embedding.word_embedding.weight, transpose_y=True
            )
        else:
            self.linear = nn.Linear(in_features=d_model, out_features=trg_vocab_size, bias_attr=False)

[docs]    def forward(self, src_word, trg_word):
        r"""
        The Transformer forward methods. The input are source/target sequences, and
        returns logits.

        Args:
            src_word (Tensor):
                The ids of source sequences words. It is a tensor with shape
                `[batch_size, source_sequence_length]` and its data type can be
                int or int64.
            trg_word (Tensor):
                The ids of target sequences words. It is a tensor with shape
                `[batch_size, target_sequence_length]` and its data type can be
                int or int64.

        Returns:
            Tensor:
                Output tensor of the final layer of the model whose data
                type can be float32 or float64 with shape
                `[batch_size, sequence_length, vocab_size]`.

        Example:
            .. code-block::

                import paddle
                from paddlenlp.transformers import TransformerModel

                transformer = TransformerModel(
                    src_vocab_size=30000,
                    trg_vocab_size=30000,
                    max_length=257,
                    num_encoder_layers=6,
                    num_decoder_layers=6,
                    n_head=8,
                    d_model=512,
                    d_inner_hid=2048,
                    dropout=0.1,
                    weight_sharing=True,
                    bos_id=0,
                    eos_id=1)

                batch_size = 5
                seq_len = 10
                predict = transformer(
                    src_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len]),
                    trg_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len]))
        """
        src_max_len = paddle.shape(src_word)[-1]
        trg_max_len = paddle.shape(trg_word)[-1]
        src_slf_attn_bias = (
            paddle.cast(src_word == self.pad_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
        )
        src_slf_attn_bias.stop_gradient = True
        trg_slf_attn_bias = self.transformer.generate_square_subsequent_mask(trg_max_len)
        trg_slf_attn_bias.stop_gradient = True
        trg_src_attn_bias = src_slf_attn_bias
        src_pos = paddle.cast(src_word != self.pad_id, dtype=src_word.dtype) * paddle.arange(
            start=0, end=src_max_len, dtype=src_word.dtype
        )
        trg_pos = paddle.cast(trg_word != self.pad_id, dtype=src_word.dtype) * paddle.arange(
            start=0, end=trg_max_len, dtype=trg_word.dtype
        )

        with paddle.static.amp.fp16_guard():
            src_emb = self.src_word_embedding(src_word)
            src_pos_emb = self.src_pos_embedding(src_pos)
            src_emb = src_emb + src_pos_emb
            enc_input = F.dropout(src_emb, p=self.dropout, training=self.training) if self.dropout else src_emb

            trg_emb = self.trg_word_embedding(trg_word)
            trg_pos_emb = self.trg_pos_embedding(trg_pos)
            trg_emb = trg_emb + trg_pos_emb
            dec_input = F.dropout(trg_emb, p=self.dropout, training=self.training) if self.dropout else trg_emb

            dec_output = self.transformer(
                enc_input,
                dec_input,
                src_mask=src_slf_attn_bias,
                tgt_mask=trg_slf_attn_bias,
                memory_mask=trg_src_attn_bias,
            )

            predict = self.linear(dec_output)

        return predict


[docs]class InferTransformerModel(TransformerModel):
    """
    The Transformer model for auto-regressive generation.

    Args:
        src_vocab_size (int):
            The size of source vocabulary.
        trg_vocab_size (int):
            The size of target vocabulary.
        max_length (int):
            The maximum length of input sequences.
        num_encoder_layers (int):
            The number of sub-layers to be stacked in the encoder.
        num_decoder_layers (int):
            The number of sub-layers to be stacked in the decoder.
        n_head (int):
            The number of head used in multi-head attention.
        d_model (int):
            The dimension for word embeddings, which is also the last dimension of
            the input and output of multi-head attention, position-wise feed-forward
            networks, encoder and decoder.
        d_inner_hid (int):
            Size of the hidden layer in position-wise feed-forward networks.
        dropout (float):
            Dropout rates. Used for pre-process, activation and inside attention.
        weight_sharing (bool):
            Whether to use weight sharing.
        attn_dropout (float):
            The dropout probability used in MHA to drop some attention target.
            If None, use the value of dropout. Defaults to None.
        act_dropout (float):
            The dropout probability used after FFN activition. If None, use
            the value of dropout. Defaults to None.
        bos_id (int, optional):
            The start token id and also is used as padding id. Defaults to 0.
        eos_id (int, optional):
            The end token id. Defaults to 1.
        pad_id (int, optional):
            The pad token id. Defaults to None. If it's None, the bos_id will be used as pad_id.
        beam_size (int, optional):
            The beam width for beam search. Defaults to 4.
        max_out_len (int, optional):
            The maximum output length. Defaults to 256.
        output_time_major(bool, optional):
            Indicate the data layout of predicted
            Tensor. If `False`, the data layout would be batch major with shape
            `[batch_size, seq_len, beam_size]`. If  `True`, the data layout would
            be time major with shape `[seq_len, batch_size, beam_size]`. Default
            to `False`.
        beam_search_version (str):
            Specify beam search version. It should be in one
            of [`v1`, `v2`]. If `v2`, need to set `alpha`(default to 0.6) for length
            penalty. Default to `v1`.
        activation (str, optional):
            The activation used in FFN. Defaults to "relu".
        normalize_before (bool, optional):
            Whether to apply pre-normalization. Defaults to True.
        kwargs:
            The key word arguments can be `rel_len` and `alpha`:

            - `rel_len(bool, optional)`: Indicating whether `max_out_len` in
            is the length relative to that of source text. Only works in `v2`
            temporarily. It is suggest to set a small `max_out_len` and use
            `rel_len=True`. Default to False if not set.

            - `alpha(float, optional)`: The power number in length penalty
            calculation. Refer to `GNMT <https://arxiv.org/pdf/1609.08144.pdf>`_.
            Only works in `v2` temporarily. Default to 0.6 if not set.
    """

    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        max_length,
        num_encoder_layers,
        num_decoder_layers,
        n_head,
        d_model,
        d_inner_hid,
        dropout,
        weight_sharing,
        attn_dropout=None,
        act_dropout=None,
        bos_id=0,
        eos_id=1,
        pad_id=None,
        beam_size=4,
        max_out_len=256,
        output_time_major=False,
        beam_search_version="v1",
        activation="relu",
        normalize_before=True,
        **kwargs
    ):
        args = dict(locals())
        args.pop("self")
        args.pop("__class__", None)
        self.beam_size = args.pop("beam_size")
        self.max_out_len = args.pop("max_out_len")
        self.output_time_major = args.pop("output_time_major")
        self.dropout = dropout
        self.beam_search_version = args.pop("beam_search_version")
        kwargs = args.pop("kwargs")
        if self.beam_search_version == "v2":
            self.alpha = kwargs.get("alpha", 0.6)
            self.rel_len = kwargs.get("rel_len", False)
        super(InferTransformerModel, self).__init__(**args)

        cell = TransformerDecodeCell(
            self.transformer.decoder, self.trg_word_embedding, self.trg_pos_embedding, self.linear, self.dropout
        )

        self.decode = TransformerBeamSearchDecoder(cell, bos_id, eos_id, beam_size, var_dim_in_state=2)

[docs]    def forward(self, src_word, trg_word=None):
        r"""
        The Transformer forward method.

        Args:
            src_word (Tensor):
                The ids of source sequence words. It is a tensor with shape
                `[batch_size, source_sequence_length]` and its data type can be
                int or int64.
            trg_word (Tensor):
                The ids of target sequence words. Normally, it should NOT be
                given. If it's given, force decoding with previous output token
                will be trigger. Defaults to None.

        Returns:
            Tensor:
                An int64 tensor shaped indicating the predicted ids. Its shape is
                `[batch_size, seq_len, beam_size]` or `[seq_len, batch_size, beam_size]`
                according to `output_time_major`.

        Example:
            .. code-block::

                import paddle
                from paddlenlp.transformers import InferTransformerModel

                transformer = InferTransformerModel(
                    src_vocab_size=30000,
                    trg_vocab_size=30000,
                    max_length=256,
                    num_encoder_layers=6,
                    num_decoder_layers=6,
                    n_head=8,
                    d_model=512,
                    d_inner_hid=2048,
                    dropout=0.1,
                    weight_sharing=True,
                    bos_id=0,
                    eos_id=1,
                    beam_size=4,
                    max_out_len=256)

                batch_size = 5
                seq_len = 10
                transformer(
                    src_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len]))
        """
        if trg_word is not None:
            trg_length = paddle.sum(paddle.cast(trg_word != self.pad_id, dtype="int32"), axis=-1)
        else:
            trg_length = None

        if self.beam_search_version == "v1":
            src_max_len = paddle.shape(src_word)[-1]
            src_slf_attn_bias = (
                paddle.cast(src_word == self.pad_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
            )
            trg_src_attn_bias = src_slf_attn_bias
            src_pos = paddle.cast(src_word != self.pad_id, dtype=src_word.dtype) * paddle.arange(
                start=0, end=src_max_len, dtype=src_word.dtype
            )

            # Run encoder
            src_emb = self.src_word_embedding(src_word)
            src_pos_emb = self.src_pos_embedding(src_pos)
            src_emb = src_emb + src_pos_emb
            enc_input = F.dropout(src_emb, p=self.dropout, training=False) if self.dropout else src_emb
            enc_output = self.transformer.encoder(enc_input, src_slf_attn_bias)

            # Init states (caches) for transformer, need to be updated according to selected beam
            incremental_cache, static_cache = self.transformer.decoder.gen_cache(enc_output, do_zip=True)

            static_cache, enc_output, trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
                (static_cache, enc_output, trg_src_attn_bias), self.beam_size
            )

            rs, _ = nn.decode.dynamic_decode(
                decoder=self.decode,
                inits=incremental_cache,
                max_step_num=self.max_out_len,
                memory=enc_output,
                trg_src_attn_bias=trg_src_attn_bias,
                static_cache=static_cache,
                is_test=True,
                output_time_major=self.output_time_major,
                trg_word=trg_word,
                trg_length=trg_length,
            )

            return rs

        elif self.beam_search_version == "v2":
            finished_seq, finished_scores = self.beam_search_v2(
                src_word, self.beam_size, self.max_out_len, self.alpha, trg_word, trg_length
            )
            if self.output_time_major:
                finished_seq = finished_seq.transpose([2, 0, 1])
            else:
                finished_seq = finished_seq.transpose([0, 2, 1])

            return finished_seq

[docs]    def beam_search_v2(self, src_word, beam_size=4, max_len=None, alpha=0.6, trg_word=None, trg_length=None):
        """
        Beam search with the alive and finished two queues, both have a beam size
        capicity separately. It includes `grow_topk` `grow_alive` `grow_finish` as
        steps.
        1. `grow_topk` selects the top `2*beam_size` candidates to avoid all getting
        EOS.
        2. `grow_alive` selects the top `beam_size` non-EOS candidates as the inputs
        of next decoding step.
        3. `grow_finish` compares the already finished candidates in the finished queue
        and newly added finished candidates from `grow_topk`, and selects the top
        `beam_size` finished candidates.
        """

        def expand_to_beam_size(tensor, beam_size):
            tensor = paddle.unsqueeze(tensor, axis=1)
            tile_dims = [1] * len(tensor.shape)
            tile_dims[1] = beam_size
            return paddle.tile(tensor, tile_dims)

        def merge_beam_dim(tensor):
            shape = tensor.shape
            return paddle.reshape(tensor, [shape[0] * shape[1]] + list(shape[2:]))

        # run encoder
        src_max_len = paddle.shape(src_word)[-1]
        src_slf_attn_bias = (
            paddle.cast(src_word == self.pad_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
        )
        src_slf_attn_bias.stop_gradient = True
        src_pos = paddle.cast(src_word != self.pad_id, dtype=src_word.dtype) * paddle.arange(
            start=0, end=src_max_len, dtype=src_word.dtype
        )
        src_emb = self.src_word_embedding(src_word)
        src_pos_emb = self.src_pos_embedding(src_pos)
        src_emb = src_emb + src_pos_emb
        enc_input = F.dropout(src_emb, p=self.dropout, training=self.training) if self.dropout else src_emb

        enc_output = self.transformer.encoder(enc_input, src_slf_attn_bias)

        # constant number
        inf = float(1.0 * 1e7)
        batch_size = enc_output.shape[0]
        max_len = (
            (enc_output.shape[1] + 20)
            if max_len is None
            else (enc_output.shape[1] + max_len if self.rel_len else max_len)
        )

        # initialize states of beam search
        # init for the alive
        initial_log_probs = paddle.assign(np.array([[0.0] + [-inf] * (beam_size - 1)], dtype="float32"))
        alive_log_probs = paddle.tile(initial_log_probs, [batch_size, 1])

        alive_seq = paddle.tile(
            paddle.cast(paddle.assign(np.array([[[self.bos_id]]])), src_word.dtype), [batch_size, beam_size, 1]
        )

        # init for the finished
        finished_scores = paddle.assign(np.array([[-inf] * beam_size], dtype="float32"))
        finished_scores = paddle.tile(finished_scores, [batch_size, 1])

        finished_seq = paddle.tile(
            paddle.cast(paddle.assign(np.array([[[self.bos_id]]])), src_word.dtype), [batch_size, beam_size, 1]
        )
        finished_flags = paddle.zeros_like(finished_scores)

        # initialize inputs and states of transformer decoder
        # init inputs for decoder, shaped `[batch_size*beam_size, ...]`
        pre_word = paddle.reshape(alive_seq[:, :, -1], [batch_size * beam_size, 1])
        trg_src_attn_bias = src_slf_attn_bias
        trg_src_attn_bias = merge_beam_dim(expand_to_beam_size(trg_src_attn_bias, beam_size))
        enc_output = merge_beam_dim(expand_to_beam_size(enc_output, beam_size))

        # init states (caches) for transformer, need to be updated according to selected beam
        caches = self.transformer.decoder.gen_cache(enc_output, do_zip=False)

        if trg_word is not None:
            scores_dtype = finished_scores.dtype
            scores = paddle.ones(shape=[batch_size, beam_size * 2], dtype=scores_dtype) * -1e4
            scores = paddle.scatter(
                scores.flatten(),
                paddle.arange(0, batch_size * beam_size * 2, step=beam_size * 2, dtype=finished_seq.dtype),
                paddle.zeros([batch_size]),
            )
            scores = paddle.reshape(scores, [batch_size, beam_size * 2])

        def update_states(caches, topk_coordinates, beam_size, batch_size):
            new_caches = []
            for cache in caches:
                k = gather_2d(cache[0].k, topk_coordinates, beam_size, batch_size, need_unmerge=True)
                v = gather_2d(cache[0].v, topk_coordinates, beam_size, batch_size, need_unmerge=True)
                new_caches.append((nn.MultiHeadAttention.Cache(k, v), cache[1]))
            return new_caches

        def get_topk_coordinates(beam_idx, beam_size, batch_size, dtype="int64"):
            batch_pos = paddle.arange(batch_size * beam_size, dtype=dtype) // beam_size
            batch_pos = paddle.reshape(batch_pos, [batch_size, beam_size])
            topk_coordinates = paddle.stack([batch_pos, beam_idx], axis=2)
            return topk_coordinates

        def gather_2d(tensor_nd, topk_coordinates, beam_size, batch_size, need_unmerge=False):

            new_tensor_nd = (
                paddle.reshape(tensor_nd, shape=[batch_size, beam_size] + list(tensor_nd.shape[1:]))
                if need_unmerge
                else tensor_nd
            )
            topk_seq = paddle.gather_nd(new_tensor_nd, topk_coordinates)
            return merge_beam_dim(topk_seq) if need_unmerge else topk_seq

        def early_finish(alive_log_probs, finished_scores, finished_in_finished):
            max_length_penalty = np.power(((5.0 + max_len) / 6.0), alpha)
            lower_bound_alive_scores = alive_log_probs[:, 0] / max_length_penalty
            lowest_score_of_fininshed_in_finished = paddle.min(finished_scores * finished_in_finished, 1)
            lowest_score_of_fininshed_in_finished += (1.0 - paddle.max(finished_in_finished, 1)) * -inf
            bound_is_met = paddle.all(
                paddle.greater_than(lowest_score_of_fininshed_in_finished, lower_bound_alive_scores)
            )

            return bound_is_met

        def grow_topk(i, logits, alive_seq, alive_log_probs, states):
            """
            This function takes the current alive sequences, and grows them to topk
            sequences where k = 2*beam.
            """
            logits = paddle.reshape(logits, [batch_size, beam_size, -1])
            candidate_log_probs = paddle.log(F.softmax(logits, axis=2))
            log_probs = paddle.add(candidate_log_probs, alive_log_probs.unsqueeze(-1))

            # Length penalty is given by = (5+len(decode)/6) ^ -\alpha. Pls refer to
            # https://arxiv.org/abs/1609.08144.
            length_penalty = paddle.pow((5.0 + i + 1.0) / 6.0, alpha)
            curr_scores = log_probs / length_penalty
            flat_curr_scores = paddle.reshape(curr_scores, [batch_size, -1])

            topk_scores, topk_ids = paddle.topk(flat_curr_scores, k=beam_size * 2)
            if topk_ids.dtype != alive_seq.dtype:
                topk_ids = paddle.cast(topk_ids, dtype=alive_seq.dtype)

            if trg_word is not None:
                topk_ids, topk_scores = force_decoding_v2(topk_ids, topk_scores, i)

            topk_log_probs = topk_scores * length_penalty

            topk_beam_index = topk_ids // self.trg_vocab_size
            topk_ids = topk_ids % self.trg_vocab_size

            topk_coordinates = get_topk_coordinates(topk_beam_index, beam_size * 2, batch_size, dtype=alive_seq.dtype)
            topk_seq = gather_2d(alive_seq, topk_coordinates, beam_size, batch_size)
            topk_seq = paddle.concat([topk_seq, paddle.reshape(topk_ids, list(topk_ids.shape[:]) + [1])], axis=2)
            states = update_states(states, topk_coordinates, beam_size, batch_size)
            eos = paddle.full(shape=paddle.shape(topk_ids), dtype=alive_seq.dtype, fill_value=self.eos_id)
            topk_finished = paddle.cast(paddle.equal(topk_ids, eos), "float32")

            # topk_seq: [batch_size, 2*beam_size, i+1]
            # topk_log_probs, topk_scores, topk_finished: [batch_size, 2*beam_size]
            return topk_seq, topk_log_probs, topk_scores, topk_finished, states

        def grow_alive(curr_seq, curr_scores, curr_log_probs, curr_finished, states):
            """
            Given sequences and scores, will gather the top k=beam size sequences
            """
            curr_scores += curr_finished * -inf
            _, topk_indexes = paddle.topk(curr_scores, k=beam_size)
            if topk_indexes.dtype != curr_seq.dtype:
                topk_indexes = paddle.cast(topk_indexes, dtype=curr_seq.dtype)

            topk_coordinates = get_topk_coordinates(topk_indexes, beam_size, batch_size, dtype=curr_seq.dtype)
            alive_seq = gather_2d(curr_seq, topk_coordinates, beam_size, batch_size)

            alive_log_probs = gather_2d(curr_log_probs, topk_coordinates, beam_size, batch_size)
            states = update_states(states, topk_coordinates, beam_size * 2, batch_size)

            return alive_seq, alive_log_probs, states

        def grow_finished(finished_seq, finished_scores, finished_flags, curr_seq, curr_scores, curr_finished):
            """
            Given sequences and scores, will gather the top k=beam size sequences.
            """
            # finished scores
            finished_seq = paddle.concat(
                [
                    finished_seq,
                    paddle.full(shape=[batch_size, beam_size, 1], dtype=finished_seq.dtype, fill_value=self.eos_id),
                ],
                axis=2,
            )
            curr_scores += (1.0 - curr_finished) * -inf
            curr_finished_seq = paddle.concat([finished_seq, curr_seq], axis=1)
            curr_finished_scores = paddle.concat([finished_scores, curr_scores], axis=1)
            curr_finished_flags = paddle.concat([finished_flags, curr_finished], axis=1)
            _, topk_indexes = paddle.topk(curr_finished_scores, k=beam_size)
            if topk_indexes.dtype != curr_seq.dtype:
                topk_indexes = paddle.cast(topk_indexes, dtype=curr_seq.dtype)

            topk_coordinates = get_topk_coordinates(topk_indexes, beam_size, batch_size, dtype=curr_seq.dtype)
            finished_seq = gather_2d(curr_finished_seq, topk_coordinates, beam_size, batch_size)
            finished_scores = gather_2d(curr_finished_scores, topk_coordinates, beam_size, batch_size)
            finished_flags = gather_2d(curr_finished_flags, topk_coordinates, beam_size, batch_size)

            return finished_seq, finished_scores, finished_flags

        def force_decoding_v2(topk_ids, topk_scores, time):
            beam_size = topk_ids.shape[1]
            if trg_word.shape[1] > time:
                force_position = paddle.unsqueeze(trg_length > time, [1])
                force_position.stop_gradient = True
                force_position = paddle.tile(force_position, [1, beam_size])

                crt_trg_word = paddle.slice(trg_word, axes=[1], starts=[time], ends=[time + 1])
                crt_trg_word = paddle.tile(crt_trg_word, [1, beam_size])

                topk_ids = paddle.where(force_position, crt_trg_word, topk_ids)

                topk_scores = paddle.where(force_position, scores, topk_scores)

            return topk_ids, topk_scores

        def inner_loop(i, pre_word, alive_seq, alive_log_probs, finished_seq, finished_scores, finished_flags, caches):
            trg_pos = paddle.full(shape=paddle.shape(pre_word), dtype=alive_seq.dtype, fill_value=i)
            trg_emb = self.trg_word_embedding(pre_word)
            trg_pos_emb = self.trg_pos_embedding(trg_pos)
            trg_emb = trg_emb + trg_pos_emb
            dec_input = F.dropout(trg_emb, p=self.dropout, training=self.training) if self.dropout else trg_emb

            logits, caches = self.transformer.decoder(dec_input, enc_output, None, trg_src_attn_bias, caches)
            logits = self.linear(logits)
            topk_seq, topk_log_probs, topk_scores, topk_finished, states = grow_topk(
                i, logits, alive_seq, alive_log_probs, caches
            )
            alive_seq, alive_log_probs, states = grow_alive(
                topk_seq, topk_scores, topk_log_probs, topk_finished, states
            )
            caches = states
            finished_seq, finished_scores, finished_flags = grow_finished(
                finished_seq, finished_scores, finished_flags, topk_seq, topk_scores, topk_finished
            )
            pre_word = paddle.reshape(alive_seq[:, :, -1], [batch_size * beam_size, 1])
            return (i + 1, pre_word, alive_seq, alive_log_probs, finished_seq, finished_scores, finished_flags, caches)

        def is_not_finish(
            i, pre_word, alive_seq, alive_log_probs, finished_seq, finished_scores, finished_flags, caches
        ):
            return paddle.greater_than(i < max_len, early_finish(alive_log_probs, finished_scores, finished_flags))

        (
            _,
            pre_word,
            alive_seq,
            alive_log_probs,
            finished_seq,
            finished_scores,
            finished_flags,
            caches,
        ) = paddle.static.nn.while_loop(
            is_not_finish,
            inner_loop,
            [
                paddle.zeros(shape=[1], dtype="int64"),
                pre_word,
                alive_seq,
                alive_log_probs,
                finished_seq,
                finished_scores,
                finished_flags,
                caches,
            ],
        )

        # (gongenlei) `paddle.where` doesn't support broadcast, so we need to use `paddle.unsqueeze`
        # and `paddle.tile` to make condition.shape same as X.shape. But when converting dygraph
        # to static  graph, `paddle.tile` will raise error.
        finished_flags = paddle.cast(finished_flags, dtype=finished_seq.dtype)
        neg_finished_flags = 1 - finished_flags
        finished_seq = paddle.multiply(finished_seq, finished_flags.unsqueeze(-1)) + paddle.multiply(
            alive_seq, neg_finished_flags.unsqueeze(-1)
        )
        finished_scores = paddle.multiply(
            finished_scores, paddle.cast(finished_flags, dtype=finished_scores.dtype)
        ) + paddle.multiply(alive_log_probs, paddle.cast(neg_finished_flags, dtype=alive_log_probs.dtype))
        return finished_seq, finished_scores