paddlenlp.transformers.bart.modeling 源代码

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from typing import List, Optional, Tuple, Union

import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import Tensor
from paddle.nn import Embedding, Layer, MultiHeadAttention

from ...utils.converter import StateDictNameMapping, init_name_mappings
from ...utils.env import CONFIG_NAME
from ...utils.log import logger
from .. import PretrainedModel, register_base_model
from ..model_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    ModelOutput,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
    Seq2SeqQuestionAnsweringModelOutput,
    Seq2SeqSequenceClassifierOutput,
    convert_encoder_output,
)
from .configuration import (
    BART_PRETRAINED_INIT_CONFIGURATION,
    BART_PRETRAINED_RESOURCE_FILES_MAP,
    BartConfig,
)

__all__ = [
    "BartModel",
    "BartPretrainedModel",
    "BartEncoder",
    "BartDecoder",
    "BartClassificationHead",
    "BartForSequenceClassification",
    "BartForQuestionAnswering",
    "BartForConditionalGeneration",
]

Cache = MultiHeadAttention.Cache
StaticCache = MultiHeadAttention.StaticCache


def shift_tokens_right(input_ids, decoder_start_token_id):
    """
    Shift input ids one token to the right.
    """
    shifted_input_ids = paddle.zeros_like(input_ids)
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    shifted_input_ids[:, 0] = decoder_start_token_id

    return shifted_input_ids


[文档]class BartPretrainedModel(PretrainedModel): """ An abstract class for pretrained Bart models. It provides Bart related `model_config_file`, `pretrained_init_configuration`, `resource_files_names`, `pretrained_resource_files_map`, `base_model_prefix` for downloading and loading pretrained models. See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details. """ model_config_file = CONFIG_NAME pretrained_init_configuration = BART_PRETRAINED_INIT_CONFIGURATION pretrained_resource_files_map = BART_PRETRAINED_RESOURCE_FILES_MAP base_model_prefix = "bart" config_class = BartConfig @classmethod def _get_name_mappings(cls, config: BartConfig) -> List[StateDictNameMapping]: model_mappings = [ "shared.weight", ] num_encoder_layers = config.num_encoder_layers or 0 num_decoder_layers = config.num_decoder_layers or 0 if num_encoder_layers: encoder_mappings = [ ["encoder.embed_positions.weight", "encoder.encoder_embed_positions.weight"], ["encoder.layernorm_embedding.weight", "encoder.encoder_layernorm_embedding.weight"], ["encoder.layernorm_embedding.bias", "encoder.encoder_layernorm_embedding.bias"], ] model_mappings.extend(encoder_mappings) for layer_index in range(num_encoder_layers): encoder_mappings = [ [ f"encoder.layers.{layer_index}.self_attn.k_proj.weight", f"encoder.encoder.layers.{layer_index}.self_attn.k_proj.weight", "transpose", ], [ f"encoder.layers.{layer_index}.self_attn.k_proj.bias", f"encoder.encoder.layers.{layer_index}.self_attn.k_proj.bias", ], [ f"encoder.layers.{layer_index}.self_attn.v_proj.weight", f"encoder.encoder.layers.{layer_index}.self_attn.v_proj.weight", "transpose", ], [ f"encoder.layers.{layer_index}.self_attn.v_proj.bias", f"encoder.encoder.layers.{layer_index}.self_attn.v_proj.bias", ], [ f"encoder.layers.{layer_index}.self_attn.q_proj.weight", f"encoder.encoder.layers.{layer_index}.self_attn.q_proj.weight", "transpose", ], [ f"encoder.layers.{layer_index}.self_attn.q_proj.bias", f"encoder.encoder.layers.{layer_index}.self_attn.q_proj.bias", ], [ f"encoder.layers.{layer_index}.self_attn.out_proj.weight", f"encoder.encoder.layers.{layer_index}.self_attn.out_proj.weight", "transpose", ], [ f"encoder.layers.{layer_index}.self_attn.out_proj.bias", f"encoder.encoder.layers.{layer_index}.self_attn.out_proj.bias", ], [ f"encoder.layers.{layer_index}.fc1.weight", f"encoder.encoder.layers.{layer_index}.linear1.weight", "transpose", ], [ f"encoder.layers.{layer_index}.fc1.bias", f"encoder.encoder.layers.{layer_index}.linear1.bias", ], [ f"encoder.layers.{layer_index}.fc2.weight", f"encoder.encoder.layers.{layer_index}.linear2.weight", "transpose", ], [ f"encoder.layers.{layer_index}.fc2.bias", f"encoder.encoder.layers.{layer_index}.linear2.bias", ], [ f"encoder.layers.{layer_index}.self_attn_layer_norm.weight", f"encoder.encoder.layers.{layer_index}.norm1.weight", ], [ f"encoder.layers.{layer_index}.self_attn_layer_norm.bias", f"encoder.encoder.layers.{layer_index}.norm1.bias", ], [ f"encoder.layers.{layer_index}.final_layer_norm.weight", f"encoder.encoder.layers.{layer_index}.norm2.weight", ], [ f"encoder.layers.{layer_index}.final_layer_norm.bias", f"encoder.encoder.layers.{layer_index}.norm2.bias", ], ] model_mappings.extend(encoder_mappings) if num_decoder_layers: decoder_mappings = [ ["decoder.embed_positions.weight", "decoder.decoder_embed_positions.weight"], ["decoder.layernorm_embedding.weight", "decoder.decoder_layernorm_embedding.weight"], ["decoder.layernorm_embedding.bias", "decoder.decoder_layernorm_embedding.bias"], ] model_mappings.extend(decoder_mappings) for layer_index in range(num_decoder_layers): decoder_mappings = [ [ f"decoder.layers.{layer_index}.self_attn.k_proj.weight", f"decoder.decoder.layers.{layer_index}.self_attn.k_proj.weight", "transpose", ], [ f"decoder.layers.{layer_index}.self_attn.k_proj.bias", f"decoder.decoder.layers.{layer_index}.self_attn.k_proj.bias", ], [ f"decoder.layers.{layer_index}.self_attn.v_proj.weight", f"decoder.decoder.layers.{layer_index}.self_attn.v_proj.weight", "transpose", ], [ f"decoder.layers.{layer_index}.self_attn.v_proj.bias", f"decoder.decoder.layers.{layer_index}.self_attn.v_proj.bias", ], [ f"decoder.layers.{layer_index}.self_attn.q_proj.weight", f"decoder.decoder.layers.{layer_index}.self_attn.q_proj.weight", "transpose", ], [ f"decoder.layers.{layer_index}.self_attn.q_proj.bias", f"decoder.decoder.layers.{layer_index}.self_attn.q_proj.bias", ], [ f"decoder.layers.{layer_index}.self_attn.out_proj.weight", f"decoder.decoder.layers.{layer_index}.self_attn.out_proj.weight", "transpose", ], [ f"decoder.layers.{layer_index}.self_attn.out_proj.bias", f"decoder.decoder.layers.{layer_index}.self_attn.out_proj.bias", ], [ f"decoder.layers.{layer_index}.encoder_attn.k_proj.weight", f"decoder.decoder.layers.{layer_index}.cross_attn.k_proj.weight", "transpose", ], [ f"decoder.layers.{layer_index}.encoder_attn.k_proj.bias", f"decoder.decoder.layers.{layer_index}.cross_attn.k_proj.bias", ], [ f"decoder.layers.{layer_index}.encoder_attn.v_proj.weight", f"decoder.decoder.layers.{layer_index}.cross_attn.v_proj.weight", "transpose", ], [ f"decoder.layers.{layer_index}.encoder_attn.v_proj.bias", f"decoder.decoder.layers.{layer_index}.cross_attn.v_proj.bias", ], [ f"decoder.layers.{layer_index}.encoder_attn.q_proj.weight", f"decoder.decoder.layers.{layer_index}.cross_attn.q_proj.weight", "transpose", ], [ f"decoder.layers.{layer_index}.encoder_attn.q_proj.bias", f"decoder.decoder.layers.{layer_index}.cross_attn.q_proj.bias", ], [ f"decoder.layers.{layer_index}.encoder_attn.out_proj.weight", f"decoder.decoder.layers.{layer_index}.cross_attn.out_proj.weight", "transpose", ], [ f"decoder.layers.{layer_index}.encoder_attn.out_proj.bias", f"decoder.decoder.layers.{layer_index}.cross_attn.out_proj.bias", ], [ f"decoder.layers.{layer_index}.fc1.weight", f"decoder.decoder.layers.{layer_index}.linear1.weight", "transpose", ], [ f"decoder.layers.{layer_index}.fc1.bias", f"decoder.decoder.layers.{layer_index}.linear1.bias", ], [ f"decoder.layers.{layer_index}.fc2.weight", f"decoder.decoder.layers.{layer_index}.linear2.weight", "transpose", ], [ f"decoder.layers.{layer_index}.fc2.bias", f"decoder.decoder.layers.{layer_index}.linear2.bias", ], [ f"decoder.layers.{layer_index}.self_attn_layer_norm.weight", f"decoder.decoder.layers.{layer_index}.norm1.weight", ], [ f"decoder.layers.{layer_index}.self_attn_layer_norm.bias", f"decoder.decoder.layers.{layer_index}.norm1.bias", ], [ f"decoder.layers.{layer_index}.encoder_attn_layer_norm.weight", f"decoder.decoder.layers.{layer_index}.norm2.weight", ], [ f"decoder.layers.{layer_index}.encoder_attn_layer_norm.bias", f"decoder.decoder.layers.{layer_index}.norm2.bias", ], [ f"decoder.layers.{layer_index}.final_layer_norm.weight", f"decoder.decoder.layers.{layer_index}.norm3.weight", ], [ f"decoder.layers.{layer_index}.final_layer_norm.bias", f"decoder.decoder.layers.{layer_index}.norm3.bias", ], ] model_mappings.extend(decoder_mappings) init_name_mappings(model_mappings) # base-model prefix "BartModel" if "BartModel" not in config.architectures: for mapping in model_mappings: mapping[0] = "model." + mapping[0] mapping[1] = "bart." + mapping[1] if "BartForQuestionAnswering" in config.architectures: model_mappings.extend( [ ["qa_outputs.weight", "classifier.weight", "transpose"], ["qa_outputs.bias", "classifier.bias"], ] ) if "BartForSequenceClassification" in config.architectures: model_mappings.extend( [ ["classification_head.dense.weight", "classifier.dense.weight", "transpose"], ["classification_head.dense.bias", "classifier.dense.bias"], ["classification_head.out_proj.weight", "classifier.out_proj.weight", "transpose"], ["classification_head.out_proj.bias", "classifier.out_proj.bias"], ] ) if "BartForConditionalGeneration" in config.architectures: model_mappings.extend( [ ["lm_head.weight", "lm_head_weight"], ["final_logits_bias", "final_logits_bias"], ] ) mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)] return mappings def _init_weights(self, layer): """Initialization hook""" if isinstance(layer, (nn.Linear, nn.Embedding)): # In the dygraph mode, use the `set_value` to reset the parameter directly, # and reset the `state_dict` to update parameter in static mode. if isinstance(layer.weight, paddle.Tensor): layer.weight.set_value( paddle.tensor.normal( mean=0.0, std=self.config.init_std, shape=layer.weight.shape, ) )
class BartLearnedPositionalEmbedding(Embedding): """ This module learns positional embeddings up to a fixed maximum size. """ def __init__(self, num_embeddings, embedding_dim): # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2 # and adjust num_embeddings appropriately. Other models dont have this hack self.offset = 2 super().__init__(num_embeddings + self.offset, embedding_dim) def forward(self, input_ids_shape: Tuple, past_key_values_length: int = 0) -> Tensor: """`input_ids_shape` is expected to be [bsz x seqlen].""" bsz, seq_len = input_ids_shape[:2] positions = paddle.arange(past_key_values_length, past_key_values_length + seq_len, dtype="int64") # (gongenlei) For dygraph to static graph return Embedding.forward(self, positions + self.offset)
[文档]class BartEncoder(BartPretrainedModel): """ The Transformer Encoder of BartModel. The arguments of BartEncoder can see :class:`BartModel`. """ def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None): super().__init__(config) self.init_std = config.init_std self.pad_token_id = config.pad_token_id if embed_tokens is not None: self.embed_tokens = embed_tokens else: self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model) self.embed_scale = (config.d_model**0.5) if config.scale_embedding else 1.0 self.encoder_embed_positions = BartLearnedPositionalEmbedding(config.max_position_embeddings, config.d_model) self.encoder_dropout = nn.Dropout(config.dropout) self.encoder_layernorm_embedding = nn.LayerNorm(config.d_model) encoder_layer = nn.TransformerEncoderLayer( d_model=config.d_model, nhead=config.encoder_attention_heads, dim_feedforward=config.encoder_ffn_dim, dropout=config.dropout, activation=config.activation_function, attn_dropout=config.attention_dropout, act_dropout=config.activation_dropout, ) self.encoder = nn.TransformerEncoder(encoder_layer, config.encoder_layers)
[文档] def forward( self, input_ids: Optional[Tensor] = None, attention_mask: Optional[Tensor] = None, inputs_embeds: Optional[Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, **kwargs ) -> Union[Tensor, Tuple, BaseModelOutputWithPastAndCrossAttentions]: """ The BartEncoder forward method, overrides the `__call__()` special method. Args: input_ids (Tensor, optional): See :class:`BartModel`. attention_mask (Tensor, optional): See :class:`BartModel`. inputs_embeds (Tensor, optional): See :class:`BartModel`. output_attentions (bool, optional): See :class:`BartModel`. output_hidden_states (bool, optional): See :class:`BartModel`. return_dict (bool, optional): See :class:`BartModel`. Returns: An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if `return_dict=True`. Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`. Especially, When `return_dict=output_hidden_states=output_attentions=False`, returns tensor `encoder_outputs` which is the output at the last layer of the model. Its data type should be float32 and has a shape of [batch_size, sequence_length, d_model]. """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is None and inputs_embeds is None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: inputs_shape = paddle.shape(input_ids) input_ids = input_ids.reshape((-1, inputs_shape[-1])) elif inputs_embeds is not None: inputs_shape = paddle.shape(inputs_embeds)[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale inputs_embed_pos = self.encoder_embed_positions(inputs_shape) hidden_states = inputs_embeds + inputs_embed_pos hidden_states = self.encoder_layernorm_embedding(hidden_states) encoder_input = self.encoder_dropout(hidden_states) if attention_mask is None and input_ids is not None: attention_mask = ( paddle.cast(input_ids == self.pad_token_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4 ) # For 2D attention_mask from tokenizer elif attention_mask.ndim == 2: attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype()) attention_mask = (1.0 - attention_mask) * -1e4 attention_mask.stop_gradient = True encoder_output = self.encoder( encoder_input, src_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) return encoder_output
[文档]class BartDecoder(BartPretrainedModel): """ The Transformer Decoder of BartModel. The arguments of BartDecoder can see :class:`BartModel`. """ def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None): super().__init__(config) self.init_std = config.init_std if embed_tokens is not None: self.embed_tokens = embed_tokens else: self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model) self.embed_scale = (config.d_model**0.5) if config.scale_embedding else 1.0 self.decoder_embed_positions = BartLearnedPositionalEmbedding(config.max_position_embeddings, config.d_model) self.decoder_dropout = nn.Dropout(config.dropout) self.decoder_layernorm_embedding = nn.LayerNorm(config.d_model) decoder_layer = nn.TransformerDecoderLayer( d_model=config.d_model, nhead=config.decoder_attention_heads, dim_feedforward=config.decoder_ffn_dim, dropout=config.dropout, activation=config.activation_function, attn_dropout=config.attention_dropout, act_dropout=config.activation_dropout, ) self.decoder = nn.TransformerDecoder(decoder_layer, config.decoder_layers)
[文档] def forward( self, decoder_input_ids: Optional[Tensor] = None, decoder_attention_mask: Optional[Tensor] = None, encoder_output: Union[Tuple[Tensor], ModelOutput, None] = None, memory_mask: Optional[Tensor] = None, decoder_inputs_embeds: Optional[Tensor] = None, cache: Optional[List[Tuple[Cache, StaticCache]]] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tensor, Tuple, BaseModelOutputWithPastAndCrossAttentions]: """ The BartDecoder forward method, overrides the `__call__()` special method. Args: decoder_input_ids (Tensor, optional): See :class:`BartModel`. decoder_attention_mask (Tensor, optional): See :class:`BartModel`. encoder_output (Tensor, optional): See :class:`BartModel`. memory_mask (Tensor, optional): See :class:`BartModel`. decoder_inputs_embeds (Tensor, optional): See :class:`BartModel`. cache (Tensor, optional): See :class:`BartModel`. output_attentions (bool, optional): See :class:`BartModel`. output_hidden_states (bool, optional): See :class:`BartModel`. return_dict (bool, optional): See :class:`BartModel`. Returns: An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if `return_dict=True`. Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`. Especially, When `return_dict=output_hidden_states=output_attentions=False`, returns tensor `decoder_outputs` which is the output at the last layer of the model. Its data type should be float32 and has a shape of [batch_size, sequence_length, d_model]. """ # retrieve input_ids and inputs_embeds if decoder_input_ids is not None and decoder_inputs_embeds is not None: raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") elif decoder_input_ids is not None: inputs_shape = paddle.shape(decoder_input_ids) decoder_input_ids = decoder_input_ids.reshape((-1, inputs_shape[-1])) elif decoder_inputs_embeds is not None: inputs_shape = paddle.shape(decoder_inputs_embeds)[:-1] else: raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") if decoder_attention_mask is None: decoder_length = inputs_shape[-1] decoder_attention_mask = paddle.tensor.triu( (paddle.full((decoder_length, decoder_length), -np.inf, dtype=paddle.get_default_dtype())), 1 ) if decoder_inputs_embeds is None: decoder_inputs_embeds = self.embed_tokens(decoder_input_ids) * self.embed_scale past_key_values_length = paddle.shape(cache[0][0].k)[2] if cache is not None else 0 decoder_inputs_embed_pos = self.decoder_embed_positions(inputs_shape, past_key_values_length) hidden_states = decoder_inputs_embeds + decoder_inputs_embed_pos hidden_states = self.decoder_layernorm_embedding(hidden_states) decoder_input = self.decoder_dropout(hidden_states) decoder_output = self.decoder( tgt=decoder_input, memory=encoder_output if isinstance(encoder_output, type(decoder_input)) else encoder_output[0], tgt_mask=decoder_attention_mask, memory_mask=memory_mask, cache=cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) return decoder_output
[文档]@register_base_model class BartModel(BartPretrainedModel): r""" The bare Bart Model transformer outputting raw hidden-states. This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`. Refer to the superclass documentation for the generic methods. This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer and refer to the Paddle documentation for all matter related to general usage and behavior. Args: config (:class:`BartConfig`): An instance of BartConfig used to construct BartModel. """ def __init__(self, config: BartConfig): super().__init__(config) self.init_std = config.init_std self.pad_token_id = config.pad_token_id self.decoder_start_token_id = config.decoder_start_token_id self.shared = nn.Embedding(config.vocab_size, config.d_model) self.encoder = BartEncoder(config, self.shared) self.decoder = BartDecoder(config, self.shared) def get_encoder(self): return self.encoder def get_decoder(self): return self.decoder
[文档] def get_input_embeddings(self): return self.shared
[文档] def set_input_embeddings(self, value): self.shared = value
[文档] def forward( self, input_ids: Optional[Tensor] = None, attention_mask: Optional[Tensor] = None, decoder_input_ids: Optional[Tensor] = None, decoder_attention_mask: Optional[Tensor] = None, encoder_output: Union[Tuple[Tensor], ModelOutput, None] = None, inputs_embeds: Optional[Tensor] = None, decoder_inputs_embeds: Optional[Tensor] = None, use_cache: Optional[bool] = None, cache: Optional[List[Tuple[Cache, StaticCache]]] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, Seq2SeqModelOutput]: r""" The BartModel forward method, overrides the `__call__()` special method. Args: input_ids (Tensor, optional): Indices of input sequence tokens in the vocabulary. They are numerical representations of tokens that build the input sequence. Its data type should be `int64` and it has a shape of [batch_size, sequence_length]. attention_mask (Tensor, optional): Mask used in multi-head attention to avoid performing attention to some unwanted positions, usually the paddings or the subsequent positions. Its data type can be int, float and bool. When the data type is bool, the `masked` tokens have `False` values and the others have `True` values. When the data type is int, the `masked` tokens have `0` values and the others have `1` values. When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values. It is a tensor with shape broadcasted to `[batch_size, encoder_attention_heads, sequence_length, sequence_length]`. For example, its shape can be [batch_size, sequence_length], [batch_size, sequence_length, sequence_length], [batch_size, encoder_attention_heads, sequence_length, sequence_length]. Defaults to `None`, which means nothing needed to be prevented attention to. decoder_input_ids (Tensor, optional): Indices of decoder input sequence tokens in the vocabulary. Its data type should be `int64` and it has a shape of [batch_size, sequence_length]. Defaults to `None`, which means no `decoder_input_ids` is provided, the model will create the tensor by shifting the `input_ids` to the right. decoder_attention_mask (Tensor, optional): Mask used in multi-head attention to avoid performing attention to some unwanted positions in `decoder_input_ids`. Its data type and shape is the same as `attention_mask`. Defaults to `None`. encoder_output (tuple, optional): The output of the encoder, a tuple consists `last_hidden_state`, `hidden_states`(optional), `attentions`(optional). The data type of `last_hidden_state` is float32 and its shape is `[batch_size, sequence_length, d_model]`. `hidden_states` is hidden_states of all layers in the Transformer encoder. The length of `hidden_states` is `num_hidden_layers + 1`. For all element in the tuple, its data type should be float32 and its shape is [`batch_size, sequence_length, d_model`]. `attentions` is attentions of all layers of in the Transformer encoder. The length of `attentions` is `num_hidden_layers`. For all element in the tuple, its data type should be float32 and its shape is [`batch_size, num_attention_heads, sequence_length, sequence_length`]. inputs_embeds (Tensor, optional): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. Default to None. decoder_inputs_embeds (Tensor, optional): Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded representation of shape `(batch_size, target_sequence_length, hidden_size)`. If `cache` is used, optionally only the last `decoder_inputs_embeds` have to be input (see `past_key_values`). This is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. Default to None. If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value of `inputs_embeds`. use_cache (bool, optional): Whether or not to use cache. Defaults to `False`. If set to `True`, key value states will be returned and can be used to speed up decoding. cache (list, optional): It is a list, and each element in the list is a tuple `(incremental_cache, static_cache)`. See `TransformerDecoder.gen_cache <https://github.com/PaddlePaddle/Paddle/blob/release/2.1/python/paddle/nn/layer/transformer.py#L1060>`__ for more details. It is only used for inference and should be None for training. Default to `None`. output_attentions (bool, optional): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. Defaults to `False`. output_hidden_states (bool, optional): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. Defaults to `False`. return_dict (bool, optional): Whether to return a :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` object. If `False`, the output will be a tuple of tensors. Defaults to `False`. Returns: An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if `return_dict=True`. Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`. Especially, When `return_dict=output_hidden_states=output_attentions=False`, returns tensor `decoder_output`, which is the output at the last layer of the model. Its data type should be float32 and has a shape of [batch_size, sequence_length, d_model]. Example: .. code-block:: import paddle from paddlenlp.transformers import BartModel, BartTokenizer tokenizer = BartTokenizer.from_pretrained('bart-base') model = BartModel.from_pretrained('bart-base') inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!") inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()} output = model(**inputs) """ # different to other models, Bart automatically creates decoder_input_ids from # inputBartForSequenceClassification_ids if no decoder_input_ids are provided if input_ids is None and inputs_embeds is None and encoder_output is None: raise ValueError("You have to specify either input_ids or encoder_output") if decoder_input_ids is None and decoder_inputs_embeds is None: if input_ids is None: raise ValueError( "If no `decoder_input_ids` or `decoder_inputs_embeds` are " "passed, `input_ids` cannot be `None`. Please pass either " "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`." ) decoder_input_ids = shift_tokens_right(input_ids, self.decoder_start_token_id) if attention_mask is None and input_ids is not None: # only generate attention_mask when input_ids is specified attention_mask = ( paddle.cast(input_ids == self.pad_token_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4 ) if inputs_embeds is not None and input_ids is None and attention_mask is None: logger.warning("provided inputs_embeds without attention_mask") # For 2D attention_mask from tokenizer elif attention_mask.ndim == 2: attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype()) attention_mask = (1.0 - attention_mask) * -1e4 attention_mask.stop_gradient = True input_type = type(decoder_input_ids) if decoder_input_ids is not None else type(decoder_inputs_embeds) if encoder_output is None: encoder_output = self.encoder( input_ids, attention_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True elif return_dict and not isinstance(encoder_output, ModelOutput): if isinstance(encoder_output, input_type): encoder_output = (encoder_output,) encoder_output = convert_encoder_output(encoder_output) if isinstance(encoder_output, input_type): encoder_last_hidden_state = encoder_output else: encoder_last_hidden_state = encoder_output[0] if use_cache: if cache is None: cache = self.decoder.decoder.gen_cache(encoder_last_hidden_state) else: cache = None memory_mask = attention_mask if attention_mask is not None: if attention_mask.ndim == 4: memory_mask = attention_mask[:, :, -1:, :] elif attention_mask.ndim == 3: memory_mask = attention_mask[:, -1:, :].unsqueeze([1]) elif attention_mask.ndim == 2: memory_mask = attention_mask.unsqueeze([1, 2]) else: raise ValueError("Invalid attention mask shape. ") decoder_output = self.decoder( decoder_input_ids, decoder_attention_mask, encoder_last_hidden_state, memory_mask, cache=cache, decoder_inputs_embeds=decoder_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) if not return_dict: if isinstance(decoder_output, input_type): decoder_output = (decoder_output,) if isinstance(encoder_output, input_type): encoder_output = (encoder_output,) return decoder_output + encoder_output return Seq2SeqModelOutput( last_hidden_state=decoder_output.last_hidden_state, past_key_values=decoder_output.past_key_values, decoder_hidden_states=decoder_output.hidden_states, decoder_attentions=decoder_output.attentions, cross_attentions=decoder_output.cross_attentions, encoder_last_hidden_state=encoder_output.last_hidden_state, encoder_hidden_states=encoder_output.hidden_states, encoder_attentions=encoder_output.attentions, )
[文档]class BartClassificationHead(Layer): """ Perform sentence-level classification tasks. """ def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float): super().__init__() self.dense = nn.Linear(input_dim, inner_dim) self.dropout = nn.Dropout(p=pooler_dropout) self.out_proj = nn.Linear(inner_dim, num_classes)
[文档] def forward(self, hidden_states: Tensor) -> Tensor: """ Args: hidden_states (Tensor): Hidden states of the classification model. """ hidden_states = self.dropout(hidden_states) hidden_states = self.dense(hidden_states) hidden_states = F.tanh(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.out_proj(hidden_states) return hidden_states
[文档]class BartForSequenceClassification(BartPretrainedModel): r""" Bart Model with a linear layer on top of the pooled output, designed for sequence classification/regression tasks like GLUE tasks. Args: config (:class:`BartConfig`): An instance of BartConfig used to construct BartForSequenceClassification. """ def __init__(self, config: BartConfig): super().__init__(config) self.bart = BartModel(config) self.num_labels = config.num_labels self.classifier = BartClassificationHead( config.d_model, config.d_model, config.num_labels, config.classifier_dropout if config.classifier_dropout is not None else config.dropout, )
[文档] def forward( self, input_ids: Optional[Tensor] = None, attention_mask: Optional[Tensor] = None, decoder_input_ids: Optional[Tensor] = None, decoder_attention_mask: Optional[Tensor] = None, encoder_output: Union[Tuple[Tensor], ModelOutput, None] = None, inputs_embeds: Optional[Tensor] = None, decoder_inputs_embeds: Optional[Tensor] = None, use_cache: Optional[bool] = None, cache: Optional[List[Tuple[Cache, StaticCache]]] = None, labels: Optional[Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tensor, Tuple, Seq2SeqSequenceClassifierOutput]: r""" The BartForSequenceClassification forward method, overrides the __call__() special method. Args: input_ids (Tensor, optional): See :class:`BartModel`. attention_mask (Tensor, optional): See :class:`BartModel`. decoder_input_ids (Tensor, `optional`): See :class:`BartModel`. decoder_attention_mask (Tensor, optional): See :class:`BartModel`. encoder_output (Tensor, optonal): See :class:`BartModel`. inputs_embeds (Tensor, optional): See :class:`BartModel`. decoder_inputs_embeds (Tensor, optional): See :class:`BartModel`. use_cache (bool, optional): See :class:`BartModel`. Forcely set to `False` when `labels` is provided that can save memory during training. cache (Tensor, optional): See :class:`BartModel`. labels (Tensor, optional): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., num_labels - 1]`. If `num_labels > 1` a classification loss is computed (Cross-Entropy). Default to `None`. output_attentions (bool, optional): See :class:`BartModel`. output_hidden_states (bool, optional): See :class:`BartModel`. return_dict (bool, optional): See :class:`BartModel`. Returns: An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqSequenceClassifierOutput` if `return_dict=True`. Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqSequenceClassifierOutput`. Especially, When `return_dict=output_hidden_states=output_attentions=False` and labels=None, returns tensor `logits`, a tensor of the input text classification logits. Shape as `[batch_size, num_labels]` and dtype as float32. Example: .. code-block:: import paddle from paddlenlp.transformers import BartForSequenceClassification, BartTokenizer tokenizer = BartTokenizer.from_pretrained('bart-base') model = BartForSequenceClassification.from_pretrained('bart-base') inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!") inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()} logits = model(**inputs) """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") use_cache = False if input_ids is None and inputs_embeds is not None: logger.warning( f"{self.__class__.__name__} will not detect eos tokens in `inputs_embeds`. Results may be " "unexpected if using eos tokens in conjunction with `inputs_embeds.`" ) outputs = self.bart( input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, encoder_output, inputs_embeds=inputs_embeds, decoder_inputs_embeds=decoder_inputs_embeds, use_cache=use_cache, cache=cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) output = outputs[0] output_shape = paddle.shape(output) if input_ids is not None: eos_mask = paddle.cast(input_ids == self.bart.config["eos_token_id"], dtype="int64") if len(paddle.unique(paddle.sum(eos_mask, axis=1))) > 1: raise ValueError("All examples must have the same number of <eos> tokens.") # TODO(gongenlei): support bool tensor index output = output.masked_select(eos_mask.unsqueeze(-1).astype("bool").tile([1, 1, output_shape[-1]])) sentence_representation = output.reshape([output_shape[0], -1, output_shape[-1]])[:, -1, :] logits = self.classifier(sentence_representation) loss = None if labels is not None: if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32): self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" if self.config.problem_type == "regression": loss_fct = paddle.nn.MSELoss() if self.num_labels == 1: loss = loss_fct(logits.squeeze(), labels.squeeze()) else: loss = loss_fct(logits, labels) elif self.config.problem_type == "single_label_classification": loss_fct = paddle.nn.CrossEntropyLoss() loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,))) elif self.config.problem_type == "multi_label_classification": loss_fct = paddle.nn.BCEWithLogitsLoss() loss = loss_fct(logits, labels) if not return_dict: if len(outputs) == 2: return (loss, logits) if loss is not None else logits output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output return Seq2SeqSequenceClassifierOutput( loss=loss, logits=logits, past_key_values=outputs.past_key_values, decoder_hidden_states=outputs.decoder_hidden_states, decoder_attentions=outputs.decoder_attentions, cross_attentions=outputs.cross_attentions, encoder_last_hidden_state=outputs.encoder_last_hidden_state, encoder_hidden_states=outputs.encoder_hidden_states, encoder_attentions=outputs.encoder_attentions, )
[文档]class BartForQuestionAnswering(BartPretrainedModel): r""" Bart Model with a linear layer on top of the hidden-states output to compute `span_start_logits` and `span_end_logits`, designed for question-answering tasks like SQuAD. Args: config (:class:`BartConfig`): An instance of BartConfig used to construct BartForQuestionAnswering. """ def __init__(self, config: BartConfig): super().__init__(config) self.bart = BartModel(config) self.classifier = nn.Linear(config.d_model, 2)
[文档] def forward( self, input_ids: Optional[Tensor] = None, attention_mask: Optional[Tensor] = None, decoder_input_ids: Optional[Tensor] = None, decoder_attention_mask: Optional[Tensor] = None, encoder_output: Union[Tuple[Tensor], ModelOutput, None] = None, inputs_embeds: Optional[Tensor] = None, decoder_inputs_embeds: Optional[Tensor] = None, use_cache: Optional[bool] = None, cache: Optional[List[Tuple[Cache, StaticCache]]] = None, start_positions: Optional[Tensor] = None, end_positions: Optional[Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, Seq2SeqQuestionAnsweringModelOutput]: r""" The BartForQuestionAnswering forward method, overrides the __call__() special method. Args: input_ids (Tensor, optional): See :class:`BartModel`. attention_mask (Tensor, optional): See :class:`BartModel`. decoder_input_ids (Tensor, `optional`): See :class:`BartModel`. decoder_attention_mask (Tensor, optional): See :class:`BartModel`. encoder_output (Tensor, optonal): See :class:`BartModel`. inputs_embeds (Tensor, optional): See :class:`BartModel`. decoder_inputs_embeds (Tensor, optional): See :class:`BartModel`. use_cache (bool, optional): See :class:`BartModel`. Forcely set to `False` when `start_positions` and `end_positions` are provided that can save memory during training. cache (Tensor, optional): See :class:`BartModel`. start_positions (Tensor, optional): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (sequence_length). Position outside of the sequence are not taken into account for computing the loss. A tensor of shape `(batch_size, )`. Default to `None`. end_positions (Tensor, optional): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (sequence_length). Position outside of the sequence are not taken into account for computing the loss. A tensor of shape `(batch_size, )`. Default to `None`. output_attentions (bool, optional): See :class:`BartModel`. output_hidden_states (bool, optional): See :class:`BartModel`. return_dict (bool, optional): See :class:`BartModel`. Returns: An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqQuestionAnsweringModelOutput` if `return_dict=True`. Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqQuestionAnsweringModelOutput`. Especially, When `return_dict=output_hidden_states=output_attentions=False` and `start_positions=end_positions=None`, returns tuple (`start_logits`, `end_logits`). With the fields: - `start_logits` (Tensor): A tensor of the input token classification logits, indicates the start position of the labelled span. Its data type should be float32 and its shape is [batch_size, sequence_length]. - `end_logits` (Tensor): A tensor of the input token classification logits, indicates the end position of the labelled span. Its data type should be float32 and its shape is [batch_size, sequence_length]. Example: .. code-block:: import paddle from paddlenlp.transformers import BartForQuestionAnswering, BartTokenizer tokenizer = BartTokenizer.from_pretrained('bart-base') model = BartForQuestionAnswering.from_pretrained('bart-base') inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!") inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()} outputs = model(**inputs) start_logits = outputs[0] end_logits =outputs[1] """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict if start_positions is not None and end_positions is not None: logger.warning( "The `use_cache` argument is changed to `False` since `start_positions` and `end_positions` are provided." ) use_cache = False outputs = self.bart( input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, encoder_output, inputs_embeds=inputs_embeds, decoder_inputs_embeds=decoder_inputs_embeds, use_cache=use_cache, cache=cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) logits = self.classifier(outputs[0]) logits = paddle.transpose(logits, perm=[2, 0, 1]) start_logits, end_logits = paddle.unstack(x=logits, axis=0) total_loss = None if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if start_positions.ndim > 1: start_positions = start_positions.squeeze(-1) if start_positions.ndim > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = paddle.shape(start_logits)[1] start_positions = start_positions.clip(0, ignored_index) end_positions = end_positions.clip(0, ignored_index) loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 if not return_dict: outputs = (start_logits, end_logits) + (outputs[1:] if len(outputs) > 2 else ()) return ((total_loss,) + outputs) if total_loss else outputs return Seq2SeqQuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, end_logits=end_logits, past_key_values=outputs.past_key_values, decoder_hidden_states=outputs.decoder_hidden_states, decoder_attentions=outputs.decoder_attentions, cross_attentions=outputs.cross_attentions, encoder_last_hidden_state=outputs.encoder_last_hidden_state, encoder_hidden_states=outputs.encoder_hidden_states, encoder_attentions=outputs.encoder_attentions, )
[文档]class BartForConditionalGeneration(BartPretrainedModel): r""" Bart Model with a `language modeling` head on top. Args: config (:class:`BartConfig`): An instance of BartConfig used to construct BartForConditionalGeneration. """ def __init__(self, config: BartConfig): super().__init__(config) self.bart = BartModel(config) self.lm_head_weight = self.create_parameter( shape=[config.vocab_size, config.d_model], dtype=self.bart.shared.weight.dtype, is_bias=False ) self.register_buffer("final_logits_bias", paddle.zeros((1, config.vocab_size))) def get_encoder(self): return self.bart.get_encoder() def get_decoder(self): return self.bart.get_decoder() def prepare_fast_entry(self, kwargs): from paddlenlp.ops import FasterBART decode_strategy = kwargs.get("decode_strategy") use_fp16_decoding = kwargs.get("use_fp16_decoding", False) decoding_lib = kwargs.get("decoding_lib", None) enable_fast_encoder = kwargs.get("enable_fast_encoder", True) if decode_strategy == "sampling" and kwargs.get("top_k") != 0 and kwargs.get("top_p") != 1: raise AttributeError( "Only topk sampling or topp sampling are supported. " "Topk sampling and topp sampling cannot be both applied in the fast version." ) if kwargs["repetition_penalty"] != 1.0: # not support for repetition_penalty yet in the fast version raise AttributeError("'repetition_penalty != 1' is not supported yet in the fast version") if kwargs["min_length"] != 0: # not support for min_length yet in the fast version raise AttributeError("'min_length != 0' is not supported yet in the fast version") if kwargs["forced_bos_token_id"] is not None: # not support for min_length yet in the fast version raise AttributeError("'forced_bos_token_id != None' is not supported yet in the fast version") self._fast_entry = FasterBART( self, use_fp16_decoding=use_fp16_decoding, decoding_lib=decoding_lib, enable_fast_encoder=enable_fast_encoder, ).forward return self._fast_entry
[文档] def forward( self, input_ids: Optional[Tensor] = None, attention_mask: Optional[Tensor] = None, decoder_input_ids: Optional[Tensor] = None, decoder_attention_mask: Optional[Tensor] = None, encoder_output: Union[Tuple[Tensor], ModelOutput, None] = None, inputs_embeds: Optional[Tensor] = None, decoder_inputs_embeds: Optional[Tensor] = None, use_cache: Optional[bool] = None, cache: Optional[List[Tuple[Cache, StaticCache]]] = None, labels: Optional[Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tensor, Tuple, Seq2SeqLMOutput]: r""" The BartForConditionalGeneration forward method, overrides the __call__() special method. Args: input_ids (Tensor, optional): See :class:`BartModel`. attention_mask (Tensor, optional): See :class:`BartModel`. decoder_input_ids (Tensor, `optional`): See :class:`BartModel`. decoder_attention_mask (Tensor, optional): See :class:`BartModel`. encoder_output (Tensor, optonal): See :class:`BartModel`. inputs_embeds (Tensor, optional): See :class:`BartModel`. decoder_inputs_embeds (Tensor, optional): See :class:`BartModel`. use_cache (bool, optional): See :class:`BartModel`. cache (Tensor, optional): See :class:`BartModel`. labels (Tensor, optional): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`. A tensor of shape `(batch_size, sequence_length)`. Default to `None`. output_attentions (bool, optional): See :class:`BartModel`. output_hidden_states (bool, optional): See :class:`BartModel`. return_dict (bool, optional): See :class:`BartModel`. Returns: An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput` if `return_dict=True`. Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput`. Especially, When `use_cache=return_dict=output_hidden_states=output_attentions=False` and labels=None, returns tensor `logits`, a tensor of the input text classification logits. With the fields: - `lm_logits` (Tensor): The generated sentence of the model. Its data type should be float32 and has a shape of [batch_size, sequence_length, vocab_size]. Example: .. code-block:: import paddle from paddlenlp.transformers import BartForConditionalGeneration, BartTokenizer tokenizer = BartTokenizer.from_pretrained('bart-base') model = BartForConditionalGeneration.from_pretrained('bart-base') inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!") inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()} outputs = model(**inputs) """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: if use_cache: logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") use_cache = False outputs = self.bart( input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, encoder_output, inputs_embeds=inputs_embeds, decoder_inputs_embeds=decoder_inputs_embeds, use_cache=use_cache, cache=cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) lm_logits = paddle.tensor.matmul(outputs[0], self.lm_head_weight, transpose_y=True) + self.final_logits_bias masked_lm_loss = None if labels is not None: loss_fct = nn.CrossEntropyLoss() masked_lm_loss = loss_fct(lm_logits.reshape((-1, self.bart.config["vocab_size"])), labels.reshape((-1,))) if not return_dict: if len(outputs) == 2: return (masked_lm_loss, lm_logits) if masked_lm_loss is not None else lm_logits else: outputs = (lm_logits,) + outputs[1:] return ((masked_lm_loss,) + outputs) if masked_lm_loss is not None else outputs return Seq2SeqLMOutput( loss=masked_lm_loss, logits=lm_logits, past_key_values=outputs.past_key_values, decoder_hidden_states=outputs.decoder_hidden_states, decoder_attentions=outputs.decoder_attentions, cross_attentions=outputs.cross_attentions, encoder_last_hidden_state=outputs.encoder_last_hidden_state, encoder_hidden_states=outputs.encoder_hidden_states, encoder_attentions=outputs.encoder_attentions, )
def prepare_decoder_input_ids_from_labels(self, labels): return shift_tokens_right(labels, self.bart.config["decoder_start_token_id"]) def prepare_inputs_for_generation( self, decoder_input_ids, attention_mask=None, decoder_attention_mask=None, cache=None, use_cache=False, encoder_output=None, **kwargs ): # cut decoder_input_ids if past is used if cache is not None: decoder_input_ids = decoder_input_ids[:, -1].unsqueeze(-1) if decoder_attention_mask is not None: decoder_attention_mask = decoder_attention_mask[:, :, -1, :].unsqueeze(2) return { "input_ids": None, "decoder_input_ids": decoder_input_ids, "encoder_output": encoder_output, "decoder_attention_mask": decoder_attention_mask, "attention_mask": attention_mask, "use_cache": use_cache, "cache": cache, } def __getattr__(self, name): try: return super().__getattr__(name) except AttributeError: return getattr(getattr(self, self.base_model_prefix), name)