```# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team.
#
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# Unless required by applicable law or agreed to in writing, software
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
import math
from typing import Optional, Tuple

from .. import PretrainedModel, register_base_model
from ..activations import ACT2FN
from .configuration import (
PROPHETNET_PRETRAINED_INIT_CONFIGURATION,
PROPHETNET_PRETRAINED_RESOURCE_FILES_MAP,
ProphetNetConfig,
)

__all__ = [
"ProphetNetModel",
"ProphetNetPretrainedModel",
"ProphetNetEncoder",
"ProphetNetDecoder",
"ProphetNetForConditionalGeneration",
]

def ngram_attention_bias(sequence_length, ngram, dtype):
"""
This function computes the bias for the predict stream
"""
left_block = paddle.ones((ngram, sequence_length, sequence_length), dtype=dtype) * float("-inf")
right_block = left_block.detach().clone()
# create bias
for stream_idx in range(ngram):
right_block[stream_idx] = right_block[stream_idx].fill_diagonal_(0, wrap=False)
left_block[stream_idx] = paddle.triu(left_block[stream_idx], diagonal=-stream_idx + 1)

left_block[:, :, 0] = 0

def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_bidirectional=False):
"""
This function computes individual parts of the relative position buckets. For more detail, see paper.
"""
inv_relative_positions = -relative_positions
rel_positions_bucket = 0

if is_bidirectional:
num_buckets = num_buckets // 2
rel_positions_bucket = (
rel_positions_bucket
)
* num_buckets
)
else:
inv_relative_positions = (
)
* inv_relative_positions
)

max_exact = num_buckets // 2
) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
val_if_large_num_buckets = paddle.ones_like(val_if_large) * (num_buckets - 1)
val_if_large = (
+ (1 - val_if_large_lt) * val_if_large_num_buckets
)
)
return rel_positions_bucket

def compute_all_stream_relative_buckets(num_buckets, max_distance, position_ids):
"""
This function computes both main and predict relative position buckets. For more detail, see paper.
"""
# main stream
)
main_stream_relative_positions = main_stream_relative_positions - paddle.unsqueeze(position_ids, axis=-1)

# predicting stream
paddle.concat([position_ids - 1, position_ids], axis=-1), axis=1
)
predicting_stream_relative_positions, repeat_times=[1, position_ids.shape[-1], 1]
)
position_ids, axis=-1
)

# get both position buckets
main_relative_position_buckets = compute_relative_buckets(
num_buckets, max_distance, main_stream_relative_positions, is_bidirectional=False
)
predict_relative_position_buckets = compute_relative_buckets(
num_buckets, max_distance, predicting_stream_relative_positions, is_bidirectional=False
)
return main_relative_position_buckets, predict_relative_position_buckets

[文档]class ProphetNetPretrainedModel(PretrainedModel):
"""
An abstract class for pretrained Prophetnet models. It provides Prophetnet related
`model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
"""

pretrained_init_configuration = PROPHETNET_PRETRAINED_INIT_CONFIGURATION
pretrained_resource_files_map = PROPHETNET_PRETRAINED_RESOURCE_FILES_MAP
base_model_prefix = "prophetnet"
config_class = ProphetNetConfig

def _init_weights(self, layer):
if isinstance(layer, nn.Linear):
layer.weight.set_value(
mean=0.0,
std=self.config.init_std,
shape=layer.weight.shape,
)
)
if layer.bias is not None:

def _shift_right(self, input_ids):
decoder_start_token_id = self.config.decoder_start_token_id

assert decoder_start_token_id is not None, (
"self.config.decoder_start_token_id has to be defined. "
)

# shift inputs to the right
shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
shifted_input_ids[..., 0] = decoder_start_token_id

# replace possible -100 values in labels by `pad_token_id`

assert (
), "Verify that `shifted_input_ids` has only positive values"

return shifted_input_ids

class ProphetNetPositionalEmbeddings(nn.Embedding):
"""
ProphetNetPositional Embeddings.
"""

def __init__(self, config: ProphetNetConfig):
self.max_length = config.max_position_embeddings
super(ProphetNetPositionalEmbeddings, self).__init__(
)

def forward(self, inputs_shape, attention_mask=None, past_key_values=None, position_ids=None):
assert (position_ids is None) or (
), "If position_ids is pre-computed then padding_idx should not be set."

if position_ids is None:
if past_key_values is not None:
# position_ids is the same for every token when decoding a single step
# Without the int() cast, it doesn't work in some cases when exporting to ONNX
prev_num_input_ids = past_key_values[0][0].shape[2]
num_input_ids = inputs_shape[1] + prev_num_input_ids
else:

# retrieve position_ids from input_ids / attention_mask
position_ids = (
)
)

# make sure position_ids are not bigger then max_length
position_ids = paddle.clip(position_ids, min=0, max=self.max_length - 1)

return super().forward(position_ids), position_ids

def _forward(self, position_ids):
return super().forward(position_ids)

class ProphetNetAttention(Layer):
"""
Multi-headed attention from 'Attention Is All You Need' paper.
"""

def __init__(self, hidden_size, attention_dropout, dropout, num_attn_heads: int):
super().__init__()
hidden_size = hidden_size

self.attention_dropout = attention_dropout
self.dropout = dropout

assert (

self.key_proj = nn.Linear(hidden_size, hidden_size)
self.value_proj = nn.Linear(hidden_size, hidden_size)
self.query_proj = nn.Linear(hidden_size, hidden_size)

self.out_proj = nn.Linear(hidden_size, hidden_size)

def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
)

def forward(
self,
hidden_states,
key_value_states: Optional[Tensor] = None,
past_key_value: Optional[Tuple[Tensor]] = None,
) -> Tuple[Tensor, Optional[Tensor]]:

batch_size, tgt_len, hidden_size = hidden_states.shape

# if key_value_states are provided this layer is used as a cross-attention layer
# for the decoder
is_cross_attention = key_value_states is not None
assert hidden_states.shape == [
batch_size,
tgt_len,
hidden_size,
], f"Size of hidden states should be {batch_size, tgt_len, hidden_size}, but is {hidden_states.shape}"

# previous time steps are cached - no need to recompute key and value if they are static

if is_cross_attention and past_key_value is not None:
# reuse k,v, cross_attentions
key_states = past_key_value[0]
value_states = past_key_value[1]
elif is_cross_attention:
# cross_attentions
key_states = self._shape(self.key_proj(key_value_states), -1, batch_size)
value_states = self._shape(self.value_proj(key_value_states), -1, batch_size)
else:
# self_attention
key_states = self._shape(self.key_proj(hidden_states), -1, batch_size)
value_states = self._shape(self.value_proj(hidden_states), -1, batch_size)

if is_cross_attention:
# Further calls to cross_attention layer can then reuse all cross-attention
# key/value_states (first "if" case)
# if encoder bi-directional self-attention `past_key_value` is always `None`
past_key_value = (key_states, value_states)

# project states into the correct shape
query_states = paddle.reshape(self._shape(query_states, tgt_len, batch_size), proj_shape)

src_len = key_states.shape[1]
attn_weights = paddle.bmm(query_states, key_states.transpose((0, 2, 1)))
assert attn_weights.shape == [
tgt_len,
src_len,
], f"`attn_weights` should be of size {batch_size * self.num_attn_heads, tgt_len, src_len}, but is of size {attn_weights.shape}"

# This is part of a workaround to get around fork/join parallelism not supporting Optional types.
1,
src_len,

attn_weights = F.softmax(attn_weights, axis=-1)

attn_probs = F.dropout(attn_weights, p=self.attention_dropout, training=self.training)

assert attn_output.shape == [
tgt_len,
], f"`attn_output` should be of shape {batch_size * self.num_attn_heads, tgt_len, self.head_dim}, but is of shape {attn_output.shape}"

),
(batch_size, tgt_len, hidden_size),
)

attn_output = self.out_proj(attn_output)

attn_output = F.dropout(attn_output, p=self.dropout, training=self.training)
return attn_output, past_key_value

class ProphetNetFeedForward(Layer):
"""
This is the residual two feed-forward layer block based on the original Transformer implementation.
"""

def __init__(self, hidden_size, activation_function, activation_dropout, dropout, ffn_dim: int):
super(ProphetNetFeedForward, self).__init__()
self.activation_fn = ACT2FN[activation_function]
self.intermediate = nn.Linear(hidden_size, ffn_dim)
self.output = nn.Linear(ffn_dim, hidden_size)
self.activation_dropout = activation_dropout
self.dropout = dropout

def forward(self, hidden_states):
hidden_states = self.intermediate(hidden_states)
hidden_states = self.activation_fn(hidden_states)

hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.output(hidden_states)
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
return hidden_states

class ProphetNetNgramSelfAttention(Layer):
def __init__(
self,
hidden_size,
num_buckets,
relative_max_distance,
dropout,
attention_dropout,
ngram,
):
super(ProphetNetNgramSelfAttention, self).__init__()

self.hidden_size = hidden_size

self.num_buckets = num_buckets
self.relative_max_distance = relative_max_distance
self.dropout = dropout
self.attention_dropout = attention_dropout
self.ngram = ngram

assert (
), "config.hidden_size must be divisible by num_attn_heads"
# key, value, query projection
self.key_proj = nn.Linear(hidden_size, hidden_size)
self.value_proj = nn.Linear(hidden_size, hidden_size)
self.query_proj = nn.Linear(hidden_size, hidden_size)

# out projection
self.out_proj = nn.Linear(hidden_size, hidden_size)

# rel position embeddings
self.relative_pos_embeddings = nn.Linear(hidden_size, self.num_buckets * self.num_attn_heads)

def _shape(self, tensor, seq_len, batch_size):
)

def forward(
self,
hidden_states,
past_key_value: Optional[Tuple[Tensor]] = None,
main_relative_position_buckets=None,
predict_relative_position_buckets=None,
position_ids=None,
):
batch_size, ngram_sequence_length, hidden_size = hidden_states.shape

assert hidden_states.shape == [
batch_size,
ngram_sequence_length,
hidden_size,
], f"`hidden_states` should be of shape {batch_size, ngram_sequence_length, hidden_size}, but is of shape {hidden_states.shape}"

# project
query_states = self.query_proj(hidden_states)
key_states = self.key_proj(hidden_states)
value_states = self.value_proj(hidden_states)

# normalize

# reshape
query_states = self._shape(query_states, ngram_sequence_length, batch_size)
key_states = self._shape(key_states, -1, batch_size)
value_states = self._shape(value_states, -1, batch_size)

# chunk into main stream and predict stream
hidden_states_list = paddle.chunk(hidden_states, 1 + self.ngram, axis=1)

query_states_list = paddle.chunk(query_states, 1 + self.ngram, axis=1)
key_states_list = paddle.chunk(key_states, 1 + self.ngram, axis=1)
value_states_list = paddle.chunk(value_states, 1 + self.ngram, axis=1)

main_hidden_states, hidden_states_predict_list = hidden_states_list[0], hidden_states_list[1:]
main_query_states, predict_query_states_list = query_states_list[0], query_states_list[1:]
main_key_states, predict_key_states_list = key_states_list[0], key_states_list[1:]
main_value_states, predict_value_states_list = value_states_list[0], value_states_list[1:]

if past_key_value is not None:

# Update cache
past_key_value = (
)

# get seq_length of main stream only
sequence_length = ngram_sequence_length // (1 + self.ngram)

# MAIN-STREAM
# main attn weights

# retrieve relative position embeddings for each layer -> see paper for more details
main_relative_pos_embeddings = self.get_main_relative_pos_embeddings(
main_hidden_states, main_attn_weights, position_ids, main_relative_position_buckets
)

main_attn_weights = main_attn_weights + main_relative_pos_embeddings

main_attn_probs = F.softmax(main_attn_weights, axis=-1, dtype=main_attn_weights.dtype)

main_attn_probs = F.dropout(main_attn_probs, p=self.attention_dropout, training=self.training)
# project to attn_output

# reshape so that num_heads dim is merged into last `head_dim` axis
(0, 2, 1, 3),
),
(batch_size, 1, sequence_length, hidden_size),
)
main_attn_output = self.out_proj(main_attn_output)

# PREDICT-STREAM
)
[
for key in predict_key_states_list
],
axis=0,
)

# [ngram, T, B, C]
paddle.concat(hidden_states_predict_list, axis=0), (self.ngram, sequence_length, batch_size, hidden_size)
)

[
for v_p in predict_value_states_list
],
axis=0,
)

# retrieve relative position embeddings for each layer -> see paper for more details
predict_relative_pos_embeddings = self.get_predict_relative_pos_embeddings(
predict_hidden_states, predict_attn_weights, position_ids, predict_relative_position_buckets
)

predict_attn_weights = predict_attn_weights + predict_relative_pos_embeddings

)

predict_attn_probs = F.softmax(predict_attn_weights, axis=-1, dtype=predict_attn_weights.dtype)

predict_attn_probs = F.dropout(predict_attn_probs, p=self.attention_dropout, training=self.training)
# project to attention output

# reshape so that num_heads dim is merged into last `head_dim` axis
# [ngram, B, T, C]
),
(1, 0, 3, 2, 4),
),
(batch_size, self.ngram, sequence_length, hidden_size),
)
predict_attn_output = self.out_proj(predict_attn_output)

# concat to single attn output
# [B, 1+ngram*T, C]
paddle.concat([main_attn_output, predict_attn_output], axis=1), (batch_size, -1, hidden_size)
)
# reshape into better form for `config.output_attentions`
(1, 0, 2, 3, 4),
)

attn_output = F.dropout(attn_output, p=self.dropout, training=self.training)

return attn_output, main_attn_probs, predict_attn_probs, past_key_value

def get_main_relative_pos_embeddings(
self, hidden_states, attn_weights, position_ids, main_relative_position_buckets
):
# input hidden_states [B,T,C], input attn_weights [T*head,T,S], input position_ids [B,T] or [1,1]

if main_relative_position_buckets is None:
batch_size, sequence_length = hidden_states.shape[:2]
repeat_times=[batch_size, sequence_length, 1],
)
)  # [B, T, s]
main_relative_position_buckets = compute_relative_buckets(
self.num_buckets, self.relative_max_distance, relative_positions, False
)

),
(0, 3, 1, 2),
rel_pos_embeddings = rel_pos_embeddings.reshape(attn_weights.shape[:2] + [-1])  # [B*head,T,Buckets]

(-1, main_relative_position_buckets.shape[-1]),
),
rel_pos_embeddings, (-1, rel_pos_embeddings.shape[-1])

main_relative_position_buckets.unsqueeze(2), repeat_times=[1, 1, 2]
)
repeat_times=[1, main_relative_position_buckets_index.shape[1]],
)

(attn_weights.shape[:2] + [-1]),
)
return main_relative_pos_embeddings

def get_predict_relative_pos_embeddings(
self, hidden_states, attn_weights, position_ids, predict_relative_position_buckets
):
# input hidden_states [ngram, T,B,C],
# input position_ids [B,T] or [1,1],
# input predict_relative_position_buckets [B,T, 2*T] or None
sequence_length, batch_size = hidden_states.shape[1:3]

if predict_relative_position_buckets is None:
key_sequence_length = attn_weights.shape[-1]
assert (
position_ids[0][0] == key_sequence_length - 1
), "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
repeat_times=[batch_size, sequence_length, 1],
)

)
predict_relative_position_buckets = compute_relative_buckets(
self.num_buckets, self.relative_max_distance, relative_positions, False
)

hidden_states = paddle.transpose(hidden_states, (0, 2, 1, 3))  # [ngram, B, T, C]
self.relative_pos_embeddings(hidden_states),
)  # [ngram, B, T, bucket, head]
paddle.transpose(rel_pos_embeddings, (0, 1, 4, 2, 3)),
(self.ngram * batch_size * self.num_attn_heads, sequence_length, -1),

)  # [ngram, B, head*T, S]

predict_relative_position_buckets.unsqueeze(2), repeat_times=[1, 1, 2]
)
repeat_times=[1, predict_relative_position_buckets_index.shape[1]],
)

(self.ngram, batch_size * self.num_attn_heads, sequence_length, -1),
)  # [ngram, B*head, T, S]

return predict_relative_pos_embeddings

class ProphetNetEncoderLayer(Layer):
"""
Encoder block for Prophetnet
"""

def __init__(self, config: ProphetNetConfig):
super(ProphetNetEncoderLayer, self).__init__()
# 1st residual block
self.self_attn = ProphetNetAttention(
)
self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size)

# 2nd residual block
self.feed_forward = ProphetNetFeedForward(
config.hidden_size,
config.activation_function,
config.activation_dropout,
config.dropout,
config.encoder_ffn_dim,
)
self.feed_forward_layer_norm = nn.LayerNorm(config.hidden_size)

# 1st residual block
hidden_states = self.self_attn_layer_norm(attention_output + hidden_states)

# 2nd residual block
feed_forward_output = self.feed_forward(hidden_states)
hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states)
return hidden_states

class ProphetNetDecoderLayer(Layer):
"""
Decoder block for Prophetnet
"""

def __init__(self, config: ProphetNetConfig):
super(ProphetNetDecoderLayer, self).__init__()
# 1st residual block
self.self_attn = ProphetNetNgramSelfAttention(
config.hidden_size,
config.num_buckets,
config.relative_max_distance,
config.dropout,
config.attention_dropout,
config.ngram,
)
self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size)

# 2nd residual block
self.cross_attn = ProphetNetAttention(
)
self.cross_attn_layer_norm = nn.LayerNorm(config.hidden_size)

# 3rd residual block
self.feed_forward = ProphetNetFeedForward(
config.hidden_size,
config.activation_function,
config.activation_dropout,
config.dropout,
config.decoder_ffn_dim,
)
self.feed_forward_layer_norm = nn.LayerNorm(config.hidden_size)

def forward(
self,
hidden_states,
encoder_hidden_states=None,
main_relative_position_buckets=None,
predict_relative_position_buckets=None,
position_ids=None,
past_key_value=None,
use_cache: bool = True,
):
# 1st residual block
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
ngram_attention_output, self_attn_weights, self_attn_weights_ngram, present_key_value = self.self_attn(
hidden_states=hidden_states,
past_key_value=self_attn_past_key_value,
main_relative_position_buckets=main_relative_position_buckets,
predict_relative_position_buckets=predict_relative_position_buckets,
position_ids=position_ids,
)
hidden_states = self.self_attn_layer_norm(hidden_states + ngram_attention_output)

# cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
if encoder_hidden_states is not None:
# 2nd residual block
attention_output, cross_attn_present_key_value = self.cross_attn(
hidden_states=hidden_states,
key_value_states=encoder_hidden_states,
past_key_value=cross_attn_past_key_value,
)
hidden_states = self.cross_attn_layer_norm(attention_output + hidden_states)

# add cross-attn to positions 3,4 of present_key_value tuple
present_key_value = present_key_value + cross_attn_present_key_value

# 3rd residual block
feed_forward_output = self.feed_forward(hidden_states)
hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states)

outputs = (hidden_states,)

if use_cache:
outputs += (present_key_value,)

return outputs

[文档]class ProphetNetEncoder(ProphetNetPretrainedModel):
r"""
word_embeddings  (:obj:`paddle.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`):
The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with
pre-defined word embeddings instead of randomly initialized word embeddings.
"""

def __init__(self, word_embeddings, config: ProphetNetConfig):
super(ProphetNetEncoder, self).__init__(config)
self.init_std = config.init_std
if word_embeddings is not None:
self.word_embeddings = word_embeddings
else:

self.position_embeddings = ProphetNetPositionalEmbeddings(config)
self.embeddings_layer_norm = nn.LayerNorm(config.hidden_size)

self.layers = nn.LayerList([ProphetNetEncoderLayer(config) for _ in range(config.num_encoder_layers)])

if input_ids is None:
raise ValueError("Input_ids cannot be None.")
inputs_embeds = self.word_embeddings(input_ids)

)
) * -10000.0
else:

position_embeddings, position_ids = self.position_embeddings(inputs_embeds.shape[:2])

hidden_states = inputs_embeds + position_embeddings
hidden_states = self.embeddings_layer_norm(hidden_states)
hidden_states = F.dropout(hidden_states, p=self.config.dropout, training=self.training)

for idx, encoder_layer in enumerate(self.layers):
return hidden_states

[文档]class ProphetNetDecoder(ProphetNetPretrainedModel):
def __init__(self, word_embeddings, config: ProphetNetConfig):
super(ProphetNetDecoder, self).__init__(config)
self.init_std = config.init_std
self.ngram = config.ngram
self.num_buckets = config.num_buckets
self.relative_max_distance = config.relative_max_distance
self.dropout = config.dropout
self.max_target_positions = config.max_position_embeddings
if word_embeddings is not None:
self.word_embeddings = word_embeddings
else:

self.position_embeddings = ProphetNetPositionalEmbeddings(config)

self.ngram_embeddings = nn.Embedding(self.ngram, config.hidden_size)
self.layers = nn.LayerList([ProphetNetDecoderLayer(config) for _ in range(config.num_decoder_layers)])
self.embeddings_layer_norm = nn.LayerNorm(config.hidden_size)

[文档]    def forward(
self,
input_ids=None,
encoder_hidden_states=None,
past_key_values=None,
use_cache=True,
):
if input_ids is None:
raise ValueError("Decoder input_ids cannot be None.")
inputs_embeds = self.word_embeddings(input_ids)
batch_size, sequence_length = inputs_embeds.shape[:2]

main_stream_pos_embed, position_ids = self.position_embeddings(
(batch_size, sequence_length), past_key_values=past_key_values
)

if past_key_values is not None:
main_relative_position_buckets, predict_relative_position_buckets = None, None
else:
main_relative_position_buckets, predict_relative_position_buckets = self.compute_buffered_relative_buckets(
position_ids
)
predicting_stream_pos_embed = self.position_embeddings._forward(position_ids + 1)

hidden_states = inputs_embeds + main_stream_pos_embed

ngram_embeddings = self.ngram_embeddings.weight

if past_key_values is not None:
assert (
hidden_states.shape[1] == 1
), "At the moment `use_cache` is only supported for `decoder_input_ids` of length 1"

ngram_hidden_states = [
(ngram_embeddings[ngram - 1] + predicting_stream_pos_embed), repeat_times=[batch_size, 1, 1]
)
for ngram in range(self.ngram)
]
else:
ngram_hidden_states = [
(ngram_embeddings[ngram - 1] + predicting_stream_pos_embed) for ngram in range(self.ngram)
]

1.0
)
) * -10000.0
else:

hidden_states = paddle.concat([hidden_states] + ngram_hidden_states, axis=1)

if self.embeddings_layer_norm:
hidden_states = self.embeddings_layer_norm(hidden_states)

hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)

present_key_values = () if use_cache else None

for idx, decoder_layer in enumerate(self.layers):

past_key_value = past_key_values[idx] if past_key_values is not None else None

layer_outputs = decoder_layer(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
main_relative_position_buckets=main_relative_position_buckets,
predict_relative_position_buckets=predict_relative_position_buckets,
position_ids=position_ids,
past_key_value=past_key_value,
use_cache=use_cache,
)

hidden_states = layer_outputs[0]

if use_cache:
present_key_values += (layer_outputs[1],)

last_hidden_state = hidden_states[:, :sequence_length]  # 1-gram
last_hidden_state_ngram = hidden_states[:, sequence_length:] if self.ngram > 0 else None  # 2-gram
return tuple(v for v in [last_hidden_state, last_hidden_state_ngram, present_key_values] if v is not None)

def compute_buffered_relative_buckets(self, position_ids):
batch_size, sequence_length = position_ids.shape

if not hasattr(self, "_main_relative_buckets") or self._main_relative_buckets is None:
self._main_relative_buckets, self._predict_relative_buckets = compute_all_stream_relative_buckets(
self.num_buckets, self.relative_max_distance, position_ids
)

# buffer relative buckets
self._main_relative_buckets[:, :sequence_length, :sequence_length], repeat_times=[batch_size, 1, 1]
)
[
self._predict_relative_buckets[:, :sequence_length, :sequence_length],
self._predict_relative_buckets[
:, :sequence_length, self.max_target_positions : self.max_target_positions + sequence_length
],
],
axis=2,
),
repeat_times=[batch_size, 1, 1],
)

return main_relative_buckets, predict_relative_buckets

batch_size, seq_length = hidden_states.shape[:2]

(self.max_target_positions, self.max_target_positions), -float("inf"), dtype=hidden_states.dtype
)
)

else:
dtype=hidden_states.dtype,
)

batch_size, seq_length = hidden_states.shape[:2]

self.max_target_positions, self.ngram, hidden_states.dtype
)
[
:, :seq_length, self.max_target_positions : self.max_target_positions + seq_length
],
],
axis=-1,
)
)

# predicted stream attention_mask should always be 0
)
else:
dtype=hidden_states.dtype,
)

[文档]@register_base_model
class ProphetNetModel(ProphetNetPretrainedModel):
def __init__(self, config: ProphetNetConfig):
super(ProphetNetModel, self).__init__(config)
self.init_std = config.init_std
self.eps = config.eps
self.disable_ngram_loss = config.disable_ngram_loss
self.decoder_start_token_id = config.decoder_start_token_id

self.encoder = ProphetNetEncoder(self.word_embeddings, config)

self.decoder = ProphetNetDecoder(self.word_embeddings, config)

def get_encoder(self):
return self.encoder

def get_decoder(self):
return self.decoder

[文档]    def get_input_embeddings(self):
return self.word_embeddings

[文档]    def set_input_embeddings(self, value):
self.word_embeddings = value

[文档]    def forward(
self,
input_ids=None,
decoder_input_ids=None,
encoder_output: Optional[Tuple] = None,
use_cache=True,
past_key_values=None,
):
assert input_ids is not None, "input_ids should be " "specified when generating attention_mask"

assert decoder_input_ids is not None, (
"decoder_input_ids should be " "specified when generating decoder_attention_mask"
)
)
if encoder_output is None:
decoder_outputs = self.decoder(
input_ids=decoder_input_ids,
encoder_hidden_states=encoder_output,
use_cache=use_cache,
past_key_values=past_key_values,
)
return decoder_outputs + (encoder_output,)

class Linear_wo_bias(Layer):
def __init__(self, in_features, out_features, weight_attr=None, name=None):
super(Linear_wo_bias, self).__init__()
self._dtype = self._helper.get_default_dtype()
self._weight_attr = weight_attr
self.weight = self.create_parameter(
shape=[in_features, out_features], attr=self._weight_attr, dtype=self._dtype, is_bias=False
)
self.name = name

def forward(self, input):
out = F.linear(x=input, weight=self.weight, name=self.name)
return out

def extra_repr(self):
name_str = ", name={}".format(self.name) if self.name else ""
return "in_features={}, out_features={}, dtype={}{}".format(
self.weight.shape[0], self.weight.shape[1], self._dtype, name_str
)

[文档]class ProphetNetForConditionalGeneration(ProphetNetPretrainedModel):
def __init__(self, config: ProphetNetConfig):
super(ProphetNetForConditionalGeneration, self).__init__(config)
self.prophetnet = ProphetNetModel(config)

[文档]    def forward(
self,
input_ids=None,
decoder_input_ids=None,
encoder_output=None,
labels=None,
use_cache=True,
past_key_values=None,
):
if labels is not None and decoder_input_ids is None:
# get decoder inputs from shifting lm labels to the right
decoder_input_ids = self._shift_right(labels)
outputs = self.prophetnet(
input_ids=input_ids,
decoder_input_ids=decoder_input_ids,
encoder_output=encoder_output,
use_cache=use_cache,
past_key_values=past_key_values,
)

batch_size, sequence_length = decoder_input_ids.shape

predicting_streams = paddle.reshape(outputs[1], (batch_size, self.config.ngram, sequence_length, -1))

logits = predict_logits[:, 0]
if use_cache:
past_key_values = outputs[2]
return logits, past_key_values, predict_logits
else:
return logits, predict_logits

def prepare_inputs_for_generation(
self,
decoder_input_ids,
cache=None,
use_cache=None,
encoder_output=None,
):
assert encoder_output is not None, "`encoder_output` have to be passed for generation."
if cache is not None:
decoder_input_ids = decoder_input_ids[:, -1].unsqueeze(-1)

# first step, decoder_cached_states are empty
return {
"input_ids": None,  # encoder_outputs is defined. input_ids not needed
"decoder_input_ids": decoder_input_ids,
"encoder_output": encoder_output,
"use_cache": use_cache,
"past_key_values": cache,
}

def prepare_decoder_input_ids_from_labels(self, labels):
return self._shift_right(labels)

def get_encoder(self):
return self.prophetnet.encoder

def get_decoder(self):
return self.prophetnet.decoder

def __getattr__(self, name):
try:
return super().__getattr__(name)
except AttributeError:
return getattr(getattr(self, self.base_model_prefix), name)
```