# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle.nn import TransformerEncoder, TransformerEncoderLayer
from paddlenlp.ops.ext_utils import LOADED_EXT, load
from paddlenlp.ops.fast_transformer.transformer.decoding import transfer_param
from paddlenlp.utils.log import logger
from .decoding import run_custom
[docs]def encoder_layer_forward(self, src, src_mask, cache=None, sequence_id_offset=None, trt_seq_len=None):
"""
Redefines `forward` function of `paddle.nn.TransformerEncoderLayer` for
integrating FastGeneration for inference.
The original `forward` function would not be replaced unless
`enable_fast_encoder` is called by objects of its base class. After
replacing, objects of `paddle.nn.TransformerEncoderLayer` also have the
same member variables as before.
After inference, `disable_fast_encoder` could be called to restore the
`forward` function of `paddle.nn.TransformerEncoder` and
`paddle.nn.TransformerEncoderLayer`.
Args:
src (Tensor):
The input of Transformer encoder layer. It is a tensor with shape
`[batch_size, sequence_length, d_model]`. The data type should be
float32 or float64.
src_mask (Tensor, optional):
A tensor used in multi-head attention to prevents attention to some
unwanted positions, usually the paddings or the subsequent
positions. It is a tensor with shape `[batch_size, 1, 1, sequence_length]`.
When the data type is bool, the unwanted positions have `False`
values and the others have `True` values. When the data type is int,
the unwanted positions have 0 values and the others have 1 values.
When the data type is float, the unwanted positions have `-INF`
values and the others have 0 values. It can be None when nothing
wanted or needed to be prevented attention to. Defaults to None.
Returns:
src(Tensor|tuple):
It is a tensor that has the same shape and data type as `enc_input`,
representing the output of Transformer encoder layer. Or a tuple if
`cache` is not None, except for encoder layer output, the tuple
includes the new cache which is same as input `cache` argument but
`incremental_cache` has an incremental length. See
`paddle.nn.MultiHeadAttention.gen_cache` and
`paddle.nn.MultiHeadAttention.forward` for more details.
"""
if cache is not None:
raise NotImplementedError("cache in encoder is not supported now")
src = infer_transformer_encoder(
input=[src],
attn_mask=[src_mask],
q_weight=[self.self_attn.q_proj.weight],
q_bias=[self.self_attn.q_proj.bias],
k_weight=[self.self_attn.k_proj.weight],
k_bias=[self.self_attn.k_proj.bias],
v_weight=[self.self_attn.v_proj.weight],
v_bias=[self.self_attn.v_proj.bias],
attn_out_weight=[self.self_attn.out_proj.weight],
attn_out_bias=[self.self_attn.out_proj.bias],
norm1_weight=[self.norm1.weight],
norm1_bias=[self.norm1.bias],
norm2_weight=[self.norm2.weight],
norm2_bias=[self.norm2.bias],
ffn_inter_weight=[self.linear1.weight],
ffn_inter_bias=[self.linear1.bias],
ffn_out_weight=[self.linear2.weight],
ffn_out_bias=[self.linear2.bias],
# sequence_id_offset=paddle.to_tensor([]),
# trt_seqlen_offset=paddle.to_tensor([]),
# amax_list=paddle.to_tensor([]), # int8 mode is not supported.
n_head=self._config["nhead"],
size_per_head=self._config["d_model"] // self._config["nhead"],
use_gelu=self._config["activation"] == "gelu",
normalize_before=self._config["normalize_before"] is True,
)
return src
[docs]def encoder_forward(self, src, src_mask=None, cache=None):
"""
Redefines `forward` function of `paddle.nn.TransformerEncoder` for
integrating FastGeneration for inference.
The original `forward` function would not be replaced unless
`enable_fast_encoder` is called by objects of its base class. After
replacing, objects of `paddle.nn.TransformerEncoder` also have the same
member variables as before.
After inference, `disable_fast_encoder` could be called to restore the
`forward` function of `paddle.nn.TransformerEncoder` and
`paddle.nn.TransformerEncoderLayer`.
Args:
src (Tensor):
The input of Transformer encoder. It is a tensor
with shape `[batch_size, sequence_length, d_model]`. The data
type should be float32 or float16.
src_mask (Tensor, optional):
A tensor used in multi-head attention to prevents attention to
some unwanted positions, usually the paddings or the subsequent
positions. It is a tensor with shape `[batch_size, 1, 1, sequence_length]`.
The data type must be float, the unwanted positions have `-INF` values or other non-zeros
and the wanted positions must be 0.0.
Returns:
output (Tensor|tuple):
It is a tensor that has the same shape and data type as `src`,
representing the output of Transformer encoder. Or a tuple if
`cache` is not None, except for encoder output, the tuple includes
the new cache which is same as input `cache` argument but
`incremental_cache` in it has an incremental length. See
`paddle.nn.MultiHeadAttention.gen_cache` and
`paddle.nn.MultiHeadAttention.forward` for more details.
"""
if cache is not None:
raise NotImplementedError("cache in encoder is not supported now")
if src_mask.dtype == paddle.float16:
src_mask = paddle.cast(src_mask, dtype="float32")
src_mask = src_mask == 0.0
if src_mask.dtype != src.dtype:
src_mask = paddle.cast(src_mask, src.dtype)
if len(src_mask.shape) == 4:
# transpose_src_mask: [batch_size, 1, sequence_length, 1]
transpose_src_mask = paddle.transpose(src_mask, perm=[0, 1, 3, 2])
# src_mask: [batch_size, 1, sequence_length, sequence_length]
src_mask = src_mask * transpose_src_mask
if getattr(self, "q_weight", None) is None:
self.q_weight = []
self.q_bias = []
self.k_weight = []
self.k_bias = []
self.v_weight = []
self.v_bias = []
self.attn_out_weight = []
self.attn_out_bias = []
self.norm1_weight = []
self.norm1_bias = []
self.norm2_weight = []
self.norm2_bias = []
self.ffn_inter_weight = []
self.ffn_inter_bias = []
self.ffn_out_weight = []
self.ffn_out_bias = []
for layer in self.layers:
self.q_weight.append(layer.self_attn.q_proj.weight)
self.q_bias.append(layer.self_attn.q_proj.bias)
self.k_weight.append(layer.self_attn.k_proj.weight)
self.k_bias.append(layer.self_attn.k_proj.bias)
self.v_weight.append(layer.self_attn.v_proj.weight)
self.v_bias.append(layer.self_attn.v_proj.bias)
self.attn_out_weight.append(layer.self_attn.out_proj.weight)
self.attn_out_bias.append(layer.self_attn.out_proj.bias)
self.norm1_weight.append(layer.norm1.weight)
self.norm1_bias.append(layer.norm1.bias)
self.norm2_weight.append(layer.norm2.weight)
self.norm2_bias.append(layer.norm2.bias)
self.ffn_inter_weight.append(layer.linear1.weight)
self.ffn_inter_bias.append(layer.linear1.bias)
self.ffn_out_weight.append(layer.linear2.weight)
self.ffn_out_bias.append(layer.linear2.bias)
output = infer_transformer_encoder(
input=[src],
attn_mask=[src_mask],
q_weight=self.q_weight,
q_bias=self.q_bias,
k_weight=self.k_weight,
k_bias=self.k_bias,
v_weight=self.v_weight,
v_bias=self.v_bias,
attn_out_weight=self.attn_out_weight,
attn_out_bias=self.attn_out_bias,
norm1_weight=self.norm1_weight,
norm1_bias=self.norm1_bias,
norm2_weight=self.norm2_weight,
norm2_bias=self.norm2_bias,
ffn_inter_weight=self.ffn_inter_weight,
ffn_inter_bias=self.ffn_inter_bias,
ffn_out_weight=self.ffn_out_weight,
ffn_out_bias=self.ffn_out_bias,
# sequence_id_offset=paddle.to_tensor([]),
# trt_seqlen_offset=paddle.to_tensor([]),
# amax_list=paddle.to_tensor([]), # int8 mode is not supported.
n_head=self.layers[0]._config["nhead"],
size_per_head=self.layers[0]._config["d_model"] // self.layers[0]._config["nhead"],
use_gelu=self.layers[0]._config["activation"] == "gelu",
normalize_before=self.layers[0]._config["normalize_before"] is True,
)
if self.norm is not None:
output = self.norm(output)
return output
[docs]def enable_fast_encoder(self, use_fp16=False, encoder_lib=None):
"""
Compiles fusion encoder operator intergrated FastGeneration using the
method of JIT(Just-In-Time) and replaces the `forward` function of
`paddle.nn.TransformerEncoder` and `paddle.nn.TransformerEncoderLayer`
objects inherited from `self` to support inference using FastGeneration.
Examples:
.. code-block:: python
from paddlenlp.ops import enable_fast_encoder, disable_fast_encoder
model.eval()
model = enable_fast_encoder(model)
enc_out = model(src, src_mask)
model = disable_fast_encoder(model)
"""
def init_func(layer):
if isinstance(layer, TransformerEncoderLayer):
is_usable = True
if layer._config["bias_attr"] is False:
logger.warning(
"`False` for paddle.nn.TransformerEncoder's"
" parameter `bias_attr` is not supported in "
"FastGeneration by now. The original forward"
" will be involved."
)
is_usable = False
if layer._config["activation"] not in ("relu", "gelu"):
logger.warning("Only 'relu' or 'gelu' is supported by now. " "The original forward will be involved.")
is_usable = False
if is_usable:
layer.forward = layer._ft_forward
elif isinstance(layer, TransformerEncoder):
layer.forward = layer._ft_forward
if use_fp16:
convert_to_fp16(layer)
if not self.training:
try:
# Pass decoding lib to prevent re-building encoder.
# Todo: check weather decoding lib have contained encoder or not.
if encoder_lib is not None:
if "FastGeneration" not in LOADED_EXT.keys():
ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(encoder_lib)
LOADED_EXT["FastGeneration"] = ops
else:
load("FastGeneration", verbose=True)
except Exception:
logger.warning("Exception occurs when using FasterEncoder. " "The original forward will be involved. ")
return self
for layer in self.children():
layer.apply(init_func)
return self
[docs]def disable_fast_encoder(self):
"""
Restores the original `forward` function of `paddle.nn.TransformerEncoder`
and `paddle.nn.TransformerEncoderLayer` objects inherited from `self`.
Examples:
.. code-block:: python
from paddlenlp.ops import enable_fast_encoder, disable_fast_encoder
model.eval()
model = enable_fast_encoder(model)
enc_out = model(src, src_mask)
model = disable_fast_encoder(model)
"""
def init_func(layer):
if isinstance(layer, (TransformerEncoderLayer, TransformerEncoder)):
layer.forward = layer._ori_forward
for layer in self.children():
layer.apply(init_func)
return self
[docs]def convert_to_fp16(transformer_encoder):
"""Convert paddle.nn.TransformerEncoder's parameter from float32 to float16
Args:
transformer_encoder (obeject, paddle.nn.TransformerEncoder):
The object to be converted to float16 inplaced, it must be an isinstance
of paddle.nn.TransformerEncoder.
"""
if not isinstance(transformer_encoder, paddle.nn.TransformerEncoder):
logger.warning(
"transformer_encoder is not isinstance of paddle.nn.TransformerEncoder, return itself with no parameters convertion.".format
)
return transformer_encoder
else:
encoder_layers = transformer_encoder.layers
for mod in encoder_layers:
mod.norm1.weight = transfer_param(mod.norm1.weight, restore_data=True)
mod.norm1.bias = transfer_param(mod.norm1.bias, is_bias=True, restore_data=True)
mod.norm2.weight = transfer_param(mod.norm2.weight, restore_data=True)
mod.norm2.bias = transfer_param(mod.norm2.bias, is_bias=True, restore_data=True)
mod.linear1.weight = transfer_param(mod.linear1.weight, restore_data=True)
mod.linear1.bias = transfer_param(mod.linear1.bias, is_bias=True, restore_data=True)
mod.self_attn.q_proj.weight = transfer_param(mod.self_attn.q_proj.weight, restore_data=True)
mod.self_attn.q_proj.bias = transfer_param(mod.self_attn.q_proj.bias, is_bias=True, restore_data=True)
mod.self_attn.k_proj.weight = transfer_param(mod.self_attn.k_proj.weight, restore_data=True)
mod.self_attn.k_proj.bias = transfer_param(mod.self_attn.k_proj.bias, is_bias=True, restore_data=True)
mod.self_attn.v_proj.weight = transfer_param(mod.self_attn.v_proj.weight, restore_data=True)
mod.self_attn.v_proj.bias = transfer_param(mod.self_attn.v_proj.bias, is_bias=True, restore_data=True)
mod.self_attn.out_proj.weight = transfer_param(mod.self_attn.out_proj.weight, restore_data=True)
mod.self_attn.out_proj.bias = transfer_param(mod.self_attn.out_proj.bias, is_bias=True, restore_data=True)
mod.linear2.weight = transfer_param(mod.linear2.weight, restore_data=True)
mod.linear2.bias = transfer_param(mod.linear2.bias, is_bias=True, restore_data=True)
logger.info("Convert transformer_encoder's parameters from float32 to float16 succeessfully.")