# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ..ernie.modeling import ErniePretrainedModel
__all__ = ['ErnieDualEncoder', 'ErnieCrossEncoder']
class ErnieEncoder(ErniePretrainedModel):
def __init__(self, ernie, dropout=None, num_classes=2):
super(ErnieEncoder, self).__init__()
self.ernie = ernie # allow ernie to be config
self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
self.classifier = nn.Linear(768, num_classes)
self.apply(self.init_weights)
def init_weights(self, layer):
""" Initialization hook """
if isinstance(layer, nn.LayerNorm):
layer._epsilon = 1e-5
def forward(self,
input_ids,
token_type_ids=None,
position_ids=None,
attention_mask=None):
sequence_output, pool_output = self.ernie(input_ids,
token_type_ids=token_type_ids,
position_ids=position_ids,
attention_mask=attention_mask)
return sequence_output, pool_output
[文档]class ErnieDualEncoder(nn.Layer):
"""
This class encapsulates two ErnieEncoder models into one model, so query
embedding and title embedding could be obtained using one model. And this
class allows two ErnieEncoder models to be trained at the same time.
Example:
.. code-block::
import paddle
from paddlenlp.transformers import ErnieDualEncoder, ErnieTokenizer
model = ErnieDualEncoder("rocketqa-zh-dureader-query-encoder", "rocketqa-zh-dureader-para-encoder")
tokenizer = ErnieTokenizer.from_pretrained("rocketqa-zh-dureader-query-encoder")
inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
# Get query embedding
query_embedding = model.get_pooled_embedding(**inputs)
# Get title embedding
title_embedding = model.get_pooled_embedding(**inputs, is_query=False)
"""
def __init__(self,
query_model_name_or_path,
title_model_name_or_path=None,
share_parameters=False,
dropout=None,
use_cross_batch=False):
super().__init__()
self.query_ernie, self.title_ernie = None, None
self.use_cross_batch = use_cross_batch
self.query_ernie = ErnieEncoder.from_pretrained(
query_model_name_or_path)
if share_parameters:
self.title_ernie = self.query_ernie
elif title_model_name_or_path is not None:
self.title_ernie = ErnieEncoder.from_pretrained(
title_model_name_or_path)
def get_semantic_embedding(self, data_loader):
self.eval()
with paddle.no_grad():
for batch_data in data_loader:
input_ids, token_type_ids = batch_data
input_ids = paddle.to_tensor(input_ids)
token_type_ids = paddle.to_tensor(token_type_ids)
text_embeddings = self.get_pooled_embedding(
input_ids, token_type_ids=token_type_ids)
yield text_embeddings
def get_pooled_embedding(self,
input_ids,
token_type_ids=None,
position_ids=None,
attention_mask=None,
is_query=True):
assert (is_query and self.query_ernie is not None) or (not is_query and self.title_ernie), \
"Please check whether your parameter for `is_query` are consistent with DualEncoder initialization."
if is_query:
sequence_output, _ = self.query_ernie(input_ids, token_type_ids,
position_ids, attention_mask)
else:
sequence_output, _ = self.title_ernie(input_ids, token_type_ids,
position_ids, attention_mask)
return sequence_output[:, 0]
def cosine_sim(self,
query_input_ids,
title_input_ids,
query_token_type_ids=None,
query_position_ids=None,
query_attention_mask=None,
title_token_type_ids=None,
title_position_ids=None,
title_attention_mask=None):
query_cls_embedding = self.get_pooled_embedding(query_input_ids,
query_token_type_ids,
query_position_ids,
query_attention_mask)
title_cls_embedding = self.get_pooled_embedding(title_input_ids,
title_token_type_ids,
title_position_ids,
title_attention_mask,
is_query=False)
cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding,
axis=-1)
return cosine_sim
[文档] def forward(self,
query_input_ids,
pos_title_input_ids,
neg_title_input_ids,
is_prediction=False,
query_token_type_ids=None,
query_position_ids=None,
query_attention_mask=None,
pos_title_token_type_ids=None,
pos_title_position_ids=None,
pos_title_attention_mask=None,
neg_title_token_type_ids=None,
neg_title_position_ids=None,
neg_title_attention_mask=None):
query_cls_embedding = self.get_pooled_embedding(query_input_ids,
query_token_type_ids,
query_position_ids,
query_attention_mask)
pos_title_cls_embedding = self.get_pooled_embedding(
pos_title_input_ids, pos_title_token_type_ids,
pos_title_position_ids, pos_title_attention_mask)
neg_title_cls_embedding = self.get_pooled_embedding(
neg_title_input_ids, neg_title_token_type_ids,
neg_title_position_ids, neg_title_attention_mask)
all_title_cls_embedding = paddle.concat(
x=[pos_title_cls_embedding, neg_title_cls_embedding], axis=0)
if is_prediction:
logits = paddle.dot(query_cls_embedding, pos_title_cls_embedding)
outputs = {
"probs": logits,
"q_rep": query_cls_embedding,
"p_rep": pos_title_cls_embedding
}
return outputs
if self.use_cross_batch:
tensor_list = []
paddle.distributed.all_gather(tensor_list, all_title_cls_embedding)
all_title_cls_embedding = paddle.concat(x=tensor_list, axis=0)
# multiply
logits = paddle.matmul(query_cls_embedding,
all_title_cls_embedding,
transpose_y=True)
batch_size = query_cls_embedding.shape[0]
labels = paddle.arange(batch_size * self.rank * 2,
batch_size * (self.rank * 2 + 1),
dtype='int64')
labels = paddle.reshape(labels, shape=[-1, 1])
accuracy = paddle.metric.accuracy(input=logits, label=labels)
loss = F.cross_entropy(input=logits, label=labels)
outputs = {"loss": loss, "accuracy": accuracy}
return outputs
[文档]class ErnieCrossEncoder(nn.Layer):
"""
Example:
.. code-block::
import paddle
from paddlenlp.transformers import ErnieCrossEncoder, ErnieTokenizer
model = ErnieCrossEncoder("rocketqa-zh-dureader-cross-encoder")
tokenizer = ErnieTokenizer.from_pretrained("rocketqa-zh-dureader-cross-encoder")
inputs = tokenizer("你们好", text_pair="你好")
inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
# Get embedding of text pair.
embedding = model.matching(**inputs)
"""
def __init__(self,
pretrain_model_name_or_path,
num_classes=2,
dropout=None):
super().__init__()
self.ernie = ErnieEncoder.from_pretrained(pretrain_model_name_or_path)
def matching(self,
input_ids,
token_type_ids=None,
position_ids=None,
attention_mask=None,
return_prob_distributation=False):
_, pooled_output = self.ernie(input_ids,
token_type_ids=token_type_ids,
position_ids=position_ids,
attention_mask=attention_mask)
pooled_output = self.ernie.dropout(pooled_output)
cls_embedding = self.ernie.classifier(pooled_output)
probs = F.softmax(cls_embedding, axis=1)
if return_prob_distributation:
return probs
return probs[:, 1]
[文档] def forward(self,
input_ids,
token_type_ids=None,
position_ids=None,
attention_mask=None,
labels=None):
probs = self.matching(input_ids,
token_type_ids=token_type_ids,
position_ids=position_ids,
attention_mask=attention_mask,
return_prob_distributation=True)
accuracy = paddle.metric.accuracy(input=probs, label=labels)
loss = F.cross_entropy(input=logits, label=labels)
outputs = {"loss": loss, "accuracy": accuracy}
return outputs