paddlenlp.metrics.distinct 源代码

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle

__all__ = ["Distinct"]


[文档]class Distinct(paddle.metric.Metric): """ `Distinct` is an algorithm for evaluating the textual diversity of the generated text by calculating the number of distinct n-grams. The larger the number of distinct n-grams, the higher the diversity of the text. See details at https://arxiv.org/abs/1510.03055. :class:`Distinct` could be used as a :class:`paddle.metric.Metric` class, or an ordinary class. When :class:`Distinct` is used as a :class:`paddle.metric.Metric` class, a function is needed to transform the network output to a string list. Args: n_size (int, optional): Number of gram for :class:`Distinct` metric. Defaults to 2. trans_func (callable, optional): `trans_func` transforms the network output to a string list. Defaults to None. .. note:: When :class:`Distinct` is used as a :class:`paddle.metric.Metric` class, `trans_func` must be provided. Please note that the input of `trans_func` is numpy array. name (str, optional): Name of :class:`paddle.metric.Metric` instance. Defaults to "distinct". Examples: 1. Using as a general evaluation object. .. code-block:: python from paddlenlp.metrics import Distinct distinct = Distinct() cand = ["The","cat","The","cat","on","the","mat"] #update the states distinct.add_inst(cand) print(distinct.score()) # 0.8333333333333334 2. Using as an instance of `paddle.metric.Metric`. .. code-block:: python import numpy as np from functools import partial import paddle from paddlenlp.transformers import BertTokenizer from paddlenlp.metrics import Distinct def trans_func(logits, tokenizer): '''Transform the network output `logits` to string list.''' # [batch_size, seq_len] token_ids = np.argmax(logits, axis=-1).tolist() cand_list = [] for ids in token_ids: tokens = tokenizer.convert_ids_to_tokens(ids) strings = tokenizer.convert_tokens_to_string(tokens) cand_list.append(strings.split()) return cand_list paddle.seed(2021) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') distinct = Distinct(trans_func=partial(trans_func, tokenizer=tokenizer)) batch_size, seq_len, vocab_size = 4, 16, tokenizer.vocab_size logits = paddle.rand([batch_size, seq_len, vocab_size]) distinct.update(logits.numpy()) print(distinct.accumulate()) # 1.0 """ def __init__(self, n_size=2, trans_func=None, name="distinct"): super(Distinct, self).__init__() self._name = name self.diff_ngram = set() self.count = 0.0 self.n_size = n_size self.trans_func = trans_func
[文档] def update(self, output, *args): """ Updates the metrics states. This method firstly will use :meth:`trans_func` method to process the `output` to get the tokenized candidate sentence list. Then call :meth:`add_inst` method to process the candidate list one by one. Args: output (numpy.ndarray|Tensor): The outputs of model. args (tuple): The additional inputs. """ if isinstance(output, paddle.Tensor): output = output.numpy() assert self.trans_func is not None, ( "The `update` method requires user " "to provide `trans_func` when initializing `Distinct`." ) cand_list = self.trans_func(output) for cand in cand_list: self.add_inst(cand)
[文档] def add_inst(self, cand): """ Updates the states based on the candidate. Args: cand (list): Tokenized candidate sentence generated by model. """ for i in range(0, len(cand) - self.n_size + 1): ngram = " ".join(cand[i : (i + self.n_size)]) self.count += 1 self.diff_ngram.add(ngram)
[文档] def reset(self): """Resets states and result.""" self.diff_ngram = set() self.count = 0.0
[文档] def accumulate(self): """ Calculates the final distinct score. Returns: float: The final distinct score. """ distinct = len(self.diff_ngram) / self.count return distinct
[文档] def score(self): """ The function is the same as :meth:`accumulate` method. Returns: float: The final distinct score. """ return self.accumulate()
[文档] def name(self): """ Returns the metric name. Returns: str: The metric name. """ return self._name