# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
from .utils import default_trans_func
__all__ = ["RougeL", "RougeLForDuReader"]
class RougeN:
def __init__(self, n):
self.n = n
def _get_ngrams(self, words):
"""Calculates word n-grams for multiple sentences."""
ngram_set = set()
max_index_ngram_start = len(words) - self.n
for i in range(max_index_ngram_start + 1):
ngram_set.add(tuple(words[i : i + self.n]))
return ngram_set
def score(self, evaluated_sentences_ids, reference_sentences_ids):
overlapping_count, reference_count = self.compute(evaluated_sentences_ids, reference_sentences_ids)
return overlapping_count / reference_count
def compute(self, evaluated_sentences_ids, reference_sentences_ids):
"""
Args:
evaluated_sentences (list): the sentences ids predicted by the model.
reference_sentences (list): the referenced sentences ids. Its size should be same as evaluated_sentences.
Returns:
overlapping_count (int): the overlapping n-gram count.
reference_count (int): the reference sentences n-gram count.
"""
if len(evaluated_sentences_ids) <= 0 or len(reference_sentences_ids) <= 0:
raise ValueError("Collections must contain at least 1 sentence.")
reference_count = 0
overlapping_count = 0
for evaluated_sentence_ids, reference_sentence_ids in zip(evaluated_sentences_ids, reference_sentences_ids):
evaluated_ngrams = self._get_ngrams(evaluated_sentence_ids)
reference_ngrams = self._get_ngrams(reference_sentence_ids)
reference_count += len(reference_ngrams)
# Gets the overlapping ngrams between evaluated and reference
overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
overlapping_count += len(overlapping_ngrams)
return overlapping_count, reference_count
def accumulate(self):
"""
This function returns the mean precision, recall and f1 score for all accumulated minibatches.
Returns:
float: mean precision, recall and f1 score.
"""
rouge_score = self.overlapping_count / self.reference_count
return rouge_score
def reset(self):
"""
Reset function empties the evaluation memory for previous mini-batches.
"""
self.overlapping_count = 0
self.reference_count = 0
def name(self):
"""
Return name of metric instance.
"""
return "Rouge-%s" % self.n
def update(self, overlapping_count, reference_count):
"""
Args:
"""
self.overlapping_count += overlapping_count
self.reference_count += reference_count
class Rouge1(RougeN):
def __init__(self):
super(Rouge1, self).__init__(n=1)
class Rouge2(RougeN):
def __init__(self):
super(Rouge2, self).__init__(n=2)
[docs]class RougeL(paddle.metric.Metric):
r"""
Rouge-L is Recall-Oriented Understudy for Gisting Evaluation based on Longest Common Subsequence (LCS).
Longest common subsequence problem takes into account sentence level structure
similarity naturally and identifies longest co-occurring
in sequence n-grams automatically.
.. math::
R_{LCS} & = \frac{LCS(C,S)}{len(S)}
P_{LCS} & = \frac{LCS(C,S)}{len(C)}
F_{LCS} & = \frac{(1 + \gamma^2)R_{LCS}P_{LCS}}{R_{LCS}} + \gamma^2{R_{LCS}}
where `C` is the candidate sentence, and `S` is the reference sentence.
Args:
trans_func (callable, optional): `trans_func` transforms the network
output to string to calculate.
vocab (dict|paddlenlp.data.vocab, optional): Vocab for target language.
If `trans_func` is None and RougeL is used as `paddle.metric.Metric`
instance, `default_trans_func` will be performed and `vocab` must
be provided.
gamma (float): A hyperparameter to decide the weight of recall. Defaults to 1.2.
name (str, optional): Name of `paddle.metric.Metric` instance. Defaults to "rouge-l".
Examples:
.. code-block:: python
from paddlenlp.metrics import RougeL
rougel = RougeL()
cand = ["The","cat","The","cat","on","the","mat"]
ref_list = [["The","cat","is","on","the","mat"], ["There","is","a","cat","on","the","mat"]]
rougel.add_inst(cand, ref_list)
print(rougel.score()) # 0.7800511508951408
"""
def __init__(self, trans_func=None, vocab=None, gamma=1.2, name="rouge-l", *args, **kwargs):
super(RougeL, self).__init__(*args, **kwargs)
self.gamma = gamma
self.inst_scores = []
self._name = name
self.vocab = vocab
self.trans_func = trans_func
[docs] def lcs(self, string, sub):
"""
Calculate the length of longest common subsequence of string and sub.
Args:
string (str):
The string to be calculated, usually longer the sub string.
sub (str):
The sub string to be calculated.
Returns:
float: Returns the length of the longest common subsequence of string and sub.
"""
if len(string) < len(sub):
sub, string = string, sub
lengths = np.zeros((len(string) + 1, len(sub) + 1))
for j in range(1, len(sub) + 1):
for i in range(1, len(string) + 1):
if string[i - 1] == sub[j - 1]:
lengths[i][j] = lengths[i - 1][j - 1] + 1
else:
lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1])
return lengths[len(string)][len(sub)]
[docs] def add_inst(self, cand, ref_list):
"""
Update the states based on the a pair of candidate and references.
Args:
cand (str): The candidate sentence generated by model.
ref_list (list): List of ground truth sentences.
"""
precs, recalls = [], []
for ref in ref_list:
basic_lcs = self.lcs(cand, ref)
prec = basic_lcs / len(cand) if len(cand) > 0.0 else 0.0
rec = basic_lcs / len(ref) if len(ref) > 0.0 else 0.0
precs.append(prec)
recalls.append(rec)
prec_max = max(precs)
rec_max = max(recalls)
if prec_max != 0 and rec_max != 0:
score = ((1 + self.gamma**2) * prec_max * rec_max) / float(rec_max + self.gamma**2 * prec_max)
else:
score = 0.0
self.inst_scores.append(score)
[docs] def update(self, output, label, seq_mask=None):
if self.trans_func is None:
if self.vocab is None:
raise AttributeError(
"The `update` method requires users to provide `trans_func` or `vocab` when initializing RougeL."
)
cand_list, ref_list = default_trans_func(output, label, seq_mask, self.vocab)
else:
cand_list, ref_list = self.trans_func(output, label, seq_mask)
if len(cand_list) != len(ref_list):
raise ValueError("Length error! Please check the output of network.")
for i in range(len(cand_list)):
self.add_inst(cand_list[i], ref_list[i])
[docs] def accumulate(self):
"""
Calculate the final rouge-l metric.
"""
return 1.0 * sum(self.inst_scores) / len(self.inst_scores)
def score(self):
return self.accumulate()
[docs] def reset(self):
self.inst_scores = []
[docs] def name(self):
return self._name
[docs]class RougeLForDuReader(RougeL):
"""
Rouge-L metric with bonus for DuReader contest.
Please refer to `DuReader Homepage<https://ai.baidu.com//broad/subordinate?dataset=dureader>`_ for more details.
Args:
alpha (float, optional): Weight of YesNo dataset when adding bonus for DuReader contest. Defaults to 1.0.
beta (float, optional): Weight of Entity dataset when adding bonus for DuReader contest. Defaults to 1.0.
"""
def __init__(self, alpha=1.0, beta=1.0, gamma=1.2):
super(RougeLForDuReader, self).__init__(gamma)
self.alpha = alpha
self.beta = beta
[docs] def add_inst(self, cand, ref_list, yn_label=None, yn_ref=None, entity_ref=None):
precs, recalls = [], []
for i, ref in enumerate(ref_list):
basic_lcs = self.lcs(cand, ref)
yn_bonus, entity_bonus = 0.0, 0.0
if yn_ref is not None and yn_label is not None:
yn_bonus = self.add_yn_bonus(cand, ref, yn_label, yn_ref[i])
elif entity_ref is not None:
entity_bonus = self.add_entity_bonus(cand, entity_ref)
p_denom = len(cand) + self.alpha * yn_bonus + self.beta * entity_bonus
r_denom = len(ref) + self.alpha * yn_bonus + self.beta * entity_bonus
prec = (basic_lcs + self.alpha * yn_bonus + self.beta * entity_bonus) / p_denom if p_denom > 0.0 else 0.0
rec = (basic_lcs + self.alpha * yn_bonus + self.beta * entity_bonus) / r_denom if r_denom > 0.0 else 0.0
precs.append(prec)
recalls.append(rec)
prec_max = max(precs)
rec_max = max(recalls)
if prec_max != 0 and rec_max != 0:
score = ((1 + self.gamma**2) * prec_max * rec_max) / float(rec_max + self.gamma**2 * prec_max)
else:
score = 0.0
self.inst_scores.append(score)
def add_yn_bonus(self, cand, ref, yn_label, yn_ref):
if yn_label != yn_ref:
return 0.0
lcs_ = self.lcs(cand, ref)
return lcs_
def add_entity_bonus(self, cand, entity_ref):
lcs_ = 0.0
for ent in entity_ref:
if ent in cand:
lcs_ += len(ent)
return lcs_