paddlenlp.taskflow.dialogue 源代码

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import contextlib
from collections import deque

import numpy as np
import paddle

from ..data import Pad
from ..transformers import UnifiedTransformerLMHeadModel, UnifiedTransformerTokenizer
from .task import Task

usage = r"""
           from paddlenlp import Taskflow

           # 非交互模式
           dialogue = Taskflow("dialogue")
           dialogue(["吃饭了吗"])
           '''
           ['刚吃完饭,你在干什么呢?']
           '''
           dialogue(["你好", "吃饭了吗"], ["你是谁?"])
           '''
           ['吃过了,你呢', '我是李明啊']
           '''

           dialogue = Taskflow("dialogue")
           # 进入交互模式 (输入exit退出)
           dialogue.interactive_mode(max_turn=3)

           '''
           [Human]:你好
           [Bot]:你好,很高兴认识你,我想问你一下,你喜欢运动吗?
           [Human]:喜欢
           [Bot]:那你喜欢什么运动啊?
           [Human]:篮球,你喜欢篮球吗
           [Bot]:当然了,我很喜欢打篮球的。
           '''
         """


[文档]class DialogueTask(Task): """ Task of Chinese open domain dialogue. Args: task(string): The name of task. model(string): The model name in the task. kwargs (dict, optional): Additional keyword arguments passed along to the specific task. """ resource_files_names = { "model_state": "model_state.pdparams", "model_config": "model_config.json", } resource_files_urls = { "plato-mini": { "model_state": [ "https://bj.bcebos.com/paddlenlp/taskflow/dialogue/plato-mini/model_state.pdparams", "450be85b9b7f0bc03b12252a75af04f3", ], "model_config": [ "https://bj.bcebos.com/paddlenlp/taskflow/dialogue/plato-mini/model_config.json", "5e853fda9a9b573815ad112e494a65af", ], }, "__internal_testing__/tiny-random-plato": { "model_state": [ "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-plato/model_state.pdparams", "fda5d068908505cf0c3a46125eb4d39e", ], "model_config": [ "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-plato/config.json", "3664e658d5273a132f2e7345a8cafa53", ], }, } def __init__(self, task, model, batch_size=1, max_seq_len=512, **kwargs): super().__init__(task=task, model=model, **kwargs) self._static_mode = False self._usage = usage if not self._custom_model: self._check_task_files() self._construct_tokenizer(self._task_path if self._custom_model else model) self._batch_size = batch_size self._max_seq_len = max_seq_len self._interactive_mode = False if self._static_mode: self._get_inference_model() else: self._construct_model(self._task_path if self._custom_model else model) def _construct_input_spec(self): """ Construct the input spec for the convert dygraph model to static model. """ self._input_spec = [ paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"), paddle.static.InputSpec(shape=[None], dtype="int64", name="token_type_ids"), ] def _construct_model(self, model): """ Construct the inference model for the predictor. """ model_instance = UnifiedTransformerLMHeadModel.from_pretrained(model, from_hf_hub=self.from_hf_hub) model_instance.eval() self._model = model_instance def _construct_tokenizer(self, model): """ Construct the tokenizer for the predictor. """ self._tokenizer = UnifiedTransformerTokenizer.from_pretrained(model, from_hf_hub=self.from_hf_hub) def _batchify_fn(self, batch_examples): # padding = False if self._batch_size == 1 else True pad_func = Pad(pad_val=self._tokenizer.pad_token_id, pad_right=False, dtype="int64") def pad_mask(batch_attention_mask): batch_size = len(batch_attention_mask) max_len = max(map(len, batch_attention_mask)) attention_mask = np.ones((batch_size, max_len, max_len), dtype="float32") * -1e4 for i, mask_data in enumerate(attention_mask): seq_len = len(batch_attention_mask[i]) mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i], dtype="float32") # In order to ensure the correct broadcasting mechanism, expand one # dimension to the second dimension (n_head of Transformer). attention_mask = np.expand_dims(attention_mask, axis=1) return attention_mask input_ids = pad_func([example["input_ids"] for example in batch_examples]) token_type_ids = pad_func([example["token_type_ids"] for example in batch_examples]) position_ids = pad_func([example["position_ids"] for example in batch_examples]) attention_mask = pad_mask([example["attention_mask"] for example in batch_examples]) return input_ids, token_type_ids, position_ids, attention_mask def _check_input_text(self, inputs): if self._interactive_mode: if isinstance(inputs, str): self.context.append(inputs.strip()) inputs = [list(self.context)] return inputs else: raise ValueError("In the interactive mode, the input data shold be a string") elif not isinstance(inputs[0], list): raise ValueError("If not in the interactive mode, the input data should be a list.") return inputs def _batchify(self, data, max_seq_len, batch_size): """ Generate input batches. """ padding = False if batch_size == 1 else True pad_func = Pad(pad_val=self._tokenizer.pad_token_id, pad_right=False, dtype=np.int64) def pad_mask(batch_attention_mask): batch_size = len(batch_attention_mask) max_len = max(map(len, batch_attention_mask)) attention_mask = np.ones((batch_size, max_len, max_len), dtype="float32") * -1e4 for i, mask_data in enumerate(attention_mask): seq_len = len(batch_attention_mask[i]) mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i], dtype="float32") # In order to ensure the correct broadcasting mechanism, expand one # dimension to the second dimension (n_head of Transformer). attention_mask = np.expand_dims(attention_mask, axis=1) return attention_mask def _parse_batch(batch_examples): if padding: input_ids = pad_func([example["input_ids"] for example in batch_examples]) token_type_ids = pad_func([example["token_type_ids"] for example in batch_examples]) position_ids = pad_func([example["position_ids"] for example in batch_examples]) attention_mask = pad_mask([example["attention_mask"] for example in batch_examples]) else: input_ids = np.asarray([example["input_ids"] for example in batch_examples], dtype=np.int64) token_type_ids = np.asarray([example["token_type_ids"] for example in batch_examples], dtype=np.int64) position_ids = np.asarray([example["position_ids"] for example in batch_examples], dtype=np.int64) attention_mask = np.asarray([example["attention_mask"] for example in batch_examples]) attention_mask = np.expand_dims(attention_mask, 0) return input_ids, token_type_ids, position_ids, attention_mask examples = [] for texts in data: examples.append(self._convert_text_to_input(texts, max_seq_len)) # Separates data into some batches. one_batch = [] for example in examples: one_batch.append(example) if len(one_batch) == batch_size: yield _parse_batch(one_batch) one_batch = [] if one_batch: yield _parse_batch(one_batch) def _convert_text_to_input(self, texts, max_seq_len): """ Convert input strings to tokens. """ return self._tokenizer.dialogue_encode( texts, max_seq_len=max_seq_len, add_start_token_as_response=True, is_split_into_words=False ) def _preprocess(self, inputs): """ Transform the raw text to the model inputs, two steps involved: 1) Transform the raw text to token ids. 2) Generate the other model inputs from the raw text and token ids. """ inputs = self._check_input_text(inputs) # Get the config from the kwargs num_workers = self.kwargs["num_workers"] if "num_workers" in self.kwargs else 0 # noqa: F841 lazy_load = self.kwargs["lazy_load"] if "lazy_load" in self.kwargs else False # noqa: F841 batches = self._batchify(inputs, self._max_seq_len, self._batch_size) outputs = {} outputs["batches"] = batches outputs["text"] = inputs return outputs def _run_model(self, inputs): """ Run the task model from the outputs of the `_tokenize` function. """ all_ids = [] all_scores = [] for batch in inputs["batches"]: input_ids, token_type_ids, position_ids, attention_mask = map(paddle.to_tensor, batch) ids, scores = self._model.generate( input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask, max_length=64, min_length=1, decode_strategy="sampling", temperature=1.0, top_k=5, top_p=1.0, num_beams=0, length_penalty=1.0, early_stopping=False, use_fast=False, num_return_sequences=1, ) all_ids.extend([ids]) all_scores.extend([scores]) inputs["ids"] = all_ids inputs["scores"] = all_scores return inputs def _post_process_response(self, token_ids, tokenizer): """ Post-process the decoded sequence. Truncate from the first <eos>. """ eos_pos = len(token_ids) for i, tok_id in enumerate(token_ids): if tok_id == tokenizer.sep_token_id: eos_pos = i break token_ids = token_ids[:eos_pos] tokens = tokenizer.convert_ids_to_tokens(token_ids) tokens = tokenizer.merge_subword(tokens) return token_ids, tokens
[文档] @contextlib.contextmanager def interactive_mode(self, max_turn=3): """ Enter the interactive mode. """ self._interactive_mode = True self.max_turn = max_turn self.context = deque(maxlen=self.max_turn) yield self.context.clear() self._interactive_mode = False
def _get_in_turn_repetition(self, pred, is_cn=False): """ Get in-turn repetition. """ if len(pred) == 0: return 1.0 if isinstance(pred[0], str): pred = [tok.lower() for tok in pred] if is_cn: pred = "".join(pred) tri_grams = set() for i in range(len(pred) - 2): tri_gram = tuple(pred[i : i + 3]) if tri_gram in tri_grams: return True tri_grams.add(tri_gram) return False def _select_response(self, ids, scores, tokenizer, max_dec_len=None, num_return_sequences=1, keep_space=True): """ Select response with the highest score. """ ids = ids.numpy().tolist() scores = scores.numpy() if len(ids) != len(scores) or (len(ids) % num_return_sequences) != 0: raise ValueError( "the length of `ids` is {}, but the `num_return_sequences` is {}".format( len(ids), num_return_sequences ) ) group = [] tmp = [] for pred, score in zip(ids, scores): pred_token_ids, pred_tokens = self._post_process_response(pred, tokenizer) num_token = len(pred_token_ids) if keep_space: response = " ".join(pred_tokens) else: response = "".join(pred_tokens) in_turn_repetition = self._get_in_turn_repetition(pred_tokens, True) or self._get_in_turn_repetition( pred_token_ids ) # not ending if max_dec_len is not None and num_token >= max_dec_len: score -= 1e3 elif in_turn_repetition: score -= 1e3 tmp.append([response, score]) if len(tmp) == num_return_sequences: group.append(tmp) tmp = [] results = [] for preds in group: preds = sorted(preds, key=lambda x: -x[1]) results.append(preds[0][0]) return results def _postprocess(self, inputs): all_ids = inputs["ids"] all_scores = inputs["scores"] texts = inputs["text"] results = [] for ids, scores, text in zip(all_ids, all_scores, texts): results.extend( self._select_response(ids, scores, self._tokenizer, num_return_sequences=1, keep_space=False) ) if self._interactive_mode: self.context.append(results[0].strip()) return results