paddlenlp.taskflow.knowledge_mining 源代码

# coding:utf-8
# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
from collections import OrderedDict

import numpy as np
import paddle

from ..datasets import load_dataset
from ..transformers import ErnieCtmNptagModel, ErnieCtmTokenizer, ErnieCtmWordtagModel
from ..transformers.ernie_ctm.configuration import ErnieCtmConfig
from .task import Task
from .utils import (
    BurkhardKellerTree,
    Customization,
    DataCollatorForErnieCtm,
    TermTree,
    WordTagRelationExtractor,
    add_docstrings,
)

LABEL_TO_SCHEMA = {
    "人物类_实体": ["人物|E", "虚拟角色|E", "演艺团体|E"],
    "人物类_概念": ["人物|C", "虚拟角色|C"],
    "作品类_实体": ["作品与出版物|E"],
    "作品类_概念": ["作品与出版物|C", "文化类"],
    "组织机构类": ["组织机构"],
    "组织机构类_企事业单位": ["企事业单位", "品牌", "组织机构"],
    "组织机构类_医疗卫生机构": ["医疗卫生机构", "组织机构"],
    "组织机构类_国家机关": ["国家机关", "组织机构"],
    "组织机构类_体育组织机构": ["体育组织机构", "组织机构"],
    "组织机构类_教育组织机构": ["教育组织机构", "组织机构"],
    "组织机构类_军事组织机构": ["军事组织机构", "组织机构"],
    "物体类": ["物体与物品", "品牌", "虚拟物品", "虚拟物品"],
    "物体类_兵器": ["兵器"],
    "物体类_化学物质": ["物体与物品", "化学术语"],
    "其他角色类": ["角色"],
    "文化类": ["文化", "作品与出版物|C", "体育运动项目", "语言文字"],
    "文化类_语言文字": ["语言学术语"],
    "文化类_奖项赛事活动": ["奖项赛事活动", "特殊日", "事件"],
    "文化类_制度政策协议": ["制度政策协议", "法律法规"],
    "文化类_姓氏与人名": ["姓氏与人名"],
    "生物类": ["生物"],
    "生物类_植物": ["植物", "生物"],
    "生物类_动物": ["动物", "生物"],
    "品牌名": ["品牌", "企事业单位"],
    "场所类": ["区域场所", "居民服务机构", "医疗卫生机构"],
    "场所类_交通场所": ["交通场所", "设施"],
    "位置方位": ["位置方位"],
    "世界地区类": ["世界地区", "区域场所", "政权朝代"],
    "饮食类": ["饮食", "生物类", "药物"],
    "饮食类_菜品": ["饮食"],
    "饮食类_饮品": ["饮食"],
    "药物类": ["药物", "生物类"],
    "药物类_中药": ["药物", "生物类"],
    "医学术语类": ["医药学术语"],
    "术语类_生物体": ["生物学术语"],
    "疾病损伤类": ["疾病损伤", "动物疾病", "医药学术语"],
    "疾病损伤类_植物病虫害": ["植物病虫害", "医药学术语"],
    "宇宙类": ["天文学术语"],
    "事件类": ["事件", "奖项赛事活动"],
    "时间类": ["时间阶段", "政权朝代"],
    "术语类": ["术语"],
    "术语类_符号指标类": ["编码符号指标", "术语"],
    "信息资料": ["生活用语"],
    "链接地址": ["生活用语"],
    "个性特征": ["个性特点", "生活用语"],
    "感官特征": ["生活用语"],
    "场景事件": ["场景事件", "情绪", "态度", "个性特点"],
    "介词": ["介词"],
    "介词_方位介词": ["介词"],
    "助词": ["助词"],
    "代词": ["代词"],
    "连词": ["连词"],
    "副词": ["副词"],
    "疑问词": ["疑问词"],
    "肯定词": ["肯定否定词"],
    "否定词": ["肯定否定词"],
    "数量词": ["数量词", "量词"],
    "叹词": ["叹词"],
    "拟声词": ["拟声词"],
    "修饰词": ["修饰词", "生活用语"],
    "外语单词": ["日文假名", "词汇用语"],
    "汉语拼音": ["汉语拼音"],
}

usage = r"""
            from paddlenlp import Taskflow

            # 默认使用WordTag词类知识标注工具
            wordtag = Taskflow("knowledge_mining", model="wordtag")
            wordtag("《孤女》是2010年九州出版社出版的小说，作者是余兼羽")
            '''
            [{'text': '《孤女》是2010年九州出版社出版的小说，作者是余兼羽', 'items': [{'item': '《', 'offset': 0, 'wordtag_label': 'w', 'length': 1}, {'item': '孤女', 'offset': 1, 'wordtag_label': '作品类_实体', 'length': 2}, {'item': '》', 'offset': 3, 'wordtag_label': 'w', 'length': 1}, {'item': '是', 'offset': 4, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '2010年', 'offset': 5, 'wordtag_label': '时间类', 'length': 5, 'termid': '时间阶段_cb_2010年'}, {'item': '九州出版社', 'offset': 10, 'wordtag_label': '组织机构类', 'length': 5, 'termid': '组织机构_eb_九州出版社'}, {'item': '出版', 'offset': 15, 'wordtag_label': '场景事件', 'length': 2, 'termid': '场景事件_cb_出版'}, {'item': '的', 'offset': 17, 'wordtag_label': '助词', 'length': 1, 'termid': '助词_cb_的'}, {'item': '小说', 'offset': 18, 'wordtag_label': '作品类_概念', 'length': 2, 'termid': '小说_cb_小说'}, {'item': '，', 'offset': 20, 'wordtag_label': 'w', 'length': 1}, {'item': '作者', 'offset': 21, 'wordtag_label': '人物类_概念', 'length': 2, 'termid': '人物_cb_作者'}, {'item': '是', 'offset': 23, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '余兼羽', 'offset': 24, 'wordtag_label': '人物类_实体', 'length': 3}]}]
            '''

            wordtag= Taskflow("knowledge_mining", batch_size=2)
            wordtag(["热梅茶是一道以梅子为主要原料制作的茶饮",
                    "《孤女》是2010年九州出版社出版的小说，作者是余兼羽"])
            '''
            [{'text': '热梅茶是一道以梅子为主要原料制作的茶饮', 'items': [{'item': '热梅茶', 'offset': 0, 'wordtag_label': '饮食类_饮品', 'length': 3}, {'item': '是', 'offset': 3, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '一道', 'offset': 4, 'wordtag_label': '数量词', 'length': 2}, {'item': '以', 'offset': 6, 'wordtag_label': '介词', 'length': 1, 'termid': '介词_cb_以'}, {'item': '梅子', 'offset': 7, 'wordtag_label': '饮食类', 'length': 2, 'termid': '饮食_cb_梅'}, {'item': '为', 'offset': 9, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_为'}, {'item': '主要原料', 'offset': 10, 'wordtag_label': '物体类', 'length': 4, 'termid': '物品_cb_主要原料'}, {'item': '制作', 'offset': 14, 'wordtag_label': '场景事件', 'length': 2, 'termid': '场景事件_cb_制作'}, {'item': '的', 'offset': 16, 'wordtag_label': '助词', 'length': 1, 'termid': '助词_cb_的'}, {'item': '茶饮', 'offset': 17, 'wordtag_label': '饮食类_饮品', 'length': 2, 'termid': '饮品_cb_茶饮'}]}, {'text': '《孤女》是2010年九州出版社出版的小说，作者是余兼羽', 'items': [{'item': '《', 'offset': 0, 'wordtag_label': 'w', 'length': 1}, {'item': '孤女', 'offset': 1, 'wordtag_label': '作品类_实体', 'length': 2}, {'item': '》', 'offset': 3, 'wordtag_label': 'w', 'length': 1}, {'item': '是', 'offset': 4, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '2010年', 'offset': 5, 'wordtag_label': '时间类', 'length': 5, 'termid': '时间阶段_cb_2010年'}, {'item': '九州出版社', 'offset': 10, 'wordtag_label': '组织机构类', 'length': 5, 'termid': '组织机构_eb_九州出版社'}, {'item': '出版', 'offset': 15, 'wordtag_label': '场景事件', 'length': 2, 'termid': '场景事件_cb_出版'}, {'item': '的', 'offset': 17, 'wordtag_label': '助词', 'length': 1, 'termid': '助词_cb_的'}, {'item': '小说', 'offset': 18, 'wordtag_label': '作品类_概念', 'length': 2, 'termid': '小说_cb_小说'}, {'item': '，', 'offset': 20, 'wordtag_label': 'w', 'length': 1}, {'item': '作者', 'offset': 21, 'wordtag_label': '人物类_概念', 'length': 2, 'termid': '人物_cb_作者'}, {'item': '是', 'offset': 23, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '余兼羽', 'offset': 24, 'wordtag_label': '人物类_实体', 'length': 3}]}]
            '''

            # 使用WordTag-IE进行信息抽取
            wordtag = Taskflow("knowledge_mining", model="wordtag", with_ie=True)
            '''
            [[{'text': '《忘了所有》是一首由王杰作词、作曲并演唱的歌曲，收录在专辑同名《忘了所有》中，由波丽佳音唱片于1996年08月31日发行。', 'items': [{'item': '《', 'offset': 0, 'wordtag_label': 'w', 'length': 1}, {'item': '忘了所有', 'offset': 1, 'wordtag_label': '作品类_实体', 'length': 4}, {'item': '》', 'offset': 5, 'wordtag_label': 'w', 'length': 1}, {'item': '是', 'offset': 6, 'wordtag_label': '肯定词', 'length': 1}, {'item': '一首', 'offset': 7, 'wordtag_label': '数量词_单位数量词', 'length': 2}, {'item': '由', 'offset': 9, 'wordtag_label': '介词', 'length': 1}, {'item': '王杰', 'offset': 10, 'wordtag_label': '人物类_实体', 'length': 2}, {'item': '作词', 'offset': 12, 'wordtag_label': '场景事件', 'length': 2}, {'item': '、', 'offset': 14, 'wordtag_label': 'w', 'length': 1}, {'item': '作曲', 'offset': 15, 'wordtag_label': '场景事件', 'length': 2}, {'item': '并', 'offset': 17, 'wordtag_label': '连词', 'length': 1}, {'item': '演唱', 'offset': 18, 'wordtag_label': '场景事件', 'length': 2}, {'item': '的', 'offset': 20, 'wordtag_label': '助词', 'length': 1}, {'item': '歌曲', 'offset': 21, 'wordtag_label': '作品类_概念', 'length': 2}, {'item': '，', 'offset': 23, 'wordtag_label': 'w', 'length': 1}, {'item': '收录', 'offset': 24, 'wordtag_label': '场景事件', 'length': 2}, {'item': '在', 'offset': 26, 'wordtag_label': '介词', 'length': 1}, {'item': '专辑', 'offset': 27, 'wordtag_label': '作品类_概念', 'length': 2}, {'item': '同名', 'offset': 29, 'wordtag_label': '场景事件', 'length': 2}, {'item': '《', 'offset': 31, 'wordtag_label': 'w', 'length': 1}, {'item': '忘了所有', 'offset': 32, 'wordtag_label': '作品类_实体', 'length': 4}, {'item': '》', 'offset': 36, 'wordtag_label': 'w', 'length': 1}, {'item': '中', 'offset': 37, 'wordtag_label': '词汇用语', 'length': 1}, {'item': '，', 'offset': 38, 'wordtag_label': 'w', 'length': 1}, {'item': '由', 'offset': 39, 'wordtag_label': '介词', 'length': 1}, {'item': '波丽佳音', 'offset': 40, 'wordtag_label': '人物类_实体', 'length': 4}, {'item': '唱片', 'offset': 44, 'wordtag_label': '作品类_概念', 'length': 2}, {'item': '于', 'offset': 46, 'wordtag_label': '介词', 'length': 1}, {'item': '1996年08月31日', 'offset': 47, 'wordtag_label': '时间类_具体时间', 'length': 11}, {'item': '发行', 'offset': 58, 'wordtag_label': '场景事件', 'length': 2}, {'item': '。', 'offset': 60, 'wordtag_label': 'w', 'length': 1}]}], [[{'HEAD_ROLE': {'item': '王杰', 'offset': 10, 'type': '人物类_实体'}, 'TAIL_ROLE': [{'item': '忘了所有', 'type': '作品类_实体', 'offset': 1}], 'GROUP': '创作', 'TRIG': [{'item': '作词', 'offset': 12}, {'item': '作曲', 'offset': 15}, {'item': '演唱', 'offset': 18}], 'SRC': 'REVERSE'}, {'HEAD_ROLE': {'item': '忘了所有', 'type': '作品类_实体', 'offset': 1}, 'TAIL_ROLE': [{'item': '王杰', 'offset': 10, 'type': '人物类_实体'}], 'GROUP': '创作者', 'SRC': 'HTG', 'TRIG': [{'item': '作词', 'offset': 12}, {'item': '作曲', 'offset': 15}, {'item': '演唱', 'offset': 18}]}, {'HEAD_ROLE': {'item': '忘了所有', 'type': '作品类_实体', 'offset': 1}, 'TAIL_ROLE': [{'item': '歌曲', 'offset': 21, 'type': '作品类_概念'}], 'GROUP': '类型', 'SRC': 'TAIL'}, {'HEAD_ROLE': {'item': '忘了所有', 'offset': 32, 'type': '作品类_实体'}, 'TAIL_ROLE': [{'item': '忘了所有', 'type': '作品类_实体', 'offset': 1}], 'GROUP': '收录', 'TRIG': [{'item': '收录', 'offset': 24}], 'SRC': 'REVERSE'}, {'HEAD_ROLE': {'item': '忘了所有', 'type': '作品类_实体', 'offset': 1}, 'TAIL_ROLE': [{'item': '忘了所有', 'offset': 32, 'type': '作品类_实体'}], 'GROUP': '收录于', 'SRC': 'HGT', 'TRIG': [{'item': '收录', 'offset': 24}]}, {'HEAD_ROLE': {'item': '忘了所有', 'offset': 32, 'type': '作品类_实体'}, 'TAIL_ROLE': [{'item': '王杰', 'type': '人物类_实体', 'offset': 10}], 'GROUP': '创作者', 'TRIG': [{'item': '专辑', 'offset': 27}], 'SRC': 'REVERSE'}, {'HEAD_ROLE': {'item': '王杰', 'type': '人物类_实体', 'offset': 10}, 'TAIL_ROLE': [{'item': '忘了所有', 'offset': 32, 'type': '作品类_实体'}], 'GROUP': '创作', 'SRC': 'HGT', 'TRIG': [{'item': '专辑', 'offset': 27}]}, {'HEAD_ROLE': {'item': '忘了所有', 'type': '作品类_实体', 'offset': 32}, 'TAIL_ROLE': [{'item': '唱片', 'offset': 44, 'type': '作品类_概念'}], 'GROUP': '类型', 'SRC': 'TAIL'}]]]
            '''

            # 切换为NPTag名词短语标注工具
            nptag = Taskflow("knowledge_mining", model="nptag")
            nptag("糖醋排骨")
            '''
            [{'text': '糖醋排骨', 'label': '菜品'}]
            '''

            nptag(["糖醋排骨", "红曲霉菌"])
            '''
            [{'text': '糖醋排骨', 'label': '菜品'}, {'text': '红曲霉菌', 'label': '微生物'}]
            '''

            # 输出粗粒度类别标签`category`，即WordTag的词汇标签。
            nptag = Taskflow("knowledge_mining", model="nptag", linking=True)
            nptag(["糖醋排骨", "红曲霉菌"])
            '''
            [{'text': '糖醋排骨', 'label': '菜品', 'category': '饮食类_菜品'}, {'text': '红曲霉菌', 'label': '微生物', 'category': '生物类_微生物'}]
            '''
         """


[文档]@add_docstrings(usage)
class WordTagTask(Task):
    """
    This the NER(Named Entity Recognition) task that convert the raw text to entities. And the task with the `wordtag`
    model will link the more meesage with the entity.
    Args:
        task(string): The name of task.
        model(string): The model name in the task.
        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.

    """

    resource_files_names = {
        "model_state": "model_state.pdparams",
        "model_config": "config.json",
        "termtree_schema": "termtree_type.csv",
        "termtree_data": "termtree_data",
        "tags": "tags.txt",
        "spo_config": "spo_config.pkl",
        "vocab_file": "vocab.txt",
        "special_tokens_map": "special_tokens_map.json",
        "tokenizer_config": "tokenizer_config.json",
    }
    resource_files_urls = {
        "wordtag": {
            "model_state": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.5/model_state.pdparams",
                "c7c9cef72f73ee22c70c26ef11393025",
            ],
            "model_config": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.1/config.json",
                "b9f307b3fa03ad98c08ecb5249c15dfa",
            ],
            "termtree_schema": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/termtree_type.csv",
                "062cb9ac24f4135bf836e2a2fc5a1209",
            ],
            "termtree_data": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/termtree_data",
                "a0efe723f84cf90540ac727be5b62e59",
            ],
            "tags": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.1/tags.txt",
                "f33feedd01d478b03bac81be19b48d00",
            ],
            "spo_config": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.1/spo_config.pkl",
                "07a0b8d0422198d8c4c0f70e68963275",
            ],
            "vocab_file": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/vocab.txt",
                "54aa6e2eeb0478c2d18a2343b008590c",
            ],
            "special_tokens_map": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/special_tokens_map.json",
                "58104269e4f141a258bdb2ed06aa599f",
            ],
            "tokenizer_config": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/tokenizer_config.json",
                "e3f2756e72e24e3bb298303fb9a171f7",
            ],
        }
    }

    def __init__(
        self,
        model,
        task,
        tag_path=None,
        term_schema_path=None,
        term_data_path=None,
        user_dict=None,
        linking=True,
        spo_config_path=None,
        with_ie=False,
        **kwargs
    ):
        super().__init__(model=model, task=task, **kwargs)
        self._tag_path = tag_path
        self._term_schema_path = term_schema_path
        self._term_data_path = term_data_path
        self._user_dict = user_dict
        self._linking = linking
        self._spo_config_path = spo_config_path
        self._with_ie = with_ie
        self._check_task_files()
        self._load_task_resources()
        self._construct_tokenizer(model)
        self._usage = usage
        self._summary_num = 2
        self._get_inference_model()

        if self._user_dict:
            self._custom = Customization()
            self._custom.load_customization(self._user_dict)
        else:
            self._custom = None
        self._num_workers = self.kwargs["num_workers"] if "num_workers" in self.kwargs else 0
        self._batch_size = self.kwargs["batch_size"] if "batch_size" in self.kwargs else 1
        self._lazy_load = self.kwargs["lazy_load"] if "lazy_load" in self.kwargs else False
        self._max_seq_len = self.kwargs["max_seq_len"] if "max_seq_len" in self.kwargs else 512
        self._split_sentence = self.kwargs["split_sentence"] if "split_sentence" in self.kwargs else False
        if self._with_ie:
            self._ie_extractor = WordTagRelationExtractor.from_pkl(self._spo_config_path)

    @property
    def summary_num(self):
        """
        Number of model summary token
        """
        return self._summary_num

    @property
    def linking(self):
        """
        Whether to do term linking.
        """
        return self._linking

    @staticmethod
    def _load_labels(tag_path):
        tags_to_idx = {}
        all_tags = []
        i = 0
        with open(tag_path, encoding="utf-8") as fp:
            for line in fp:
                line = line.strip()
                tag = line.split("-")[-1]
                if tag not in all_tags:
                    all_tags.append(tag)
                tags_to_idx[line] = i
                i += 1
        idx_to_tags = dict(zip(*(tags_to_idx.values(), tags_to_idx.keys())))
        return tags_to_idx, idx_to_tags, all_tags

    def _load_task_resources(self):
        """
        Load the resource of this task.
        """
        if self._tag_path is None:
            self._tag_path = os.path.join(self._task_path, "tags.txt")
            self._tags_to_index, self._index_to_tags, self._all_tags = self._load_labels(self._tag_path)

        if self._term_schema_path is None:
            self._term_schema_path = os.path.join(self._task_path, "termtree_type.csv")
        if self._term_data_path is None:
            self._term_data_path = os.path.join(self._task_path, "termtree_data")

        if self._linking is True:
            self._termtree = TermTree.from_dir(self._term_schema_path, self._term_data_path, self._linking)

        if self._spo_config_path is None:
            self._spo_config_path = os.path.join(self._task_path, "spo_config.pkl")

    def _preprocess_text(self, input_texts):
        """
        Create the dataset and dataloader for the predict.
        """
        max_predict_len = self._max_seq_len - self.summary_num - 1
        filter_input_texts = []
        for input_text in input_texts:
            if not (isinstance(input_text, str) and len(input_text) > 0):
                continue
            filter_input_texts.append(input_text)
        input_texts = filter_input_texts

        short_input_texts, self.input_mapping = self._auto_splitter(
            input_texts, max_predict_len, split_sentence=self._split_sentence
        )

        def read(inputs):
            for text in inputs:
                tokenized_output = self._tokenizer(
                    list(text), return_length=True, is_split_into_words=True, max_length=self._max_seq_len
                )
                yield {
                    "input_ids": tokenized_output["input_ids"],
                    "token_type_ids": tokenized_output["token_type_ids"],
                    "seq_len": tokenized_output["seq_len"],
                }

        infer_ds = load_dataset(read, inputs=short_input_texts, lazy=self._lazy_load)

        data_collator = DataCollatorForErnieCtm(self._tokenizer, model="wordtag")

        batch_sampler = paddle.io.BatchSampler(dataset=infer_ds, batch_size=self._batch_size, shuffle=False)

        infer_data_loader = paddle.io.DataLoader(
            dataset=infer_ds,
            batch_sampler=batch_sampler,
            collate_fn=data_collator,
            num_workers=self._num_workers,
            return_list=True,
        )

        outputs = {}
        outputs["data_loader"] = infer_data_loader
        outputs["short_input_texts"] = short_input_texts
        return outputs

    def _reset_offset(self, pred_words):
        for i in range(0, len(pred_words)):
            if i > 0:
                pred_words[i]["offset"] = pred_words[i - 1]["offset"] + len(pred_words[i - 1]["item"])
            pred_words[i]["length"] = len(pred_words[i]["item"])
        return pred_words

    def _decode(self, batch_texts, batch_pred_tags):
        batch_results = []
        for sent_index in range(len(batch_texts)):
            sent = batch_texts[sent_index]
            indexes = batch_pred_tags[sent_index][self.summary_num : len(sent) + self.summary_num]
            tags = [self._index_to_tags[index] for index in indexes]
            if self._custom:
                self._custom.parse_customization(sent, tags, prefix=True)
            sent_out = []
            tags_out = []
            partial_word = ""
            for ind, tag in enumerate(tags):
                if partial_word == "":
                    partial_word = sent[ind]
                    tags_out.append(tag.split("-")[-1])
                    continue
                if tag.startswith("B") or tag.startswith("S") or tag.startswith("O"):
                    sent_out.append(partial_word)
                    tags_out.append(tag.split("-")[-1])
                    partial_word = sent[ind]
                    continue
                partial_word += sent[ind]

            if len(sent_out) < len(tags_out):
                sent_out.append(partial_word)

            pred_words = []
            for s, t in zip(sent_out, tags_out):
                pred_words.append({"item": s, "offset": 0, "wordtag_label": t})

            pred_words = self._reset_offset(pred_words)
            result = {"text": sent, "items": pred_words}
            batch_results.append(result)
        return batch_results

    def _term_linking(self, wordtag_res):
        for item in wordtag_res["items"]:
            flag, _ = self._termtree.find_term(item["item"])
            if flag is False:
                continue
            if item["wordtag_label"] not in LABEL_TO_SCHEMA:
                # Custom label defined by user
                if item["wordtag_label"] not in self._all_tags:
                    target_type_can = [item["wordtag_label"]]
                else:
                    continue
            else:
                target_type_can = LABEL_TO_SCHEMA[item["wordtag_label"]]
            high_priority = False
            for target_type_raw in target_type_can:
                target_type_ = target_type_raw.split("|")
                target_src = None
                if len(target_type_) == 2:
                    target_src = target_type_[1]
                target_type = target_type_[0]
                flag, term_id = self._termtree.find_term(item["item"], target_type)
                if flag is False:
                    continue
                term_id = list(filter(lambda d: self._termtree[d].node_type == "term", term_id))
                if len(term_id) == 0:
                    continue
                if target_src is not None:
                    term_id = list(filter(lambda d: self._termtree[d].base.startswith(target_src.lower()), term_id))
                    if len(term_id) == 0:
                        continue

                term_id.sort(
                    key=lambda d: (
                        self._termtree[d].termtype == target_type or target_type in self._termtree[d].subtype,
                        self._termtree[d].term == item["item"],
                    ),
                    reverse=True,
                )
                if self._termtree[term_id[0]].term == item["item"]:
                    high_priority = True
                    item["termid"] = term_id[0]
                if high_priority:
                    break

    def _construct_input_spec(self):
        """
        Construct the input spec for the convert dygraph model to static model.
        """
        self._input_spec = [
            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),  # input_ids
            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"),  # token_type_ids
            paddle.static.InputSpec(shape=[None], dtype="int64", name="seq_len"),  # seq_len
        ]

    def _construct_model(self, model):
        """
        Construct the inference model for the predictor.
        """

        model_config = ErnieCtmConfig.from_pretrained(self._task_path, num_labels=len(self._tags_to_index))
        model_instance = ErnieCtmWordtagModel.from_pretrained(self._task_path, config=model_config)

        self._model = model_instance
        self._model.eval()

    def _construct_tokenizer(self, model):
        """
        Construct the tokenizer for the predictor.
        """
        tokenizer_instance = ErnieCtmTokenizer.from_pretrained(self._task_path)
        self._tokenizer = tokenizer_instance

    def _preprocess(self, inputs, padding=True, add_special_tokens=True):
        """
        Transform the raw text to the model inputs, two steps involved:
           1) Transform the raw text to token ids.
           2) Generate the other model inputs from the raw text and token ids.
        """
        inputs = self._check_input_text(inputs)
        outputs = self._preprocess_text(inputs)
        return outputs

    def _run_model(self, inputs):
        """
        Run the task model from the outputs of the `_tokenize` function.
        """
        all_pred_tags = []
        for batch in inputs["data_loader"]:
            input_ids, token_type_ids, seq_len = batch
            self.input_handles[0].copy_from_cpu(input_ids.numpy())
            self.input_handles[1].copy_from_cpu(token_type_ids.numpy())
            self.input_handles[2].copy_from_cpu(seq_len.numpy())
            self.predictor.run()
            pred_tags = self.output_handle[0].copy_to_cpu()
            all_pred_tags.extend(pred_tags.tolist())
        inputs["all_pred_tags"] = all_pred_tags
        return inputs

    def _postprocess(self, inputs):
        """
        The model output is the tag ids, this function will convert the model output to raw text.
        """
        results = self._decode(inputs["short_input_texts"], inputs["all_pred_tags"])
        results = self._auto_joiner(results, self.input_mapping, is_dict=True)
        for result in results:
            pred_words = result["items"]
            pred_words = self._reset_offset(pred_words)
            result["items"] = pred_words
        if self.linking is True:
            for res in results:
                self._term_linking(res)
        if self._with_ie:
            ie_results = []
            for result in results:
                spo_result = self._ie_extractor.extract_spo(result["items"])
                ie_results.append(spo_result)
            return [results, ie_results]
        return results

[文档]    def set_schema(self, schema):
        """User define the schema for the information extraction.
        Args:
            schema (List[ Dict[str, Any]]): Dictionary data contain all k-v data.
        """
        self._ie_extractor = WordTagRelationExtractor.from_dict(schema)


[文档]@add_docstrings(usage)
class NPTagTask(Task):
    """
    Noun phrase tagging task that convert the noun phrase to POS tag.
    Args:
        task(string): The name of task.
        model(string): The model name in the task.
        batch_size(int): Numbers of examples a batch.
        linking(bool): Returns the categories. If `linking` is True, the fine-grained label (label) will link with the coarse-grained label (category).
    """

    resource_files_names = {
        "model_state": "model_state.pdparams",
        "model_config": "config.json",
        "name_category_map": "name_category_map.json",
        "vocab_file": "vocab.txt",
        "special_tokens_map": "special_tokens_map.json",
        "tokenizer_config": "tokenizer_config.json",
    }
    resource_files_urls = {
        "nptag": {
            "model_state": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/nptag_v1.2/model_state.pdparams",
                "34923c4d06acf936f52e1fa376b13748",
            ],
            "model_config": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/nptag/config.json",
                "895f0eba0819da56db709d00109c984e",
            ],
            "name_category_map": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/nptag/name_category_map.json",
                "c60810205993d307d919a26a3b96786f",
            ],
            "vocab_file": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/nptag/vocab.txt",
                "54aa6e2eeb0478c2d18a2343b008590c",
            ],
            "special_tokens_map": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/nptag/special_tokens_map.json",
                "58104269e4f141a258bdb2ed06aa599f",
            ],
            "tokenizer_config": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/nptag/tokenizer_config.json",
                "e3f2756e72e24e3bb298303fb9a171f7",
            ],
        }
    }

    def __init__(self, task, model, batch_size=1, max_seq_len=64, linking=False, **kwargs):
        super().__init__(task=task, model=model, **kwargs)
        self._usage = usage
        self._batch_size = batch_size
        self._max_seq_len = max_seq_len
        self._linking = linking
        self._check_task_files()
        self._construct_tokenizer(model)
        self._name_dict = None
        self._summary_num = 2
        self._max_cls_len = 5
        self._lazy_load = kwargs.get("lazy_load", False)
        self._num_workers = kwargs.get("num_workers", 0)
        self._construct_dict_map()

        self._get_inference_model()
        # Disable IR optimization for NPTag
        self._config.switch_ir_optim(False)

    @property
    def summary_num(self):
        """
        Number of model summary token
        """
        return self._summary_num

    def _construct_dict_map(self):
        """
        Construct dict map for the predictor.
        """
        name_dict_path = os.path.join(self._task_path, "name_category_map.json")
        with open(name_dict_path, encoding="utf-8") as fp:
            self._name_dict = json.load(fp)
        self._tree = BurkhardKellerTree()
        self._cls_vocabs = OrderedDict()
        for k in self._name_dict:
            self._tree.add(k)
            for c in k:
                if c not in self._cls_vocabs:
                    self._cls_vocabs[c] = len(self._cls_vocabs)
        self._cls_vocabs["[PAD]"] = len(self._cls_vocabs)
        self._id_vocabs = dict(zip(self._cls_vocabs.values(), self._cls_vocabs.keys()))
        self._vocab_ids = self._tokenizer.vocab.to_indices(list(self._cls_vocabs.keys()))

    def _decode(self, pred_ids):
        tokens = [self._id_vocabs[i] for i in pred_ids]
        valid_token = []
        for token in tokens:
            if token == "[PAD]":
                break
            valid_token.append(token)
        return "".join(valid_token)

    def _search(self, scores_can, pred_ids_can, depth, path, score):
        if depth >= 5:
            return [(path, score)]
        res = []
        for i in range(len(pred_ids_can[0])):
            tmp_res = self._search(
                scores_can, pred_ids_can, depth + 1, path + [pred_ids_can[depth][i]], score + scores_can[depth][i]
            )
            res.extend(tmp_res)
        return res

    def _find_topk(self, a, k, axis=-1, largest=True, sorted=True):
        if axis is None:
            axis_size = a.size
        else:
            axis_size = a.shape[axis]
        assert 1 <= k <= axis_size

        a = np.asanyarray(a)
        if largest:
            index_array = np.argpartition(a, axis_size - k, axis=axis)
            topk_indices = np.take(index_array, -np.arange(k) - 1, axis=axis)
        else:
            index_array = np.argpartition(a, k - 1, axis=axis)
            topk_indices = np.take(index_array, np.arange(k), axis=axis)
        topk_values = np.take_along_axis(a, topk_indices, axis=axis)
        if sorted:
            sorted_indices_in_topk = np.argsort(topk_values, axis=axis)
            if largest:
                sorted_indices_in_topk = np.flip(sorted_indices_in_topk, axis=axis)
            sorted_topk_values = np.take_along_axis(topk_values, sorted_indices_in_topk, axis=axis)
            sorted_topk_indices = np.take_along_axis(topk_indices, sorted_indices_in_topk, axis=axis)
            return sorted_topk_values, sorted_topk_indices
        return topk_values, topk_indices

    def _construct_input_spec(self):
        """
        Construct the input spec for the convert dygraph model to static model.
        """
        self._input_spec = [
            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),  # input_ids
            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"),  # token_type_ids
        ]

    def _construct_model(self, model):
        """
        Construct the inference model for the predictor.
        """
        model_instance = ErnieCtmNptagModel.from_pretrained(self._task_path)
        self._model = model_instance
        self._model.eval()

    def _construct_tokenizer(self, model):
        """
        Construct the tokenizer for the predictor.
        """
        tokenizer_instance = ErnieCtmTokenizer.from_pretrained(self._task_path)
        self._tokenizer = tokenizer_instance

    def _preprocess(self, inputs):
        """
        Create the dataset and dataloader for the predict.
        """
        inputs = self._check_input_text(inputs)
        self._max_cls_len = 5

        # Prompt template: input_text + "是" + "[MASK]" * cls_seq_length
        prompt_template = ["是"] + ["[MASK]"] * self._max_cls_len

        def read(inputs):
            for text in inputs:
                if len(text) + self._max_cls_len + 1 + self._summary_num + 1 > self._max_seq_len:
                    text = text[: (self._max_seq_len - (self._max_cls_len + 1 + self._summary_num + 1))]

                tokens = list(text) + prompt_template
                tokenized_output = self._tokenizer(
                    tokens, return_length=True, is_split_into_words=True, max_length=self._max_seq_len
                )
                label_indices = list(
                    range(tokenized_output["seq_len"] - 1 - self._max_cls_len, tokenized_output["seq_len"] - 1)
                )

                yield {
                    "input_ids": tokenized_output["input_ids"],
                    "token_type_ids": tokenized_output["token_type_ids"],
                    "label_indices": label_indices,
                }

        infer_ds = load_dataset(read, inputs=inputs, lazy=self._lazy_load)

        data_collator = DataCollatorForErnieCtm(self._tokenizer, model="nptag")

        batch_sampler = paddle.io.BatchSampler(dataset=infer_ds, batch_size=self._batch_size, shuffle=False)

        infer_data_loader = paddle.io.DataLoader(
            dataset=infer_ds,
            batch_sampler=batch_sampler,
            collate_fn=data_collator,
            num_workers=self._num_workers,
            return_list=True,
        )

        outputs = {}
        outputs["data_loader"] = infer_data_loader
        outputs["texts"] = inputs
        return outputs

    def _run_model(self, inputs):
        all_scores_can = []
        all_preds_can = []
        pred_ids = []
        for batch in inputs["data_loader"]:
            input_ids, token_type_ids, label_indices = batch
            self.input_handles[0].copy_from_cpu(input_ids.numpy())
            self.input_handles[1].copy_from_cpu(token_type_ids.numpy())
            self.predictor.run()
            logits = self.output_handle[0].copy_to_cpu()
            for i, l in zip(label_indices, logits):
                score = l[i[0] : i[-1] + 1, self._vocab_ids]
                # Find topk candidates of scores and predicted indices.
                score_can, pred_id_can = self._find_topk(score, k=4, axis=-1)

                all_scores_can.extend([score_can.tolist()])
                all_preds_can.extend([pred_id_can.tolist()])
                pred_ids.extend([pred_id_can[:, 0].tolist()])
        inputs["all_scores_can"] = all_scores_can
        inputs["all_preds_can"] = all_preds_can
        inputs["pred_ids"] = pred_ids
        return inputs

    def _postprocess(self, inputs):
        results = []

        for i in range(len(inputs["texts"])):
            cls_label = self._decode(inputs["pred_ids"][i])
            result = {
                "text": inputs["texts"][i],
                "label": cls_label,
            }
            if cls_label not in self._name_dict:
                scores_can = inputs["all_scores_can"][i]
                pred_ids_can = inputs["all_preds_can"][i]
                labels_can = self._search(scores_can, pred_ids_can, 0, [], 0)
                labels_can.sort(key=lambda d: -d[1])
                for labels in labels_can:
                    cls_label_can = self._decode(labels[0])
                    if cls_label_can in self._name_dict:
                        result["label"] = cls_label_can
                        break
                else:
                    labels_can = self._tree.search_similar_word(cls_label)
                    if len(labels_can) != 0:
                        result["label"] = labels_can[0][0]
                        break
            if self._linking:
                if result["label"] in self._name_dict:
                    result["category"] = self._name_dict[result["label"]]
            results.append(result)
        return results