paddlenlp.taskflow.knowledge_mining 源代码

# coding:utf-8
# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
from collections import OrderedDict

import numpy as np
import paddle

from ..datasets import load_dataset
from ..transformers import ErnieCtmNptagModel, ErnieCtmTokenizer, ErnieCtmWordtagModel
from ..transformers.ernie_ctm.configuration import ErnieCtmConfig
from .task import Task
from .utils import (
    BurkhardKellerTree,
    Customization,
    DataCollatorForErnieCtm,
    TermTree,
    WordTagRelationExtractor,
    add_docstrings,
)

LABEL_TO_SCHEMA = {
    "人物类_实体": ["人物|E", "虚拟角色|E", "演艺团体|E"],
    "人物类_概念": ["人物|C", "虚拟角色|C"],
    "作品类_实体": ["作品与出版物|E"],
    "作品类_概念": ["作品与出版物|C", "文化类"],
    "组织机构类": ["组织机构"],
    "组织机构类_企事业单位": ["企事业单位", "品牌", "组织机构"],
    "组织机构类_医疗卫生机构": ["医疗卫生机构", "组织机构"],
    "组织机构类_国家机关": ["国家机关", "组织机构"],
    "组织机构类_体育组织机构": ["体育组织机构", "组织机构"],
    "组织机构类_教育组织机构": ["教育组织机构", "组织机构"],
    "组织机构类_军事组织机构": ["军事组织机构", "组织机构"],
    "物体类": ["物体与物品", "品牌", "虚拟物品", "虚拟物品"],
    "物体类_兵器": ["兵器"],
    "物体类_化学物质": ["物体与物品", "化学术语"],
    "其他角色类": ["角色"],
    "文化类": ["文化", "作品与出版物|C", "体育运动项目", "语言文字"],
    "文化类_语言文字": ["语言学术语"],
    "文化类_奖项赛事活动": ["奖项赛事活动", "特殊日", "事件"],
    "文化类_制度政策协议": ["制度政策协议", "法律法规"],
    "文化类_姓氏与人名": ["姓氏与人名"],
    "生物类": ["生物"],
    "生物类_植物": ["植物", "生物"],
    "生物类_动物": ["动物", "生物"],
    "品牌名": ["品牌", "企事业单位"],
    "场所类": ["区域场所", "居民服务机构", "医疗卫生机构"],
    "场所类_交通场所": ["交通场所", "设施"],
    "位置方位": ["位置方位"],
    "世界地区类": ["世界地区", "区域场所", "政权朝代"],
    "饮食类": ["饮食", "生物类", "药物"],
    "饮食类_菜品": ["饮食"],
    "饮食类_饮品": ["饮食"],
    "药物类": ["药物", "生物类"],
    "药物类_中药": ["药物", "生物类"],
    "医学术语类": ["医药学术语"],
    "术语类_生物体": ["生物学术语"],
    "疾病损伤类": ["疾病损伤", "动物疾病", "医药学术语"],
    "疾病损伤类_植物病虫害": ["植物病虫害", "医药学术语"],
    "宇宙类": ["天文学术语"],
    "事件类": ["事件", "奖项赛事活动"],
    "时间类": ["时间阶段", "政权朝代"],
    "术语类": ["术语"],
    "术语类_符号指标类": ["编码符号指标", "术语"],
    "信息资料": ["生活用语"],
    "链接地址": ["生活用语"],
    "个性特征": ["个性特点", "生活用语"],
    "感官特征": ["生活用语"],
    "场景事件": ["场景事件", "情绪", "态度", "个性特点"],
    "介词": ["介词"],
    "介词_方位介词": ["介词"],
    "助词": ["助词"],
    "代词": ["代词"],
    "连词": ["连词"],
    "副词": ["副词"],
    "疑问词": ["疑问词"],
    "肯定词": ["肯定否定词"],
    "否定词": ["肯定否定词"],
    "数量词": ["数量词", "量词"],
    "叹词": ["叹词"],
    "拟声词": ["拟声词"],
    "修饰词": ["修饰词", "生活用语"],
    "外语单词": ["日文假名", "词汇用语"],
    "汉语拼音": ["汉语拼音"],
}

usage = r"""
            from paddlenlp import Taskflow

            # 默认使用WordTag词类知识标注工具
            wordtag = Taskflow("knowledge_mining", model="wordtag")
            wordtag("《孤女》是2010年九州出版社出版的小说,作者是余兼羽")
            '''
            [{'text': '《孤女》是2010年九州出版社出版的小说,作者是余兼羽', 'items': [{'item': '《', 'offset': 0, 'wordtag_label': 'w', 'length': 1}, {'item': '孤女', 'offset': 1, 'wordtag_label': '作品类_实体', 'length': 2}, {'item': '》', 'offset': 3, 'wordtag_label': 'w', 'length': 1}, {'item': '是', 'offset': 4, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '2010年', 'offset': 5, 'wordtag_label': '时间类', 'length': 5, 'termid': '时间阶段_cb_2010年'}, {'item': '九州出版社', 'offset': 10, 'wordtag_label': '组织机构类', 'length': 5, 'termid': '组织机构_eb_九州出版社'}, {'item': '出版', 'offset': 15, 'wordtag_label': '场景事件', 'length': 2, 'termid': '场景事件_cb_出版'}, {'item': '的', 'offset': 17, 'wordtag_label': '助词', 'length': 1, 'termid': '助词_cb_的'}, {'item': '小说', 'offset': 18, 'wordtag_label': '作品类_概念', 'length': 2, 'termid': '小说_cb_小说'}, {'item': ',', 'offset': 20, 'wordtag_label': 'w', 'length': 1}, {'item': '作者', 'offset': 21, 'wordtag_label': '人物类_概念', 'length': 2, 'termid': '人物_cb_作者'}, {'item': '是', 'offset': 23, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '余兼羽', 'offset': 24, 'wordtag_label': '人物类_实体', 'length': 3}]}]
            '''

            wordtag= Taskflow("knowledge_mining", batch_size=2)
            wordtag(["热梅茶是一道以梅子为主要原料制作的茶饮",
                    "《孤女》是2010年九州出版社出版的小说,作者是余兼羽"])
            '''
            [{'text': '热梅茶是一道以梅子为主要原料制作的茶饮', 'items': [{'item': '热梅茶', 'offset': 0, 'wordtag_label': '饮食类_饮品', 'length': 3}, {'item': '是', 'offset': 3, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '一道', 'offset': 4, 'wordtag_label': '数量词', 'length': 2}, {'item': '以', 'offset': 6, 'wordtag_label': '介词', 'length': 1, 'termid': '介词_cb_以'}, {'item': '梅子', 'offset': 7, 'wordtag_label': '饮食类', 'length': 2, 'termid': '饮食_cb_梅'}, {'item': '为', 'offset': 9, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_为'}, {'item': '主要原料', 'offset': 10, 'wordtag_label': '物体类', 'length': 4, 'termid': '物品_cb_主要原料'}, {'item': '制作', 'offset': 14, 'wordtag_label': '场景事件', 'length': 2, 'termid': '场景事件_cb_制作'}, {'item': '的', 'offset': 16, 'wordtag_label': '助词', 'length': 1, 'termid': '助词_cb_的'}, {'item': '茶饮', 'offset': 17, 'wordtag_label': '饮食类_饮品', 'length': 2, 'termid': '饮品_cb_茶饮'}]}, {'text': '《孤女》是2010年九州出版社出版的小说,作者是余兼羽', 'items': [{'item': '《', 'offset': 0, 'wordtag_label': 'w', 'length': 1}, {'item': '孤女', 'offset': 1, 'wordtag_label': '作品类_实体', 'length': 2}, {'item': '》', 'offset': 3, 'wordtag_label': 'w', 'length': 1}, {'item': '是', 'offset': 4, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '2010年', 'offset': 5, 'wordtag_label': '时间类', 'length': 5, 'termid': '时间阶段_cb_2010年'}, {'item': '九州出版社', 'offset': 10, 'wordtag_label': '组织机构类', 'length': 5, 'termid': '组织机构_eb_九州出版社'}, {'item': '出版', 'offset': 15, 'wordtag_label': '场景事件', 'length': 2, 'termid': '场景事件_cb_出版'}, {'item': '的', 'offset': 17, 'wordtag_label': '助词', 'length': 1, 'termid': '助词_cb_的'}, {'item': '小说', 'offset': 18, 'wordtag_label': '作品类_概念', 'length': 2, 'termid': '小说_cb_小说'}, {'item': ',', 'offset': 20, 'wordtag_label': 'w', 'length': 1}, {'item': '作者', 'offset': 21, 'wordtag_label': '人物类_概念', 'length': 2, 'termid': '人物_cb_作者'}, {'item': '是', 'offset': 23, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '余兼羽', 'offset': 24, 'wordtag_label': '人物类_实体', 'length': 3}]}]
            '''

            # 使用WordTag-IE进行信息抽取
            wordtag = Taskflow("knowledge_mining", model="wordtag", with_ie=True)
            '''
            [[{'text': '《忘了所有》是一首由王杰作词、作曲并演唱的歌曲,收录在专辑同名《忘了所有》中,由波丽佳音唱片于1996年08月31日发行。', 'items': [{'item': '《', 'offset': 0, 'wordtag_label': 'w', 'length': 1}, {'item': '忘了所有', 'offset': 1, 'wordtag_label': '作品类_实体', 'length': 4}, {'item': '》', 'offset': 5, 'wordtag_label': 'w', 'length': 1}, {'item': '是', 'offset': 6, 'wordtag_label': '肯定词', 'length': 1}, {'item': '一首', 'offset': 7, 'wordtag_label': '数量词_单位数量词', 'length': 2}, {'item': '由', 'offset': 9, 'wordtag_label': '介词', 'length': 1}, {'item': '王杰', 'offset': 10, 'wordtag_label': '人物类_实体', 'length': 2}, {'item': '作词', 'offset': 12, 'wordtag_label': '场景事件', 'length': 2}, {'item': '、', 'offset': 14, 'wordtag_label': 'w', 'length': 1}, {'item': '作曲', 'offset': 15, 'wordtag_label': '场景事件', 'length': 2}, {'item': '并', 'offset': 17, 'wordtag_label': '连词', 'length': 1}, {'item': '演唱', 'offset': 18, 'wordtag_label': '场景事件', 'length': 2}, {'item': '的', 'offset': 20, 'wordtag_label': '助词', 'length': 1}, {'item': '歌曲', 'offset': 21, 'wordtag_label': '作品类_概念', 'length': 2}, {'item': ',', 'offset': 23, 'wordtag_label': 'w', 'length': 1}, {'item': '收录', 'offset': 24, 'wordtag_label': '场景事件', 'length': 2}, {'item': '在', 'offset': 26, 'wordtag_label': '介词', 'length': 1}, {'item': '专辑', 'offset': 27, 'wordtag_label': '作品类_概念', 'length': 2}, {'item': '同名', 'offset': 29, 'wordtag_label': '场景事件', 'length': 2}, {'item': '《', 'offset': 31, 'wordtag_label': 'w', 'length': 1}, {'item': '忘了所有', 'offset': 32, 'wordtag_label': '作品类_实体', 'length': 4}, {'item': '》', 'offset': 36, 'wordtag_label': 'w', 'length': 1}, {'item': '中', 'offset': 37, 'wordtag_label': '词汇用语', 'length': 1}, {'item': ',', 'offset': 38, 'wordtag_label': 'w', 'length': 1}, {'item': '由', 'offset': 39, 'wordtag_label': '介词', 'length': 1}, {'item': '波丽佳音', 'offset': 40, 'wordtag_label': '人物类_实体', 'length': 4}, {'item': '唱片', 'offset': 44, 'wordtag_label': '作品类_概念', 'length': 2}, {'item': '于', 'offset': 46, 'wordtag_label': '介词', 'length': 1}, {'item': '1996年08月31日', 'offset': 47, 'wordtag_label': '时间类_具体时间', 'length': 11}, {'item': '发行', 'offset': 58, 'wordtag_label': '场景事件', 'length': 2}, {'item': '。', 'offset': 60, 'wordtag_label': 'w', 'length': 1}]}], [[{'HEAD_ROLE': {'item': '王杰', 'offset': 10, 'type': '人物类_实体'}, 'TAIL_ROLE': [{'item': '忘了所有', 'type': '作品类_实体', 'offset': 1}], 'GROUP': '创作', 'TRIG': [{'item': '作词', 'offset': 12}, {'item': '作曲', 'offset': 15}, {'item': '演唱', 'offset': 18}], 'SRC': 'REVERSE'}, {'HEAD_ROLE': {'item': '忘了所有', 'type': '作品类_实体', 'offset': 1}, 'TAIL_ROLE': [{'item': '王杰', 'offset': 10, 'type': '人物类_实体'}], 'GROUP': '创作者', 'SRC': 'HTG', 'TRIG': [{'item': '作词', 'offset': 12}, {'item': '作曲', 'offset': 15}, {'item': '演唱', 'offset': 18}]}, {'HEAD_ROLE': {'item': '忘了所有', 'type': '作品类_实体', 'offset': 1}, 'TAIL_ROLE': [{'item': '歌曲', 'offset': 21, 'type': '作品类_概念'}], 'GROUP': '类型', 'SRC': 'TAIL'}, {'HEAD_ROLE': {'item': '忘了所有', 'offset': 32, 'type': '作品类_实体'}, 'TAIL_ROLE': [{'item': '忘了所有', 'type': '作品类_实体', 'offset': 1}], 'GROUP': '收录', 'TRIG': [{'item': '收录', 'offset': 24}], 'SRC': 'REVERSE'}, {'HEAD_ROLE': {'item': '忘了所有', 'type': '作品类_实体', 'offset': 1}, 'TAIL_ROLE': [{'item': '忘了所有', 'offset': 32, 'type': '作品类_实体'}], 'GROUP': '收录于', 'SRC': 'HGT', 'TRIG': [{'item': '收录', 'offset': 24}]}, {'HEAD_ROLE': {'item': '忘了所有', 'offset': 32, 'type': '作品类_实体'}, 'TAIL_ROLE': [{'item': '王杰', 'type': '人物类_实体', 'offset': 10}], 'GROUP': '创作者', 'TRIG': [{'item': '专辑', 'offset': 27}], 'SRC': 'REVERSE'}, {'HEAD_ROLE': {'item': '王杰', 'type': '人物类_实体', 'offset': 10}, 'TAIL_ROLE': [{'item': '忘了所有', 'offset': 32, 'type': '作品类_实体'}], 'GROUP': '创作', 'SRC': 'HGT', 'TRIG': [{'item': '专辑', 'offset': 27}]}, {'HEAD_ROLE': {'item': '忘了所有', 'type': '作品类_实体', 'offset': 32}, 'TAIL_ROLE': [{'item': '唱片', 'offset': 44, 'type': '作品类_概念'}], 'GROUP': '类型', 'SRC': 'TAIL'}]]]
            '''

            # 切换为NPTag名词短语标注工具
            nptag = Taskflow("knowledge_mining", model="nptag")
            nptag("糖醋排骨")
            '''
            [{'text': '糖醋排骨', 'label': '菜品'}]
            '''

            nptag(["糖醋排骨", "红曲霉菌"])
            '''
            [{'text': '糖醋排骨', 'label': '菜品'}, {'text': '红曲霉菌', 'label': '微生物'}]
            '''

            # 输出粗粒度类别标签`category`,即WordTag的词汇标签。
            nptag = Taskflow("knowledge_mining", model="nptag", linking=True)
            nptag(["糖醋排骨", "红曲霉菌"])
            '''
            [{'text': '糖醋排骨', 'label': '菜品', 'category': '饮食类_菜品'}, {'text': '红曲霉菌', 'label': '微生物', 'category': '生物类_微生物'}]
            '''
         """


[文档]@add_docstrings(usage) class WordTagTask(Task): """ This the NER(Named Entity Recognition) task that convert the raw text to entities. And the task with the `wordtag` model will link the more meesage with the entity. Args: task(string): The name of task. model(string): The model name in the task. kwargs (dict, optional): Additional keyword arguments passed along to the specific task. """ resource_files_names = { "model_state": "model_state.pdparams", "model_config": "config.json", "termtree_schema": "termtree_type.csv", "termtree_data": "termtree_data", "tags": "tags.txt", "spo_config": "spo_config.pkl", "vocab_file": "vocab.txt", "special_tokens_map": "special_tokens_map.json", "tokenizer_config": "tokenizer_config.json", } resource_files_urls = { "wordtag": { "model_state": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.5/model_state.pdparams", "c7c9cef72f73ee22c70c26ef11393025", ], "model_config": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.1/config.json", "b9f307b3fa03ad98c08ecb5249c15dfa", ], "termtree_schema": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/termtree_type.csv", "062cb9ac24f4135bf836e2a2fc5a1209", ], "termtree_data": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/termtree_data", "a0efe723f84cf90540ac727be5b62e59", ], "tags": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.1/tags.txt", "f33feedd01d478b03bac81be19b48d00", ], "spo_config": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.1/spo_config.pkl", "07a0b8d0422198d8c4c0f70e68963275", ], "vocab_file": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/vocab.txt", "54aa6e2eeb0478c2d18a2343b008590c", ], "special_tokens_map": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/special_tokens_map.json", "58104269e4f141a258bdb2ed06aa599f", ], "tokenizer_config": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/tokenizer_config.json", "e3f2756e72e24e3bb298303fb9a171f7", ], } } def __init__( self, model, task, tag_path=None, term_schema_path=None, term_data_path=None, user_dict=None, linking=True, spo_config_path=None, with_ie=False, **kwargs ): super().__init__(model=model, task=task, **kwargs) self._tag_path = tag_path self._term_schema_path = term_schema_path self._term_data_path = term_data_path self._user_dict = user_dict self._linking = linking self._spo_config_path = spo_config_path self._with_ie = with_ie self._check_task_files() self._load_task_resources() self._construct_tokenizer(model) self._usage = usage self._summary_num = 2 self._get_inference_model() if self._user_dict: self._custom = Customization() self._custom.load_customization(self._user_dict) else: self._custom = None self._num_workers = self.kwargs["num_workers"] if "num_workers" in self.kwargs else 0 self._batch_size = self.kwargs["batch_size"] if "batch_size" in self.kwargs else 1 self._lazy_load = self.kwargs["lazy_load"] if "lazy_load" in self.kwargs else False self._max_seq_len = self.kwargs["max_seq_len"] if "max_seq_len" in self.kwargs else 512 self._split_sentence = self.kwargs["split_sentence"] if "split_sentence" in self.kwargs else False if self._with_ie: self._ie_extractor = WordTagRelationExtractor.from_pkl(self._spo_config_path) @property def summary_num(self): """ Number of model summary token """ return self._summary_num @property def linking(self): """ Whether to do term linking. """ return self._linking @staticmethod def _load_labels(tag_path): tags_to_idx = {} all_tags = [] i = 0 with open(tag_path, encoding="utf-8") as fp: for line in fp: line = line.strip() tag = line.split("-")[-1] if tag not in all_tags: all_tags.append(tag) tags_to_idx[line] = i i += 1 idx_to_tags = dict(zip(*(tags_to_idx.values(), tags_to_idx.keys()))) return tags_to_idx, idx_to_tags, all_tags def _load_task_resources(self): """ Load the resource of this task. """ if self._tag_path is None: self._tag_path = os.path.join(self._task_path, "tags.txt") self._tags_to_index, self._index_to_tags, self._all_tags = self._load_labels(self._tag_path) if self._term_schema_path is None: self._term_schema_path = os.path.join(self._task_path, "termtree_type.csv") if self._term_data_path is None: self._term_data_path = os.path.join(self._task_path, "termtree_data") if self._linking is True: self._termtree = TermTree.from_dir(self._term_schema_path, self._term_data_path, self._linking) if self._spo_config_path is None: self._spo_config_path = os.path.join(self._task_path, "spo_config.pkl") def _preprocess_text(self, input_texts): """ Create the dataset and dataloader for the predict. """ max_predict_len = self._max_seq_len - self.summary_num - 1 filter_input_texts = [] for input_text in input_texts: if not (isinstance(input_text, str) and len(input_text) > 0): continue filter_input_texts.append(input_text) input_texts = filter_input_texts short_input_texts, self.input_mapping = self._auto_splitter( input_texts, max_predict_len, split_sentence=self._split_sentence ) def read(inputs): for text in inputs: tokenized_output = self._tokenizer( list(text), return_length=True, is_split_into_words=True, max_length=self._max_seq_len ) yield { "input_ids": tokenized_output["input_ids"], "token_type_ids": tokenized_output["token_type_ids"], "seq_len": tokenized_output["seq_len"], } infer_ds = load_dataset(read, inputs=short_input_texts, lazy=self._lazy_load) data_collator = DataCollatorForErnieCtm(self._tokenizer, model="wordtag") batch_sampler = paddle.io.BatchSampler(dataset=infer_ds, batch_size=self._batch_size, shuffle=False) infer_data_loader = paddle.io.DataLoader( dataset=infer_ds, batch_sampler=batch_sampler, collate_fn=data_collator, num_workers=self._num_workers, return_list=True, ) outputs = {} outputs["data_loader"] = infer_data_loader outputs["short_input_texts"] = short_input_texts return outputs def _reset_offset(self, pred_words): for i in range(0, len(pred_words)): if i > 0: pred_words[i]["offset"] = pred_words[i - 1]["offset"] + len(pred_words[i - 1]["item"]) pred_words[i]["length"] = len(pred_words[i]["item"]) return pred_words def _decode(self, batch_texts, batch_pred_tags): batch_results = [] for sent_index in range(len(batch_texts)): sent = batch_texts[sent_index] indexes = batch_pred_tags[sent_index][self.summary_num : len(sent) + self.summary_num] tags = [self._index_to_tags[index] for index in indexes] if self._custom: self._custom.parse_customization(sent, tags, prefix=True) sent_out = [] tags_out = [] partial_word = "" for ind, tag in enumerate(tags): if partial_word == "": partial_word = sent[ind] tags_out.append(tag.split("-")[-1]) continue if tag.startswith("B") or tag.startswith("S") or tag.startswith("O"): sent_out.append(partial_word) tags_out.append(tag.split("-")[-1]) partial_word = sent[ind] continue partial_word += sent[ind] if len(sent_out) < len(tags_out): sent_out.append(partial_word) pred_words = [] for s, t in zip(sent_out, tags_out): pred_words.append({"item": s, "offset": 0, "wordtag_label": t}) pred_words = self._reset_offset(pred_words) result = {"text": sent, "items": pred_words} batch_results.append(result) return batch_results def _term_linking(self, wordtag_res): for item in wordtag_res["items"]: flag, _ = self._termtree.find_term(item["item"]) if flag is False: continue if item["wordtag_label"] not in LABEL_TO_SCHEMA: # Custom label defined by user if item["wordtag_label"] not in self._all_tags: target_type_can = [item["wordtag_label"]] else: continue else: target_type_can = LABEL_TO_SCHEMA[item["wordtag_label"]] high_priority = False for target_type_raw in target_type_can: target_type_ = target_type_raw.split("|") target_src = None if len(target_type_) == 2: target_src = target_type_[1] target_type = target_type_[0] flag, term_id = self._termtree.find_term(item["item"], target_type) if flag is False: continue term_id = list(filter(lambda d: self._termtree[d].node_type == "term", term_id)) if len(term_id) == 0: continue if target_src is not None: term_id = list(filter(lambda d: self._termtree[d].base.startswith(target_src.lower()), term_id)) if len(term_id) == 0: continue term_id.sort( key=lambda d: ( self._termtree[d].termtype == target_type or target_type in self._termtree[d].subtype, self._termtree[d].term == item["item"], ), reverse=True, ) if self._termtree[term_id[0]].term == item["item"]: high_priority = True item["termid"] = term_id[0] if high_priority: break def _construct_input_spec(self): """ Construct the input spec for the convert dygraph model to static model. """ self._input_spec = [ paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"), # input_ids paddle.static.InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"), # token_type_ids paddle.static.InputSpec(shape=[None], dtype="int64", name="seq_len"), # seq_len ] def _construct_model(self, model): """ Construct the inference model for the predictor. """ model_config = ErnieCtmConfig.from_pretrained(self._task_path, num_labels=len(self._tags_to_index)) model_instance = ErnieCtmWordtagModel.from_pretrained(self._task_path, config=model_config) self._model = model_instance self._model.eval() def _construct_tokenizer(self, model): """ Construct the tokenizer for the predictor. """ tokenizer_instance = ErnieCtmTokenizer.from_pretrained(self._task_path) self._tokenizer = tokenizer_instance def _preprocess(self, inputs, padding=True, add_special_tokens=True): """ Transform the raw text to the model inputs, two steps involved: 1) Transform the raw text to token ids. 2) Generate the other model inputs from the raw text and token ids. """ inputs = self._check_input_text(inputs) outputs = self._preprocess_text(inputs) return outputs def _run_model(self, inputs): """ Run the task model from the outputs of the `_tokenize` function. """ all_pred_tags = [] for batch in inputs["data_loader"]: input_ids, token_type_ids, seq_len = batch self.input_handles[0].copy_from_cpu(input_ids.numpy()) self.input_handles[1].copy_from_cpu(token_type_ids.numpy()) self.input_handles[2].copy_from_cpu(seq_len.numpy()) self.predictor.run() pred_tags = self.output_handle[0].copy_to_cpu() all_pred_tags.extend(pred_tags.tolist()) inputs["all_pred_tags"] = all_pred_tags return inputs def _postprocess(self, inputs): """ The model output is the tag ids, this function will convert the model output to raw text. """ results = self._decode(inputs["short_input_texts"], inputs["all_pred_tags"]) results = self._auto_joiner(results, self.input_mapping, is_dict=True) for result in results: pred_words = result["items"] pred_words = self._reset_offset(pred_words) result["items"] = pred_words if self.linking is True: for res in results: self._term_linking(res) if self._with_ie: ie_results = [] for result in results: spo_result = self._ie_extractor.extract_spo(result["items"]) ie_results.append(spo_result) return [results, ie_results] return results
[文档] def set_schema(self, schema): """User define the schema for the information extraction. Args: schema (List[ Dict[str, Any]]): Dictionary data contain all k-v data. """ self._ie_extractor = WordTagRelationExtractor.from_dict(schema)
[文档]@add_docstrings(usage) class NPTagTask(Task): """ Noun phrase tagging task that convert the noun phrase to POS tag. Args: task(string): The name of task. model(string): The model name in the task. batch_size(int): Numbers of examples a batch. linking(bool): Returns the categories. If `linking` is True, the fine-grained label (label) will link with the coarse-grained label (category). """ resource_files_names = { "model_state": "model_state.pdparams", "model_config": "config.json", "name_category_map": "name_category_map.json", "vocab_file": "vocab.txt", "special_tokens_map": "special_tokens_map.json", "tokenizer_config": "tokenizer_config.json", } resource_files_urls = { "nptag": { "model_state": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/nptag_v1.2/model_state.pdparams", "34923c4d06acf936f52e1fa376b13748", ], "model_config": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/nptag/config.json", "895f0eba0819da56db709d00109c984e", ], "name_category_map": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/nptag/name_category_map.json", "c60810205993d307d919a26a3b96786f", ], "vocab_file": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/nptag/vocab.txt", "54aa6e2eeb0478c2d18a2343b008590c", ], "special_tokens_map": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/nptag/special_tokens_map.json", "58104269e4f141a258bdb2ed06aa599f", ], "tokenizer_config": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/nptag/tokenizer_config.json", "e3f2756e72e24e3bb298303fb9a171f7", ], } } def __init__(self, task, model, batch_size=1, max_seq_len=64, linking=False, **kwargs): super().__init__(task=task, model=model, **kwargs) self._usage = usage self._batch_size = batch_size self._max_seq_len = max_seq_len self._linking = linking self._check_task_files() self._construct_tokenizer(model) self._name_dict = None self._summary_num = 2 self._max_cls_len = 5 self._lazy_load = kwargs.get("lazy_load", False) self._num_workers = kwargs.get("num_workers", 0) self._construct_dict_map() self._get_inference_model() # Disable IR optimization for NPTag self._config.switch_ir_optim(False) @property def summary_num(self): """ Number of model summary token """ return self._summary_num def _construct_dict_map(self): """ Construct dict map for the predictor. """ name_dict_path = os.path.join(self._task_path, "name_category_map.json") with open(name_dict_path, encoding="utf-8") as fp: self._name_dict = json.load(fp) self._tree = BurkhardKellerTree() self._cls_vocabs = OrderedDict() for k in self._name_dict: self._tree.add(k) for c in k: if c not in self._cls_vocabs: self._cls_vocabs[c] = len(self._cls_vocabs) self._cls_vocabs["[PAD]"] = len(self._cls_vocabs) self._id_vocabs = dict(zip(self._cls_vocabs.values(), self._cls_vocabs.keys())) self._vocab_ids = self._tokenizer.vocab.to_indices(list(self._cls_vocabs.keys())) def _decode(self, pred_ids): tokens = [self._id_vocabs[i] for i in pred_ids] valid_token = [] for token in tokens: if token == "[PAD]": break valid_token.append(token) return "".join(valid_token) def _search(self, scores_can, pred_ids_can, depth, path, score): if depth >= 5: return [(path, score)] res = [] for i in range(len(pred_ids_can[0])): tmp_res = self._search( scores_can, pred_ids_can, depth + 1, path + [pred_ids_can[depth][i]], score + scores_can[depth][i] ) res.extend(tmp_res) return res def _find_topk(self, a, k, axis=-1, largest=True, sorted=True): if axis is None: axis_size = a.size else: axis_size = a.shape[axis] assert 1 <= k <= axis_size a = np.asanyarray(a) if largest: index_array = np.argpartition(a, axis_size - k, axis=axis) topk_indices = np.take(index_array, -np.arange(k) - 1, axis=axis) else: index_array = np.argpartition(a, k - 1, axis=axis) topk_indices = np.take(index_array, np.arange(k), axis=axis) topk_values = np.take_along_axis(a, topk_indices, axis=axis) if sorted: sorted_indices_in_topk = np.argsort(topk_values, axis=axis) if largest: sorted_indices_in_topk = np.flip(sorted_indices_in_topk, axis=axis) sorted_topk_values = np.take_along_axis(topk_values, sorted_indices_in_topk, axis=axis) sorted_topk_indices = np.take_along_axis(topk_indices, sorted_indices_in_topk, axis=axis) return sorted_topk_values, sorted_topk_indices return topk_values, topk_indices def _construct_input_spec(self): """ Construct the input spec for the convert dygraph model to static model. """ self._input_spec = [ paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"), # input_ids paddle.static.InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"), # token_type_ids ] def _construct_model(self, model): """ Construct the inference model for the predictor. """ model_instance = ErnieCtmNptagModel.from_pretrained(self._task_path) self._model = model_instance self._model.eval() def _construct_tokenizer(self, model): """ Construct the tokenizer for the predictor. """ tokenizer_instance = ErnieCtmTokenizer.from_pretrained(self._task_path) self._tokenizer = tokenizer_instance def _preprocess(self, inputs): """ Create the dataset and dataloader for the predict. """ inputs = self._check_input_text(inputs) self._max_cls_len = 5 # Prompt template: input_text + "是" + "[MASK]" * cls_seq_length prompt_template = ["是"] + ["[MASK]"] * self._max_cls_len def read(inputs): for text in inputs: if len(text) + self._max_cls_len + 1 + self._summary_num + 1 > self._max_seq_len: text = text[: (self._max_seq_len - (self._max_cls_len + 1 + self._summary_num + 1))] tokens = list(text) + prompt_template tokenized_output = self._tokenizer( tokens, return_length=True, is_split_into_words=True, max_length=self._max_seq_len ) label_indices = list( range(tokenized_output["seq_len"] - 1 - self._max_cls_len, tokenized_output["seq_len"] - 1) ) yield { "input_ids": tokenized_output["input_ids"], "token_type_ids": tokenized_output["token_type_ids"], "label_indices": label_indices, } infer_ds = load_dataset(read, inputs=inputs, lazy=self._lazy_load) data_collator = DataCollatorForErnieCtm(self._tokenizer, model="nptag") batch_sampler = paddle.io.BatchSampler(dataset=infer_ds, batch_size=self._batch_size, shuffle=False) infer_data_loader = paddle.io.DataLoader( dataset=infer_ds, batch_sampler=batch_sampler, collate_fn=data_collator, num_workers=self._num_workers, return_list=True, ) outputs = {} outputs["data_loader"] = infer_data_loader outputs["texts"] = inputs return outputs def _run_model(self, inputs): all_scores_can = [] all_preds_can = [] pred_ids = [] for batch in inputs["data_loader"]: input_ids, token_type_ids, label_indices = batch self.input_handles[0].copy_from_cpu(input_ids.numpy()) self.input_handles[1].copy_from_cpu(token_type_ids.numpy()) self.predictor.run() logits = self.output_handle[0].copy_to_cpu() for i, l in zip(label_indices, logits): score = l[i[0] : i[-1] + 1, self._vocab_ids] # Find topk candidates of scores and predicted indices. score_can, pred_id_can = self._find_topk(score, k=4, axis=-1) all_scores_can.extend([score_can.tolist()]) all_preds_can.extend([pred_id_can.tolist()]) pred_ids.extend([pred_id_can[:, 0].tolist()]) inputs["all_scores_can"] = all_scores_can inputs["all_preds_can"] = all_preds_can inputs["pred_ids"] = pred_ids return inputs def _postprocess(self, inputs): results = [] for i in range(len(inputs["texts"])): cls_label = self._decode(inputs["pred_ids"][i]) result = { "text": inputs["texts"][i], "label": cls_label, } if cls_label not in self._name_dict: scores_can = inputs["all_scores_can"][i] pred_ids_can = inputs["all_preds_can"][i] labels_can = self._search(scores_can, pred_ids_can, 0, [], 0) labels_can.sort(key=lambda d: -d[1]) for labels in labels_can: cls_label_can = self._decode(labels[0]) if cls_label_can in self._name_dict: result["label"] = cls_label_can break else: labels_can = self._tree.search_similar_word(cls_label) if len(labels_can) != 0: result["label"] = labels_can[0][0] break if self._linking: if result["label"] in self._name_dict: result["category"] = self._name_dict[result["label"]] results.append(result) return results