Source code for paddlenlp.taskflow.named_entity_recognition

# coding:utf-8
# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .knowledge_mining import WordTagTask
from .lexical_analysis import LacTask
from .utils import Customization

POS_LABEL_WORDTAG = [
    "介词",
    "介词_方位介词",
    "助词",
    "代词",
    "连词",
    "副词",
    "疑问词",
    "肯定词",
    "否定词",
    "数量词",
    "叹词",
    "拟声词",
    "修饰词",
    "外语单词",
    "英语单词",
    "汉语拼音",
    "词汇用语",
    "w",
]

POS_LABEL_LAC = ["n", "f", "s", "t", "v", "vd", "vn", "a", "ad", "an", "d", "m", "q", "r", "p", "c", "u", "xc", "w"]

usage = r"""
          from paddlenlp import Taskflow

          # WordTag精确模式
          ner = Taskflow("ner")
          ner("《孤女》是2010年九州出版社出版的小说,作者是余兼羽")
          '''
          [('《', 'w'), ('孤女', '作品类_实体'), ('》', 'w'), ('是', '肯定词'), ('2010年', '时间类'), ('九州出版社', '组织机构类'), ('出版', '场景事件'), ('的', '助词'), ('小说', '作品类_概念'), (',', 'w'), ('作者', '人物类_概念'), ('是', '肯定词'), ('余兼羽', '人物类_实体')]
          '''

          ner(["热梅茶是一道以梅子为主要原料制作的茶饮", "《孤女》是2010年九州出版社出版的小说,作者是余兼羽"])
          '''
          [[('热梅茶', '饮食类_饮品'), ('是', '肯定词'), ('一道', '数量词'), ('以', '介词'), ('梅子', '饮食类'), ('为', '肯定词'), ('主要原料', '物体类'), ('制作', '场景事件'), ('的', '助词'), ('茶饮', '饮食类_饮品')], [('《', 'w'), ('孤女', '作品类_实体'), ('》', 'w'), ('是', '肯定词'), ('2010年', '时间类'), ('九州出版社', '组织机构类'), ('出版', '场景事件'), ('的', '助词'), ('小说', '作品类_概念'), (',', 'w'), ('作者', '人物类_概念'), ('是', '肯定词'), ('余兼羽', '人物类_实体')]]
          '''

          # 只返回实体/概念词
          ner = Taskflow("ner", entity_only=True)
          ner("《孤女》是2010年九州出版社出版的小说,作者是余兼羽")
          '''
          [('孤女', '作品类_实体'), ('2010年', '时间类'), ('九州出版社', '组织机构类'), ('出版', '场景事件'), ('小说', '作品类_概念'), ('作者', '人物类_概念'), ('余兼羽', '人物类_实体')]
          '''

          # 使用快速模式,只返回实体词
          ner = Taskflow("ner", mode="fast", entity_only=True)
          ner("三亚是一个美丽的城市")
          '''
          [('三亚', 'LOC')]
          '''
          """


[docs]class NERWordTagTask(WordTagTask): """ This the NER(Named Entity Recognition) task that convert the raw text to entities. And the task with the `wordtag` model will link the more meesage with the entity. Args: task(string): The name of task. model(string): The model name in the task. kwargs (dict, optional): Additional keyword arguments passed along to the specific task. """ resource_files_names = { "model_state": "model_state.pdparams", "model_config": "config.json", "tags": "tags.txt", "vocab_file": "vocab.txt", "special_tokens_map": "special_tokens_map.json", "tokenizer_config": "tokenizer_config.json", } resource_files_urls = { "wordtag": { "model_state": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.5/model_state.pdparams", "c7c9cef72f73ee22c70c26ef11393025", ], "model_config": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.1/config.json", "b9f307b3fa03ad98c08ecb5249c15dfa", ], "tags": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.1/tags.txt", "f33feedd01d478b03bac81be19b48d00", ], "vocab_file": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/vocab.txt", "54aa6e2eeb0478c2d18a2343b008590c", ], "special_tokens_map": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/special_tokens_map.json", "58104269e4f141a258bdb2ed06aa599f", ], "tokenizer_config": [ "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/tokenizer_config.json", "e3f2756e72e24e3bb298303fb9a171f7", ], } } def __init__(self, model, task, entity_only=False, **kwargs): super().__init__(model="wordtag", task=task, **kwargs) self.entity_only = entity_only if self._user_dict: self._custom = Customization() self._custom.load_customization(self._user_dict) else: self._custom = None def _decode(self, batch_texts, batch_pred_tags): batch_results = [] for sent_index in range(len(batch_texts)): sent = batch_texts[sent_index] indexes = batch_pred_tags[sent_index][self.summary_num : len(sent) + self.summary_num] tags = [self._index_to_tags[index] for index in indexes] if self._custom: self._custom.parse_customization(sent, tags, prefix=True) sent_out = [] tags_out = [] partial_word = "" for ind, tag in enumerate(tags): if partial_word == "": partial_word = sent[ind] tags_out.append(tag.split("-")[-1]) continue if tag.startswith("B") or tag.startswith("S") or tag.startswith("O"): sent_out.append(partial_word) tags_out.append(tag.split("-")[-1]) partial_word = sent[ind] continue partial_word += sent[ind] if len(sent_out) < len(tags_out): sent_out.append(partial_word) pred_words = [] for s, t in zip(sent_out, tags_out): pred_words.append({"item": s, "wordtag_label": t}) result = {"text": sent, "items": pred_words} batch_results.append(result) return batch_results def _simplify_result(self, results): simple_results = [] for result in results: simple_result = [] if "items" in result: for item in result["items"]: if self.entity_only and item["wordtag_label"] in POS_LABEL_WORDTAG: continue simple_result.append((item["item"], item["wordtag_label"])) simple_results.append(simple_result) simple_results = simple_results[0] if len(simple_results) == 1 else simple_results return simple_results def _postprocess(self, inputs): """ The model output is the tag ids, this function will convert the model output to raw text. """ results = self._decode(inputs["short_input_texts"], inputs["all_pred_tags"]) results = self._auto_joiner(results, self.input_mapping, is_dict=True) results = self._simplify_result(results) return results
[docs]class NERLACTask(LacTask): """ Part-of-speech tagging task for the raw text. Args: task(string): The name of task. model(string): The model name in the task. kwargs (dict, optional): Additional keyword arguments passed along to the specific task. """ def __init__(self, model, task, entity_only=False, **kwargs): super().__init__(task=task, model="lac", **kwargs) self.entity_only = entity_only def _postprocess(self, inputs): """ The model output is the tag ids, this function will convert the model output to raw text. """ lengths = inputs["lens"] preds = inputs["result"] sents = inputs["text"] final_results = [] for sent_index in range(len(lengths)): tags = [self._id2tag_dict[str(index)] for index in preds[sent_index][: lengths[sent_index]]] sent = sents[sent_index] if self._custom: self._custom.parse_customization(sent, tags) sent_out = [] tags_out = [] parital_word = "" for ind, tag in enumerate(tags): if parital_word == "": parital_word = sent[ind] tags_out.append(tag.split("-")[0]) continue if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"): sent_out.append(parital_word) tags_out.append(tag.split("-")[0]) parital_word = sent[ind] continue parital_word += sent[ind] if len(sent_out) < len(tags_out): sent_out.append(parital_word) result = [] for s, t in zip(sent_out, tags_out): if self.entity_only and t in POS_LABEL_LAC: continue result.append((s, t)) final_results.append(result) final_results = self._auto_joiner(final_results, self.input_mapping) final_results = final_results if len(final_results) > 1 else final_results[0] return final_results