Source code for paddlenlp.taskflow.named_entity_recognition

# coding:utf-8
# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .knowledge_mining import WordTagTask
from .lexical_analysis import LacTask
from .utils import Customization

POS_LABEL_WORDTAG = [
    "介词",
    "介词_方位介词",
    "助词",
    "代词",
    "连词",
    "副词",
    "疑问词",
    "肯定词",
    "否定词",
    "数量词",
    "叹词",
    "拟声词",
    "修饰词",
    "外语单词",
    "英语单词",
    "汉语拼音",
    "词汇用语",
    "w",
]

POS_LABEL_LAC = ["n", "f", "s", "t", "v", "vd", "vn", "a", "ad", "an", "d", "m", "q", "r", "p", "c", "u", "xc", "w"]

usage = r"""
          from paddlenlp import Taskflow

          # WordTag精确模式
          ner = Taskflow("ner")
          ner("《孤女》是2010年九州出版社出版的小说，作者是余兼羽")
          '''
          [('《', 'w'), ('孤女', '作品类_实体'), ('》', 'w'), ('是', '肯定词'), ('2010年', '时间类'), ('九州出版社', '组织机构类'), ('出版', '场景事件'), ('的', '助词'), ('小说', '作品类_概念'), ('，', 'w'), ('作者', '人物类_概念'), ('是', '肯定词'), ('余兼羽', '人物类_实体')]
          '''

          ner(["热梅茶是一道以梅子为主要原料制作的茶饮", "《孤女》是2010年九州出版社出版的小说，作者是余兼羽"])
          '''
          [[('热梅茶', '饮食类_饮品'), ('是', '肯定词'), ('一道', '数量词'), ('以', '介词'), ('梅子', '饮食类'), ('为', '肯定词'), ('主要原料', '物体类'), ('制作', '场景事件'), ('的', '助词'), ('茶饮', '饮食类_饮品')], [('《', 'w'), ('孤女', '作品类_实体'), ('》', 'w'), ('是', '肯定词'), ('2010年', '时间类'), ('九州出版社', '组织机构类'), ('出版', '场景事件'), ('的', '助词'), ('小说', '作品类_概念'), ('，', 'w'), ('作者', '人物类_概念'), ('是', '肯定词'), ('余兼羽', '人物类_实体')]]
          '''

          # 只返回实体/概念词
          ner = Taskflow("ner", entity_only=True)
          ner("《孤女》是2010年九州出版社出版的小说，作者是余兼羽")
          '''
          [('孤女', '作品类_实体'), ('2010年', '时间类'), ('九州出版社', '组织机构类'), ('出版', '场景事件'), ('小说', '作品类_概念'), ('作者', '人物类_概念'), ('余兼羽', '人物类_实体')]
          '''

          # 使用快速模式，只返回实体词
          ner = Taskflow("ner", mode="fast", entity_only=True)
          ner("三亚是一个美丽的城市")
          '''
          [('三亚', 'LOC')]
          '''
          """


[docs]class NERWordTagTask(WordTagTask):
    """
    This the NER(Named Entity Recognition) task that convert the raw text to entities. And the task with the `wordtag`
    model will link the more meesage with the entity.
    Args:
        task(string): The name of task.
        model(string): The model name in the task.
        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.

    """

    resource_files_names = {
        "model_state": "model_state.pdparams",
        "model_config": "config.json",
        "tags": "tags.txt",
        "vocab_file": "vocab.txt",
        "special_tokens_map": "special_tokens_map.json",
        "tokenizer_config": "tokenizer_config.json",
    }
    resource_files_urls = {
        "wordtag": {
            "model_state": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.5/model_state.pdparams",
                "c7c9cef72f73ee22c70c26ef11393025",
            ],
            "model_config": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.1/config.json",
                "b9f307b3fa03ad98c08ecb5249c15dfa",
            ],
            "tags": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.1/tags.txt",
                "f33feedd01d478b03bac81be19b48d00",
            ],
            "vocab_file": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/vocab.txt",
                "54aa6e2eeb0478c2d18a2343b008590c",
            ],
            "special_tokens_map": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/special_tokens_map.json",
                "58104269e4f141a258bdb2ed06aa599f",
            ],
            "tokenizer_config": [
                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/tokenizer_config.json",
                "e3f2756e72e24e3bb298303fb9a171f7",
            ],
        }
    }

    def __init__(self, model, task, entity_only=False, **kwargs):
        super().__init__(model="wordtag", task=task, **kwargs)
        self.entity_only = entity_only
        if self._user_dict:
            self._custom = Customization()
            self._custom.load_customization(self._user_dict)
        else:
            self._custom = None

    def _decode(self, batch_texts, batch_pred_tags):
        batch_results = []
        for sent_index in range(len(batch_texts)):
            sent = batch_texts[sent_index]
            indexes = batch_pred_tags[sent_index][self.summary_num : len(sent) + self.summary_num]
            tags = [self._index_to_tags[index] for index in indexes]
            if self._custom:
                self._custom.parse_customization(sent, tags, prefix=True)
            sent_out = []
            tags_out = []
            partial_word = ""
            for ind, tag in enumerate(tags):
                if partial_word == "":
                    partial_word = sent[ind]
                    tags_out.append(tag.split("-")[-1])
                    continue
                if tag.startswith("B") or tag.startswith("S") or tag.startswith("O"):
                    sent_out.append(partial_word)
                    tags_out.append(tag.split("-")[-1])
                    partial_word = sent[ind]
                    continue
                partial_word += sent[ind]

            if len(sent_out) < len(tags_out):
                sent_out.append(partial_word)

            pred_words = []
            for s, t in zip(sent_out, tags_out):
                pred_words.append({"item": s, "wordtag_label": t})

            result = {"text": sent, "items": pred_words}
            batch_results.append(result)
        return batch_results

    def _simplify_result(self, results):
        simple_results = []
        for result in results:
            simple_result = []
            if "items" in result:
                for item in result["items"]:
                    if self.entity_only and item["wordtag_label"] in POS_LABEL_WORDTAG:
                        continue
                    simple_result.append((item["item"], item["wordtag_label"]))
            simple_results.append(simple_result)
        simple_results = simple_results[0] if len(simple_results) == 1 else simple_results
        return simple_results

    def _postprocess(self, inputs):
        """
        The model output is the tag ids, this function will convert the model output to raw text.
        """
        results = self._decode(inputs["short_input_texts"], inputs["all_pred_tags"])
        results = self._auto_joiner(results, self.input_mapping, is_dict=True)
        results = self._simplify_result(results)
        return results


[docs]class NERLACTask(LacTask):
    """
    Part-of-speech tagging task for the raw text.
    Args:
        task(string): The name of task.
        model(string): The model name in the task.
        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
    """

    def __init__(self, model, task, entity_only=False, **kwargs):
        super().__init__(task=task, model="lac", **kwargs)
        self.entity_only = entity_only

    def _postprocess(self, inputs):
        """
        The model output is the tag ids, this function will convert the model output to raw text.
        """
        lengths = inputs["lens"]
        preds = inputs["result"]
        sents = inputs["text"]
        final_results = []
        for sent_index in range(len(lengths)):
            tags = [self._id2tag_dict[str(index)] for index in preds[sent_index][: lengths[sent_index]]]
            sent = sents[sent_index]
            if self._custom:
                self._custom.parse_customization(sent, tags)
            sent_out = []
            tags_out = []
            parital_word = ""
            for ind, tag in enumerate(tags):
                if parital_word == "":
                    parital_word = sent[ind]
                    tags_out.append(tag.split("-")[0])
                    continue
                if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"):
                    sent_out.append(parital_word)
                    tags_out.append(tag.split("-")[0])
                    parital_word = sent[ind]
                    continue
                parital_word += sent[ind]

            if len(sent_out) < len(tags_out):
                sent_out.append(parital_word)

            result = []
            for s, t in zip(sent_out, tags_out):
                if self.entity_only and t in POS_LABEL_LAC:
                    continue
                result.append((s, t))
            final_results.append(result)
        final_results = self._auto_joiner(final_results, self.input_mapping)
        final_results = final_results if len(final_results) > 1 else final_results[0]
        return final_results