hummingbird
/
fastNLP

 
			
							from .pipe import Pipe
from .. import DataBundle
from .utils import iob2, iob2bioes
from ...core.const import Const
from ..loader.conll import Conll2003NERLoader, OntoNotesNERLoader
from .utils import _indexize, _add_words_field


class _NERPipe(Pipe):
    """
    NER任务的处理Pipe, 该Pipe会（1）复制raw_words列，并命名为words; (2）在words, target列建立词表
    (创建 :class:`fastNLP.Vocabulary` 对象，所以在返回的DataBundle中将有两个Vocabulary); (3）将words，target列根据相应的
    Vocabulary转换为index。

    raw_words列为List[str], 是未转换的原始数据; words列为List[int]，是转换为index的输入数据; target列是List[int]，是转换为index的
    target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target, seq_len。

    :param: str encoding_type: target列使用什么类型的encoding方式，支持bioes, bio两种。
    :param bool lower: 是否将words小写化后再建立词表，绝大多数情况都不需要设置为True。
    :param int target_pad_val: target的padding值，target这一列pad的位置值为target_pad_val。默认为-100。
    """

    def __init__(self, encoding_type: str = 'bio', lower: bool = False, target_pad_val=0):
        if encoding_type == 'bio':
            self.convert_tag = iob2
        else:
            self.convert_tag = lambda words: iob2bioes(iob2(words))
        self.lower = lower
        self.target_pad_val = int(target_pad_val)

    def process(self, data_bundle: DataBundle) -> DataBundle:
        """
        支持的DataSet的field为

        .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader
           :header: "raw_words", "target"

           "[Nadim, Ladki]", "[B-PER, I-PER]"
           "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]"
           "[...]", "[...]"


        :param DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field，且两个field的内容均为List[str]。
            在传入DataBundle基础上原位修改。
        :return: DataBundle

        Example::

            data_bundle = Conll2003Loader().load('/path/to/conll2003/')
            data_bundle = Conll2003NERPipe().process(data_bundle)

            # 获取train
            tr_data = data_bundle.get_dataset('train')

            # 获取target这个field的词表
            target_vocab = data_bundle.get_vocab('target')
            # 获取words这个field的词表
            word_vocab = data_bundle.get_vocab('words')

        """
        # 转换tag
        for name, dataset in data_bundle.datasets.items():
            dataset.apply_field(self.convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET)

        _add_words_field(data_bundle, lower=self.lower)

        # index
        _indexize(data_bundle)

        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
        target_fields = [Const.TARGET, Const.INPUT_LEN]

        for name, dataset in data_bundle.datasets.items():
            dataset.set_pad_val(Const.TARGET, self.target_pad_val)
            dataset.add_seq_len(Const.INPUT)

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)

        return data_bundle

    def process_from_file(self, paths) -> DataBundle:
        """

        :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.ConllLoader` 的load函数。
        :return: DataBundle
        """
        # 读取数据
        data_bundle = Conll2003NERLoader().load(paths)
        data_bundle = self.process(data_bundle)

        return data_bundle


class Conll2003NERPipe(_NERPipe):
    """
    Conll2003的NER任务的处理Pipe, 该Pipe会（1）复制raw_words列，并命名为words; (2）在words, target列建立词表
    (创建 :class:`fastNLP.Vocabulary` 对象，所以在返回的DataBundle中将有两个Vocabulary); (3）将words，target列根据相应的
    Vocabulary转换为index。
    经过该Pipe过后，DataSet中的内容如下所示

    .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader
       :header: "raw_words", "words", "target", "seq_len"

       "[Nadim, Ladki]", "[1, 2]", "[1, 2]", 2
       "[AL-AIN, United, Arab, ...]", "[3, 4, 5,...]", "[3, 4]", 10
       "[...]", "[...]", "[...]", .

    raw_words列为List[str], 是未转换的原始数据; words列为List[int]，是转换为index的输入数据; target列是List[int]，是转换为index的
    target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。

    :param: str encoding_type: target列使用什么类型的encoding方式，支持bioes, bio两种。
    :param bool lower: 是否将words小写化后再建立词表，绝大多数情况都不需要设置为True。
    :param int target_pad_val: target的padding值，target这一列pad的位置值为target_pad_val。默认为-100。
    """

    def process_from_file(self, paths) -> DataBundle:
        """

        :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.ConllLoader` 的load函数。
        :return: DataBundle
        """
        # 读取数据
        data_bundle = Conll2003NERLoader().load(paths)
        data_bundle = self.process(data_bundle)

        return data_bundle


class OntoNotesNERPipe(_NERPipe):
    """
    处理OntoNotes的NER数据，处理之后DataSet中的field情况为

    .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader
       :header: "raw_words", "words", "target", "seq_len"

       "[Nadim, Ladki]", "[1, 2]", "[1, 2]", 2
       "[AL-AIN, United, Arab, ...]", "[3, 4, 5,...]", "[3, 4]", 6
       "[...]", "[...]", "[...]", .

    :param: str encoding_type: target列使用什么类型的encoding方式，支持bioes, bio两种。
    :param bool lower: 是否将words小写化后再建立词表，绝大多数情况都不需要设置为True。
    :param int target_pad_val: target的padding值，target这一列pad的位置值为target_pad_val。默认为-100。
    """

    def process_from_file(self, paths):
        data_bundle = OntoNotesNERLoader().load(paths)
        return self.process(data_bundle)