Browse Source

本地暂存

tags/v0.4.10
yh_cc 6 years ago
parent
commit
28ece53df0
15 changed files with 267 additions and 253 deletions
  1. +12
    -12
      docs/source/fastNLP.io.rst
  2. +20
    -20
      docs/source/fastNLP.models.rst
  3. +8
    -8
      docs/source/fastNLP.modules.aggregator.rst
  4. +10
    -10
      docs/source/fastNLP.modules.encoder.rst
  5. +2
    -2
      docs/source/fastNLP.modules.rst
  6. +169
    -53
      fastNLP/core/dataset.py
  7. +5
    -0
      fastNLP/core/fieldarray.py
  8. +11
    -0
      fastNLP/core/instance.py
  9. +23
    -20
      fastNLP/core/utils.py
  10. +0
    -115
      fastNLP/io/embed_loader.py
  11. +1
    -2
      fastNLP/modules/decoder/utils.py
  12. +1
    -1
      reproduction/Biaffine_parser/run.py
  13. +2
    -1
      requirements.txt
  14. +3
    -3
      test/core/test_utils.py
  15. +0
    -6
      test/io/test_embed_loader.py

+ 12
- 12
docs/source/fastNLP.io.rst View File

@@ -4,48 +4,48 @@ fastNLP.io package
Submodules Submodules
---------- ----------


fastNLP.io.base\_loader module
------------------------------
fastNLP.io.base_loader module
-----------------------------


.. automodule:: fastNLP.io.base_loader .. automodule:: fastNLP.io.base_loader
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.io.config\_io module
----------------------------
fastNLP.io.config_io module
---------------------------


.. automodule:: fastNLP.io.config_io .. automodule:: fastNLP.io.config_io
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.io.dataset\_loader module
---------------------------------
fastNLP.io.dataset_loader module
--------------------------------


.. automodule:: fastNLP.io.dataset_loader .. automodule:: fastNLP.io.dataset_loader
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.io.embed\_loader module
-------------------------------
fastNLP.io.embed_loader module
------------------------------


.. automodule:: fastNLP.io.embed_loader .. automodule:: fastNLP.io.embed_loader
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.io.file\_reader module
------------------------------
fastNLP.io.file_reader module
-----------------------------


.. automodule:: fastNLP.io.file_reader .. automodule:: fastNLP.io.file_reader
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.io.model\_io module
---------------------------
fastNLP.io.model_io module
--------------------------


.. automodule:: fastNLP.io.model_io .. automodule:: fastNLP.io.model_io
:members: :members:


+ 20
- 20
docs/source/fastNLP.models.rst View File

@@ -4,8 +4,8 @@ fastNLP.models package
Submodules Submodules
---------- ----------


fastNLP.models.base\_model module
---------------------------------
fastNLP.models.base_model module
--------------------------------


.. automodule:: fastNLP.models.base_model .. automodule:: fastNLP.models.base_model
:members: :members:
@@ -20,64 +20,64 @@ fastNLP.models.bert module
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.models.biaffine\_parser module
--------------------------------------
fastNLP.models.biaffine_parser module
-------------------------------------


.. automodule:: fastNLP.models.biaffine_parser .. automodule:: fastNLP.models.biaffine_parser
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.models.char\_language\_model module
-------------------------------------------
fastNLP.models.char_language_model module
-----------------------------------------


.. automodule:: fastNLP.models.char_language_model .. automodule:: fastNLP.models.char_language_model
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.models.cnn\_text\_classification module
-----------------------------------------------
fastNLP.models.cnn_text_classification module
---------------------------------------------


.. automodule:: fastNLP.models.cnn_text_classification .. automodule:: fastNLP.models.cnn_text_classification
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.models.enas\_controller module
--------------------------------------
fastNLP.models.enas_controller module
-------------------------------------


.. automodule:: fastNLP.models.enas_controller .. automodule:: fastNLP.models.enas_controller
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.models.enas\_model module
---------------------------------
fastNLP.models.enas_model module
--------------------------------


.. automodule:: fastNLP.models.enas_model .. automodule:: fastNLP.models.enas_model
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.models.enas\_trainer module
-----------------------------------
fastNLP.models.enas_trainer module
----------------------------------


.. automodule:: fastNLP.models.enas_trainer .. automodule:: fastNLP.models.enas_trainer
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.models.enas\_utils module
---------------------------------
fastNLP.models.enas_utils module
--------------------------------


.. automodule:: fastNLP.models.enas_utils .. automodule:: fastNLP.models.enas_utils
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.models.sequence\_modeling module
----------------------------------------
fastNLP.models.sequence_modeling module
---------------------------------------


.. automodule:: fastNLP.models.sequence_modeling .. automodule:: fastNLP.models.sequence_modeling
:members: :members:
@@ -92,8 +92,8 @@ fastNLP.models.snli module
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.models.star\_transformer module
---------------------------------------
fastNLP.models.star_transformer module
--------------------------------------


.. automodule:: fastNLP.models.star_transformer .. automodule:: fastNLP.models.star_transformer
:members: :members:


+ 8
- 8
docs/source/fastNLP.modules.aggregator.rst View File

@@ -12,32 +12,32 @@ fastNLP.modules.aggregator.attention module
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.modules.aggregator.avg\_pool module
-------------------------------------------
fastNLP.modules.aggregator.avg_pool module
------------------------------------------


.. automodule:: fastNLP.modules.aggregator.avg_pool .. automodule:: fastNLP.modules.aggregator.avg_pool
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.modules.aggregator.kmax\_pool module
--------------------------------------------
fastNLP.modules.aggregator.kmax_pool module
-------------------------------------------


.. automodule:: fastNLP.modules.aggregator.kmax_pool .. automodule:: fastNLP.modules.aggregator.kmax_pool
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.modules.aggregator.max\_pool module
-------------------------------------------
fastNLP.modules.aggregator.max_pool module
------------------------------------------


.. automodule:: fastNLP.modules.aggregator.max_pool .. automodule:: fastNLP.modules.aggregator.max_pool
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.modules.aggregator.self\_attention module
-------------------------------------------------
fastNLP.modules.aggregator.self_attention module
------------------------------------------------


.. automodule:: fastNLP.modules.aggregator.self_attention .. automodule:: fastNLP.modules.aggregator.self_attention
:members: :members:


+ 10
- 10
docs/source/fastNLP.modules.encoder.rst View File

@@ -4,8 +4,8 @@ fastNLP.modules.encoder package
Submodules Submodules
---------- ----------


fastNLP.modules.encoder.char\_embedding module
----------------------------------------------
fastNLP.modules.encoder.char_embedding module
---------------------------------------------


.. automodule:: fastNLP.modules.encoder.char_embedding .. automodule:: fastNLP.modules.encoder.char_embedding
:members: :members:
@@ -20,8 +20,8 @@ fastNLP.modules.encoder.conv module
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.modules.encoder.conv\_maxpool module
--------------------------------------------
fastNLP.modules.encoder.conv_maxpool module
-------------------------------------------


.. automodule:: fastNLP.modules.encoder.conv_maxpool .. automodule:: fastNLP.modules.encoder.conv_maxpool
:members: :members:
@@ -52,16 +52,16 @@ fastNLP.modules.encoder.lstm module
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.modules.encoder.masked\_rnn module
------------------------------------------
fastNLP.modules.encoder.masked_rnn module
-----------------------------------------


.. automodule:: fastNLP.modules.encoder.masked_rnn .. automodule:: fastNLP.modules.encoder.masked_rnn
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.modules.encoder.star\_transformer module
------------------------------------------------
fastNLP.modules.encoder.star_transformer module
-----------------------------------------------


.. automodule:: fastNLP.modules.encoder.star_transformer .. automodule:: fastNLP.modules.encoder.star_transformer
:members: :members:
@@ -76,8 +76,8 @@ fastNLP.modules.encoder.transformer module
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.modules.encoder.variational\_rnn module
-----------------------------------------------
fastNLP.modules.encoder.variational_rnn module
----------------------------------------------


.. automodule:: fastNLP.modules.encoder.variational_rnn .. automodule:: fastNLP.modules.encoder.variational_rnn
:members: :members:


+ 2
- 2
docs/source/fastNLP.modules.rst View File

@@ -21,8 +21,8 @@ fastNLP.modules.dropout module
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.modules.other\_modules module
-------------------------------------
fastNLP.modules.other_modules module
------------------------------------


.. automodule:: fastNLP.modules.other_modules .. automodule:: fastNLP.modules.other_modules
:members: :members:


+ 169
- 53
fastNLP/core/dataset.py View File

@@ -1,14 +1,114 @@
""" """
fastNLP.core.DataSet的介绍文档
DataSet是fastNLP中用于承载数据的容器。可以将DataSet看做是一个表格,每一行是一个sample(在fastNLP中被称为Instance),每一列是一个feature(在fastNLP中称为field)。


DataSet是fastNLP中用于承载数据的容器。可以将DataSet看做是一个表格,每一行是一个instance(或sample),每一列是一个feature。
.. _DataSet:


csv-table::
:header: "Field1", "Field2", "Field3"
:widths:20, 10, 10
.. csv-table:: Following is a demo layout of DataSet
:header: "sentence", "words", "seq_len"

"This is the first instance .", "[This, is, the, first, instance, .]", 6
"Second instance .", "[Second, instance, .]", 3
"Third instance .", "[Third, instance, .]", 3
"...", "[...]", "..."

在fastNLP内部每一行是一个 Instance_ 对象; 每一列是一个 FieldArray_ 对象。

1. DataSet的创建

创建DataSet主要有以下的3种方式

1. 传入dict

Example::

from fastNLP import DataSet
data = {'sentence':["This is the first instance .", "Second instance .", "Third instance ."],
'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.'],
'seq_len': [6, 3, 3]}
dataset = DataSet(data)
# 传入的dict的每个key的value应该为具有相同长度的list

2. 通过构建Instance

Example::

from fastNLP import DataSet
from fastNLP import Instance
dataset = DataSet()
instance = Instance(sentence="This is the first instance",
words=['this', 'is', 'the', 'first', 'instance', '.'],
seq_len=6)
dataset.append(instance)
# 可以继续append更多内容,但是append的instance应该和第一个instance拥有完全相同的field

3. 通过list(Instance)

Example::

from fastNLP import DataSet
from fastNLP import Instance
instances = []
instances.append(Instance(sentence="This is the first instance",
words=['this', 'is', 'the', 'first', 'instance', '.'],
seq_len=6))
instances.append(Instance(sentence="Second instance .",
words=['Second', 'instance', '.'],
seq_len=3))
dataset = DataSet(instances)

2. DataSet的基本使用
1. 从某个文本文件读取内容 # TODO 引用DataLoader

Example::

from fastNLP import DataSet
from fastNLP import Instance
dataset = DataSet()
filepath='some/text/file'
# 假设文件中每行内容如下(sentence label):
# This is a fantastic day positive
# The bad weather negative
# .....
with open(filepath, 'r') as f:
for line in f:
sent, label = line.strip().split('\t')
dataset.append(Instance(sentence=sent, label=label))

2. index, 返回结果为对DataSet对象的浅拷贝

Example::

import numpy as np
from fastNLP import DataSet
dataset = DataSet({'a': np.arange(10), 'b': [[_] for _ in range(10)]})
d[0] # 使用一个下标获取一个instance
>>{'a': 0 type=int,'b': [2] type=list} # 得到一个instance
d[1:3] # 使用slice获取一个新的DataSet
>>DataSet({'a': 1 type=int, 'b': [2] type=list}, {'a': 2 type=int, 'b': [2] type=list})

3. 对DataSet中的内容处理

Example::

from fastNLP import DataSet
data = {'sentence':["This is the first instance .", "Second instance .", "Third instance ."]}
dataset = DataSet(data)
# 将句子分成单词形式, 详见DataSet.apply()方法
dataset.apply(lambda ins: ins['sentence'].split(), new_field_name='words')
# 或使用DataSet.apply_field()
dataset.apply(lambda sent:sent.split(), field_name='sentence', new_field_name='words')

4. 删除DataSet的内容

Example::

from fastNLP import DataSet
dataset = DataSet({'a': list(range(-5, 5))})
# 返回满足条件的instance,并放入DataSet中
dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)
# 在dataset中删除满足条件的instance
dataset.drop(lambda ins:ins['a']<0) # dataset的instance数量减少


"This is the first instance", ['This', 'is', 'the', 'first', 'instance'], 5
"Second instance", ['Second', 'instance'], 2


""" """


@@ -22,7 +122,6 @@ from fastNLP.core.fieldarray import FieldArray
from fastNLP.core.instance import Instance from fastNLP.core.instance import Instance
from fastNLP.core.utils import get_func_signature from fastNLP.core.utils import get_func_signature



class DataSet(object): class DataSet(object):
"""DataSet is the collection of examples. """DataSet is the collection of examples.
DataSet provides instance-level interface. You can append and access an instance of the DataSet. DataSet provides instance-level interface. You can append and access an instance of the DataSet.
@@ -87,10 +186,7 @@ class DataSet(object):
return inner_iter_func() return inner_iter_func()


def __getitem__(self, idx): def __getitem__(self, idx):
"""Fetch Instance(s) at the `idx` position(s) in the dataset.
Notice: This method returns a copy of the actual instance(s). Any change to the returned value would not modify
the origin instance(s) of the DataSet.
If you want to make in-place changes to all Instances, use `apply` method.
"""给定int的index,返回一个Instance; 给定slice,返回包含这个slice内容的新的DataSet。


:param idx: can be int or slice. :param idx: can be int or slice.
:return: If `idx` is int, return an Instance object. :return: If `idx` is int, return an Instance object.
@@ -145,33 +241,48 @@ class DataSet(object):
def __repr__(self): def __repr__(self):
return "DataSet(" + self.__inner_repr__() + ")" return "DataSet(" + self.__inner_repr__() + ")"


def append(self, ins):
def append(self, instance):
"""将一个instance对象append到DataSet后面。 """将一个instance对象append到DataSet后面。
If the DataSet is not empty, the instance must have the same field names as the rest instances in the DataSet. If the DataSet is not empty, the instance must have the same field names as the rest instances in the DataSet.


:param ins: an Instance object
:param instance: an Instance object


""" """
if len(self.field_arrays) == 0: if len(self.field_arrays) == 0:
# DataSet has no field yet # DataSet has no field yet
for name, field in ins.fields.items():
for name, field in instance.fields.items():
field = field.tolist() if isinstance(field, np.ndarray) else field field = field.tolist() if isinstance(field, np.ndarray) else field
self.field_arrays[name] = FieldArray(name, [field]) # 第一个样本,必须用list包装起来 self.field_arrays[name] = FieldArray(name, [field]) # 第一个样本,必须用list包装起来
else: else:
if len(self.field_arrays) != len(ins.fields):
if len(self.field_arrays) != len(instance.fields):
raise ValueError( raise ValueError(
"DataSet object has {} fields, but attempt to append an Instance object with {} fields." "DataSet object has {} fields, but attempt to append an Instance object with {} fields."
.format(len(self.field_arrays), len(ins.fields)))
for name, field in ins.fields.items():
.format(len(self.field_arrays), len(instance.fields)))
for name, field in instance.fields.items():
assert name in self.field_arrays assert name in self.field_arrays
self.field_arrays[name].append(field) self.field_arrays[name].append(field)


def add_fieldarray(self, field_name, fieldarray):
"""将fieldarray添加到DataSet中.

:param str field_name: 新加入的field的名称
:param FieldArray fieldarray: 需要加入DataSet的field的内容
:return:
"""
if not isinstance(fieldarray, FieldArray):
raise TypeError("Only fastNLP.FieldArray supported.")
if len(self) != len(fieldarray):
raise RuntimeError(f"The field to add must have the same size as dataset. "
f"Dataset size {len(self)} != field size {len(fieldarray)}")
self.field_arrays[field_name] = fieldarray


def add_field(self, field_name, fields, padder=AutoPadder(), is_input=False, is_target=False, ignore_type=False): def add_field(self, field_name, fields, padder=AutoPadder(), is_input=False, is_target=False, ignore_type=False):
"""新增一个field """新增一个field
:param str field_name: 新增的field的名称 :param str field_name: 新增的field的名称
:param list fields: 需要新增的field的内容 :param list fields: 需要新增的field的内容
:param None, Padder padder: 如果为None,则不进行pad。
:param None,Padder padder: 如果为None,则不进行pad。
:param bool is_input: 新加入的field是否是input :param bool is_input: 新加入的field是否是input
:param bool is_target: 新加入的field是否是target :param bool is_target: 新加入的field是否是target
:param bool ignore_type: 是否忽略对新加入的field的类型检查 :param bool ignore_type: 是否忽略对新加入的field的类型检查
@@ -179,18 +290,28 @@ class DataSet(object):


if len(self.field_arrays) != 0: if len(self.field_arrays) != 0:
if len(self) != len(fields): if len(self) != len(fields):
raise RuntimeError(f"The field to append must have the same size as dataset. "
raise RuntimeError(f"The field to add must have the same size as dataset. "
f"Dataset size {len(self)} != field size {len(fields)}") f"Dataset size {len(self)} != field size {len(fields)}")
self.field_arrays[field_name] = FieldArray(field_name, fields, is_target=is_target, is_input=is_input, self.field_arrays[field_name] = FieldArray(field_name, fields, is_target=is_target, is_input=is_input,
padder=padder, ignore_type=ignore_type) padder=padder, ignore_type=ignore_type)


def delete_field(self, field_name): def delete_field(self, field_name):
"""删除field
"""删除名为field_name的field


:param str field_name: 需要删除的field的名称. :param str field_name: 需要删除的field的名称.
""" """
self.field_arrays.pop(field_name) self.field_arrays.pop(field_name)


def has_field(self, field_name):
"""判断DataSet中是否有field_name这个field

:param str field_name: field的名称
:return: bool
"""
if isinstance(field_name, str):
return field_name in self.field_arrays
return False

def get_field(self, field_name): def get_field(self, field_name):
"""获取field_name这个field """获取field_name这个field


@@ -318,25 +439,21 @@ class DataSet(object):
def apply_field(self, func, field_name, new_field_name=None, **kwargs): def apply_field(self, func, field_name, new_field_name=None, **kwargs):
"""将DataSet中的每个instance中的`field_name`这个field传给func,并获取它的返回值. """将DataSet中的每个instance中的`field_name`这个field传给func,并获取它的返回值.


:param callable func: input是instance的`field_name`这个field.
:param str field_name: 传入func的是哪个field.
:param str, None new_field_name: 将func返回的内容放入到什么field中

1. str, 将func的返回值放入这个名为`new_field_name`的新field中,如果名称与已有的field相
同,则覆盖之前的field

2. None, 不创建新的field
:param kwargs: 合法的参数有以下三个
:param callable func: input是instance的`field_name`这个field的内容。
:param str field_name: 传入func的是哪个field。
:param None,str new_field_name: 将func返回的内容放入到new_field_name这个field中,如果名称与已有的field相同,则覆
:盖之前的field。如果为None则不创建新的field。
:param optional kwargs: 支持输入is_input,is_target,ignore_type


1. is_input: bool, 如果为True则将`new_field_name`的field设置为input
1. is_input: bool, 如果为True则将`new_field_name`的field设置为input


2. is_target: bool, 如果为True则将`new_field_name`的field设置为target
2. is_target: bool, 如果为True则将`new_field_name`的field设置为target


3. ignore_type: bool, 如果为True则将`new_field_name`的field的ignore_type设置为true, 忽略其类型
3. ignore_type: bool, 如果为True则将`new_field_name`的field的ignore_type设置为true, 忽略其类型
:return: list(Any), 里面的元素为func的返回值,所以list长度为DataSet的长度 :return: list(Any), 里面的元素为func的返回值,所以list长度为DataSet的长度


""" """
assert len(self)!=0, "Null DataSet cannot use apply()."
assert len(self)!=0, "Null DataSet cannot use apply_field()."
if field_name not in self: if field_name not in self:
raise KeyError("DataSet has no field named `{}`.".format(field_name)) raise KeyError("DataSet has no field named `{}`.".format(field_name))
results = [] results = []
@@ -388,23 +505,19 @@ class DataSet(object):
ignore_type=extra_param.get("ignore_type", False)) ignore_type=extra_param.get("ignore_type", False))


def apply(self, func, new_field_name=None, **kwargs): def apply(self, func, new_field_name=None, **kwargs):
"""将DataSet中每个instance传入到func中,并获取它的返回值.

:param callable func: 参数是DataSet中的instance
:param str, None new_field_name: 将func返回的内容放入到什么field中

1. str, 将func的返回值放入这个名为`new_field_name`的新field中,如果名称与已有的field相
同,则覆盖之前的field
""" 将DataSet中每个instance传入到func中,并获取它的返回值.


2. None, 不创建新的field
:param kwargs: 合法的参数有以下三个
:param callable func: 参数是DataSet中的Instance
:param None,str new_field_name: 将func返回的内容放入到new_field_name这个field中,如果名称与已有的field相同,则覆
:盖之前的field。如果为None则不创建新的field。
:param optional kwargs: 支持输入is_input,is_target,ignore_type


1. is_input: bool, 如果为True则将`new_field_name`的field设置为input
1. is_input: bool, 如果为True则将`new_field_name`的field设置为input


2. is_target: bool, 如果为True则将`new_field_name`的field设置为target
2. is_target: bool, 如果为True则将`new_field_name`的field设置为target


3. ignore_type: bool, 如果为True则将`new_field_name`的field的ignore_type设置为true, 忽略其类型
:return: List[], 里面的元素为func的返回值,所以list长度为DataSet的长度
3. ignore_type: bool, 如果为True则将`new_field_name`的field的ignore_type设置为true, 忽略其类型
:return: list(Any), 里面的元素为func的返回值,所以list长度为DataSet的长度
""" """
assert len(self)!=0, "Null DataSet cannot use apply()." assert len(self)!=0, "Null DataSet cannot use apply()."
idx = -1 idx = -1
@@ -426,10 +539,10 @@ class DataSet(object):
return results return results


def drop(self, func, inplace=True): def drop(self, func, inplace=True):
"""func接受一个instance,返回bool值,返回值为True时,该instance会被删除
"""func接受一个instance,返回bool值,返回值为True时,该instance会被移除或者加入到返回的DataSet中


:param callable func: 接受一个instance作为参数,返回bool值。为True时删除该instance :param callable func: 接受一个instance作为参数,返回bool值。为True时删除该instance
:param bool inplace: 是否在当前DataSet中直接删除instance。如果为False,返回值为一个删除了相应instance的新的DataSet
:param bool inplace: 是否在当前DataSet中直接删除instance。如果为False,返回值被删除的instance的组成的新DataSet


:return: DataSet :return: DataSet
""" """
@@ -440,10 +553,13 @@ class DataSet(object):
return self return self
else: else:
results = [ins for ins in self if not func(ins)] results = [ins for ins in self if not func(ins)]
dataset = DataSet(results)
for field_name, field in self.field_arrays.items():
dataset.field_arrays[field_name].to(field)
return dataset
if len(results)!=0:
dataset = DataSet(results)
for field_name, field in self.field_arrays.items():
dataset.field_arrays[field_name].to(field)
return dataset
else:
return DataSet()


def split(self, ratio): def split(self, ratio):
"""将DataSet按照ratio的比例拆分,返回两个DataSet """将DataSet按照ratio的比例拆分,返回两个DataSet


+ 5
- 0
fastNLP/core/fieldarray.py View File

@@ -1,4 +1,9 @@
"""
FieldArray是 DataSet_ 中一列的存储方式


.. _FieldArray:

"""




import numpy as np import numpy as np


+ 11
- 0
fastNLP/core/instance.py View File

@@ -1,3 +1,14 @@
"""
Instance文档

.. _Instance:

测试

"""



class Instance(object): class Instance(object):
"""An Instance is an example of data. """An Instance is an example of data.
Example:: Example::


+ 23
- 20
fastNLP/core/utils.py View File

@@ -24,47 +24,50 @@ def _prepare_cache_filepath(filepath):
if not os.path.exists(cache_dir): if not os.path.exists(cache_dir):
os.makedirs(cache_dir) os.makedirs(cache_dir)



# TODO 可以保存下缓存时的参数,如果load的时候发现参数不一致,发出警告。 # TODO 可以保存下缓存时的参数,如果load的时候发现参数不一致,发出警告。
def cache_results(cache_filepath, refresh=False, verbose=1):
def cache_results(_cache_fp, _refresh=False, _verbose=1):
def wrapper_(func): def wrapper_(func):
signature = inspect.signature(func) signature = inspect.signature(func)
for key, _ in signature.parameters.items(): for key, _ in signature.parameters.items():
if key in ('cache_filepath', 'refresh', 'verbose'): if key in ('cache_filepath', 'refresh', 'verbose'):
raise RuntimeError("The function decorated by cache_results cannot have keyword `{}`.".format(key)) raise RuntimeError("The function decorated by cache_results cannot have keyword `{}`.".format(key))
def wrapper(*args, **kwargs): def wrapper(*args, **kwargs):
if 'cache_filepath' in kwargs:
_cache_filepath = kwargs.pop('cache_filepath')
assert isinstance(_cache_filepath, str), "cache_filepath can only be str."
if '_cache_fp' in kwargs:
cache_filepath = kwargs.pop('_cache_fp')
assert isinstance(cache_filepath, str), "_cache_fp can only be str."
else:
cache_filepath = _cache_fp
if '_refresh' in kwargs:
refresh = kwargs.pop('_refresh')
assert isinstance(refresh, bool), "_refresh can only be bool."
else: else:
_cache_filepath = cache_filepath
if 'refresh' in kwargs:
_refresh = kwargs.pop('refresh')
assert isinstance(_refresh, bool), "refresh can only be bool."
refresh = _refresh
if '_verbose' in kwargs:
verbose = kwargs.pop('_verbose')
assert isinstance(verbose, int), "_verbose can only be integer."
else: else:
_refresh = refresh
if 'verbose' in kwargs:
_verbose = kwargs.pop('verbose')
assert isinstance(_verbose, int), "verbose can only be integer."
verbose = _verbose
refresh_flag = True refresh_flag = True


if _cache_filepath is not None and _refresh is False:
if cache_filepath is not None and refresh is False:
# load data # load data
if os.path.exists(_cache_filepath):
with open(_cache_filepath, 'rb') as f:
if os.path.exists(cache_filepath):
with open(cache_filepath, 'rb') as f:
results = _pickle.load(f) results = _pickle.load(f)
if verbose==1: if verbose==1:
print("Read cache from {}.".format(_cache_filepath))
print("Read cache from {}.".format(cache_filepath))
refresh_flag = False refresh_flag = False


if refresh_flag: if refresh_flag:
results = func(*args, **kwargs) results = func(*args, **kwargs)
if _cache_filepath is not None:
if cache_filepath is not None:
if results is None: if results is None:
raise RuntimeError("The return value is None. Delete the decorator.") raise RuntimeError("The return value is None. Delete the decorator.")
_prepare_cache_filepath(_cache_filepath)
with open(_cache_filepath, 'wb') as f:
_prepare_cache_filepath(cache_filepath)
with open(cache_filepath, 'wb') as f:
_pickle.dump(results, f) _pickle.dump(results, f)
print("Save cache to {}.".format(_cache_filepath))
print("Save cache to {}.".format(cache_filepath))


return results return results
return wrapper return wrapper


+ 0
- 115
fastNLP/io/embed_loader.py View File

@@ -1,7 +1,6 @@
import os import os


import numpy as np import numpy as np
import torch


from fastNLP.core.vocabulary import Vocabulary from fastNLP.core.vocabulary import Vocabulary
from fastNLP.io.base_loader import BaseLoader from fastNLP.io.base_loader import BaseLoader
@@ -14,120 +13,6 @@ class EmbedLoader(BaseLoader):
def __init__(self): def __init__(self):
super(EmbedLoader, self).__init__() super(EmbedLoader, self).__init__()


@staticmethod
def _load_glove(emb_file):
"""Read file as a glove embedding

file format:
embeddings are split by line,
for one embedding, word and numbers split by space
Example::

word_1 float_1 float_2 ... float_emb_dim
word_2 float_1 float_2 ... float_emb_dim
...
"""
emb = {}
with open(emb_file, 'r', encoding='utf-8') as f:
for line in f:
line = list(filter(lambda w: len(w) > 0, line.strip().split(' ')))
if len(line) > 2:
emb[line[0]] = torch.Tensor(list(map(float, line[1:])))
return emb

@staticmethod
def _load_pretrain(emb_file, emb_type):
"""Read txt data from embedding file and convert to np.array as pre-trained embedding

:param str emb_file: the pre-trained embedding file path
:param str emb_type: the pre-trained embedding data format
:return: a dict of ``{str: np.array}``
"""
if emb_type == 'glove':
return EmbedLoader._load_glove(emb_file)
else:
raise Exception("embedding type {} not support yet".format(emb_type))

@staticmethod
def load_embedding(emb_dim, emb_file, emb_type, vocab):
"""Load the pre-trained embedding and combine with the given dictionary.

:param int emb_dim: the dimension of the embedding. Should be the same as pre-trained embedding.
:param str emb_file: the pre-trained embedding file path.
:param str emb_type: the pre-trained embedding format, support glove now
:param Vocabulary vocab: a mapping from word to index, can be provided by user or built from pre-trained embedding
:return (embedding_tensor, vocab):
embedding_tensor - Tensor of shape (len(word_dict), emb_dim);
vocab - input vocab or vocab built by pre-train

"""
pretrain = EmbedLoader._load_pretrain(emb_file, emb_type)
if vocab is None:
# build vocabulary from pre-trained embedding
vocab = Vocabulary()
for w in pretrain.keys():
vocab.add(w)
embedding_tensor = torch.randn(len(vocab), emb_dim)
for w, v in pretrain.items():
if len(v.shape) > 1 or emb_dim != v.shape[0]:
raise ValueError(
"Pretrained embedding dim is {}. Dimension dismatched. Required {}".format(v.shape, (emb_dim,)))
if vocab.has_word(w):
embedding_tensor[vocab[w]] = v
return embedding_tensor, vocab

@staticmethod
def parse_glove_line(line):
line = line.split()
if len(line) <= 2:
raise RuntimeError("something goes wrong in parsing glove embedding")
return line[0], line[1:]

@staticmethod
def str_list_2_vec(line):
try:
return torch.Tensor(list(map(float, line)))
except Exception:
raise RuntimeError("something goes wrong in parsing glove embedding")


@staticmethod
def fast_load_embedding(emb_dim, emb_file, vocab):
"""Fast load the pre-trained embedding and combine with the given dictionary.
This loading method uses line-by-line operation.

:param int emb_dim: the dimension of the embedding. Should be the same as pre-trained embedding.
:param str emb_file: the pre-trained embedding file path.
:param Vocabulary vocab: a mapping from word to index, can be provided by user or built from pre-trained embedding
:return embedding_matrix: numpy.ndarray

"""
if vocab is None:
raise RuntimeError("You must provide a vocabulary.")
embedding_matrix = np.zeros(shape=(len(vocab), emb_dim), dtype=np.float32)
hit_flags = np.zeros(shape=(len(vocab),), dtype=int)
with open(emb_file, "r", encoding="utf-8") as f:
startline = f.readline()
if len(startline.split()) > 2:
f.seek(0)
for line in f:
word, vector = EmbedLoader.parse_glove_line(line)
if word in vocab:
vector = EmbedLoader.str_list_2_vec(vector)
if len(vector.shape) > 1 or emb_dim != vector.shape[0]:
raise ValueError("Pre-trained embedding dim is {}. Expect {}.".format(vector.shape, (emb_dim,)))
embedding_matrix[vocab[word]] = vector
hit_flags[vocab[word]] = 1

if np.sum(hit_flags) < len(vocab):
# some words from vocab are missing in pre-trained embedding
# we normally sample each dimension
vocab_embed = embedding_matrix[np.where(hit_flags)]
sampled_vectors = np.random.normal(vocab_embed.mean(axis=0), vocab_embed.std(axis=0),
size=(len(vocab) - np.sum(hit_flags), emb_dim))
embedding_matrix[np.where(1 - hit_flags)] = sampled_vectors
return embedding_matrix

@staticmethod @staticmethod
def load_with_vocab(embed_filepath, vocab, dtype=np.float32, normalize=True, error='ignore'): def load_with_vocab(embed_filepath, vocab, dtype=np.float32, normalize=True, error='ignore'):
""" """


+ 1
- 2
fastNLP/modules/decoder/utils.py View File

@@ -36,8 +36,7 @@ def viterbi_decode(feats, transitions, mask=None, unpad=False):
vpath = feats.new_zeros((seq_len, batch_size, n_tags), dtype=torch.long) vpath = feats.new_zeros((seq_len, batch_size, n_tags), dtype=torch.long)
vscore = feats[0] vscore = feats[0]


vscore += transitions[n_tags, :n_tags]
trans_score = transitions[:n_tags, :n_tags].view(1, n_tags, n_tags).data
trans_score = transitions.view(1, n_tags, n_tags).data
for i in range(1, seq_len): for i in range(1, seq_len):
prev_score = vscore.view(batch_size, n_tags, 1) prev_score = vscore.view(batch_size, n_tags, 1)
cur_score = feats[i].view(batch_size, 1, n_tags) cur_score = feats[i].view(batch_size, 1, n_tags)


+ 1
- 1
reproduction/Biaffine_parser/run.py View File

@@ -155,7 +155,7 @@ print('test len {}'.format(len(test_data)))
def train(path): def train(path):
# test saving pipeline # test saving pipeline
save_pipe(path) save_pipe(path)
embed = EmbedLoader.fast_load_embedding(model_args['word_emb_dim'], emb_file_name, word_v)
embed = EmbedLoader.load_with_vocab(emb_file_name, word_v)
embed = torch.tensor(embed, dtype=torch.float32) embed = torch.tensor(embed, dtype=torch.float32)


# embed = EmbedLoader.fast_load_embedding(emb_dim=model_args['word_emb_dim'], emb_file=emb_file_name, vocab=word_v) # embed = EmbedLoader.fast_load_embedding(emb_dim=model_args['word_emb_dim'], emb_file=emb_file_name, vocab=word_v)


+ 2
- 1
requirements.txt View File

@@ -1,4 +1,5 @@
numpy>=1.14.2 numpy>=1.14.2
torch>=0.4.0 torch>=0.4.0
tensorboardX tensorboardX
tqdm>=4.28.1
tqdm>=4.28.1
nltk>=3.4.1

+ 3
- 3
test/core/test_utils.py View File

@@ -89,17 +89,17 @@ class TestCache(unittest.TestCase):
def test_duplicate_keyword(self): def test_duplicate_keyword(self):
with self.assertRaises(RuntimeError): with self.assertRaises(RuntimeError):
@cache_results(None) @cache_results(None)
def func_verbose(a, verbose):
def func_verbose(a, _verbose):
pass pass
func_verbose(0, 1) func_verbose(0, 1)
with self.assertRaises(RuntimeError): with self.assertRaises(RuntimeError):
@cache_results(None) @cache_results(None)
def func_cache(a, cache_filepath):
def func_cache(a, _cache_fp):
pass pass
func_cache(1, 2) func_cache(1, 2)
with self.assertRaises(RuntimeError): with self.assertRaises(RuntimeError):
@cache_results(None) @cache_results(None)
def func_refresh(a, refresh):
def func_refresh(a, _refresh):
pass pass
func_refresh(1, 2) func_refresh(1, 2)




+ 0
- 6
test/io/test_embed_loader.py View File

@@ -6,12 +6,6 @@ from fastNLP.io.embed_loader import EmbedLoader




class TestEmbedLoader(unittest.TestCase): class TestEmbedLoader(unittest.TestCase):
def test_case(self):
vocab = Vocabulary()
vocab.update(["the", "in", "I", "to", "of", "hahaha"])
embedding = EmbedLoader().fast_load_embedding(50, "test/data_for_tests/glove.6B.50d_test.txt", vocab)
self.assertEqual(tuple(embedding.shape), (len(vocab), 50))

def test_load_with_vocab(self): def test_load_with_vocab(self):
vocab = Vocabulary() vocab = Vocabulary()
glove = "test/data_for_tests/glove.6B.50d_test.txt" glove = "test/data_for_tests/glove.6B.50d_test.txt"


Loading…
Cancel
Save