Browse Source

1. delete io/data_loader dir; 2. delete model/enas*; 3. delete legacy dir; 4. delete DateSetLoader and relevant codes; 5. fix a test code error in core/test_dataset.py; 6. delete io.BaseLoader and relevant code.

tags/v0.4.10
xuyige 5 years ago
parent
commit
5d8a8c98c6
40 changed files with 14 additions and 5445 deletions
  1. +0
    -1
      fastNLP/io/__init__.py
  2. +4
    -193
      fastNLP/io/data_bundle.py
  3. +0
    -39
      fastNLP/io/data_loader/__init__.py
  4. +0
    -109
      fastNLP/io/data_loader/conll.py
  5. +0
    -99
      fastNLP/io/data_loader/imdb.py
  6. +0
    -248
      fastNLP/io/data_loader/matching.py
  7. +0
    -62
      fastNLP/io/data_loader/mnli.py
  8. +0
    -68
      fastNLP/io/data_loader/mtl.py
  9. +0
    -85
      fastNLP/io/data_loader/people_daily.py
  10. +0
    -47
      fastNLP/io/data_loader/qnli.py
  11. +0
    -34
      fastNLP/io/data_loader/quora.py
  12. +0
    -47
      fastNLP/io/data_loader/rte.py
  13. +0
    -46
      fastNLP/io/data_loader/snli.py
  14. +0
    -180
      fastNLP/io/data_loader/sst.py
  15. +0
    -132
      fastNLP/io/data_loader/yelp.py
  16. +0
    -121
      fastNLP/io/dataset_loader.py
  17. +6
    -7
      fastNLP/io/embed_loader.py
  18. +1
    -3
      fastNLP/io/model_io.py
  19. +0
    -228
      fastNLP/models/enas_controller.py
  20. +0
    -393
      fastNLP/models/enas_model.py
  21. +0
    -384
      fastNLP/models/enas_trainer.py
  22. +0
    -58
      fastNLP/models/enas_utils.py
  23. +0
    -44
      legacy/api/README.md
  24. +0
    -2
      legacy/api/__init__.py
  25. +0
    -463
      legacy/api/api.py
  26. +0
    -181
      legacy/api/converter.py
  27. +0
    -56
      legacy/api/examples.py
  28. +0
    -33
      legacy/api/pipeline.py
  29. +0
    -428
      legacy/api/processor.py
  30. +0
    -134
      legacy/api/utils.py
  31. +0
    -0
      legacy/automl/__init__.py
  32. +0
    -223
      legacy/automl/enas_controller.py
  33. +0
    -388
      legacy/automl/enas_model.py
  34. +0
    -383
      legacy/automl/enas_trainer.py
  35. +0
    -53
      legacy/automl/enas_utils.py
  36. +0
    -1
      legacy/component/__init__.py
  37. +0
    -378
      legacy/component/bert_tokenizer.py
  38. +3
    -2
      test/core/test_dataset.py
  39. +0
    -15
      test/io/test_data_loader.py
  40. +0
    -77
      test/io/test_dataset_loader.py

+ 0
- 1
fastNLP/io/__init__.py View File

@@ -82,7 +82,6 @@ __all__ = [

from .embed_loader import EmbedLoader
from .data_bundle import DataBundle
from .dataset_loader import CSVLoader, JsonLoader
from .model_io import ModelLoader, ModelSaver

from .loader import *


+ 4
- 193
fastNLP/io/data_bundle.py View File

@@ -6,112 +6,10 @@ __all__ = [
'DataBundle',
]

import _pickle as pickle
import os
from typing import Union, Dict

from ..core.dataset import DataSet
from ..core.vocabulary import Vocabulary


class BaseLoader(object):
"""
各个 Loader 的基类,提供了 API 的参考。

"""
def __init__(self):
super(BaseLoader, self).__init__()
@staticmethod
def load_lines(data_path):
"""
按行读取,舍弃每行两侧空白字符,返回list of str

:param data_path: 读取数据的路径
"""
with open(data_path, "r", encoding="utf=8") as f:
text = f.readlines()
return [line.strip() for line in text]
@classmethod
def load(cls, data_path):
"""
先按行读取,去除一行两侧空白,再提取每行的字符。返回list of list of str
:param data_path:
"""
with open(data_path, "r", encoding="utf-8") as f:
text = f.readlines()
return [[word for word in sent.strip()] for sent in text]
@classmethod
def load_with_cache(cls, data_path, cache_path):
"""缓存版的load
"""
if os.path.isfile(cache_path) and os.path.getmtime(data_path) < os.path.getmtime(cache_path):
with open(cache_path, 'rb') as f:
return pickle.load(f)
else:
obj = cls.load(data_path)
with open(cache_path, 'wb') as f:
pickle.dump(obj, f)
return obj


def _download_from_url(url, path):
try:
from tqdm.auto import tqdm
except:
from ..core.utils import _pseudo_tqdm as tqdm
import requests

"""Download file"""
r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, stream=True)
chunk_size = 16 * 1024
total_size = int(r.headers.get('Content-length', 0))
with open(path, "wb") as file, \
tqdm(total=total_size, unit='B', unit_scale=1, desc=path.split('/')[-1]) as t:
for chunk in r.iter_content(chunk_size):
if chunk:
file.write(chunk)
t.update(len(chunk))


def _uncompress(src, dst):
import zipfile
import gzip
import tarfile
import os

def unzip(src, dst):
with zipfile.ZipFile(src, 'r') as f:
f.extractall(dst)

def ungz(src, dst):
with gzip.open(src, 'rb') as f, open(dst, 'wb') as uf:
length = 16 * 1024 # 16KB
buf = f.read(length)
while buf:
uf.write(buf)
buf = f.read(length)

def untar(src, dst):
with tarfile.open(src, 'r:gz') as f:
f.extractall(dst)

fn, ext = os.path.splitext(src)
_, ext_2 = os.path.splitext(fn)
if ext == '.zip':
unzip(src, dst)
elif ext == '.gz' and ext_2 != '.tar':
ungz(src, dst)
elif (ext == '.gz' and ext_2 == '.tar') or ext_2 == '.tgz':
untar(src, dst)
else:
raise ValueError('unsupported file {}'.format(src))


class DataBundle:
"""
经过处理的数据信息,包括一系列数据集(比如:分开的训练集、验证集和测试集)以及各个field对应的vocabulary。该对象一般由fastNLP中各种
@@ -154,7 +52,7 @@ class DataBundle:
self.datasets[name] = dataset
return self

def get_dataset(self, name:str)->DataSet:
def get_dataset(self, name: str) -> DataSet:
"""
获取名为name的dataset

@@ -163,7 +61,7 @@ class DataBundle:
"""
return self.datasets[name]

def delete_dataset(self, name:str):
def delete_dataset(self, name: str):
"""
删除名为name的DataSet

@@ -173,7 +71,7 @@ class DataBundle:
self.datasets.pop(name, None)
return self

def get_vocab(self, field_name:str)->Vocabulary:
def get_vocab(self, field_name: str) -> Vocabulary:
"""
获取field名为field_name对应的vocab

@@ -182,7 +80,7 @@ class DataBundle:
"""
return self.vocabs[field_name]

def delete_vocab(self, field_name:str):
def delete_vocab(self, field_name: str):
"""
删除vocab
:param str field_name:
@@ -312,90 +210,3 @@ class DataBundle:
return _str


class DataSetLoader:
"""
别名::class:`fastNLP.io.DataSetLoader` :class:`fastNLP.io.dataset_loader.DataSetLoader`

定义了各种 DataSetLoader 所需的API 接口,开发者应该继承它实现各种的 DataSetLoader。

开发者至少应该编写如下内容:

- _load 函数:从一个数据文件中读取数据到一个 :class:`~fastNLP.DataSet`
- load 函数(可以使用基类的方法):从一个或多个数据文件中读取数据到一个或多个 :class:`~fastNLP.DataSet`
- process 函数:一个或多个从数据文件中读取数据,并处理成可以训练的一个或多个 :class:`~fastNLP.DataSet`

**process 函数中可以 调用load 函数或 _load 函数**

"""
URL = ''
DATA_DIR = ''

ROOT_DIR = '.fastnlp/datasets/'
UNCOMPRESS = True

def _download(self, url: str, pdir: str, uncompress=True) -> str:
"""

从 ``url`` 下载数据到 ``path``, 如果 ``uncompress`` 为 ``True`` ,自动解压。

:param url: 下载的网站
:param pdir: 下载到的目录
:param uncompress: 是否自动解压缩
:return: 数据的存放路径
"""
fn = os.path.basename(url)
path = os.path.join(pdir, fn)
"""check data exists"""
if not os.path.exists(path):
os.makedirs(pdir, exist_ok=True)
_download_from_url(url, path)
if uncompress:
dst = os.path.join(pdir, 'data')
if not os.path.exists(dst):
_uncompress(path, dst)
return dst
return path

def download(self):
return self._download(
self.URL,
os.path.join(self.ROOT_DIR, self.DATA_DIR),
uncompress=self.UNCOMPRESS)

def load(self, paths: Union[str, Dict[str, str]]) -> Union[DataSet, Dict[str, DataSet]]:
"""
从指定一个或多个路径中的文件中读取数据,返回一个或多个数据集 :class:`~fastNLP.DataSet` 。
如果处理多个路径,传入的 dict 中的 key 与返回的 dict 中的 key 保存一致。

:param Union[str, Dict[str, str]] paths: 文件路径
:return: :class:`~fastNLP.DataSet` 类的对象或存储多个 :class:`~fastNLP.DataSet` 的字典
"""
if isinstance(paths, str):
return self._load(paths)
return {name: self._load(path) for name, path in paths.items()}

def _load(self, path: str) -> DataSet:
"""从指定路径的文件中读取数据,返回 :class:`~fastNLP.DataSet` 类型的对象

:param str path: 文件路径
:return: 一个 :class:`~fastNLP.DataSet` 类型的对象
"""
raise NotImplementedError

def process(self, paths: Union[str, Dict[str, str]], **options) -> DataBundle:
"""
对于特定的任务和数据集,读取并处理数据,返回处理DataInfo类对象或字典。

从指定一个或多个路径中的文件中读取数据,DataInfo对象中可以包含一个或多个数据集 。
如果处理多个路径,传入的 dict 的 key 与返回DataInfo中的 dict 中的 key 保存一致。

返回的 :class:`DataBundle` 对象有如下属性:

- vocabs: 由从数据集中获取的词表组成的字典,每个词表
- datasets: 一个dict,包含一系列 :class:`~fastNLP.DataSet` 类型的对象。其中 field 的命名参考 :mod:`~fastNLP.core.const`

:param paths: 原始数据读取的路径
:param options: 根据不同的任务和数据集,设计自己的参数
:return: 返回一个 DataBundle
"""
raise NotImplementedError

+ 0
- 39
fastNLP/io/data_loader/__init__.py View File

@@ -1,39 +0,0 @@
"""undocumented
.. warning::

本模块在 `0.5.0版本` 中被废弃,由 :mod:`~fastNLP.io.loader` 和 :mod:`~fastNLP.io.pipe` 模块替代。

用于读数据集的模块, 可以读取文本分类、序列标注、Matching任务的数据集

这些模块的具体介绍如下,您可以通过阅读 :doc:`教程</tutorials/tutorial_2_load_dataset>` 来进行了解。
"""
__all__ = [
'ConllLoader',
'Conll2003Loader',
'IMDBLoader',
'MatchingLoader',
'SNLILoader',
'MNLILoader',
'MTL16Loader',
'PeopleDailyCorpusLoader',
'QNLILoader',
'QuoraLoader',
'RTELoader',
'SSTLoader',
'SST2Loader',
'YelpLoader',
]


from .conll import ConllLoader, Conll2003Loader
from .imdb import IMDBLoader
from .matching import MatchingLoader
from .mnli import MNLILoader
from .mtl import MTL16Loader
from .people_daily import PeopleDailyCorpusLoader
from .qnli import QNLILoader
from .quora import QuoraLoader
from .rte import RTELoader
from .snli import SNLILoader
from .sst import SSTLoader, SST2Loader
from .yelp import YelpLoader

+ 0
- 109
fastNLP/io/data_loader/conll.py View File

@@ -1,109 +0,0 @@

from ...core.dataset import DataSet
from ...core.instance import Instance
from ..data_bundle import DataSetLoader
from ..file_reader import _read_conll
from typing import Union, Dict
from ..utils import check_loader_paths
from ..data_bundle import DataBundle

class ConllLoader(DataSetLoader):
"""
别名::class:`fastNLP.io.ConllLoader` :class:`fastNLP.io.data_loader.ConllLoader`

该ConllLoader支持读取的数据格式: 以空行隔开两个sample,除了分割行,每一行用空格或者制表符隔开不同的元素。如下例所示:

Example::

# 文件中的内容
Nadim NNP B-NP B-PER
Ladki NNP I-NP I-PER

AL-AIN NNP B-NP B-LOC
United NNP B-NP B-LOC
Arab NNP I-NP I-LOC
Emirates NNPS I-NP I-LOC
1996-12-06 CD I-NP O
...

# 如果用以下的参数读取,返回的DataSet将包含raw_words和pos两个field, 这两个field的值分别取自于第0列与第1列
dataset = ConllLoader(headers=['raw_words', 'pos'], indexes=[0, 1])._load('/path/to/train.conll')
# 如果用以下的参数读取,返回的DataSet将包含raw_words和ner两个field, 这两个field的值分别取自于第0列与第2列
dataset = ConllLoader(headers=['raw_words', 'ner'], indexes=[0, 3])._load('/path/to/train.conll')
# 如果用以下的参数读取,返回的DataSet将包含raw_words, pos和ner三个field
dataset = ConllLoader(headers=['raw_words', 'pos', 'ner'], indexes=[0, 1, 3])._load('/path/to/train.conll')

dataset = ConllLoader(headers=['raw_words', 'pos'], indexes=[0, 1])._load('/path/to/train.conll')中DataSet的raw_words
列与pos列的内容都是List[str]

数据中以"-DOCSTART-"开头的行将被忽略,因为该符号在conll 2003中被用为文档分割符。

:param list headers: 每一列数据的名称,需为List or Tuple of str。``header`` 与 ``indexes`` 一一对应
:param list indexes: 需要保留的数据列下标,从0开始。若为 ``None`` ,则所有列都保留。Default: ``None``
:param bool dropna: 是否忽略非法数据,若 ``False`` ,遇到非法数据时抛出 ``ValueError`` 。Default: ``True``
"""

def __init__(self, headers, indexes=None, dropna=True):
super(ConllLoader, self).__init__()
if not isinstance(headers, (list, tuple)):
raise TypeError(
'invalid headers: {}, should be list of strings'.format(headers))
self.headers = headers
self.dropna = dropna
if indexes is None:
self.indexes = list(range(len(self.headers)))
else:
if len(indexes) != len(headers):
raise ValueError
self.indexes = indexes

def _load(self, path):
"""
传入的一个文件路径,将该文件读入DataSet中,field由Loader初始化时指定的headers决定。

:param str path: 文件的路径
:return: DataSet
"""
ds = DataSet()
for idx, data in _read_conll(path, indexes=self.indexes, dropna=self.dropna):
ins = {h: data[i] for i, h in enumerate(self.headers)}
ds.append(Instance(**ins))
return ds

def load(self, paths: Union[str, Dict[str, str]]) -> DataBundle:
"""
从指定一个或多个路径中的文件中读取数据,返回:class:`~fastNLP.io.DataBundle` 。

读取的field根据ConllLoader初始化时传入的headers决定。

:param Union[str, Dict[str, str]] paths:
:return: :class:`~fastNLP.DataSet` 类的对象或 :class:`~fastNLP.io.DataBundle` 的字典
"""
paths = check_loader_paths(paths)
datasets = {name: self._load(path) for name, path in paths.items()}
data_bundle = DataBundle(datasets=datasets)
return data_bundle


class Conll2003Loader(ConllLoader):
"""
别名::class:`fastNLP.io.Conll2003Loader` :class:`fastNLP.io.data_loader.Conll2003Loader`

该Loader用以读取Conll2003数据,conll2003的数据可以在https://github.com/davidsbatista/NER-datasets/tree/master/CONLL2003
找到。数据中以"-DOCSTART-"开头的行将被忽略,因为该符号在conll 2003中被用为文档分割符。

返回的DataSet将具有以下['raw_words', 'pos', 'chunks', 'ner']四个field, 每个field中的内容都是List[str]。

.. csv-table:: Conll2003Loader处理之 :header: "raw_words", "words", "target", "seq_len"

"[Nadim, Ladki]", "[1, 2]", "[1, 2]", 2
"[AL-AIN, United, Arab, ...]", "[3, 4, 5,...]", "[3, 4]", 5
"[...]", "[...]", "[...]", .

"""

def __init__(self):
headers = [
'raw_words', 'pos', 'chunks', 'ner',
]
super(Conll2003Loader, self).__init__(headers=headers)

+ 0
- 99
fastNLP/io/data_loader/imdb.py View File

@@ -1,99 +0,0 @@

from typing import Union, Dict

from ..embed_loader import EmbeddingOption, EmbedLoader
from ..data_bundle import DataSetLoader, DataBundle
from ...core.vocabulary import VocabularyOption, Vocabulary
from ...core.dataset import DataSet
from ...core.instance import Instance
from ...core.const import Const

from ..utils import get_tokenizer


class IMDBLoader(DataSetLoader):
"""
别名::class:`fastNLP.io.IMDBLoader` :class:`fastNLP.io.data_loader.IMDBLoader`

读取IMDB数据集,DataSet包含以下fields:

words: list(str), 需要分类的文本

target: str, 文本的标签

"""

def __init__(self):
super(IMDBLoader, self).__init__()
self.tokenizer = get_tokenizer()

def _load(self, path):
dataset = DataSet()
with open(path, 'r', encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
parts = line.split('\t')
target = parts[0]
words = self.tokenizer(parts[1].lower())
dataset.append(Instance(words=words, target=target))

if len(dataset) == 0:
raise RuntimeError(f"{path} has no valid data.")

return dataset
def process(self,
paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None,
char_level_op=False):

datasets = {}
info = DataBundle()
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset

def wordtochar(words):
chars = []
for word in words:
word = word.lower()
for char in word:
chars.append(char)
chars.append('')
chars.pop()
return chars

if char_level_op:
for dataset in datasets.values():
dataset.apply_field(wordtochar, field_name="words", new_field_name='chars')

datasets["train"], datasets["dev"] = datasets["train"].split(0.1, shuffle=False)

src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
src_vocab.from_dataset(datasets['train'], field_name='words')

src_vocab.index_dataset(*datasets.values(), field_name='words')

tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
tgt_vocab.from_dataset(datasets['train'], field_name='target')
tgt_vocab.index_dataset(*datasets.values(), field_name='target')

info.vocabs = {
Const.INPUT: src_vocab,
Const.TARGET: tgt_vocab
}

info.datasets = datasets

for name, dataset in info.datasets.items():
dataset.set_input(Const.INPUT)
dataset.set_target(Const.TARGET)

return info




+ 0
- 248
fastNLP/io/data_loader/matching.py View File

@@ -1,248 +0,0 @@
import os

from typing import Union, Dict, List

from ...core.const import Const
from ...core.vocabulary import Vocabulary
from ..data_bundle import DataBundle, DataSetLoader
from ..file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR
from ...modules.encoder.bert import BertTokenizer


class MatchingLoader(DataSetLoader):
"""
别名::class:`fastNLP.io.MatchingLoader` :class:`fastNLP.io.data_loader.MatchingLoader`

读取Matching任务的数据集

:param dict paths: key是数据集名称(如train、dev、test),value是对应的文件名
"""

def __init__(self, paths: dict=None):
self.paths = paths

def _load(self, path):
"""
:param str path: 待读取数据集的路径名
:return: fastNLP.DataSet ds: 返回一个DataSet对象,里面必须包含3个field:其中两个分别为两个句子
的原始字符串文本,第三个为标签
"""
raise NotImplementedError

def process(self, paths: Union[str, Dict[str, str]], dataset_name: str=None,
to_lower=False, seq_len_type: str=None, bert_tokenizer: str=None,
cut_text: int = None, get_index=True, auto_pad_length: int=None,
auto_pad_token: str='<pad>', set_input: Union[list, str, bool]=True,
set_target: Union[list, str, bool]=True, concat: Union[str, list, bool]=None,
extra_split: List[str]=None, ) -> DataBundle:
"""
:param paths: str或者Dict[str, str]。如果是str,则为数据集所在的文件夹或者是全路径文件名:如果是文件夹,
则会从self.paths里面找对应的数据集名称与文件名。如果是Dict,则为数据集名称(如train、dev、test)和
对应的全路径文件名。
:param str dataset_name: 如果在paths里传入的是一个数据集的全路径文件名,那么可以用dataset_name来定义
这个数据集的名字,如果不定义则默认为train。
:param bool to_lower: 是否将文本自动转为小写。默认值为False。
:param str seq_len_type: 提供的seq_len类型,支持 ``seq_len`` :提供一个数字作为句子长度; ``mask`` :
提供一个0/1的mask矩阵作为句子长度; ``bert`` :提供segment_type_id(第一个句子为0,第二个句子为1)和
attention mask矩阵(0/1的mask矩阵)。默认值为None,即不提供seq_len
:param str bert_tokenizer: bert tokenizer所使用的词表所在的文件夹路径
:param int cut_text: 将长于cut_text的内容截掉。默认为None,即不截。
:param bool get_index: 是否需要根据词表将文本转为index
:param int auto_pad_length: 是否需要将文本自动pad到一定长度(超过这个长度的文本将会被截掉),默认为不会自动pad
:param str auto_pad_token: 自动pad的内容
:param set_input: 如果为True,则会自动将相关的field(名字里含有Const.INPUT的)设置为input,如果为False
则不会将任何field设置为input。如果传入str或者List[str],则会根据传入的内容将相对应的field设置为input,
于此同时其他field不会被设置为input。默认值为True。
:param set_target: set_target将控制哪些field可以被设置为target,用法与set_input一致。默认值为True。
:param concat: 是否需要将两个句子拼接起来。如果为False则不会拼接。如果为True则会在两个句子之间插入一个<sep>。
如果传入一个长度为4的list,则分别表示插在第一句开始前、第一句结束后、第二句开始前、第二句结束后的标识符。如果
传入字符串 ``bert`` ,则会采用bert的拼接方式,等价于['[CLS]', '[SEP]', '', '[SEP]'].
:param extra_split: 额外的分隔符,即除了空格之外的用于分词的字符。
:return:
"""
if isinstance(set_input, str):
set_input = [set_input]
if isinstance(set_target, str):
set_target = [set_target]
if isinstance(set_input, bool):
auto_set_input = set_input
else:
auto_set_input = False
if isinstance(set_target, bool):
auto_set_target = set_target
else:
auto_set_target = False
if isinstance(paths, str):
if os.path.isdir(paths):
path = {n: os.path.join(paths, self.paths[n]) for n in self.paths.keys()}
else:
path = {dataset_name if dataset_name is not None else 'train': paths}
else:
path = paths

data_info = DataBundle()
for data_name in path.keys():
data_info.datasets[data_name] = self._load(path[data_name])

for data_name, data_set in data_info.datasets.items():
if auto_set_input:
data_set.set_input(Const.INPUTS(0), Const.INPUTS(1))
if auto_set_target:
if Const.TARGET in data_set.get_field_names():
data_set.set_target(Const.TARGET)

if extra_split is not None:
for data_name, data_set in data_info.datasets.items():
data_set.apply(lambda x: ' '.join(x[Const.INPUTS(0)]), new_field_name=Const.INPUTS(0))
data_set.apply(lambda x: ' '.join(x[Const.INPUTS(1)]), new_field_name=Const.INPUTS(1))

for s in extra_split:
data_set.apply(lambda x: x[Const.INPUTS(0)].replace(s, ' ' + s + ' '),
new_field_name=Const.INPUTS(0))
data_set.apply(lambda x: x[Const.INPUTS(0)].replace(s, ' ' + s + ' '),
new_field_name=Const.INPUTS(0))

_filt = lambda x: x
data_set.apply(lambda x: list(filter(_filt, x[Const.INPUTS(0)].split(' '))),
new_field_name=Const.INPUTS(0), is_input=auto_set_input)
data_set.apply(lambda x: list(filter(_filt, x[Const.INPUTS(1)].split(' '))),
new_field_name=Const.INPUTS(1), is_input=auto_set_input)
_filt = None

if to_lower:
for data_name, data_set in data_info.datasets.items():
data_set.apply(lambda x: [w.lower() for w in x[Const.INPUTS(0)]], new_field_name=Const.INPUTS(0),
is_input=auto_set_input)
data_set.apply(lambda x: [w.lower() for w in x[Const.INPUTS(1)]], new_field_name=Const.INPUTS(1),
is_input=auto_set_input)

if bert_tokenizer is not None:
if bert_tokenizer.lower() in PRETRAINED_BERT_MODEL_DIR:
PRETRAIN_URL = _get_base_url('bert')
model_name = PRETRAINED_BERT_MODEL_DIR[bert_tokenizer]
model_url = PRETRAIN_URL + model_name
model_dir = cached_path(model_url, name='embedding')
# 检查是否存在
elif os.path.isdir(bert_tokenizer):
model_dir = bert_tokenizer
else:
raise ValueError(f"Cannot recognize BERT tokenizer from {bert_tokenizer}.")

words_vocab = Vocabulary(padding='[PAD]', unknown='[UNK]')
with open(os.path.join(model_dir, 'vocab.txt'), 'r') as f:
lines = f.readlines()
lines = [line.strip() for line in lines]
words_vocab.add_word_lst(lines)
words_vocab.build_vocab()

tokenizer = BertTokenizer.from_pretrained(model_dir)

for data_name, data_set in data_info.datasets.items():
for fields in data_set.get_field_names():
if Const.INPUT in fields:
data_set.apply(lambda x: tokenizer.tokenize(' '.join(x[fields])), new_field_name=fields,
is_input=auto_set_input)

if isinstance(concat, bool):
concat = 'default' if concat else None
if concat is not None:
if isinstance(concat, str):
CONCAT_MAP = {'bert': ['[CLS]', '[SEP]', '', '[SEP]'],
'default': ['', '<sep>', '', '']}
if concat.lower() in CONCAT_MAP:
concat = CONCAT_MAP[concat]
else:
concat = 4 * [concat]
assert len(concat) == 4, \
f'Please choose a list with 4 symbols which at the beginning of first sentence ' \
f'the end of first sentence, the begin of second sentence, and the end of second' \
f'sentence. Your input is {concat}'

for data_name, data_set in data_info.datasets.items():
data_set.apply(lambda x: [concat[0]] + x[Const.INPUTS(0)] + [concat[1]] + [concat[2]] +
x[Const.INPUTS(1)] + [concat[3]], new_field_name=Const.INPUT)
data_set.apply(lambda x: [w for w in x[Const.INPUT] if len(w) > 0], new_field_name=Const.INPUT,
is_input=auto_set_input)

if seq_len_type is not None:
if seq_len_type == 'seq_len': #
for data_name, data_set in data_info.datasets.items():
for fields in data_set.get_field_names():
if Const.INPUT in fields:
data_set.apply(lambda x: len(x[fields]),
new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN),
is_input=auto_set_input)
elif seq_len_type == 'mask':
for data_name, data_set in data_info.datasets.items():
for fields in data_set.get_field_names():
if Const.INPUT in fields:
data_set.apply(lambda x: [1] * len(x[fields]),
new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN),
is_input=auto_set_input)
elif seq_len_type == 'bert':
for data_name, data_set in data_info.datasets.items():
if Const.INPUT not in data_set.get_field_names():
raise KeyError(f'Field ``{Const.INPUT}`` not in {data_name} data set: '
f'got {data_set.get_field_names()}')
data_set.apply(lambda x: [0] * (len(x[Const.INPUTS(0)]) + 2) + [1] * (len(x[Const.INPUTS(1)]) + 1),
new_field_name=Const.INPUT_LENS(0), is_input=auto_set_input)
data_set.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]),
new_field_name=Const.INPUT_LENS(1), is_input=auto_set_input)

if auto_pad_length is not None:
cut_text = min(auto_pad_length, cut_text if cut_text is not None else auto_pad_length)

if cut_text is not None:
for data_name, data_set in data_info.datasets.items():
for fields in data_set.get_field_names():
if (Const.INPUT in fields) or ((Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len')):
data_set.apply(lambda x: x[fields][: cut_text], new_field_name=fields,
is_input=auto_set_input)

data_set_list = [d for n, d in data_info.datasets.items()]
assert len(data_set_list) > 0, f'There are NO data sets in data info!'

if bert_tokenizer is None:
words_vocab = Vocabulary(padding=auto_pad_token)
words_vocab = words_vocab.from_dataset(*[d for n, d in data_info.datasets.items() if 'train' in n],
field_name=[n for n in data_set_list[0].get_field_names()
if (Const.INPUT in n)],
no_create_entry_dataset=[d for n, d in data_info.datasets.items()
if 'train' not in n])
target_vocab = Vocabulary(padding=None, unknown=None)
target_vocab = target_vocab.from_dataset(*[d for n, d in data_info.datasets.items() if 'train' in n],
field_name=Const.TARGET)
data_info.vocabs = {Const.INPUT: words_vocab, Const.TARGET: target_vocab}

if get_index:
for data_name, data_set in data_info.datasets.items():
for fields in data_set.get_field_names():
if Const.INPUT in fields:
data_set.apply(lambda x: [words_vocab.to_index(w) for w in x[fields]], new_field_name=fields,
is_input=auto_set_input)

if Const.TARGET in data_set.get_field_names():
data_set.apply(lambda x: target_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET,
is_input=auto_set_input, is_target=auto_set_target)

if auto_pad_length is not None:
if seq_len_type == 'seq_len':
raise RuntimeError(f'the sequence will be padded with the length {auto_pad_length}, '
f'so the seq_len_type cannot be `{seq_len_type}`!')
for data_name, data_set in data_info.datasets.items():
for fields in data_set.get_field_names():
if Const.INPUT in fields:
data_set.apply(lambda x: x[fields] + [words_vocab.to_index(words_vocab.padding)] *
(auto_pad_length - len(x[fields])), new_field_name=fields,
is_input=auto_set_input)
elif (Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len'):
data_set.apply(lambda x: x[fields] + [0] * (auto_pad_length - len(x[fields])),
new_field_name=fields, is_input=auto_set_input)

for data_name, data_set in data_info.datasets.items():
if isinstance(set_input, list):
data_set.set_input(*[inputs for inputs in set_input if inputs in data_set.get_field_names()])
if isinstance(set_target, list):
data_set.set_target(*[target for target in set_target if target in data_set.get_field_names()])

return data_info

+ 0
- 62
fastNLP/io/data_loader/mnli.py View File

@@ -1,62 +0,0 @@

from ...core.const import Const

from .matching import MatchingLoader
from ..dataset_loader import CSVLoader


class MNLILoader(MatchingLoader, CSVLoader):
"""
别名::class:`fastNLP.io.MNLILoader` :class:`fastNLP.io.data_loader.MNLILoader`

读取MNLI数据集,读取的DataSet包含fields::

words1: list(str),第一句文本, premise

words2: list(str), 第二句文本, hypothesis

target: str, 真实标签

数据来源:
"""

def __init__(self, paths: dict=None):
paths = paths if paths is not None else {
'train': 'train.tsv',
'dev_matched': 'dev_matched.tsv',
'dev_mismatched': 'dev_mismatched.tsv',
'test_matched': 'test_matched.tsv',
'test_mismatched': 'test_mismatched.tsv',
# 'test_0.9_matched': 'multinli_0.9_test_matched_unlabeled.txt',
# 'test_0.9_mismatched': 'multinli_0.9_test_mismatched_unlabeled.txt',

# test_0.9_mathed与mismatched是MNLI0.9版本的(数据来源:kaggle)
}
MatchingLoader.__init__(self, paths=paths)
CSVLoader.__init__(self, sep='\t')
self.fields = {
'sentence1_binary_parse': Const.INPUTS(0),
'sentence2_binary_parse': Const.INPUTS(1),
'gold_label': Const.TARGET,
}

def _load(self, path):
ds = CSVLoader._load(self, path)

for k, v in self.fields.items():
if k in ds.get_field_names():
ds.rename_field(k, v)

if Const.TARGET in ds.get_field_names():
if ds[0][Const.TARGET] == 'hidden':
ds.delete_field(Const.TARGET)

parentheses_table = str.maketrans({'(': None, ')': None})

ds.apply(lambda ins: ins[Const.INPUTS(0)].translate(parentheses_table).strip().split(),
new_field_name=Const.INPUTS(0))
ds.apply(lambda ins: ins[Const.INPUTS(1)].translate(parentheses_table).strip().split(),
new_field_name=Const.INPUTS(1))
if Const.TARGET in ds.get_field_names():
ds.drop(lambda x: x[Const.TARGET] == '-')
return ds

+ 0
- 68
fastNLP/io/data_loader/mtl.py View File

@@ -1,68 +0,0 @@

from typing import Union, Dict

from ..data_bundle import DataBundle
from ..dataset_loader import CSVLoader
from ...core.vocabulary import Vocabulary, VocabularyOption
from ...core.const import Const
from ..utils import check_loader_paths


class MTL16Loader(CSVLoader):
"""
别名::class:`fastNLP.io.MTL16Loader` :class:`fastNLP.io.data_loader.MTL16Loader`

读取MTL16数据集,DataSet包含以下fields:

words: list(str), 需要分类的文本

target: str, 文本的标签

数据来源:https://pan.baidu.com/s/1c2L6vdA

"""

def __init__(self):
super(MTL16Loader, self).__init__(headers=(Const.TARGET, Const.INPUT), sep='\t')

def _load(self, path):
dataset = super(MTL16Loader, self)._load(path)
dataset.apply(lambda x: x[Const.INPUT].lower().split(), new_field_name=Const.INPUT)
if len(dataset) == 0:
raise RuntimeError(f"{path} has no valid data.")

return dataset

def process(self,
paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None,):

paths = check_loader_paths(paths)
datasets = {}
info = DataBundle()
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset

src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
src_vocab.from_dataset(datasets['train'], field_name=Const.INPUT)
src_vocab.index_dataset(*datasets.values(), field_name=Const.INPUT)

tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
tgt_vocab.from_dataset(datasets['train'], field_name=Const.TARGET)
tgt_vocab.index_dataset(*datasets.values(), field_name=Const.TARGET)

info.vocabs = {
Const.INPUT: src_vocab,
Const.TARGET: tgt_vocab
}

info.datasets = datasets

for name, dataset in info.datasets.items():
dataset.set_input(Const.INPUT)
dataset.set_target(Const.TARGET)

return info

+ 0
- 85
fastNLP/io/data_loader/people_daily.py View File

@@ -1,85 +0,0 @@

from ..data_bundle import DataSetLoader
from ...core.dataset import DataSet
from ...core.instance import Instance
from ...core.const import Const


class PeopleDailyCorpusLoader(DataSetLoader):
"""
别名::class:`fastNLP.io.PeopleDailyCorpusLoader` :class:`fastNLP.io.data_loader.PeopleDailyCorpusLoader`

读取人民日报数据集
"""

def __init__(self, pos=True, ner=True):
super(PeopleDailyCorpusLoader, self).__init__()
self.pos = pos
self.ner = ner

def _load(self, data_path):
with open(data_path, "r", encoding="utf-8") as f:
sents = f.readlines()
examples = []
for sent in sents:
if len(sent) <= 2:
continue
inside_ne = False
sent_pos_tag = []
sent_words = []
sent_ner = []
words = sent.strip().split()[1:]
for word in words:
if "[" in word and "]" in word:
ner_tag = "U"
print(word)
elif "[" in word:
inside_ne = True
ner_tag = "B"
word = word[1:]
elif "]" in word:
ner_tag = "L"
word = word[:word.index("]")]
if inside_ne is True:
inside_ne = False
else:
raise RuntimeError("only ] appears!")
else:
if inside_ne is True:
ner_tag = "I"
else:
ner_tag = "O"
tmp = word.split("/")
token, pos = tmp[0], tmp[1]
sent_ner.append(ner_tag)
sent_pos_tag.append(pos)
sent_words.append(token)
example = [sent_words]
if self.pos is True:
example.append(sent_pos_tag)
if self.ner is True:
example.append(sent_ner)
examples.append(example)
return self.convert(examples)

def convert(self, data):
"""

:param data: python 内置对象
:return: 一个 :class:`~fastNLP.DataSet` 类型的对象
"""
data_set = DataSet()
for item in data:
sent_words = item[0]
if self.pos is True and self.ner is True:
instance = Instance(
words=sent_words, pos_tags=item[1], ner=item[2])
elif self.pos is True:
instance = Instance(words=sent_words, pos_tags=item[1])
elif self.ner is True:
instance = Instance(words=sent_words, ner=item[1])
else:
instance = Instance(words=sent_words)
data_set.append(instance)
data_set.apply(lambda ins: len(ins[Const.INPUT]), new_field_name=Const.INPUT_LEN)
return data_set

+ 0
- 47
fastNLP/io/data_loader/qnli.py View File

@@ -1,47 +0,0 @@

from ...core.const import Const

from .matching import MatchingLoader
from ..dataset_loader import CSVLoader


class QNLILoader(MatchingLoader, CSVLoader):
"""
别名::class:`fastNLP.io.QNLILoader` :class:`fastNLP.io.data_loader.QNLILoader`

读取QNLI数据集,读取的DataSet包含fields::

words1: list(str),第一句文本, premise

words2: list(str), 第二句文本, hypothesis

target: str, 真实标签

数据来源:
"""

def __init__(self, paths: dict=None):
paths = paths if paths is not None else {
'train': 'train.tsv',
'dev': 'dev.tsv',
'test': 'test.tsv' # test set has not label
}
MatchingLoader.__init__(self, paths=paths)
self.fields = {
'question': Const.INPUTS(0),
'sentence': Const.INPUTS(1),
'label': Const.TARGET,
}
CSVLoader.__init__(self, sep='\t')

def _load(self, path):
ds = CSVLoader._load(self, path)

for k, v in self.fields.items():
if k in ds.get_field_names():
ds.rename_field(k, v)
for fields in ds.get_all_fields():
if Const.INPUT in fields:
ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields)

return ds

+ 0
- 34
fastNLP/io/data_loader/quora.py View File

@@ -1,34 +0,0 @@

from ...core.const import Const

from .matching import MatchingLoader
from ..dataset_loader import CSVLoader


class QuoraLoader(MatchingLoader, CSVLoader):
"""
别名::class:`fastNLP.io.QuoraLoader` :class:`fastNLP.io.data_loader.QuoraLoader`

读取MNLI数据集,读取的DataSet包含fields::

words1: list(str),第一句文本, premise

words2: list(str), 第二句文本, hypothesis

target: str, 真实标签

数据来源:
"""

def __init__(self, paths: dict=None):
paths = paths if paths is not None else {
'train': 'train.tsv',
'dev': 'dev.tsv',
'test': 'test.tsv',
}
MatchingLoader.__init__(self, paths=paths)
CSVLoader.__init__(self, sep='\t', headers=(Const.TARGET, Const.INPUTS(0), Const.INPUTS(1), 'pairID'))

def _load(self, path):
ds = CSVLoader._load(self, path)
return ds

+ 0
- 47
fastNLP/io/data_loader/rte.py View File

@@ -1,47 +0,0 @@

from ...core.const import Const

from .matching import MatchingLoader
from ..dataset_loader import CSVLoader


class RTELoader(MatchingLoader, CSVLoader):
"""
别名::class:`fastNLP.io.RTELoader` :class:`fastNLP.io.data_loader.RTELoader`

读取RTE数据集,读取的DataSet包含fields::

words1: list(str),第一句文本, premise

words2: list(str), 第二句文本, hypothesis

target: str, 真实标签

数据来源:
"""

def __init__(self, paths: dict=None):
paths = paths if paths is not None else {
'train': 'train.tsv',
'dev': 'dev.tsv',
'test': 'test.tsv' # test set has not label
}
MatchingLoader.__init__(self, paths=paths)
self.fields = {
'sentence1': Const.INPUTS(0),
'sentence2': Const.INPUTS(1),
'label': Const.TARGET,
}
CSVLoader.__init__(self, sep='\t')

def _load(self, path):
ds = CSVLoader._load(self, path)

for k, v in self.fields.items():
if k in ds.get_field_names():
ds.rename_field(k, v)
for fields in ds.get_all_fields():
if Const.INPUT in fields:
ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields)

return ds

+ 0
- 46
fastNLP/io/data_loader/snli.py View File

@@ -1,46 +0,0 @@

from ...core.const import Const

from .matching import MatchingLoader
from ..dataset_loader import JsonLoader


class SNLILoader(MatchingLoader, JsonLoader):
"""
别名::class:`fastNLP.io.SNLILoader` :class:`fastNLP.io.data_loader.SNLILoader`

读取SNLI数据集,读取的DataSet包含fields::

words1: list(str),第一句文本, premise

words2: list(str), 第二句文本, hypothesis

target: str, 真实标签

数据来源: https://nlp.stanford.edu/projects/snli/snli_1.0.zip
"""

def __init__(self, paths: dict=None):
fields = {
'sentence1_binary_parse': Const.INPUTS(0),
'sentence2_binary_parse': Const.INPUTS(1),
'gold_label': Const.TARGET,
}
paths = paths if paths is not None else {
'train': 'snli_1.0_train.jsonl',
'dev': 'snli_1.0_dev.jsonl',
'test': 'snli_1.0_test.jsonl'}
MatchingLoader.__init__(self, paths=paths)
JsonLoader.__init__(self, fields=fields)

def _load(self, path):
ds = JsonLoader._load(self, path)

parentheses_table = str.maketrans({'(': None, ')': None})

ds.apply(lambda ins: ins[Const.INPUTS(0)].translate(parentheses_table).strip().split(),
new_field_name=Const.INPUTS(0))
ds.apply(lambda ins: ins[Const.INPUTS(1)].translate(parentheses_table).strip().split(),
new_field_name=Const.INPUTS(1))
ds.drop(lambda x: x[Const.TARGET] == '-')
return ds

+ 0
- 180
fastNLP/io/data_loader/sst.py View File

@@ -1,180 +0,0 @@

from typing import Union, Dict
from nltk import Tree

from ..data_bundle import DataBundle, DataSetLoader
from ..dataset_loader import CSVLoader
from ...core.vocabulary import VocabularyOption, Vocabulary
from ...core.dataset import DataSet
from ...core.const import Const
from ...core.instance import Instance
from ..utils import check_loader_paths, get_tokenizer


class SSTLoader(DataSetLoader):
"""
别名::class:`fastNLP.io.SSTLoader` :class:`fastNLP.io.data_loader.SSTLoader`

读取SST数据集, DataSet包含fields::

words: list(str) 需要分类的文本
target: str 文本的标签

数据来源: https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip

:param subtree: 是否将数据展开为子树,扩充数据量. Default: ``False``
:param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False``
"""

URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip'
DATA_DIR = 'sst/'

def __init__(self, subtree=False, fine_grained=False):
self.subtree = subtree

tag_v = {'0': 'very negative', '1': 'negative', '2': 'neutral',
'3': 'positive', '4': 'very positive'}
if not fine_grained:
tag_v['0'] = tag_v['1']
tag_v['4'] = tag_v['3']
self.tag_v = tag_v
self.tokenizer = get_tokenizer()

def _load(self, path):
"""

:param str path: 存储数据的路径
:return: 一个 :class:`~fastNLP.DataSet` 类型的对象
"""
datalist = []
with open(path, 'r', encoding='utf-8') as f:
datas = []
for l in f:
datas.extend([(s, self.tag_v[t])
for s, t in self._get_one(l, self.subtree)])
ds = DataSet()
for words, tag in datas:
ds.append(Instance(words=words, target=tag))
return ds

def _get_one(self, data, subtree):
tree = Tree.fromstring(data)
if subtree:
return [(self.tokenizer(' '.join(t.leaves())), t.label()) for t in tree.subtrees() ]
return [(self.tokenizer(' '.join(tree.leaves())), tree.label())]

def process(self,
paths, train_subtree=True,
src_vocab_op: VocabularyOption = None,
tgt_vocab_op: VocabularyOption = None,):
paths = check_loader_paths(paths)
input_name, target_name = 'words', 'target'
src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op)
tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)

info = DataBundle()
origin_subtree = self.subtree
self.subtree = train_subtree
info.datasets['train'] = self._load(paths['train'])
self.subtree = origin_subtree
for n, p in paths.items():
if n != 'train':
info.datasets[n] = self._load(p)

src_vocab.from_dataset(
info.datasets['train'],
field_name=input_name,
no_create_entry_dataset=[ds for n, ds in info.datasets.items() if n != 'train'])
tgt_vocab.from_dataset(info.datasets['train'], field_name=target_name)

src_vocab.index_dataset(
*info.datasets.values(),
field_name=input_name, new_field_name=input_name)
tgt_vocab.index_dataset(
*info.datasets.values(),
field_name=target_name, new_field_name=target_name)
info.vocabs = {
input_name: src_vocab,
target_name: tgt_vocab
}

return info


class SST2Loader(CSVLoader):
"""
别名::class:`fastNLP.io.SST2Loader` :class:`fastNLP.io.data_loader.SST2Loader`

数据来源 SST: https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8
"""

def __init__(self):
super(SST2Loader, self).__init__(sep='\t')
self.tokenizer = get_tokenizer()
self.field = {'sentence': Const.INPUT, 'label': Const.TARGET}

def _load(self, path: str) -> DataSet:
ds = super(SST2Loader, self)._load(path)
for k, v in self.field.items():
if k in ds.get_field_names():
ds.rename_field(k, v)
ds.apply(lambda x: self.tokenizer(x[Const.INPUT]), new_field_name=Const.INPUT)
print("all count:", len(ds))
return ds

def process(self,
paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None,
char_level_op=False):

paths = check_loader_paths(paths)
datasets = {}
info = DataBundle()
for name, path in paths.items():
dataset = self.load(path)
dataset.apply_field(lambda words:words.copy(), field_name='words', new_field_name='raw_words')
datasets[name] = dataset

def wordtochar(words):
chars = []
for word in words:
word = word.lower()
for char in word:
chars.append(char)
chars.append('')
chars.pop()
return chars

input_name, target_name = Const.INPUT, Const.TARGET
info.vocabs={}

# 就分隔为char形式
if char_level_op:
for dataset in datasets.values():
dataset.apply_field(wordtochar, field_name=Const.INPUT, new_field_name=Const.CHAR_INPUT)
src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
src_vocab.from_dataset(datasets['train'], field_name=Const.INPUT, no_create_entry_dataset=[
dataset for name, dataset in datasets.items() if name!='train'
])
src_vocab.index_dataset(*datasets.values(), field_name=Const.INPUT)

tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
tgt_vocab.from_dataset(datasets['train'], field_name=Const.TARGET)
tgt_vocab.index_dataset(*datasets.values(), field_name=Const.TARGET)

info.vocabs = {
Const.INPUT: src_vocab,
Const.TARGET: tgt_vocab
}

info.datasets = datasets

for name, dataset in info.datasets.items():
dataset.set_input(Const.INPUT)
dataset.set_target(Const.TARGET)

return info


+ 0
- 132
fastNLP/io/data_loader/yelp.py View File

@@ -1,132 +0,0 @@

import csv
from typing import Iterable

from ...core.const import Const
from ...core.dataset import DataSet
from ...core.instance import Instance
from ...core.vocabulary import VocabularyOption, Vocabulary
from ..data_bundle import DataBundle, DataSetLoader
from typing import Union, Dict
from ..utils import check_loader_paths, get_tokenizer


class YelpLoader(DataSetLoader):
"""
别名::class:`fastNLP.io.YelpLoader` :class:`fastNLP.io.data_loader.YelpLoader`
读取Yelp_full/Yelp_polarity数据集, DataSet包含fields:

words: list(str), 需要分类的文本

target: str, 文本的标签

chars:list(str),未index的字符列表

数据集:yelp_full/yelp_polarity

:param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False``
:param lower: 是否需要自动转小写,默认为False。
"""

def __init__(self, fine_grained=False, lower=False):
super(YelpLoader, self).__init__()
tag_v = {'1.0': 'very negative', '2.0': 'negative', '3.0': 'neutral',
'4.0': 'positive', '5.0': 'very positive'}
if not fine_grained:
tag_v['1.0'] = tag_v['2.0']
tag_v['5.0'] = tag_v['4.0']
self.fine_grained = fine_grained
self.tag_v = tag_v
self.lower = lower
self.tokenizer = get_tokenizer()

def _load(self, path):
ds = DataSet()
csv_reader = csv.reader(open(path, encoding='utf-8'))
all_count = 0
real_count = 0
for row in csv_reader:
all_count += 1
if len(row) == 2:
target = self.tag_v[row[0] + ".0"]
words = clean_str(row[1], self.tokenizer, self.lower)
if len(words) != 0:
ds.append(Instance(words=words, target=target))
real_count += 1
print("all count:", all_count)
print("real count:", real_count)
return ds

def process(self, paths: Union[str, Dict[str, str]],
train_ds: Iterable[str] = None,
src_vocab_op: VocabularyOption = None,
tgt_vocab_op: VocabularyOption = None,
char_level_op=False):
paths = check_loader_paths(paths)
info = DataBundle(datasets=self.load(paths))
src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op)
tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)
_train_ds = [info.datasets[name]
for name in train_ds] if train_ds else info.datasets.values()

def wordtochar(words):
chars = []
for word in words:
word = word.lower()
for char in word:
chars.append(char)
chars.append('')
chars.pop()
return chars

input_name, target_name = Const.INPUT, Const.TARGET
info.vocabs = {}
# 就分隔为char形式
if char_level_op:
for dataset in info.datasets.values():
dataset.apply_field(wordtochar, field_name=Const.INPUT, new_field_name=Const.CHAR_INPUT)
else:
src_vocab.from_dataset(*_train_ds, field_name=input_name)
src_vocab.index_dataset(*info.datasets.values(), field_name=input_name, new_field_name=input_name)
info.vocabs[input_name] = src_vocab

tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
tgt_vocab.index_dataset(
*info.datasets.values(),
field_name=target_name, new_field_name=target_name)

info.vocabs[target_name] = tgt_vocab

info.datasets['train'], info.datasets['dev'] = info.datasets['train'].split(0.1, shuffle=False)

for name, dataset in info.datasets.items():
dataset.set_input(Const.INPUT)
dataset.set_target(Const.TARGET)

return info


def clean_str(sentence, tokenizer, char_lower=False):
"""
heavily borrowed from github
https://github.com/LukeZhuang/Hierarchical-Attention-Network/blob/master/yelp-preprocess.ipynb
:param sentence: is a str
:return:
"""
if char_lower:
sentence = sentence.lower()
import re
nonalpnum = re.compile('[^0-9a-zA-Z?!\']+')
words = tokenizer(sentence)
words_collection = []
for word in words:
if word in ['-lrb-', '-rrb-', '<sssss>', '-r', '-l', 'b-']:
continue
tt = nonalpnum.split(word)
t = ''.join(tt)
if t != '':
words_collection.append(t)

return words_collection


+ 0
- 121
fastNLP/io/dataset_loader.py View File

@@ -1,121 +0,0 @@
"""undocumented
.. warning::

本模块将在 `0.5.0版本` 中被废弃,由 :mod:`~fastNLP.io.loader` 和 :mod:`~fastNLP.io.pipe` 模块替代。

dataset_loader模块实现了许多 DataSetLoader, 用于读取不同格式的数据, 并返回 `DataSet` ,
得到的 :class:`~fastNLP.DataSet` 对象可以直接传入 :class:`~fastNLP.Trainer` 和 :class:`~fastNLP.Tester`, 用于模型的训练和测试。
以SNLI数据集为例::

loader = SNLILoader()
train_ds = loader.load('path/to/train')
dev_ds = loader.load('path/to/dev')
test_ds = loader.load('path/to/test')

# ... do stuff
为 fastNLP 提供 DataSetLoader 的开发者请参考 :class:`~fastNLP.io.DataSetLoader` 的介绍。

"""
__all__ = [
'CSVLoader',
'JsonLoader',
]


from .data_bundle import DataSetLoader
from .file_reader import _read_csv, _read_json
from ..core.dataset import DataSet
from ..core.instance import Instance


class JsonLoader(DataSetLoader):
"""
别名::class:`fastNLP.io.JsonLoader` :class:`fastNLP.io.dataset_loader.JsonLoader`

读取json格式数据.数据必须按行存储,每行是一个包含各类属性的json对象

:param dict fields: 需要读入的json属性名称, 和读入后在DataSet中存储的field_name
``fields`` 的 `key` 必须是json对象的属性名. ``fields`` 的 `value` 为读入后在DataSet存储的 `field_name` ,
`value` 也可为 ``None`` , 这时读入后的 `field_name` 与json对象对应属性同名
``fields`` 可为 ``None`` , 这时,json对象所有属性都保存在DataSet中. Default: ``None``
:param bool dropna: 是否忽略非法数据,若 ``True`` 则忽略,若 ``False`` ,在遇到非法数据时,抛出 ``ValueError`` .
Default: ``False``
"""

def __init__(self, fields=None, dropna=False):
super(JsonLoader, self).__init__()
self.dropna = dropna
self.fields = None
self.fields_list = None
if fields:
self.fields = {}
for k, v in fields.items():
self.fields[k] = k if v is None else v
self.fields_list = list(self.fields.keys())

def _load(self, path):
ds = DataSet()
for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna):
if self.fields:
ins = {self.fields[k]: v for k, v in d.items()}
else:
ins = d
ds.append(Instance(**ins))
return ds


class CSVLoader(DataSetLoader):
"""
别名::class:`fastNLP.io.CSVLoader` :class:`fastNLP.io.dataset_loader.CSVLoader`

读取CSV格式的数据集。返回 ``DataSet``

:param List[str] headers: CSV文件的文件头.定义每一列的属性名称,即返回的DataSet中`field`的名称
若为 ``None`` ,则将读入文件的第一行视作 ``headers`` . Default: ``None``
:param str sep: CSV文件中列与列之间的分隔符. Default: ","
:param bool dropna: 是否忽略非法数据,若 ``True`` 则忽略,若 ``False`` ,在遇到非法数据时,抛出 ``ValueError`` .
Default: ``False``
"""

def __init__(self, headers=None, sep=",", dropna=False):
self.headers = headers
self.sep = sep
self.dropna = dropna

def _load(self, path):
ds = DataSet()
for idx, data in _read_csv(path, headers=self.headers,
sep=self.sep, dropna=self.dropna):
ds.append(Instance(**data))
return ds


def _cut_long_sentence(sent, max_sample_length=200):
"""
将长于max_sample_length的sentence截成多段,只会在有空格的地方发生截断。
所以截取的句子可能长于或者短于max_sample_length

:param sent: str.
:param max_sample_length: int.
:return: list of str.
"""
sent_no_space = sent.replace(' ', '')
cutted_sentence = []
if len(sent_no_space) > max_sample_length:
parts = sent.strip().split()
new_line = ''
length = 0
for part in parts:
length += len(part)
new_line += part + ' '
if length > max_sample_length:
new_line = new_line[:-1]
cutted_sentence.append(new_line)
length = 0
new_line = ''
if new_line != '':
cutted_sentence.append(new_line[:-1])
else:
cutted_sentence.append(sent)
return cutted_sentence

+ 6
- 7
fastNLP/io/embed_loader.py View File

@@ -13,7 +13,6 @@ import warnings

import numpy as np

from .data_bundle import BaseLoader
from ..core.utils import Option
from ..core.vocabulary import Vocabulary

@@ -32,7 +31,7 @@ class EmbeddingOption(Option):
)


class EmbedLoader(BaseLoader):
class EmbedLoader:
"""
别名::class:`fastNLP.io.EmbedLoader` :class:`fastNLP.io.embed_loader.EmbedLoader`

@@ -84,9 +83,9 @@ class EmbedLoader(BaseLoader):
word = ''.join(parts[:-dim])
nums = parts[-dim:]
# 对齐unk与pad
if word==padding and vocab.padding is not None:
if word == padding and vocab.padding is not None:
word = vocab.padding
elif word==unknown and vocab.unknown is not None:
elif word == unknown and vocab.unknown is not None:
word = vocab.unknown
if word in vocab:
index = vocab.to_index(word)
@@ -171,7 +170,7 @@ class EmbedLoader(BaseLoader):
index = vocab.to_index(key)
matrix[index] = vec

if (unknown is not None and not found_unknown) or (padding is not None and not found_pad):
if ((unknown is not None) and (not found_unknown)) or ((padding is not None) and (not found_pad)):
start_idx = 0
if padding is not None:
start_idx += 1
@@ -180,9 +179,9 @@ class EmbedLoader(BaseLoader):

mean = np.mean(matrix[start_idx:], axis=0, keepdims=True)
std = np.std(matrix[start_idx:], axis=0, keepdims=True)
if (unknown is not None and not found_unknown):
if (unknown is not None) and (not found_unknown):
matrix[start_idx - 1] = np.random.randn(1, dim).astype(dtype) * std + mean
if (padding is not None and not found_pad):
if (padding is not None) and (not found_pad):
matrix[0] = np.random.randn(1, dim).astype(dtype) * std + mean
if normalize:


+ 1
- 3
fastNLP/io/model_io.py View File

@@ -8,10 +8,8 @@ __all__ = [

import torch

from .data_bundle import BaseLoader


class ModelLoader(BaseLoader):
class ModelLoader:
"""
别名::class:`fastNLP.io.ModelLoader` :class:`fastNLP.io.model_io.ModelLoader`



+ 0
- 228
fastNLP/models/enas_controller.py View File

@@ -1,228 +0,0 @@
"""undocumented
Code Modified from https://github.com/carpedm20/ENAS-pytorch
A module with NAS controller-related code.
"""

__all__ = []

import collections
import os

import torch
import torch.nn.functional as F

from . import enas_utils as utils
from .enas_utils import Node


def _construct_dags(prev_nodes, activations, func_names, num_blocks):
"""Constructs a set of DAGs based on the actions, i.e., previous nodes and
activation functions, sampled from the controller/policy pi.

Args:
prev_nodes: Previous node actions from the policy.
activations: Activations sampled from the policy.
func_names: Mapping from activation function names to functions.
num_blocks: Number of blocks in the target RNN cell.

Returns:
A list of DAGs defined by the inputs.

RNN cell DAGs are represented in the following way:

1. Each element (node) in a DAG is a list of `Node`s.

2. The `Node`s in the list dag[i] correspond to the subsequent nodes
that take the output from node i as their own input.

3. dag[-1] is the node that takes input from x^{(t)} and h^{(t - 1)}.
dag[-1] always feeds dag[0].
dag[-1] acts as if `w_xc`, `w_hc`, `w_xh` and `w_hh` are its
weights.

4. dag[N - 1] is the node that produces the hidden state passed to
the next timestep. dag[N - 1] is also always a leaf node, and therefore
is always averaged with the other leaf nodes and fed to the output
decoder.
"""
dags = []
for nodes, func_ids in zip(prev_nodes, activations):
dag = collections.defaultdict(list)

# add first node
dag[-1] = [Node(0, func_names[func_ids[0]])]
dag[-2] = [Node(0, func_names[func_ids[0]])]

# add following nodes
for jdx, (idx, func_id) in enumerate(zip(nodes, func_ids[1:])):
dag[utils.to_item(idx)].append(Node(jdx + 1, func_names[func_id]))

leaf_nodes = set(range(num_blocks)) - dag.keys()

# merge with avg
for idx in leaf_nodes:
dag[idx] = [Node(num_blocks, 'avg')]

# This is actually y^{(t)}. h^{(t)} is node N - 1 in
# the graph, where N Is the number of nodes. I.e., h^{(t)} takes
# only one other node as its input.
# last h[t] node
last_node = Node(num_blocks + 1, 'h[t]')
dag[num_blocks] = [last_node]
dags.append(dag)

return dags


class Controller(torch.nn.Module):
"""Based on
https://github.com/pytorch/examples/blob/master/word_language_model/model.py

RL controllers do not necessarily have much to do with
language models.

Base the controller RNN on the GRU from:
https://github.com/ikostrikov/pytorch-a2c-ppo-acktr/blob/master/model.py
"""
def __init__(self, num_blocks=4, controller_hid=100, cuda=False):
torch.nn.Module.__init__(self)

# `num_tokens` here is just the activation function
# for every even step,
self.shared_rnn_activations = ['tanh', 'ReLU', 'identity', 'sigmoid']
self.num_tokens = [len(self.shared_rnn_activations)]
self.controller_hid = controller_hid
self.use_cuda = cuda
self.num_blocks = num_blocks
for idx in range(num_blocks):
self.num_tokens += [idx + 1, len(self.shared_rnn_activations)]
self.func_names = self.shared_rnn_activations

num_total_tokens = sum(self.num_tokens)

self.encoder = torch.nn.Embedding(num_total_tokens,
controller_hid)
self.lstm = torch.nn.LSTMCell(controller_hid, controller_hid)

# Perhaps these weights in the decoder should be
# shared? At least for the activation functions, which all have the
# same size.
self.decoders = []
for idx, size in enumerate(self.num_tokens):
decoder = torch.nn.Linear(controller_hid, size)
self.decoders.append(decoder)

self._decoders = torch.nn.ModuleList(self.decoders)

self.reset_parameters()
self.static_init_hidden = utils.keydefaultdict(self.init_hidden)

def _get_default_hidden(key):
return utils.get_variable(
torch.zeros(key, self.controller_hid),
self.use_cuda,
requires_grad=False)

self.static_inputs = utils.keydefaultdict(_get_default_hidden)

def reset_parameters(self):
init_range = 0.1
for param in self.parameters():
param.data.uniform_(-init_range, init_range)
for decoder in self.decoders:
decoder.bias.data.fill_(0)

def forward(self, # pylint:disable=arguments-differ
inputs,
hidden,
block_idx,
is_embed):
if not is_embed:
embed = self.encoder(inputs)
else:
embed = inputs

hx, cx = self.lstm(embed, hidden)
logits = self.decoders[block_idx](hx)

logits /= 5.0

# # exploration
# if self.args.mode == 'train':
# logits = (2.5 * F.tanh(logits))

return logits, (hx, cx)

def sample(self, batch_size=1, with_details=False, save_dir=None):
"""Samples a set of `args.num_blocks` many computational nodes from the
controller, where each node is made up of an activation function, and
each node except the last also includes a previous node.
"""
if batch_size < 1:
raise Exception(f'Wrong batch_size: {batch_size} < 1')

# [B, L, H]
inputs = self.static_inputs[batch_size]
hidden = self.static_init_hidden[batch_size]

activations = []
entropies = []
log_probs = []
prev_nodes = []
# The RNN controller alternately outputs an activation,
# followed by a previous node, for each block except the last one,
# which only gets an activation function. The last node is the output
# node, and its previous node is the average of all leaf nodes.
for block_idx in range(2*(self.num_blocks - 1) + 1):
logits, hidden = self.forward(inputs,
hidden,
block_idx,
is_embed=(block_idx == 0))

probs = F.softmax(logits, dim=-1)
log_prob = F.log_softmax(logits, dim=-1)
# .mean() for entropy?
entropy = -(log_prob * probs).sum(1, keepdim=False)

action = probs.multinomial(num_samples=1).data
selected_log_prob = log_prob.gather(
1, utils.get_variable(action, requires_grad=False))

# why the [:, 0] here? Should it be .squeeze(), or
# .view()? Same below with `action`.
entropies.append(entropy)
log_probs.append(selected_log_prob[:, 0])

# 0: function, 1: previous node
mode = block_idx % 2
inputs = utils.get_variable(
action[:, 0] + sum(self.num_tokens[:mode]),
requires_grad=False)

if mode == 0:
activations.append(action[:, 0])
elif mode == 1:
prev_nodes.append(action[:, 0])

prev_nodes = torch.stack(prev_nodes).transpose(0, 1)
activations = torch.stack(activations).transpose(0, 1)

dags = _construct_dags(prev_nodes,
activations,
self.func_names,
self.num_blocks)

if save_dir is not None:
for idx, dag in enumerate(dags):
utils.draw_network(dag,
os.path.join(save_dir, f'graph{idx}.png'))

if with_details:
return dags, torch.cat(log_probs), torch.cat(entropies)

return dags

def init_hidden(self, batch_size):
zeros = torch.zeros(batch_size, self.controller_hid)
return (utils.get_variable(zeros, self.use_cuda, requires_grad=False),
utils.get_variable(zeros.clone(), self.use_cuda, requires_grad=False))

+ 0
- 393
fastNLP/models/enas_model.py View File

@@ -1,393 +0,0 @@
"""undocumented
Module containing the shared RNN model.
Code Modified from https://github.com/carpedm20/ENAS-pytorch
"""

__all__ = []

import collections

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

from . import enas_utils as utils
from .base_model import BaseModel


def _get_dropped_weights(w_raw, dropout_p, is_training):
"""Drops out weights to implement DropConnect.

Args:
w_raw: Full, pre-dropout, weights to be dropped out.
dropout_p: Proportion of weights to drop out.
is_training: True iff _shared_ model is training.

Returns:
The dropped weights.

Why does torch.nn.functional.dropout() return:
1. `torch.autograd.Variable()` on the training loop
2. `torch.nn.Parameter()` on the controller or eval loop, when
training = False...

Even though the call to `_setweights` in the Smerity repo's
`weight_drop.py` does not have this behaviour, and `F.dropout` always
returns `torch.autograd.Variable` there, even when `training=False`?

The above TODO is the reason for the hacky check for `torch.nn.Parameter`.
"""
dropped_w = F.dropout(w_raw, p=dropout_p, training=is_training)
if isinstance(dropped_w, torch.nn.Parameter):
dropped_w = dropped_w.clone()
return dropped_w


class EmbeddingDropout(torch.nn.Embedding):
"""Class for dropping out embeddings by zero'ing out parameters in the
embedding matrix.

This is equivalent to dropping out particular words, e.g., in the sentence
'the quick brown fox jumps over the lazy dog', dropping out 'the' would
lead to the sentence '### quick brown fox jumps over ### lazy dog' (in the
embedding vector space).

See 'A Theoretically Grounded Application of Dropout in Recurrent Neural
Networks', (Gal and Ghahramani, 2016).
"""
def __init__(self,
num_embeddings,
embedding_dim,
max_norm=None,
norm_type=2,
scale_grad_by_freq=False,
sparse=False,
dropout=0.1,
scale=None):
"""Embedding constructor.

Args:
dropout: Dropout probability.
scale: Used to scale parameters of embedding weight matrix that are
not dropped out. Note that this is _in addition_ to the
`1/(1 - dropout)` scaling.

See `torch.nn.Embedding` for remaining arguments.
"""
torch.nn.Embedding.__init__(self,
num_embeddings=num_embeddings,
embedding_dim=embedding_dim,
max_norm=max_norm,
norm_type=norm_type,
scale_grad_by_freq=scale_grad_by_freq,
sparse=sparse)
self.dropout = dropout
assert (dropout >= 0.0) and (dropout < 1.0), ('Dropout must be >= 0.0 '
'and < 1.0')
self.scale = scale
def forward(self, inputs): # pylint:disable=arguments-differ
"""Embeds `inputs` with the dropped out embedding weight matrix."""
if self.training:
dropout = self.dropout
else:
dropout = 0
if dropout:
mask = self.weight.data.new(self.weight.size(0), 1)
mask.bernoulli_(1 - dropout)
mask = mask.expand_as(self.weight)
mask = mask / (1 - dropout)
masked_weight = self.weight * Variable(mask)
else:
masked_weight = self.weight
if self.scale and self.scale != 1:
masked_weight = masked_weight * self.scale
return F.embedding(inputs,
masked_weight,
max_norm=self.max_norm,
norm_type=self.norm_type,
scale_grad_by_freq=self.scale_grad_by_freq,
sparse=self.sparse)


class LockedDropout(nn.Module):
# code from https://github.com/salesforce/awd-lstm-lm/blob/master/locked_dropout.py
def __init__(self):
super().__init__()
def forward(self, x, dropout=0.5):
if not self.training or not dropout:
return x
m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout)
mask = Variable(m, requires_grad=False) / (1 - dropout)
mask = mask.expand_as(x)
return mask * x


class ENASModel(BaseModel):
"""Shared RNN model."""
def __init__(self, embed_num, num_classes, num_blocks=4, cuda=False, shared_hid=1000, shared_embed=1000):
super(ENASModel, self).__init__()
self.use_cuda = cuda
self.shared_hid = shared_hid
self.num_blocks = num_blocks
self.decoder = nn.Linear(self.shared_hid, num_classes)
self.encoder = EmbeddingDropout(embed_num,
shared_embed,
dropout=0.1)
self.lockdrop = LockedDropout()
self.dag = None
# Tie weights
# self.decoder.weight = self.encoder.weight
# Since W^{x, c} and W^{h, c} are always summed, there
# is no point duplicating their bias offset parameter. Likewise for
# W^{x, h} and W^{h, h}.
self.w_xc = nn.Linear(shared_embed, self.shared_hid)
self.w_xh = nn.Linear(shared_embed, self.shared_hid)
# The raw weights are stored here because the hidden-to-hidden weights
# are weight dropped on the forward pass.
self.w_hc_raw = torch.nn.Parameter(
torch.Tensor(self.shared_hid, self.shared_hid))
self.w_hh_raw = torch.nn.Parameter(
torch.Tensor(self.shared_hid, self.shared_hid))
self.w_hc = None
self.w_hh = None
self.w_h = collections.defaultdict(dict)
self.w_c = collections.defaultdict(dict)
for idx in range(self.num_blocks):
for jdx in range(idx + 1, self.num_blocks):
self.w_h[idx][jdx] = nn.Linear(self.shared_hid,
self.shared_hid,
bias=False)
self.w_c[idx][jdx] = nn.Linear(self.shared_hid,
self.shared_hid,
bias=False)
self._w_h = nn.ModuleList([self.w_h[idx][jdx]
for idx in self.w_h
for jdx in self.w_h[idx]])
self._w_c = nn.ModuleList([self.w_c[idx][jdx]
for idx in self.w_c
for jdx in self.w_c[idx]])
self.batch_norm = None
# if args.mode == 'train':
# self.batch_norm = nn.BatchNorm1d(self.shared_hid)
# else:
# self.batch_norm = None
self.reset_parameters()
self.static_init_hidden = utils.keydefaultdict(self.init_hidden)
def setDAG(self, dag):
if self.dag is None:
self.dag = dag
def forward(self, word_seq, hidden=None):
inputs = torch.transpose(word_seq, 0, 1)
time_steps = inputs.size(0)
batch_size = inputs.size(1)
self.w_hh = _get_dropped_weights(self.w_hh_raw,
0.5,
self.training)
self.w_hc = _get_dropped_weights(self.w_hc_raw,
0.5,
self.training)
# hidden = self.static_init_hidden[batch_size] if hidden is None else hidden
hidden = self.static_init_hidden[batch_size]
embed = self.encoder(inputs)
embed = self.lockdrop(embed, 0.65 if self.training else 0)
# The norm of hidden states are clipped here because
# otherwise ENAS is especially prone to exploding activations on the
# forward pass. This could probably be fixed in a more elegant way, but
# it might be exposing a weakness in the ENAS algorithm as currently
# proposed.
#
# For more details, see
# https://github.com/carpedm20/ENAS-pytorch/issues/6
clipped_num = 0
max_clipped_norm = 0
h1tohT = []
logits = []
for step in range(time_steps):
x_t = embed[step]
logit, hidden = self.cell(x_t, hidden, self.dag)
hidden_norms = hidden.norm(dim=-1)
max_norm = 25.0
if hidden_norms.data.max() > max_norm:
# Just directly use the torch slice operations
# in PyTorch v0.4.
#
# This workaround for PyTorch v0.3.1 does everything in numpy,
# because the PyTorch slicing and slice assignment is too
# flaky.
hidden_norms = hidden_norms.data.cpu().numpy()
clipped_num += 1
if hidden_norms.max() > max_clipped_norm:
max_clipped_norm = hidden_norms.max()
clip_select = hidden_norms > max_norm
clip_norms = hidden_norms[clip_select]
mask = np.ones(hidden.size())
normalizer = max_norm / clip_norms
normalizer = normalizer[:, np.newaxis]
mask[clip_select] = normalizer
if self.use_cuda:
hidden *= torch.autograd.Variable(
torch.FloatTensor(mask).cuda(), requires_grad=False)
else:
hidden *= torch.autograd.Variable(
torch.FloatTensor(mask), requires_grad=False)
logits.append(logit)
h1tohT.append(hidden)
h1tohT = torch.stack(h1tohT)
output = torch.stack(logits)
raw_output = output
output = self.lockdrop(output, 0.4 if self.training else 0)
# Pooling
output = torch.mean(output, 0)
decoded = self.decoder(output)
extra_out = {'dropped': decoded,
'hiddens': h1tohT,
'raw': raw_output}
return {'pred': decoded, 'hidden': hidden, 'extra_out': extra_out}
def cell(self, x, h_prev, dag):
"""Computes a single pass through the discovered RNN cell."""
c = {}
h = {}
f = {}
f[0] = self.get_f(dag[-1][0].name)
c[0] = torch.sigmoid(self.w_xc(x) + F.linear(h_prev, self.w_hc, None))
h[0] = (c[0] * f[0](self.w_xh(x) + F.linear(h_prev, self.w_hh, None)) +
(1 - c[0]) * h_prev)
leaf_node_ids = []
q = collections.deque()
q.append(0)
# Computes connections from the parent nodes `node_id`
# to their child nodes `next_id` recursively, skipping leaf nodes. A
# leaf node is a node whose id == `self.num_blocks`.
#
# Connections between parent i and child j should be computed as
# h_j = c_j*f_{ij}{(W^h_{ij}*h_i)} + (1 - c_j)*h_i,
# where c_j = \sigmoid{(W^c_{ij}*h_i)}
#
# See Training details from Section 3.1 of the paper.
#
# The following algorithm does a breadth-first (since `q.popleft()` is
# used) search over the nodes and computes all the hidden states.
while True:
if len(q) == 0:
break
node_id = q.popleft()
nodes = dag[node_id]
for next_node in nodes:
next_id = next_node.id
if next_id == self.num_blocks:
leaf_node_ids.append(node_id)
assert len(nodes) == 1, ('parent of leaf node should have '
'only one child')
continue
w_h = self.w_h[node_id][next_id]
w_c = self.w_c[node_id][next_id]
f[next_id] = self.get_f(next_node.name)
c[next_id] = torch.sigmoid(w_c(h[node_id]))
h[next_id] = (c[next_id] * f[next_id](w_h(h[node_id])) +
(1 - c[next_id]) * h[node_id])
q.append(next_id)
# Instead of averaging loose ends, perhaps there should
# be a set of separate unshared weights for each "loose" connection
# between each node in a cell and the output.
#
# As it stands, all weights W^h_{ij} are doing double duty by
# connecting both from i to j, as well as from i to the output.
# average all the loose ends
leaf_nodes = [h[node_id] for node_id in leaf_node_ids]
output = torch.mean(torch.stack(leaf_nodes, 2), -1)
# stabilizing the Updates of omega
if self.batch_norm is not None:
output = self.batch_norm(output)
return output, h[self.num_blocks - 1]
def init_hidden(self, batch_size):
zeros = torch.zeros(batch_size, self.shared_hid)
return utils.get_variable(zeros, self.use_cuda, requires_grad=False)
def get_f(self, name):
name = name.lower()
if name == 'relu':
f = torch.relu
elif name == 'tanh':
f = torch.tanh
elif name == 'identity':
f = lambda x: x
elif name == 'sigmoid':
f = torch.sigmoid
return f
@property
def num_parameters(self):
def size(p):
return np.prod(p.size())
return sum([size(param) for param in self.parameters()])
def reset_parameters(self):
init_range = 0.025
# init_range = 0.025 if self.args.mode == 'train' else 0.04
for param in self.parameters():
param.data.uniform_(-init_range, init_range)
self.decoder.bias.data.fill_(0)
def predict(self, word_seq):
"""

:param word_seq: torch.LongTensor, [batch_size, seq_len]
:return predict: dict of torch.LongTensor, [batch_size, seq_len]
"""
output = self(word_seq)
_, predict = output['pred'].max(dim=1)
return {'pred': predict}

+ 0
- 384
fastNLP/models/enas_trainer.py View File

@@ -1,384 +0,0 @@
"""undocumented
Code Modified from https://github.com/carpedm20/ENAS-pytorch
"""

__all__ = []

import math
import time
from datetime import datetime, timedelta

import numpy as np
import torch
from torch.optim import Adam

try:
from tqdm.auto import tqdm
except:
from ..core.utils import _pseudo_tqdm as tqdm

from ..core.trainer import Trainer
from ..core.batch import DataSetIter
from ..core.callback import CallbackException
from ..core.dataset import DataSet
from ..core.utils import _move_dict_value_to_device
from . import enas_utils as utils
from ..core.utils import _build_args


def _get_no_grad_ctx_mgr():
"""Returns a the `torch.no_grad` context manager for PyTorch version >=
0.4, or a no-op context manager otherwise.
"""
return torch.no_grad()


class ENASTrainer(Trainer):
"""A class to wrap training code."""
def __init__(self, train_data, model, controller, **kwargs):
"""Constructor for training algorithm.
:param DataSet train_data: the training data
:param torch.nn.modules.module model: a PyTorch model
:param torch.nn.modules.module controller: a PyTorch model
"""
self.final_epochs = kwargs['final_epochs']
kwargs.pop('final_epochs')
super(ENASTrainer, self).__init__(train_data, model, **kwargs)
self.controller_step = 0
self.shared_step = 0
self.max_length = 35
self.shared = model
self.controller = controller
self.shared_optim = Adam(
self.shared.parameters(),
lr=20.0,
weight_decay=1e-7)
self.controller_optim = Adam(
self.controller.parameters(),
lr=3.5e-4)
def train(self, load_best_model=True):
"""
:param bool load_best_model: 该参数只有在初始化提供了dev_data的情况下有效,如果True, trainer将在返回之前重新加载dev表现
最好的模型参数。
:return results: 返回一个字典类型的数据,
内含以下内容::

seconds: float, 表示训练时长
以下三个内容只有在提供了dev_data的情况下会有。
best_eval: Dict of Dict, 表示evaluation的结果
best_epoch: int,在第几个epoch取得的最佳值
best_step: int, 在第几个step(batch)更新取得的最佳值

"""
results = {}
if self.n_epochs <= 0:
print(f"training epoch is {self.n_epochs}, nothing was done.")
results['seconds'] = 0.
return results
try:
if torch.cuda.is_available() and "cuda" in self.device:
self.model = self.model.cuda()
self._model_device = self.model.parameters().__next__().device
self._mode(self.model, is_test=False)
self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S'))
start_time = time.time()
print("training epochs started " + self.start_time, flush=True)
try:
self.callback_manager.on_train_begin()
self._train()
self.callback_manager.on_train_end()
except (CallbackException, KeyboardInterrupt) as e:
self.callback_manager.on_exception(e)
if self.dev_data is not None:
print(
"\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) +
self.tester._format_eval_results(self.best_dev_perf), )
results['best_eval'] = self.best_dev_perf
results['best_epoch'] = self.best_dev_epoch
results['best_step'] = self.best_dev_step
if load_best_model:
model_name = "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time])
load_succeed = self._load_model(self.model, model_name)
if load_succeed:
print("Reloaded the best model.")
else:
print("Fail to reload best model.")
finally:
pass
results['seconds'] = round(time.time() - start_time, 2)
return results
def _train(self):
if not self.use_tqdm:
from fastNLP.core.utils import _pseudo_tqdm as inner_tqdm
else:
inner_tqdm = tqdm
self.step = 0
start = time.time()
total_steps = (len(self.train_data) // self.batch_size + int(
len(self.train_data) % self.batch_size != 0)) * self.n_epochs
with inner_tqdm(total=total_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar:
avg_loss = 0
data_iterator = DataSetIter(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False,
prefetch=self.prefetch)
for epoch in range(1, self.n_epochs + 1):
pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs))
last_stage = (epoch > self.n_epochs + 1 - self.final_epochs)
if epoch == self.n_epochs + 1 - self.final_epochs:
print('Entering the final stage. (Only train the selected structure)')
# early stopping
self.callback_manager.on_epoch_begin()
# 1. Training the shared parameters omega of the child models
self.train_shared(pbar)
# 2. Training the controller parameters theta
if not last_stage:
self.train_controller()
if ((self.validate_every > 0 and self.step % self.validate_every == 0) or
(self.validate_every < 0 and self.step % len(data_iterator) == 0)) \
and self.dev_data is not None:
if not last_stage:
self.derive()
eval_res = self._do_validation(epoch=epoch, step=self.step)
eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step,
total_steps) + \
self.tester._format_eval_results(eval_res)
pbar.write(eval_str)
# lr decay; early stopping
self.callback_manager.on_epoch_end()
# =============== epochs end =================== #
pbar.close()
# ============ tqdm end ============== #
def get_loss(self, inputs, targets, hidden, dags):
"""Computes the loss for the same batch for M models.

This amounts to an estimate of the loss, which is turned into an
estimate for the gradients of the shared model.
"""
if not isinstance(dags, list):
dags = [dags]
loss = 0
for dag in dags:
self.shared.setDAG(dag)
inputs = _build_args(self.shared.forward, **inputs)
inputs['hidden'] = hidden
result = self.shared(**inputs)
output, hidden, extra_out = result['pred'], result['hidden'], result['extra_out']
self.callback_manager.on_loss_begin(targets, result)
sample_loss = self._compute_loss(result, targets)
loss += sample_loss
assert len(dags) == 1, 'there are multiple `hidden` for multple `dags`'
return loss, hidden, extra_out
def train_shared(self, pbar=None, max_step=None, dag=None):
"""Train the language model for 400 steps of minibatches of 64
examples.

Args:
max_step: Used to run extra training steps as a warm-up.
dag: If not None, is used instead of calling sample().

BPTT is truncated at 35 timesteps.

For each weight update, gradients are estimated by sampling M models
from the fixed controller policy, and averaging their gradients
computed on a batch of training data.
"""
model = self.shared
model.train()
self.controller.eval()
hidden = self.shared.init_hidden(self.batch_size)
abs_max_grad = 0
abs_max_hidden_norm = 0
step = 0
raw_total_loss = 0
total_loss = 0
train_idx = 0
avg_loss = 0
data_iterator = DataSetIter(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False,
prefetch=self.prefetch)
for batch_x, batch_y in data_iterator:
_move_dict_value_to_device(batch_x, batch_y, device=self._model_device)
indices = data_iterator.get_batch_indices()
# negative sampling; replace unknown; re-weight batch_y
self.callback_manager.on_batch_begin(batch_x, batch_y, indices)
# prediction = self._data_forward(self.model, batch_x)
dags = self.controller.sample(1)
inputs, targets = batch_x, batch_y
# self.callback_manager.on_loss_begin(batch_y, prediction)
loss, hidden, extra_out = self.get_loss(inputs,
targets,
hidden,
dags)
hidden.detach_()
avg_loss += loss.item()
# Is loss NaN or inf? requires_grad = False
self.callback_manager.on_backward_begin(loss)
self._grad_backward(loss)
self.callback_manager.on_backward_end()
self._update()
self.callback_manager.on_step_end()
if (self.step + 1) % self.print_every == 0:
if self.use_tqdm:
print_output = "loss:{0:<6.5f}".format(avg_loss / self.print_every)
pbar.update(self.print_every)
else:
end = time.time()
diff = timedelta(seconds=round(end - start))
print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.6} time: {}".format(
epoch, self.step, avg_loss, diff)
pbar.set_postfix_str(print_output)
avg_loss = 0
self.step += 1
step += 1
self.shared_step += 1
self.callback_manager.on_batch_end()
# ================= mini-batch end ==================== #
def get_reward(self, dag, entropies, hidden, valid_idx=0):
"""Computes the perplexity of a single sampled model on a minibatch of
validation data.
"""
if not isinstance(entropies, np.ndarray):
entropies = entropies.data.cpu().numpy()
data_iterator = DataSetIter(self.dev_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False,
prefetch=self.prefetch)
for inputs, targets in data_iterator:
valid_loss, hidden, _ = self.get_loss(inputs, targets, hidden, dag)
valid_loss = utils.to_item(valid_loss.data)
valid_ppl = math.exp(valid_loss)
R = 80 / valid_ppl
rewards = R + 1e-4 * entropies
return rewards, hidden
def train_controller(self):
"""Fixes the shared parameters and updates the controller parameters.

The controller is updated with a score function gradient estimator
(i.e., REINFORCE), with the reward being c/valid_ppl, where valid_ppl
is computed on a minibatch of validation data.

A moving average baseline is used.

The controller is trained for 2000 steps per epoch (i.e.,
first (Train Shared) phase -> second (Train Controller) phase).
"""
model = self.controller
model.train()
# Why can't we call shared.eval() here? Leads to loss
# being uniformly zero for the controller.
# self.shared.eval()
avg_reward_base = None
baseline = None
adv_history = []
entropy_history = []
reward_history = []
hidden = self.shared.init_hidden(self.batch_size)
total_loss = 0
valid_idx = 0
for step in range(20):
# sample models
dags, log_probs, entropies = self.controller.sample(
with_details=True)
# calculate reward
np_entropies = entropies.data.cpu().numpy()
# No gradients should be backpropagated to the
# shared model during controller training, obviously.
with _get_no_grad_ctx_mgr():
rewards, hidden = self.get_reward(dags,
np_entropies,
hidden,
valid_idx)
reward_history.extend(rewards)
entropy_history.extend(np_entropies)
# moving average baseline
if baseline is None:
baseline = rewards
else:
decay = 0.95
baseline = decay * baseline + (1 - decay) * rewards
adv = rewards - baseline
adv_history.extend(adv)
# policy loss
loss = -log_probs * utils.get_variable(adv,
'cuda' in self.device,
requires_grad=False)
loss = loss.sum() # or loss.mean()
# update
self.controller_optim.zero_grad()
loss.backward()
self.controller_optim.step()
total_loss += utils.to_item(loss.data)
if ((step % 50) == 0) and (step > 0):
reward_history, adv_history, entropy_history = [], [], []
total_loss = 0
self.controller_step += 1
# prev_valid_idx = valid_idx
# valid_idx = ((valid_idx + self.max_length) %
# (self.valid_data.size(0) - 1))
# # Whenever we wrap around to the beginning of the
# # validation data, we reset the hidden states.
# if prev_valid_idx > valid_idx:
# hidden = self.shared.init_hidden(self.batch_size)
def derive(self, sample_num=10, valid_idx=0):
"""We are always deriving based on the very first batch
of validation data? This seems wrong...
"""
hidden = self.shared.init_hidden(self.batch_size)
dags, _, entropies = self.controller.sample(sample_num,
with_details=True)
max_R = 0
best_dag = None
for dag in dags:
R, _ = self.get_reward(dag, entropies, hidden, valid_idx)
if R.max() > max_R:
max_R = R.max()
best_dag = dag
self.model.setDAG(best_dag)

+ 0
- 58
fastNLP/models/enas_utils.py View File

@@ -1,58 +0,0 @@
"""undocumented
Code Modified from https://github.com/carpedm20/ENAS-pytorch
"""

__all__ = []

import collections
from collections import defaultdict

import numpy as np
import torch
from torch.autograd import Variable


def detach(h):
if type(h) == Variable:
return Variable(h.data)
else:
return tuple(detach(v) for v in h)


def get_variable(inputs, cuda=False, **kwargs):
if type(inputs) in [list, np.ndarray]:
inputs = torch.Tensor(inputs)
if cuda:
out = Variable(inputs.cuda(), **kwargs)
else:
out = Variable(inputs, **kwargs)
return out


def update_lr(optimizer, lr):
for param_group in optimizer.param_groups:
param_group['lr'] = lr


Node = collections.namedtuple('Node', ['id', 'name'])


class keydefaultdict(defaultdict):
def __missing__(self, key):
if self.default_factory is None:
raise KeyError(key)
else:
ret = self[key] = self.default_factory(key)
return ret


def to_item(x):
"""Converts x, possibly scalar and possibly tensor, to a Python scalar."""
if isinstance(x, (float, int)):
return x
if float(torch.__version__[0:3]) < 0.4:
assert (x.dim() == 1) and (len(x) == 1)
return x[0]
return x.item()

+ 0
- 44
legacy/api/README.md View File

@@ -1,44 +0,0 @@
# fastNLP 高级接口

### 环境与配置
1. 系统环境:linux/ubuntu(推荐)
2. 编程语言:Python>=3.6
3. Python包依赖
- **torch==1.0**
- numpy>=1.14.2

### 中文分词
```python
text = ['编者按:7月12日,英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。',
'这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。',
'那么这款无人机到底有多厉害?']
from fastNLP.api import CWS
cws = CWS(device='cpu')
print(cws.predict(text))
# ['编者 按 : 7月 12日 , 英国 航空 航天 系统 公司 公布 了 该 公司 研制 的 第一 款 高 科技 隐形 无人 机雷电 之 神 。', '这 款 飞行 从 外型 上 来 看 酷似 电影 中 的 太空 飞行器 , 据 英国 方面 介绍 , 可以 实现 洲际 远程 打击 。', '那么 这 款 无人 机 到底 有 多 厉害 ?']
```

### 词性标注
```python
# 输入已分词序列
text = [['编者', '按:', '7月', '12日', ',', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司',
'研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'],
['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '?']]
from fastNLP.api import POS
pos = POS(device='cpu')
print(pos.predict(text))
# [['编者/NN', '按:/NN', '7月/NT', '12日/NT', ',/PU', '英国/NR', '航空/NN', '航天/NN', '系统/NN', '公司/NN', '公布/VV', '了/AS', '该/DT', '公司/NN', '研制/VV', '的/DEC', '第一款/NN', '高科技/NN', '隐形/AD', '无人机/VV', '雷电之神/NN', '。/PU'], ['那么/AD', '这/DT', '款/NN', '无人机/VV', '到底/AD', '有/VE', '多/AD', '厉害/VA', '?/PU']]
```

### 句法分析
```python
text = [['编者', '按:', '7月', '12日', ',', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司',
'研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'],
['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '?']]
from fastNLP.api import Parser
parser = Parser(device='cpu')
print(parser.predict(text))
# [['2/nn', '4/nn', '4/nn', '20/tmod', '11/punct', '10/nn', '10/nn', '10/nn', '10/nn', '11/nsubj', '20/dep', '11/asp', '14/det', '15/nsubj', '18/rcmod', '15/cpm', '18/nn', '11/dobj', '20/advmod', '0/root', '20/dobj', '20/punct'], ['4/advmod', '3/det', '8/xsubj', '8/dep', '8/advmod', '8/dep', '8/advmod', '0/root', '8/punct']]
```

完整样例见`examples.py`

+ 0
- 2
legacy/api/__init__.py View File

@@ -1,2 +0,0 @@
__all__ = ["CWS", "POS", "Parser"]
from .api import CWS, POS, Parser

+ 0
- 463
legacy/api/api.py View File

@@ -1,463 +0,0 @@
import warnings

import torch

warnings.filterwarnings('ignore')
import os

from fastNLP.core.dataset import DataSet
from .utils import load_url
from .processor import ModelProcessor
from fastNLP.io.dataset_loader import _cut_long_sentence
from fastNLP.io.data_loader import ConllLoader
from fastNLP.core.instance import Instance
from ..api.pipeline import Pipeline
from fastNLP.core.metrics import SpanFPreRecMetric
from .processor import IndexerProcessor

# TODO add pretrain urls
model_urls = {
"cws": "http://123.206.98.91:8888/download/cws_lstm_ctb9_1_20-09908656.pkl",
"pos": "http://123.206.98.91:8888/download/pos_tag_model_20190119-43f8b435.pkl",
"parser": "http://123.206.98.91:8888/download/parser_20190204-c72ca5c0.pkl"
}


class ConllCWSReader(object):
"""Deprecated. Use ConllLoader for all types of conll-format files."""
def __init__(self):
pass
def load(self, path, cut_long_sent=False):
"""
返回的DataSet只包含raw_sentence这个field,内容为str。
假定了输入为conll的格式,以空行隔开两个句子,每行共7列,即
::

1 编者按 编者按 NN O 11 nmod:topic
2 : : PU O 11 punct
3 7月 7月 NT DATE 4 compound:nn
4 12日 12日 NT DATE 11 nmod:tmod
5 , , PU O 11 punct

1 这 这 DT O 3 det
2 款 款 M O 1 mark:clf
3 飞行 飞行 NN O 8 nsubj
4 从 从 P O 5 case
5 外型 外型 NN O 8 nmod:prep

"""
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.strip().split())
if len(sample) > 0:
datalist.append(sample)
ds = DataSet()
for sample in datalist:
# print(sample)
res = self.get_char_lst(sample)
if res is None:
continue
line = ' '.join(res)
if cut_long_sent:
sents = _cut_long_sentence(line)
else:
sents = [line]
for raw_sentence in sents:
ds.append(Instance(raw_sentence=raw_sentence))
return ds
def get_char_lst(self, sample):
if len(sample) == 0:
return None
text = []
for w in sample:
t1, t2, t3, t4 = w[1], w[3], w[6], w[7]
if t3 == '_':
return None
text.append(t1)
return text


class ConllxDataLoader(ConllLoader):
"""返回“词级别”的标签信息,包括词、词性、(句法)头依赖、(句法)边标签。跟``ZhConllPOSReader``完全不同。

Deprecated. Use ConllLoader for all types of conll-format files.
"""
def __init__(self):
headers = [
'words', 'pos_tags', 'heads', 'labels',
]
indexs = [
1, 3, 6, 7,
]
super(ConllxDataLoader, self).__init__(headers=headers, indexes=indexs)


class API:
def __init__(self):
self.pipeline = None
self._dict = None
def predict(self, *args, **kwargs):
"""Do prediction for the given input.
"""
raise NotImplementedError
def test(self, file_path):
"""Test performance over the given data set.

:param str file_path:
:return: a dictionary of metric values
"""
raise NotImplementedError
def load(self, path, device):
if os.path.exists(os.path.expanduser(path)):
_dict = torch.load(path, map_location='cpu')
else:
_dict = load_url(path, map_location='cpu')
self._dict = _dict
self.pipeline = _dict['pipeline']
for processor in self.pipeline.pipeline:
if isinstance(processor, ModelProcessor):
processor.set_model_device(device)


class POS(API):
"""FastNLP API for Part-Of-Speech tagging.

:param str model_path: the path to the model.
:param str device: device name such as "cpu" or "cuda:0". Use the same notation as PyTorch.

"""
def __init__(self, model_path=None, device='cpu'):
super(POS, self).__init__()
if model_path is None:
model_path = model_urls['pos']
self.load(model_path, device)
def predict(self, content):
"""predict函数的介绍,
函数介绍的第二句,这句话不会换行
:param content: list of list of str. Each string is a token(word).
:return answer: list of list of str. Each string is a tag.
"""
if not hasattr(self, "pipeline"):
raise ValueError("You have to load model first.")
sentence_list = content
# 1. 检查sentence的类型
for sentence in sentence_list:
if not all((type(obj) == str for obj in sentence)):
raise ValueError("Input must be list of list of string.")
# 2. 组建dataset
dataset = DataSet()
dataset.add_field("words", sentence_list)
# 3. 使用pipeline
self.pipeline(dataset)
def merge_tag(words_list, tags_list):
rtn = []
for words, tags in zip(words_list, tags_list):
rtn.append([w + "/" + t for w, t in zip(words, tags)])
return rtn
output = dataset.field_arrays["tag"].content
if isinstance(content, str):
return output[0]
elif isinstance(content, list):
return merge_tag(content, output)
def test(self, file_path):
test_data = ConllxDataLoader().load(file_path)
save_dict = self._dict
tag_vocab = save_dict["tag_vocab"]
pipeline = save_dict["pipeline"]
index_tag = IndexerProcessor(vocab=tag_vocab, field_name="tag", new_added_field_name="truth", is_input=False)
pipeline.pipeline = [index_tag] + pipeline.pipeline
test_data.rename_field("pos_tags", "tag")
pipeline(test_data)
test_data.set_target("truth")
prediction = test_data.field_arrays["predict"].content
truth = test_data.field_arrays["truth"].content
seq_len = test_data.field_arrays["word_seq_origin_len"].content
# padding by hand
max_length = max([len(seq) for seq in prediction])
for idx in range(len(prediction)):
prediction[idx] = list(prediction[idx]) + ([0] * (max_length - len(prediction[idx])))
truth[idx] = list(truth[idx]) + ([0] * (max_length - len(truth[idx])))
evaluator = SpanFPreRecMetric(tag_vocab=tag_vocab, pred="predict", target="truth",
seq_len="word_seq_origin_len")
evaluator({"predict": torch.Tensor(prediction), "word_seq_origin_len": torch.Tensor(seq_len)},
{"truth": torch.Tensor(truth)})
test_result = evaluator.get_metric()
f1 = round(test_result['f'] * 100, 2)
pre = round(test_result['pre'] * 100, 2)
rec = round(test_result['rec'] * 100, 2)
return {"F1": f1, "precision": pre, "recall": rec}


class CWS(API):
"""
中文分词高级接口。

:param model_path: 当model_path为None,使用默认位置的model。如果默认位置不存在,则自动下载模型
:param device: str,可以为'cpu', 'cuda'或'cuda:0'等。会将模型load到相应device进行推断。
"""
def __init__(self, model_path=None, device='cpu'):
super(CWS, self).__init__()
if model_path is None:
model_path = model_urls['cws']
self.load(model_path, device)
def predict(self, content):
"""
分词接口。

:param content: str或List[str], 例如: "中文分词很重要!", 返回的结果是"中文 分词 很 重要 !"。 如果传入的为List[str],比如
[ "中文分词很重要!", ...], 返回的结果["中文 分词 很 重要 !", ...]。
:return: str或List[str], 根据输入的的类型决定。
"""
if not hasattr(self, 'pipeline'):
raise ValueError("You have to load model first.")
sentence_list = []
# 1. 检查sentence的类型
if isinstance(content, str):
sentence_list.append(content)
elif isinstance(content, list):
sentence_list = content
# 2. 组建dataset
dataset = DataSet()
dataset.add_field('raw_sentence', sentence_list)
# 3. 使用pipeline
self.pipeline(dataset)
output = dataset.get_field('output').content
if isinstance(content, str):
return output[0]
elif isinstance(content, list):
return output
def test(self, filepath):
"""
传入一个分词文件路径,返回该数据集上分词f1, precision, recall。
分词文件应该为::
1 编者按 编者按 NN O 11 nmod:topic
2 : : PU O 11 punct
3 7月 7月 NT DATE 4 compound:nn
4 12日 12日 NT DATE 11 nmod:tmod
5 , , PU O 11 punct
1 这 这 DT O 3 det
2 款 款 M O 1 mark:clf
3 飞行 飞行 NN O 8 nsubj
4 从 从 P O 5 case
5 外型 外型 NN O 8 nmod:prep
以空行分割两个句子,有内容的每行有7列。

:param filepath: str, 文件路径路径。
:return: float, float, float. 分别f1, precision, recall.
"""
tag_proc = self._dict['tag_proc']
cws_model = self.pipeline.pipeline[-2].model
pipeline = self.pipeline.pipeline[:-2]
pipeline.insert(1, tag_proc)
pp = Pipeline(pipeline)
reader = ConllCWSReader()
# te_filename = '/home/hyan/ctb3/test.conllx'
te_dataset = reader.load(filepath)
pp(te_dataset)
from ..core.tester import Tester
from ..core.metrics import SpanFPreRecMetric
tester = Tester(data=te_dataset, model=cws_model, metrics=SpanFPreRecMetric(tag_proc.get_vocab()), batch_size=64,
verbose=0)
eval_res = tester.test()
f1 = eval_res['SpanFPreRecMetric']['f']
pre = eval_res['SpanFPreRecMetric']['pre']
rec = eval_res['SpanFPreRecMetric']['rec']
# print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1, pre, rec))
return {"F1": f1, "precision": pre, "recall": rec}


class Parser(API):
def __init__(self, model_path=None, device='cpu'):
super(Parser, self).__init__()
if model_path is None:
model_path = model_urls['parser']
self.pos_tagger = POS(device=device)
self.load(model_path, device)
def predict(self, content):
if not hasattr(self, 'pipeline'):
raise ValueError("You have to load model first.")
# 1. 利用POS得到分词和pos tagging结果
pos_out = self.pos_tagger.predict(content)
# pos_out = ['这里/NN 是/VB 分词/NN 结果/NN'.split()]
# 2. 组建dataset
dataset = DataSet()
dataset.add_field('wp', pos_out)
dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[0] for w in x['wp']], new_field_name='words')
dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[1] for w in x['wp']], new_field_name='pos')
dataset.rename_field("words", "raw_words")
# 3. 使用pipeline
self.pipeline(dataset)
dataset.apply(lambda x: [str(arc) for arc in x['arc_pred']], new_field_name='arc_pred')
dataset.apply(lambda x: [arc + '/' + label for arc, label in
zip(x['arc_pred'], x['label_pred_seq'])][1:], new_field_name='output')
# output like: [['2/top', '0/root', '4/nn', '2/dep']]
return dataset.field_arrays['output'].content
def load_test_file(self, path):
def get_one(sample):
sample = list(map(list, zip(*sample)))
if len(sample) == 0:
return None
for w in sample[7]:
if w == '_':
print('Error Sample {}'.format(sample))
return None
# return word_seq, pos_seq, head_seq, head_tag_seq
return sample[1], sample[3], list(map(int, sample[6])), sample[7]
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.split('\t'))
if len(sample) > 0:
datalist.append(sample)
data = [get_one(sample) for sample in datalist]
data_list = list(filter(lambda x: x is not None, data))
return data_list
def test(self, filepath):
data = self.load_test_file(filepath)
def convert(data):
BOS = '<BOS>'
dataset = DataSet()
for sample in data:
word_seq = [BOS] + sample[0]
pos_seq = [BOS] + sample[1]
heads = [0] + sample[2]
head_tags = [BOS] + sample[3]
dataset.append(Instance(raw_words=word_seq,
pos=pos_seq,
gold_heads=heads,
arc_true=heads,
tags=head_tags))
return dataset
ds = convert(data)
pp = self.pipeline
for p in pp:
if p.field_name == 'word_list':
p.field_name = 'gold_words'
elif p.field_name == 'pos_list':
p.field_name = 'gold_pos'
# ds.rename_field("words", "raw_words")
# ds.rename_field("tag", "pos")
pp(ds)
head_cor, label_cor, total = 0, 0, 0
for ins in ds:
head_gold = ins['gold_heads']
head_pred = ins['arc_pred']
length = len(head_gold)
total += length
for i in range(length):
head_cor += 1 if head_pred[i] == head_gold[i] else 0
uas = head_cor / total
# print('uas:{:.2f}'.format(uas))
for p in pp:
if p.field_name == 'gold_words':
p.field_name = 'word_list'
elif p.field_name == 'gold_pos':
p.field_name = 'pos_list'
return {"USA": round(uas, 5)}


class Analyzer:
def __init__(self, device='cpu'):
self.cws = CWS(device=device)
self.pos = POS(device=device)
self.parser = Parser(device=device)
def predict(self, content, seg=False, pos=False, parser=False):
if seg is False and pos is False and parser is False:
seg = True
output_dict = {}
if seg:
seg_output = self.cws.predict(content)
output_dict['seg'] = seg_output
if pos:
pos_output = self.pos.predict(content)
output_dict['pos'] = pos_output
if parser:
parser_output = self.parser.predict(content)
output_dict['parser'] = parser_output
return output_dict
def test(self, filepath):
output_dict = {}
if self.cws:
seg_output = self.cws.test(filepath)
output_dict['seg'] = seg_output
if self.pos:
pos_output = self.pos.test(filepath)
output_dict['pos'] = pos_output
if self.parser:
parser_output = self.parser.test(filepath)
output_dict['parser'] = parser_output
return output_dict

+ 0
- 181
legacy/api/converter.py View File

@@ -1,181 +0,0 @@
import re


class SpanConverter:
def __init__(self, replace_tag, pattern):
super(SpanConverter, self).__init__()

self.replace_tag = replace_tag
self.pattern = pattern

def find_certain_span_and_replace(self, sentence):
replaced_sentence = ''
prev_end = 0
for match in re.finditer(self.pattern, sentence):
start, end = match.span()
span = sentence[start:end]
replaced_sentence += sentence[prev_end:start] + self.span_to_special_tag(span)
prev_end = end
replaced_sentence += sentence[prev_end:]

return replaced_sentence

def span_to_special_tag(self, span):

return self.replace_tag

def find_certain_span(self, sentence):
spans = []
for match in re.finditer(self.pattern, sentence):
spans.append(match.span())
return spans


class AlphaSpanConverter(SpanConverter):
def __init__(self):
replace_tag = '<ALPHA>'
# 理想状态下仅处理纯为字母的情况, 但不处理<[a-zA-Z]+>(因为这应该是特殊的tag).
pattern = '[a-zA-Z]+(?=[\u4e00-\u9fff ,%.!<\\-"])'

super(AlphaSpanConverter, self).__init__(replace_tag, pattern)


class DigitSpanConverter(SpanConverter):
def __init__(self):
replace_tag = '<NUM>'
pattern = '\d[\d\\.]*(?=[\u4e00-\u9fff ,%.!<-])'

super(DigitSpanConverter, self).__init__(replace_tag, pattern)

def span_to_special_tag(self, span):
# return self.special_tag
if span[0] == '0' and len(span) > 2:
return '<NUM>'
decimal_point_count = 0 # one might have more than one decimal pointers
for idx, char in enumerate(span):
if char == '.' or char == '﹒' or char == '·':
decimal_point_count += 1
if span[-1] == '.' or span[-1] == '﹒' or span[-1] == '·':
# last digit being decimal point means this is not a number
if decimal_point_count == 1:
return span
else:
return '<UNKDGT>'
if decimal_point_count == 1:
return '<DEC>'
elif decimal_point_count > 1:
return '<UNKDGT>'
else:
return '<NUM>'


class TimeConverter(SpanConverter):
def __init__(self):
replace_tag = '<TOC>'
pattern = '\d+[::∶][\d::∶]+(?=[\u4e00-\u9fff ,%.!<-])'

super().__init__(replace_tag, pattern)


class MixNumAlphaConverter(SpanConverter):
def __init__(self):
replace_tag = '<MIX>'
pattern = None

super().__init__(replace_tag, pattern)

def find_certain_span_and_replace(self, sentence):
replaced_sentence = ''
start = 0
matching_flag = False
number_flag = False
alpha_flag = False
link_flag = False
slash_flag = False
bracket_flag = False
for idx in range(len(sentence)):
if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]):
if not matching_flag:
replaced_sentence += sentence[start:idx]
start = idx
if re.match('[0-9]', sentence[idx]):
number_flag = True
elif re.match('[\'′&\\-]', sentence[idx]):
link_flag = True
elif re.match('/', sentence[idx]):
slash_flag = True
elif re.match('[\\(\\)]', sentence[idx]):
bracket_flag = True
else:
alpha_flag = True
matching_flag = True
elif re.match('[\\.]', sentence[idx]):
pass
else:
if matching_flag:
if (number_flag and alpha_flag) or (link_flag and alpha_flag) \
or (slash_flag and alpha_flag) or (link_flag and number_flag) \
or (number_flag and bracket_flag) or (bracket_flag and alpha_flag):
span = sentence[start:idx]
start = idx
replaced_sentence += self.span_to_special_tag(span)
matching_flag = False
number_flag = False
alpha_flag = False
link_flag = False
slash_flag = False
bracket_flag = False

replaced_sentence += sentence[start:]
return replaced_sentence

def find_certain_span(self, sentence):
spans = []
start = 0
matching_flag = False
number_flag = False
alpha_flag = False
link_flag = False
slash_flag = False
bracket_flag = False
for idx in range(len(sentence)):
if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]):
if not matching_flag:
start = idx
if re.match('[0-9]', sentence[idx]):
number_flag = True
elif re.match('[\'′&\\-]', sentence[idx]):
link_flag = True
elif re.match('/', sentence[idx]):
slash_flag = True
elif re.match('[\\(\\)]', sentence[idx]):
bracket_flag = True
else:
alpha_flag = True
matching_flag = True
elif re.match('[\\.]', sentence[idx]):
pass
else:
if matching_flag:
if (number_flag and alpha_flag) or (link_flag and alpha_flag) \
or (slash_flag and alpha_flag) or (link_flag and number_flag) \
or (number_flag and bracket_flag) or (bracket_flag and alpha_flag):
spans.append((start, idx))
start = idx

matching_flag = False
number_flag = False
alpha_flag = False
link_flag = False
slash_flag = False
bracket_flag = False

return spans


class EmailConverter(SpanConverter):
def __init__(self):
replaced_tag = "<EML>"
pattern = '[0-9a-zA-Z]+[@][.﹒0-9a-zA-Z@]+(?=[\u4e00-\u9fff ,%.!<\\-"$])'

super(EmailConverter, self).__init__(replaced_tag, pattern)

+ 0
- 56
legacy/api/examples.py View File

@@ -1,56 +0,0 @@
"""
api/example.py contains all API examples provided by fastNLP.
It is used as a tutorial for API or a test script since it is difficult to test APIs in travis.

"""
from . import CWS, POS, Parser

text = ['编者按:7月12日,英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。',
'这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。',
'那么这款无人机到底有多厉害?']


def chinese_word_segmentation():
cws = CWS(device='cpu')
print(cws.predict(text))


def chinese_word_segmentation_test():
cws = CWS(device='cpu')
print(cws.test("../../test/data_for_tests/zh_sample.conllx"))


def pos_tagging():
# 输入已分词序列
text = [['编者', '按:', '7月', '12日', ',', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司',
'研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'],
['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '?']]
pos = POS(device='cpu')
print(pos.predict(text))


def pos_tagging_test():
pos = POS(device='cpu')
print(pos.test("../../test/data_for_tests/zh_sample.conllx"))


def syntactic_parsing():
text = [['编者', '按:', '7月', '12日', ',', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司',
'研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'],
['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '?']]
parser = Parser(device='cpu')
print(parser.predict(text))


def syntactic_parsing_test():
parser = Parser(device='cpu')
print(parser.test("../../test/data_for_tests/zh_sample.conllx"))


if __name__ == "__main__":
# chinese_word_segmentation()
# chinese_word_segmentation_test()
# pos_tagging()
# pos_tagging_test()
syntactic_parsing()
# syntactic_parsing_test()

+ 0
- 33
legacy/api/pipeline.py View File

@@ -1,33 +0,0 @@
from ..api.processor import Processor


class Pipeline:
"""
Pipeline takes a DataSet object as input, runs multiple processors sequentially, and
outputs a DataSet object.
"""

def __init__(self, processors=None):
self.pipeline = []
if isinstance(processors, list):
for proc in processors:
assert isinstance(proc, Processor), "Must be a Processor, not {}.".format(type(proc))
self.pipeline = processors

def add_processor(self, processor):
assert isinstance(processor, Processor), "Must be a Processor, not {}.".format(type(processor))
self.pipeline.append(processor)

def process(self, dataset):
assert len(self.pipeline) != 0, "You need to add some processor first."

for proc in self.pipeline:
dataset = proc(dataset)

return dataset

def __call__(self, *args, **kwargs):
return self.process(*args, **kwargs)

def __getitem__(self, item):
return self.pipeline[item]

+ 0
- 428
legacy/api/processor.py View File

@@ -1,428 +0,0 @@
import re
from collections import defaultdict

import torch

from fastNLP.core.batch import Batch
from fastNLP.core.dataset import DataSet
from fastNLP.core.sampler import SequentialSampler
from fastNLP.core.vocabulary import Vocabulary


class Processor(object):
def __init__(self, field_name, new_added_field_name):
"""

:param field_name: 处理哪个field
:param new_added_field_name: 如果为None,则认为是field_name,即覆盖原有的field
"""
self.field_name = field_name
if new_added_field_name is None:
self.new_added_field_name = field_name
else:
self.new_added_field_name = new_added_field_name

def process(self, *args, **kwargs):
raise NotImplementedError

def __call__(self, *args, **kwargs):
return self.process(*args, **kwargs)


class FullSpaceToHalfSpaceProcessor(Processor):
"""全角转半角,以字符为处理单元

"""

def __init__(self, field_name, change_alpha=True, change_digit=True, change_punctuation=True,
change_space=True):
super(FullSpaceToHalfSpaceProcessor, self).__init__(field_name, None)

self.change_alpha = change_alpha
self.change_digit = change_digit
self.change_punctuation = change_punctuation
self.change_space = change_space

FH_SPACE = [(u" ", u" ")]
FH_NUM = [
(u"0", u"0"), (u"1", u"1"), (u"2", u"2"), (u"3", u"3"), (u"4", u"4"),
(u"5", u"5"), (u"6", u"6"), (u"7", u"7"), (u"8", u"8"), (u"9", u"9")]
FH_ALPHA = [
(u"a", u"a"), (u"b", u"b"), (u"c", u"c"), (u"d", u"d"), (u"e", u"e"),
(u"f", u"f"), (u"g", u"g"), (u"h", u"h"), (u"i", u"i"), (u"j", u"j"),
(u"k", u"k"), (u"l", u"l"), (u"m", u"m"), (u"n", u"n"), (u"o", u"o"),
(u"p", u"p"), (u"q", u"q"), (u"r", u"r"), (u"s", u"s"), (u"t", u"t"),
(u"u", u"u"), (u"v", u"v"), (u"w", u"w"), (u"x", u"x"), (u"y", u"y"),
(u"z", u"z"),
(u"A", u"A"), (u"B", u"B"), (u"C", u"C"), (u"D", u"D"), (u"E", u"E"),
(u"F", u"F"), (u"G", u"G"), (u"H", u"H"), (u"I", u"I"), (u"J", u"J"),
(u"K", u"K"), (u"L", u"L"), (u"M", u"M"), (u"N", u"N"), (u"O", u"O"),
(u"P", u"P"), (u"Q", u"Q"), (u"R", u"R"), (u"S", u"S"), (u"T", u"T"),
(u"U", u"U"), (u"V", u"V"), (u"W", u"W"), (u"X", u"X"), (u"Y", u"Y"),
(u"Z", u"Z")]
# 谨慎使用标点符号转换, 因为"5.12特大地震"转换后可能就成了"5.12特大地震"
FH_PUNCTUATION = [
(u'%', u'%'), (u'!', u'!'), (u'"', u'\"'), (u''', u'\''), (u'#', u'#'),
(u'¥', u'$'), (u'&', u'&'), (u'(', u'('), (u')', u')'), (u'*', u'*'),
(u'+', u'+'), (u',', u','), (u'-', u'-'), (u'.', u'.'), (u'/', u'/'),
(u':', u':'), (u';', u';'), (u'<', u'<'), (u'=', u'='), (u'>', u'>'),
(u'?', u'?'), (u'@', u'@'), (u'[', u'['), (u']', u']'), (u'\', u'\\'),
(u'^', u'^'), (u'_', u'_'), (u'`', u'`'), (u'~', u'~'), (u'{', u'{'),
(u'}', u'}'), (u'|', u'|')]
FHs = []
if self.change_alpha:
FHs = FH_ALPHA
if self.change_digit:
FHs += FH_NUM
if self.change_punctuation:
FHs += FH_PUNCTUATION
if self.change_space:
FHs += FH_SPACE
self.convert_map = {k: v for k, v in FHs}

def process(self, dataset):
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))

def inner_proc(ins):
sentence = ins[self.field_name]
new_sentence = [""] * len(sentence)
for idx, char in enumerate(sentence):
if char in self.convert_map:
char = self.convert_map[char]
new_sentence[idx] = char
return "".join(new_sentence)

dataset.apply(inner_proc, new_field_name=self.field_name)
return dataset


class PreAppendProcessor(Processor):
"""
向某个field的起始增加data(应该为str类型)。该field需要为list类型。即新增的field为
[data] + instance[field_name]

"""

def __init__(self, data, field_name, new_added_field_name=None):
super(PreAppendProcessor, self).__init__(field_name, new_added_field_name)
self.data = data

def process(self, dataset):
dataset.apply(lambda ins: [self.data] + ins[self.field_name], new_field_name=self.new_added_field_name)
return dataset


class SliceProcessor(Processor):
"""
从某个field中只取部分内容。等价于instance[field_name][start:end:step]

"""

def __init__(self, start, end, step, field_name, new_added_field_name=None):
super(SliceProcessor, self).__init__(field_name, new_added_field_name)
for o in (start, end, step):
assert isinstance(o, int) or o is None
self.slice = slice(start, end, step)

def process(self, dataset):
dataset.apply(lambda ins: ins[self.field_name][self.slice], new_field_name=self.new_added_field_name)
return dataset


class Num2TagProcessor(Processor):
"""
将一句话中的数字转换为某个tag。

"""

def __init__(self, tag, field_name, new_added_field_name=None):
"""

:param tag: str, 将数字转换为该tag
:param field_name:
:param new_added_field_name:
"""
super(Num2TagProcessor, self).__init__(field_name, new_added_field_name)
self.tag = tag
self.pattern = r'[-+]?([0-9]+[.]?[0-9]*)+[/eE]?[-+]?([0-9]+[.]?[0-9]*)'

def process(self, dataset):

def inner_proc(ins):
s = ins[self.field_name]
new_s = [None] * len(s)
for i, w in enumerate(s):
if re.search(self.pattern, w) is not None:
w = self.tag
new_s[i] = w
return new_s

dataset.apply(inner_proc, new_field_name=self.new_added_field_name)
return dataset


class IndexerProcessor(Processor):
"""
给定一个vocabulary , 将指定field转换为index形式。指定field应该是一维的list,比如
['我', '是', xxx]
"""

def __init__(self, vocab, field_name, new_added_field_name, delete_old_field=False, is_input=True):

assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab))

super(IndexerProcessor, self).__init__(field_name, new_added_field_name)
self.vocab = vocab
self.delete_old_field = delete_old_field
self.is_input = is_input

def set_vocab(self, vocab):
assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab))

self.vocab = vocab

def process(self, dataset):
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
dataset.apply(lambda ins: [self.vocab.to_index(token) for token in ins[self.field_name]],
new_field_name=self.new_added_field_name)
if self.is_input:
dataset.set_input(self.new_added_field_name)

if self.delete_old_field:
dataset.delete_field(self.field_name)

return dataset


class VocabProcessor(Processor):
"""
传入若干个DataSet以建立vocabulary。

"""

def __init__(self, field_name, min_freq=1, max_size=None):
super(VocabProcessor, self).__init__(field_name, None)
self.vocab = Vocabulary(min_freq=min_freq, max_size=max_size)

def process(self, *datasets):
for dataset in datasets:
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
dataset.apply(lambda ins: self.vocab.update(ins[self.field_name]))

def get_vocab(self):
self.vocab.build_vocab()
return self.vocab


class SeqLenProcessor(Processor):
"""
根据某个field新增一个sequence length的field。取该field的第一维

"""

def __init__(self, field_name, new_added_field_name='seq_lens', is_input=True):
super(SeqLenProcessor, self).__init__(field_name, new_added_field_name)
self.is_input = is_input

def process(self, dataset):
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
dataset.apply(lambda ins: len(ins[self.field_name]), new_field_name=self.new_added_field_name)
if self.is_input:
dataset.set_input(self.new_added_field_name)
return dataset


from fastNLP.core.utils import _build_args


class ModelProcessor(Processor):
def __init__(self, model, seq_len_field_name='seq_lens', batch_size=32):
"""
传入一个model,在process()时传入一个dataset,该processor会通过Batch将DataSet的内容输出给model.predict或者model.forward.
model输出的内容会被增加到dataset中,field_name由model输出决定。如果生成的内容维度不是(Batch_size, )与
(Batch_size, 1),则使用seqence length这个field进行unpad
TODO 这个类需要删除对seq_lens的依赖。

:param seq_len_field_name:
:param batch_size:
"""
super(ModelProcessor, self).__init__(None, None)
self.batch_size = batch_size
self.seq_len_field_name = seq_len_field_name
self.model = model

def process(self, dataset):
self.model.eval()
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
data_iterator = Batch(dataset, batch_size=self.batch_size, sampler=SequentialSampler())

batch_output = defaultdict(list)
predict_func = self.model.forward
with torch.no_grad():
for batch_x, _ in data_iterator:
refined_batch_x = _build_args(predict_func, **batch_x)
prediction = predict_func(**refined_batch_x)
seq_lens = batch_x[self.seq_len_field_name].tolist()

for key, value in prediction.items():
tmp_batch = []
value = value.cpu().numpy()
if len(value.shape) == 1 or (len(value.shape) == 2 and value.shape[1] == 1):
batch_output[key].extend(value.tolist())
else:
for idx, seq_len in enumerate(seq_lens):
tmp_batch.append(value[idx, :seq_len])
batch_output[key].extend(tmp_batch)
if not self.seq_len_field_name in prediction:
batch_output[self.seq_len_field_name].extend(seq_lens)

# TODO 当前的实现会导致之后的processor需要知道model输出的output的key是什么
for field_name, fields in batch_output.items():
dataset.add_field(field_name, fields, is_input=True, is_target=False)

return dataset

def set_model(self, model):
self.model = model

def set_model_device(self, device):
device = torch.device(device)
self.model.to(device)


class Index2WordProcessor(Processor):
"""
将DataSet中某个为index的field根据vocab转换为str

"""

def __init__(self, vocab, field_name, new_added_field_name):
super(Index2WordProcessor, self).__init__(field_name, new_added_field_name)
self.vocab = vocab

def process(self, dataset):
dataset.apply(lambda ins: [self.vocab.to_word(w) for w in ins[self.field_name]],
new_field_name=self.new_added_field_name)
return dataset


class SetTargetProcessor(Processor):
def __init__(self, *fields, flag=True):
super(SetTargetProcessor, self).__init__(None, None)
self.fields = fields
self.flag = flag

def process(self, dataset):
dataset.set_target(*self.fields, flag=self.flag)
return dataset


class SetInputProcessor(Processor):
def __init__(self, *fields, flag=True):
super(SetInputProcessor, self).__init__(None, None)
self.fields = fields
self.flag = flag

def process(self, dataset):
dataset.set_input(*self.fields, flag=self.flag)
return dataset


class VocabIndexerProcessor(Processor):
"""
根据DataSet创建Vocabulary,并将其用数字index。新生成的index的field会被放在new_added_filed_name, 如果没有提供
new_added_field_name, 则覆盖原有的field_name.

"""

def __init__(self, field_name, new_added_filed_name=None, min_freq=1, max_size=None,
verbose=0, is_input=True):
"""

:param field_name: 从哪个field_name创建词表,以及对哪个field_name进行index操作
:param new_added_filed_name: index时,生成的index field的名称,如果不传入,则覆盖field_name.
:param min_freq: 创建的Vocabulary允许的单词最少出现次数.
:param max_size: 创建的Vocabulary允许的最大的单词数量
:param verbose: 0, 不输出任何信息;1,输出信息
:param bool is_input:
"""
super(VocabIndexerProcessor, self).__init__(field_name, new_added_filed_name)
self.min_freq = min_freq
self.max_size = max_size

self.verbose = verbose
self.is_input = is_input

def construct_vocab(self, *datasets):
"""
使用传入的DataSet创建vocabulary

:param datasets: DataSet类型的数据,用于构建vocabulary
:return:
"""
self.vocab = Vocabulary(min_freq=self.min_freq, max_size=self.max_size)
for dataset in datasets:
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
dataset.apply(lambda ins: self.vocab.update(ins[self.field_name]))
self.vocab.build_vocab()
if self.verbose:
print("Vocabulary Constructed, has {} items.".format(len(self.vocab)))

def process(self, *datasets, only_index_dataset=None):
"""
若还未建立Vocabulary,则使用dataset中的DataSet建立vocabulary;若已经有了vocabulary则使用已有的vocabulary。得到vocabulary
后,则会index datasets与only_index_dataset。

:param datasets: DataSet类型的数据
:param only_index_dataset: DataSet, or list of DataSet. 该参数中的内容只会被用于index,不会被用于生成vocabulary。
:return:
"""
if len(datasets) == 0 and not hasattr(self, 'vocab'):
raise RuntimeError("You have to construct vocabulary first. Or you have to pass datasets to construct it.")
if not hasattr(self, 'vocab'):
self.construct_vocab(*datasets)
else:
if self.verbose:
print("Using constructed vocabulary with {} items.".format(len(self.vocab)))
to_index_datasets = []
if len(datasets) != 0:
for dataset in datasets:
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
to_index_datasets.append(dataset)

if not (only_index_dataset is None):
if isinstance(only_index_dataset, list):
for dataset in only_index_dataset:
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
to_index_datasets.append(dataset)
elif isinstance(only_index_dataset, DataSet):
to_index_datasets.append(only_index_dataset)
else:
raise TypeError('Only DataSet or list of DataSet is allowed, not {}.'.format(type(only_index_dataset)))

for dataset in to_index_datasets:
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
dataset.apply(lambda ins: [self.vocab.to_index(token) for token in ins[self.field_name]],
new_field_name=self.new_added_field_name, is_input=self.is_input)
# 只返回一个,infer时为了跟其他processor保持一致
if len(to_index_datasets) == 1:
return to_index_datasets[0]

def set_vocab(self, vocab):
assert isinstance(vocab, Vocabulary), "Only fastNLP.core.Vocabulary is allowed, not {}.".format(type(vocab))
self.vocab = vocab

def delete_vocab(self):
del self.vocab

def get_vocab_size(self):
return len(self.vocab)

def set_verbose(self, verbose):
"""
设置processor verbose状态。

:param verbose: int, 0,不输出任何信息;1,输出vocab 信息。
:return:
"""
self.verbose = verbose

+ 0
- 134
legacy/api/utils.py View File

@@ -1,134 +0,0 @@
import hashlib
import os
import re
import shutil
import sys
import tempfile

import torch

try:
from requests.utils import urlparse
from requests import get as urlopen
requests_available = True
except ImportError:
requests_available = False
if sys.version_info[0] == 2:
from urlparse import urlparse # noqa f811
from urllib2 import urlopen # noqa f811
else:
from urllib.request import urlopen
from urllib.parse import urlparse
try:
from tqdm.auto import tqdm
except:
from fastNLP.core.utils import _pseudo_tqdm as tqdm
# matches bfd8deac from resnet18-bfd8deac.pth
HASH_REGEX = re.compile(r'-([a-f0-9]*)\.')


def load_url(url, model_dir=None, map_location=None, progress=True):
r"""Loads the Torch serialized object at the given URL.

If the object is already present in `model_dir`, it's deserialized and
returned. The filename part of the URL should follow the naming convention
``filename-<sha256>.ext`` where ``<sha256>`` is the first eight or more
digits of the SHA256 hash of the contents of the file. The hash is used to
ensure unique names and to verify the contents of the file.

The default value of `model_dir` is ``$TORCH_HOME/models`` where
``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be
overridden with the ``$TORCH_MODEL_ZOO`` environment variable.

Args:
url (string): URL of the object to download
model_dir (string, optional): directory in which to save the object
map_location (optional): a function or a dict specifying how to remap storage locations (see torch.load)
progress (bool, optional): whether or not to display a progress bar to stderr

Example:
# >>> state_dict = model_zoo.load_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth')

"""
if model_dir is None:
torch_home = os.path.expanduser(os.getenv('fastNLP_HOME', '~/.fastNLP'))
model_dir = os.getenv('fastNLP_MODEL_ZOO', os.path.join(torch_home, 'models'))
if not os.path.exists(model_dir):
os.makedirs(model_dir)
parts = urlparse(url)
filename = os.path.basename(parts.path)
cached_file = os.path.join(model_dir, filename)
if not os.path.exists(cached_file):
sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file))
# hash_prefix = HASH_REGEX.search(filename).group(1)
_download_url_to_file(url, cached_file, hash_prefix=None, progress=progress)
return torch.load(cached_file, map_location=map_location)


def _download_url_to_file(url, dst, hash_prefix, progress):
if requests_available:
u = urlopen(url, stream=True)
file_size = int(u.headers["Content-Length"])
u = u.raw
else:
u = urlopen(url)
meta = u.info()
if hasattr(meta, 'getheaders'):
file_size = int(meta.getheaders("Content-Length")[0])
else:
file_size = int(meta.get_all("Content-Length")[0])

f = tempfile.NamedTemporaryFile(delete=False)
try:
if hash_prefix is not None:
sha256 = hashlib.sha256()
with tqdm(total=file_size, disable=not progress) as pbar:
while True:
buffer = u.read(8192)
if len(buffer) == 0:
break
f.write(buffer)
if hash_prefix is not None:
sha256.update(buffer)
pbar.update(len(buffer))

f.close()
if hash_prefix is not None:
digest = sha256.hexdigest()
if digest[:len(hash_prefix)] != hash_prefix:
raise RuntimeError('invalid hash value (expected "{}", got "{}")'
.format(hash_prefix, digest))
shutil.move(f.name, dst)
finally:
f.close()
if os.path.exists(f.name):
os.remove(f.name)


if tqdm is None:
# fake tqdm if it's not installed
class tqdm(object):

def __init__(self, total, disable=False):
self.total = total
self.disable = disable
self.n = 0

def update(self, n):
if self.disable:
return

self.n += n
sys.stderr.write("\r{0:.1f}%".format(100 * self.n / float(self.total)))
sys.stderr.flush()

def __enter__(self):
return self

def __exit__(self, exc_type, exc_val, exc_tb):
if self.disable:
return

sys.stderr.write('\n')


+ 0
- 0
legacy/automl/__init__.py View File


+ 0
- 223
legacy/automl/enas_controller.py View File

@@ -1,223 +0,0 @@
# Code Modified from https://github.com/carpedm20/ENAS-pytorch
"""A module with NAS controller-related code."""
import collections
import os

import torch
import torch.nn.functional as F

import fastNLP.automl.enas_utils as utils
from fastNLP.automl.enas_utils import Node


def _construct_dags(prev_nodes, activations, func_names, num_blocks):
"""Constructs a set of DAGs based on the actions, i.e., previous nodes and
activation functions, sampled from the controller/policy pi.

Args:
prev_nodes: Previous node actions from the policy.
activations: Activations sampled from the policy.
func_names: Mapping from activation function names to functions.
num_blocks: Number of blocks in the target RNN cell.

Returns:
A list of DAGs defined by the inputs.

RNN cell DAGs are represented in the following way:

1. Each element (node) in a DAG is a list of `Node`s.

2. The `Node`s in the list dag[i] correspond to the subsequent nodes
that take the output from node i as their own input.

3. dag[-1] is the node that takes input from x^{(t)} and h^{(t - 1)}.
dag[-1] always feeds dag[0].
dag[-1] acts as if `w_xc`, `w_hc`, `w_xh` and `w_hh` are its
weights.

4. dag[N - 1] is the node that produces the hidden state passed to
the next timestep. dag[N - 1] is also always a leaf node, and therefore
is always averaged with the other leaf nodes and fed to the output
decoder.
"""
dags = []
for nodes, func_ids in zip(prev_nodes, activations):
dag = collections.defaultdict(list)

# add first node
dag[-1] = [Node(0, func_names[func_ids[0]])]
dag[-2] = [Node(0, func_names[func_ids[0]])]

# add following nodes
for jdx, (idx, func_id) in enumerate(zip(nodes, func_ids[1:])):
dag[utils.to_item(idx)].append(Node(jdx + 1, func_names[func_id]))

leaf_nodes = set(range(num_blocks)) - dag.keys()

# merge with avg
for idx in leaf_nodes:
dag[idx] = [Node(num_blocks, 'avg')]

# This is actually y^{(t)}. h^{(t)} is node N - 1 in
# the graph, where N Is the number of nodes. I.e., h^{(t)} takes
# only one other node as its input.
# last h[t] node
last_node = Node(num_blocks + 1, 'h[t]')
dag[num_blocks] = [last_node]
dags.append(dag)

return dags


class Controller(torch.nn.Module):
"""Based on
https://github.com/pytorch/examples/blob/master/word_language_model/model.py

RL controllers do not necessarily have much to do with
language models.

Base the controller RNN on the GRU from:
https://github.com/ikostrikov/pytorch-a2c-ppo-acktr/blob/master/model.py
"""
def __init__(self, num_blocks=4, controller_hid=100, cuda=False):
torch.nn.Module.__init__(self)

# `num_tokens` here is just the activation function
# for every even step,
self.shared_rnn_activations = ['tanh', 'ReLU', 'identity', 'sigmoid']
self.num_tokens = [len(self.shared_rnn_activations)]
self.controller_hid = controller_hid
self.use_cuda = cuda
self.num_blocks = num_blocks
for idx in range(num_blocks):
self.num_tokens += [idx + 1, len(self.shared_rnn_activations)]
self.func_names = self.shared_rnn_activations

num_total_tokens = sum(self.num_tokens)

self.encoder = torch.nn.Embedding(num_total_tokens,
controller_hid)
self.lstm = torch.nn.LSTMCell(controller_hid, controller_hid)

# Perhaps these weights in the decoder should be
# shared? At least for the activation functions, which all have the
# same size.
self.decoders = []
for idx, size in enumerate(self.num_tokens):
decoder = torch.nn.Linear(controller_hid, size)
self.decoders.append(decoder)

self._decoders = torch.nn.ModuleList(self.decoders)

self.reset_parameters()
self.static_init_hidden = utils.keydefaultdict(self.init_hidden)

def _get_default_hidden(key):
return utils.get_variable(
torch.zeros(key, self.controller_hid),
self.use_cuda,
requires_grad=False)

self.static_inputs = utils.keydefaultdict(_get_default_hidden)

def reset_parameters(self):
init_range = 0.1
for param in self.parameters():
param.data.uniform_(-init_range, init_range)
for decoder in self.decoders:
decoder.bias.data.fill_(0)

def forward(self, # pylint:disable=arguments-differ
inputs,
hidden,
block_idx,
is_embed):
if not is_embed:
embed = self.encoder(inputs)
else:
embed = inputs

hx, cx = self.lstm(embed, hidden)
logits = self.decoders[block_idx](hx)

logits /= 5.0

# # exploration
# if self.args.mode == 'train':
# logits = (2.5 * F.tanh(logits))

return logits, (hx, cx)

def sample(self, batch_size=1, with_details=False, save_dir=None):
"""Samples a set of `args.num_blocks` many computational nodes from the
controller, where each node is made up of an activation function, and
each node except the last also includes a previous node.
"""
if batch_size < 1:
raise Exception(f'Wrong batch_size: {batch_size} < 1')

# [B, L, H]
inputs = self.static_inputs[batch_size]
hidden = self.static_init_hidden[batch_size]

activations = []
entropies = []
log_probs = []
prev_nodes = []
# The RNN controller alternately outputs an activation,
# followed by a previous node, for each block except the last one,
# which only gets an activation function. The last node is the output
# node, and its previous node is the average of all leaf nodes.
for block_idx in range(2*(self.num_blocks - 1) + 1):
logits, hidden = self.forward(inputs,
hidden,
block_idx,
is_embed=(block_idx == 0))

probs = F.softmax(logits, dim=-1)
log_prob = F.log_softmax(logits, dim=-1)
# .mean() for entropy?
entropy = -(log_prob * probs).sum(1, keepdim=False)

action = probs.multinomial(num_samples=1).data
selected_log_prob = log_prob.gather(
1, utils.get_variable(action, requires_grad=False))

# why the [:, 0] here? Should it be .squeeze(), or
# .view()? Same below with `action`.
entropies.append(entropy)
log_probs.append(selected_log_prob[:, 0])

# 0: function, 1: previous node
mode = block_idx % 2
inputs = utils.get_variable(
action[:, 0] + sum(self.num_tokens[:mode]),
requires_grad=False)

if mode == 0:
activations.append(action[:, 0])
elif mode == 1:
prev_nodes.append(action[:, 0])

prev_nodes = torch.stack(prev_nodes).transpose(0, 1)
activations = torch.stack(activations).transpose(0, 1)

dags = _construct_dags(prev_nodes,
activations,
self.func_names,
self.num_blocks)

if save_dir is not None:
for idx, dag in enumerate(dags):
utils.draw_network(dag,
os.path.join(save_dir, f'graph{idx}.png'))

if with_details:
return dags, torch.cat(log_probs), torch.cat(entropies)

return dags

def init_hidden(self, batch_size):
zeros = torch.zeros(batch_size, self.controller_hid)
return (utils.get_variable(zeros, self.use_cuda, requires_grad=False),
utils.get_variable(zeros.clone(), self.use_cuda, requires_grad=False))

+ 0
- 388
legacy/automl/enas_model.py View File

@@ -1,388 +0,0 @@
# Code Modified from https://github.com/carpedm20/ENAS-pytorch

"""Module containing the shared RNN model."""
import collections

import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from torch.autograd import Variable

import fastNLP.automl.enas_utils as utils
from fastNLP.models.base_model import BaseModel


def _get_dropped_weights(w_raw, dropout_p, is_training):
"""Drops out weights to implement DropConnect.

Args:
w_raw: Full, pre-dropout, weights to be dropped out.
dropout_p: Proportion of weights to drop out.
is_training: True iff _shared_ model is training.

Returns:
The dropped weights.

Why does torch.nn.functional.dropout() return:
1. `torch.autograd.Variable()` on the training loop
2. `torch.nn.Parameter()` on the controller or eval loop, when
training = False...

Even though the call to `_setweights` in the Smerity repo's
`weight_drop.py` does not have this behaviour, and `F.dropout` always
returns `torch.autograd.Variable` there, even when `training=False`?

The above TODO is the reason for the hacky check for `torch.nn.Parameter`.
"""
dropped_w = F.dropout(w_raw, p=dropout_p, training=is_training)

if isinstance(dropped_w, torch.nn.Parameter):
dropped_w = dropped_w.clone()

return dropped_w

class EmbeddingDropout(torch.nn.Embedding):
"""Class for dropping out embeddings by zero'ing out parameters in the
embedding matrix.

This is equivalent to dropping out particular words, e.g., in the sentence
'the quick brown fox jumps over the lazy dog', dropping out 'the' would
lead to the sentence '### quick brown fox jumps over ### lazy dog' (in the
embedding vector space).

See 'A Theoretically Grounded Application of Dropout in Recurrent Neural
Networks', (Gal and Ghahramani, 2016).
"""
def __init__(self,
num_embeddings,
embedding_dim,
max_norm=None,
norm_type=2,
scale_grad_by_freq=False,
sparse=False,
dropout=0.1,
scale=None):
"""Embedding constructor.

Args:
dropout: Dropout probability.
scale: Used to scale parameters of embedding weight matrix that are
not dropped out. Note that this is _in addition_ to the
`1/(1 - dropout)` scaling.

See `torch.nn.Embedding` for remaining arguments.
"""
torch.nn.Embedding.__init__(self,
num_embeddings=num_embeddings,
embedding_dim=embedding_dim,
max_norm=max_norm,
norm_type=norm_type,
scale_grad_by_freq=scale_grad_by_freq,
sparse=sparse)
self.dropout = dropout
assert (dropout >= 0.0) and (dropout < 1.0), ('Dropout must be >= 0.0 '
'and < 1.0')
self.scale = scale

def forward(self, inputs): # pylint:disable=arguments-differ
"""Embeds `inputs` with the dropped out embedding weight matrix."""
if self.training:
dropout = self.dropout
else:
dropout = 0

if dropout:
mask = self.weight.data.new(self.weight.size(0), 1)
mask.bernoulli_(1 - dropout)
mask = mask.expand_as(self.weight)
mask = mask / (1 - dropout)
masked_weight = self.weight * Variable(mask)
else:
masked_weight = self.weight
if self.scale and self.scale != 1:
masked_weight = masked_weight * self.scale

return F.embedding(inputs,
masked_weight,
max_norm=self.max_norm,
norm_type=self.norm_type,
scale_grad_by_freq=self.scale_grad_by_freq,
sparse=self.sparse)


class LockedDropout(nn.Module):
# code from https://github.com/salesforce/awd-lstm-lm/blob/master/locked_dropout.py
def __init__(self):
super().__init__()

def forward(self, x, dropout=0.5):
if not self.training or not dropout:
return x
m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout)
mask = Variable(m, requires_grad=False) / (1 - dropout)
mask = mask.expand_as(x)
return mask * x


class ENASModel(BaseModel):
"""Shared RNN model."""
def __init__(self, embed_num, num_classes, num_blocks=4, cuda=False, shared_hid=1000, shared_embed=1000):
super(ENASModel, self).__init__()

self.use_cuda = cuda

self.shared_hid = shared_hid
self.num_blocks = num_blocks
self.decoder = nn.Linear(self.shared_hid, num_classes)
self.encoder = EmbeddingDropout(embed_num,
shared_embed,
dropout=0.1)
self.lockdrop = LockedDropout()
self.dag = None

# Tie weights
# self.decoder.weight = self.encoder.weight

# Since W^{x, c} and W^{h, c} are always summed, there
# is no point duplicating their bias offset parameter. Likewise for
# W^{x, h} and W^{h, h}.
self.w_xc = nn.Linear(shared_embed, self.shared_hid)
self.w_xh = nn.Linear(shared_embed, self.shared_hid)

# The raw weights are stored here because the hidden-to-hidden weights
# are weight dropped on the forward pass.
self.w_hc_raw = torch.nn.Parameter(
torch.Tensor(self.shared_hid, self.shared_hid))
self.w_hh_raw = torch.nn.Parameter(
torch.Tensor(self.shared_hid, self.shared_hid))
self.w_hc = None
self.w_hh = None

self.w_h = collections.defaultdict(dict)
self.w_c = collections.defaultdict(dict)

for idx in range(self.num_blocks):
for jdx in range(idx + 1, self.num_blocks):
self.w_h[idx][jdx] = nn.Linear(self.shared_hid,
self.shared_hid,
bias=False)
self.w_c[idx][jdx] = nn.Linear(self.shared_hid,
self.shared_hid,
bias=False)

self._w_h = nn.ModuleList([self.w_h[idx][jdx]
for idx in self.w_h
for jdx in self.w_h[idx]])
self._w_c = nn.ModuleList([self.w_c[idx][jdx]
for idx in self.w_c
for jdx in self.w_c[idx]])

self.batch_norm = None
# if args.mode == 'train':
# self.batch_norm = nn.BatchNorm1d(self.shared_hid)
# else:
# self.batch_norm = None

self.reset_parameters()
self.static_init_hidden = utils.keydefaultdict(self.init_hidden)

def setDAG(self, dag):
if self.dag is None:
self.dag = dag

def forward(self, word_seq, hidden=None):
inputs = torch.transpose(word_seq, 0, 1)

time_steps = inputs.size(0)
batch_size = inputs.size(1)


self.w_hh = _get_dropped_weights(self.w_hh_raw,
0.5,
self.training)
self.w_hc = _get_dropped_weights(self.w_hc_raw,
0.5,
self.training)

# hidden = self.static_init_hidden[batch_size] if hidden is None else hidden
hidden = self.static_init_hidden[batch_size]

embed = self.encoder(inputs)

embed = self.lockdrop(embed, 0.65 if self.training else 0)

# The norm of hidden states are clipped here because
# otherwise ENAS is especially prone to exploding activations on the
# forward pass. This could probably be fixed in a more elegant way, but
# it might be exposing a weakness in the ENAS algorithm as currently
# proposed.
#
# For more details, see
# https://github.com/carpedm20/ENAS-pytorch/issues/6
clipped_num = 0
max_clipped_norm = 0
h1tohT = []
logits = []
for step in range(time_steps):
x_t = embed[step]
logit, hidden = self.cell(x_t, hidden, self.dag)

hidden_norms = hidden.norm(dim=-1)
max_norm = 25.0
if hidden_norms.data.max() > max_norm:
# Just directly use the torch slice operations
# in PyTorch v0.4.
#
# This workaround for PyTorch v0.3.1 does everything in numpy,
# because the PyTorch slicing and slice assignment is too
# flaky.
hidden_norms = hidden_norms.data.cpu().numpy()

clipped_num += 1
if hidden_norms.max() > max_clipped_norm:
max_clipped_norm = hidden_norms.max()

clip_select = hidden_norms > max_norm
clip_norms = hidden_norms[clip_select]

mask = np.ones(hidden.size())
normalizer = max_norm/clip_norms
normalizer = normalizer[:, np.newaxis]

mask[clip_select] = normalizer

if self.use_cuda:
hidden *= torch.autograd.Variable(
torch.FloatTensor(mask).cuda(), requires_grad=False)
else:
hidden *= torch.autograd.Variable(
torch.FloatTensor(mask), requires_grad=False)
logits.append(logit)
h1tohT.append(hidden)

h1tohT = torch.stack(h1tohT)
output = torch.stack(logits)
raw_output = output

output = self.lockdrop(output, 0.4 if self.training else 0)

#Pooling
output = torch.mean(output, 0)

decoded = self.decoder(output)

extra_out = {'dropped': decoded,
'hiddens': h1tohT,
'raw': raw_output}
return {'pred': decoded, 'hidden': hidden, 'extra_out': extra_out}

def cell(self, x, h_prev, dag):
"""Computes a single pass through the discovered RNN cell."""
c = {}
h = {}
f = {}

f[0] = self.get_f(dag[-1][0].name)
c[0] = torch.sigmoid(self.w_xc(x) + F.linear(h_prev, self.w_hc, None))
h[0] = (c[0]*f[0](self.w_xh(x) + F.linear(h_prev, self.w_hh, None)) +
(1 - c[0])*h_prev)

leaf_node_ids = []
q = collections.deque()
q.append(0)

# Computes connections from the parent nodes `node_id`
# to their child nodes `next_id` recursively, skipping leaf nodes. A
# leaf node is a node whose id == `self.num_blocks`.
#
# Connections between parent i and child j should be computed as
# h_j = c_j*f_{ij}{(W^h_{ij}*h_i)} + (1 - c_j)*h_i,
# where c_j = \sigmoid{(W^c_{ij}*h_i)}
#
# See Training details from Section 3.1 of the paper.
#
# The following algorithm does a breadth-first (since `q.popleft()` is
# used) search over the nodes and computes all the hidden states.
while True:
if len(q) == 0:
break

node_id = q.popleft()
nodes = dag[node_id]

for next_node in nodes:
next_id = next_node.id
if next_id == self.num_blocks:
leaf_node_ids.append(node_id)
assert len(nodes) == 1, ('parent of leaf node should have '
'only one child')
continue

w_h = self.w_h[node_id][next_id]
w_c = self.w_c[node_id][next_id]

f[next_id] = self.get_f(next_node.name)
c[next_id] = torch.sigmoid(w_c(h[node_id]))
h[next_id] = (c[next_id]*f[next_id](w_h(h[node_id])) +
(1 - c[next_id])*h[node_id])

q.append(next_id)

# Instead of averaging loose ends, perhaps there should
# be a set of separate unshared weights for each "loose" connection
# between each node in a cell and the output.
#
# As it stands, all weights W^h_{ij} are doing double duty by
# connecting both from i to j, as well as from i to the output.

# average all the loose ends
leaf_nodes = [h[node_id] for node_id in leaf_node_ids]
output = torch.mean(torch.stack(leaf_nodes, 2), -1)

# stabilizing the Updates of omega
if self.batch_norm is not None:
output = self.batch_norm(output)

return output, h[self.num_blocks - 1]

def init_hidden(self, batch_size):
zeros = torch.zeros(batch_size, self.shared_hid)
return utils.get_variable(zeros, self.use_cuda, requires_grad=False)

def get_f(self, name):
name = name.lower()
if name == 'relu':
f = torch.relu
elif name == 'tanh':
f = torch.tanh
elif name == 'identity':
f = lambda x: x
elif name == 'sigmoid':
f = torch.sigmoid
return f

@property
def num_parameters(self):
def size(p):
return np.prod(p.size())
return sum([size(param) for param in self.parameters()])


def reset_parameters(self):
init_range = 0.025
# init_range = 0.025 if self.args.mode == 'train' else 0.04
for param in self.parameters():
param.data.uniform_(-init_range, init_range)
self.decoder.bias.data.fill_(0)

def predict(self, word_seq):
"""

:param word_seq: torch.LongTensor, [batch_size, seq_len]
:return predict: dict of torch.LongTensor, [batch_size, seq_len]
"""
output = self(word_seq)
_, predict = output['pred'].max(dim=1)
return {'pred': predict}

+ 0
- 383
legacy/automl/enas_trainer.py View File

@@ -1,383 +0,0 @@
# Code Modified from https://github.com/carpedm20/ENAS-pytorch

import math
import time
from datetime import datetime
from datetime import timedelta

import numpy as np
import torch

try:
from tqdm.auto import tqdm
except:
from fastNLP.core.utils import _pseudo_tqdm as tqdm

from fastNLP.core.batch import Batch
from fastNLP.core.callback import CallbackException
from fastNLP.core.dataset import DataSet
from fastNLP.core.utils import _move_dict_value_to_device
import fastNLP
from . import enas_utils as utils
from fastNLP.core.utils import _build_args

from torch.optim import Adam


def _get_no_grad_ctx_mgr():
"""Returns a the `torch.no_grad` context manager for PyTorch version >=
0.4, or a no-op context manager otherwise.
"""
return torch.no_grad()


class ENASTrainer(fastNLP.Trainer):
"""A class to wrap training code."""
def __init__(self, train_data, model, controller, **kwargs):
"""Constructor for training algorithm.
:param DataSet train_data: the training data
:param torch.nn.modules.module model: a PyTorch model
:param torch.nn.modules.module controller: a PyTorch model
"""
self.final_epochs = kwargs['final_epochs']
kwargs.pop('final_epochs')
super(ENASTrainer, self).__init__(train_data, model, **kwargs)
self.controller_step = 0
self.shared_step = 0
self.max_length = 35

self.shared = model
self.controller = controller

self.shared_optim = Adam(
self.shared.parameters(),
lr=20.0,
weight_decay=1e-7)

self.controller_optim = Adam(
self.controller.parameters(),
lr=3.5e-4)

def train(self, load_best_model=True):
"""
:param bool load_best_model: 该参数只有在初始化提供了dev_data的情况下有效,如果True, trainer将在返回之前重新加载dev表现
最好的模型参数。
:return results: 返回一个字典类型的数据,
内含以下内容::

seconds: float, 表示训练时长
以下三个内容只有在提供了dev_data的情况下会有。
best_eval: Dict of Dict, 表示evaluation的结果
best_epoch: int,在第几个epoch取得的最佳值
best_step: int, 在第几个step(batch)更新取得的最佳值

"""
results = {}
if self.n_epochs <= 0:
print(f"training epoch is {self.n_epochs}, nothing was done.")
results['seconds'] = 0.
return results
try:
if torch.cuda.is_available() and self.use_cuda:
self.model = self.model.cuda()
self._model_device = self.model.parameters().__next__().device
self._mode(self.model, is_test=False)

self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S'))
start_time = time.time()
print("training epochs started " + self.start_time, flush=True)

try:
self.callback_manager.on_train_begin()
self._train()
self.callback_manager.on_train_end(self.model)
except (CallbackException, KeyboardInterrupt) as e:
self.callback_manager.on_exception(e, self.model)

if self.dev_data is not None:
print("\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) +
self.tester._format_eval_results(self.best_dev_perf),)
results['best_eval'] = self.best_dev_perf
results['best_epoch'] = self.best_dev_epoch
results['best_step'] = self.best_dev_step
if load_best_model:
model_name = "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time])
load_succeed = self._load_model(self.model, model_name)
if load_succeed:
print("Reloaded the best model.")
else:
print("Fail to reload best model.")
finally:
pass
results['seconds'] = round(time.time() - start_time, 2)

return results

def _train(self):
if not self.use_tqdm:
from fastNLP.core.utils import _pseudo_tqdm as inner_tqdm
else:
inner_tqdm = tqdm
self.step = 0
start = time.time()
total_steps = (len(self.train_data) // self.batch_size + int(
len(self.train_data) % self.batch_size != 0)) * self.n_epochs
with inner_tqdm(total=total_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar:
avg_loss = 0
data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False,
prefetch=self.prefetch)
for epoch in range(1, self.n_epochs+1):
pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs))
last_stage = (epoch > self.n_epochs + 1 - self.final_epochs)
if epoch == self.n_epochs + 1 - self.final_epochs:
print('Entering the final stage. (Only train the selected structure)')
# early stopping
self.callback_manager.on_epoch_begin(epoch, self.n_epochs)

# 1. Training the shared parameters omega of the child models
self.train_shared(pbar)

# 2. Training the controller parameters theta
if not last_stage:
self.train_controller()

if ((self.validate_every > 0 and self.step % self.validate_every == 0) or
(self.validate_every < 0 and self.step % len(data_iterator) == 0)) \
and self.dev_data is not None:
if not last_stage:
self.derive()
eval_res = self._do_validation(epoch=epoch, step=self.step)
eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step,
total_steps) + \
self.tester._format_eval_results(eval_res)
pbar.write(eval_str)

# lr decay; early stopping
self.callback_manager.on_epoch_end(epoch, self.n_epochs, self.optimizer)
# =============== epochs end =================== #
pbar.close()
# ============ tqdm end ============== #


def get_loss(self, inputs, targets, hidden, dags):
"""Computes the loss for the same batch for M models.

This amounts to an estimate of the loss, which is turned into an
estimate for the gradients of the shared model.
"""
if not isinstance(dags, list):
dags = [dags]

loss = 0
for dag in dags:
self.shared.setDAG(dag)
inputs = _build_args(self.shared.forward, **inputs)
inputs['hidden'] = hidden
result = self.shared(**inputs)
output, hidden, extra_out = result['pred'], result['hidden'], result['extra_out']

self.callback_manager.on_loss_begin(targets, result)
sample_loss = self._compute_loss(result, targets)
loss += sample_loss

assert len(dags) == 1, 'there are multiple `hidden` for multple `dags`'
return loss, hidden, extra_out

def train_shared(self, pbar=None, max_step=None, dag=None):
"""Train the language model for 400 steps of minibatches of 64
examples.

Args:
max_step: Used to run extra training steps as a warm-up.
dag: If not None, is used instead of calling sample().

BPTT is truncated at 35 timesteps.

For each weight update, gradients are estimated by sampling M models
from the fixed controller policy, and averaging their gradients
computed on a batch of training data.
"""
model = self.shared
model.train()
self.controller.eval()

hidden = self.shared.init_hidden(self.batch_size)

abs_max_grad = 0
abs_max_hidden_norm = 0
step = 0
raw_total_loss = 0
total_loss = 0
train_idx = 0
avg_loss = 0
data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False,
prefetch=self.prefetch)

for batch_x, batch_y in data_iterator:
_move_dict_value_to_device(batch_x, batch_y, device=self._model_device)
indices = data_iterator.get_batch_indices()
# negative sampling; replace unknown; re-weight batch_y
self.callback_manager.on_batch_begin(batch_x, batch_y, indices)
# prediction = self._data_forward(self.model, batch_x)

dags = self.controller.sample(1)
inputs, targets = batch_x, batch_y
# self.callback_manager.on_loss_begin(batch_y, prediction)
loss, hidden, extra_out = self.get_loss(inputs,
targets,
hidden,
dags)
hidden.detach_()
avg_loss += loss.item()

# Is loss NaN or inf? requires_grad = False
self.callback_manager.on_backward_begin(loss, self.model)
self._grad_backward(loss)
self.callback_manager.on_backward_end(self.model)

self._update()
self.callback_manager.on_step_end(self.optimizer)

if (self.step+1) % self.print_every == 0:
if self.use_tqdm:
print_output = "loss:{0:<6.5f}".format(avg_loss / self.print_every)
pbar.update(self.print_every)
else:
end = time.time()
diff = timedelta(seconds=round(end - start))
print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.6} time: {}".format(
epoch, self.step, avg_loss, diff)
pbar.set_postfix_str(print_output)
avg_loss = 0
self.step += 1
step += 1
self.shared_step += 1
self.callback_manager.on_batch_end()
# ================= mini-batch end ==================== #


def get_reward(self, dag, entropies, hidden, valid_idx=0):
"""Computes the perplexity of a single sampled model on a minibatch of
validation data.
"""
if not isinstance(entropies, np.ndarray):
entropies = entropies.data.cpu().numpy()

data_iterator = Batch(self.dev_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False,
prefetch=self.prefetch)

for inputs, targets in data_iterator:
valid_loss, hidden, _ = self.get_loss(inputs, targets, hidden, dag)
valid_loss = utils.to_item(valid_loss.data)

valid_ppl = math.exp(valid_loss)

R = 80 / valid_ppl

rewards = R + 1e-4 * entropies

return rewards, hidden

def train_controller(self):
"""Fixes the shared parameters and updates the controller parameters.

The controller is updated with a score function gradient estimator
(i.e., REINFORCE), with the reward being c/valid_ppl, where valid_ppl
is computed on a minibatch of validation data.

A moving average baseline is used.

The controller is trained for 2000 steps per epoch (i.e.,
first (Train Shared) phase -> second (Train Controller) phase).
"""
model = self.controller
model.train()
# Why can't we call shared.eval() here? Leads to loss
# being uniformly zero for the controller.
# self.shared.eval()

avg_reward_base = None
baseline = None
adv_history = []
entropy_history = []
reward_history = []

hidden = self.shared.init_hidden(self.batch_size)
total_loss = 0
valid_idx = 0
for step in range(20):
# sample models
dags, log_probs, entropies = self.controller.sample(
with_details=True)

# calculate reward
np_entropies = entropies.data.cpu().numpy()
# No gradients should be backpropagated to the
# shared model during controller training, obviously.
with _get_no_grad_ctx_mgr():
rewards, hidden = self.get_reward(dags,
np_entropies,
hidden,
valid_idx)


reward_history.extend(rewards)
entropy_history.extend(np_entropies)

# moving average baseline
if baseline is None:
baseline = rewards
else:
decay = 0.95
baseline = decay * baseline + (1 - decay) * rewards

adv = rewards - baseline
adv_history.extend(adv)

# policy loss
loss = -log_probs*utils.get_variable(adv,
self.use_cuda,
requires_grad=False)

loss = loss.sum() # or loss.mean()

# update
self.controller_optim.zero_grad()
loss.backward()

self.controller_optim.step()

total_loss += utils.to_item(loss.data)

if ((step % 50) == 0) and (step > 0):
reward_history, adv_history, entropy_history = [], [], []
total_loss = 0

self.controller_step += 1
# prev_valid_idx = valid_idx
# valid_idx = ((valid_idx + self.max_length) %
# (self.valid_data.size(0) - 1))
# # Whenever we wrap around to the beginning of the
# # validation data, we reset the hidden states.
# if prev_valid_idx > valid_idx:
# hidden = self.shared.init_hidden(self.batch_size)

def derive(self, sample_num=10, valid_idx=0):
"""We are always deriving based on the very first batch
of validation data? This seems wrong...
"""
hidden = self.shared.init_hidden(self.batch_size)

dags, _, entropies = self.controller.sample(sample_num,
with_details=True)

max_R = 0
best_dag = None
for dag in dags:
R, _ = self.get_reward(dag, entropies, hidden, valid_idx)
if R.max() > max_R:
max_R = R.max()
best_dag = dag

self.model.setDAG(best_dag)

+ 0
- 53
legacy/automl/enas_utils.py View File

@@ -1,53 +0,0 @@
# Code Modified from https://github.com/carpedm20/ENAS-pytorch

from __future__ import print_function

import collections
from collections import defaultdict

import numpy as np
import torch
from torch.autograd import Variable


def detach(h):
if type(h) == Variable:
return Variable(h.data)
else:
return tuple(detach(v) for v in h)

def get_variable(inputs, cuda=False, **kwargs):
if type(inputs) in [list, np.ndarray]:
inputs = torch.Tensor(inputs)
if cuda:
out = Variable(inputs.cuda(), **kwargs)
else:
out = Variable(inputs, **kwargs)
return out

def update_lr(optimizer, lr):
for param_group in optimizer.param_groups:
param_group['lr'] = lr

Node = collections.namedtuple('Node', ['id', 'name'])


class keydefaultdict(defaultdict):
def __missing__(self, key):
if self.default_factory is None:
raise KeyError(key)
else:
ret = self[key] = self.default_factory(key)
return ret


def to_item(x):
"""Converts x, possibly scalar and possibly tensor, to a Python scalar."""
if isinstance(x, (float, int)):
return x

if float(torch.__version__[0:3]) < 0.4:
assert (x.dim() == 1) and (len(x) == 1)
return x[0]

return x.item()

+ 0
- 1
legacy/component/__init__.py View File

@@ -1 +0,0 @@
from .bert_tokenizer import BertTokenizer

+ 0
- 378
legacy/component/bert_tokenizer.py View File

@@ -1,378 +0,0 @@
"""
bert_tokenizer.py is modified from huggingface/pytorch-pretrained-BERT, which is licensed under the Apache License 2.0.
"""
import collections
import os
import unicodedata
from io import open


PRETRAINED_VOCAB_ARCHIVE_MAP = {
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
}
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
'bert-base-uncased': 512,
'bert-large-uncased': 512,
'bert-base-cased': 512,
'bert-large-cased': 512,
'bert-base-multilingual-uncased': 512,
'bert-base-multilingual-cased': 512,
'bert-base-chinese': 512,
}
VOCAB_NAME = 'vocab.txt'


def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with open(vocab_file, "r", encoding="utf-8") as reader:
while True:
token = reader.readline()
if not token:
break
token = token.strip()
vocab[token] = index
index += 1
return vocab


def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens


class BertTokenizer(object):
"""Runs end-to-end tokenization: punctuation splitting + wordpiece"""

def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
"""Constructs a BertTokenizer.
Args:
vocab_file: Path to a one-wordpiece-per-line vocabulary file
do_lower_case: Whether to lower case the input
Only has an effect when do_wordpiece_only=False
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
max_len: An artificial maximum length to truncate tokenized sequences to;
Effective maximum length is always the minimum of this
value (if specified) and the underlying BERT model's
sequence length.
never_split: List of tokens which will never be split during tokenization.
Only has an effect when do_wordpiece_only=False
"""
if not os.path.isfile(vocab_file):
raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict(
[(ids, tok) for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
never_split=never_split)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
self.max_len = max_len if max_len is not None else int(1e12)

def tokenize(self, text):
split_tokens = []
if self.do_basic_tokenize:
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
else:
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens

def convert_tokens_to_ids(self, tokens):
"""Converts a sequence of tokens into ids using the vocab."""
ids = []
for token in tokens:
ids.append(self.vocab[token])
if len(ids) > self.max_len:
print(
"WARNING!\n\""
"Token indices sequence length is longer than the specified maximum "
"sequence length for this BERT model ({} > {}). Running this"
" sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
)
return ids

def convert_ids_to_tokens(self, ids):
"""Converts a sequence of ids in wordpiece tokens using the vocab."""
tokens = []
for i in ids:
tokens.append(self.ids_to_tokens[i])
return tokens

def save_vocabulary(self, vocab_path):
"""Save the tokenizer vocabulary to a directory or file."""
index = 0
if os.path.isdir(vocab_path):
vocab_file = os.path.join(vocab_path, VOCAB_NAME)
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
print("Saving vocabulary to {}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!".format(vocab_file))
index = token_index
writer.write(token + u'\n')
index += 1
return vocab_file

@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
"""
Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed.
"""
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
print("The pre-trained model you are loading is a cased model but you have not set "
"`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
"you may want to check this behavior.")
kwargs['do_lower_case'] = False
elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
print("The pre-trained model you are loading is an uncased model but you have set "
"`do_lower_case` to False. We are setting `do_lower_case=True` for you "
"but you may want to check this behavior.")
kwargs['do_lower_case'] = True
else:
vocab_file = pretrained_model_name_or_path
if os.path.isdir(vocab_file):
vocab_file = os.path.join(vocab_file, VOCAB_NAME)
# redirect to the cache, if necessary
resolved_vocab_file = vocab_file
print("loading vocabulary file {}".format(vocab_file))
if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
# than the number of positional embeddings
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
# Instantiate tokenizer.
tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
return tokenizer


class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

def __init__(self,
do_lower_case=True,
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case
self.never_split = never_split

def tokenize(self, text):
"""Tokenizes a piece of text."""
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case and token not in self.never_split:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))

output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens

def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)

def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
if text in self.never_split:
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1

return ["".join(x) for x in output]

def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)

def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True

return False

def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)


class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""

def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word

def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer`.
Returns:
A list of wordpiece tokens.
"""

output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue

is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end

if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens


def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False


def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False


def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False


+ 3
- 2
test/core/test_dataset.py View File

@@ -182,8 +182,9 @@ class TestDataSetMethods(unittest.TestCase):
def test_apply2(self):
def split_sent(ins):
return ins['raw_sentence'].split()
csv_loader = CSVLoader(headers=['raw_sentence', 'label'],sep='\t')
dataset = csv_loader.load('test/data_for_tests/tutorial_sample_dataset.csv')
csv_loader = CSVLoader(headers=['raw_sentence', 'label'], sep='\t')
data_bundle = csv_loader.load('test/data_for_tests/tutorial_sample_dataset.csv')
dataset = data_bundle.datasets['train']
dataset.drop(lambda x: len(x['raw_sentence'].split()) == 0, inplace=True)
dataset.apply(split_sent, new_field_name='words', is_input=True)
# print(dataset)


+ 0
- 15
test/io/test_data_loader.py View File

@@ -1,15 +0,0 @@
import unittest

from fastNLP.core.const import Const
from fastNLP.io.data_loader import MNLILoader


class TestDataLoader(unittest.TestCase):

def test_mnli_loader(self):
ds = MNLILoader().process('test/data_for_tests/sample_mnli.tsv',
to_lower=True, get_index=True, seq_len_type='mask')
self.assertTrue('train' in ds.datasets)
self.assertTrue(len(ds.datasets) == 1)
self.assertTrue(len(ds.datasets['train']) == 11)
self.assertTrue(isinstance(ds.datasets['train'][0][Const.INPUT_LENS(0)], list))

+ 0
- 77
test/io/test_dataset_loader.py View File

@@ -1,77 +0,0 @@
import unittest
import os
from fastNLP.io import CSVLoader, JsonLoader
from fastNLP.io.data_loader import SSTLoader, SNLILoader, Conll2003Loader, PeopleDailyCorpusLoader


class TestDatasetLoader(unittest.TestCase):
def test_Conll2003Loader(self):
"""
Test the the loader of Conll2003 dataset
"""
dataset_path = "test/data_for_tests/conll_2003_example.txt"
loader = Conll2003Loader()
dataset_2003 = loader.load(dataset_path)
def test_PeopleDailyCorpusLoader(self):
data_set = PeopleDailyCorpusLoader().load("test/data_for_tests/people_daily_raw.txt")
def test_CSVLoader(self):
ds = CSVLoader(sep='\t', headers=['words', 'label']) \
.load('test/data_for_tests/tutorial_sample_dataset.csv')
assert len(ds) > 0
def test_SNLILoader(self):
ds = SNLILoader().load('test/data_for_tests/sample_snli.jsonl')
assert len(ds) == 3
def test_JsonLoader(self):
ds = JsonLoader().load('test/data_for_tests/sample_snli.jsonl')
assert len(ds) == 3

def no_test_SST(self):
train_data = """(3 (2 (2 The) (2 Rock)) (4 (3 (2 is) (4 (2 destined) (2 (2 (2 (2 (2 to) (2 (2 be) (2 (2 the) (2 (2 21st) (2 (2 (2 Century) (2 's)) (2 (3 new) (2 (2 ``) (2 Conan)))))))) (2 '')) (2 and)) (3 (2 that) (3 (2 he) (3 (2 's) (3 (2 going) (3 (2 to) (4 (3 (2 make) (3 (3 (2 a) (3 splash)) (2 (2 even) (3 greater)))) (2 (2 than) (2 (2 (2 (2 (1 (2 Arnold) (2 Schwarzenegger)) (2 ,)) (2 (2 Jean-Claud) (2 (2 Van) (2 Damme)))) (2 or)) (2 (2 Steven) (2 Segal))))))))))))) (2 .)))
(4 (4 (4 (2 The) (4 (3 gorgeously) (3 (2 elaborate) (2 continuation)))) (2 (2 (2 of) (2 ``)) (2 (2 The) (2 (2 (2 Lord) (2 (2 of) (2 (2 the) (2 Rings)))) (2 (2 '') (2 trilogy)))))) (2 (3 (2 (2 is) (2 (2 so) (2 huge))) (2 (2 that) (3 (2 (2 (2 a) (2 column)) (2 (2 of) (2 words))) (2 (2 (2 (2 can) (1 not)) (3 adequately)) (2 (2 describe) (2 (3 (2 (2 co-writer\/director) (2 (2 Peter) (3 (2 Jackson) (2 's)))) (3 (2 expanded) (2 vision))) (2 (2 of) (2 (2 (2 J.R.R.) (2 (2 Tolkien) (2 's))) (2 Middle-earth))))))))) (2 .)))
(3 (3 (2 (2 (2 (2 (2 Singer\/composer) (2 (2 Bryan) (2 Adams))) (2 (2 contributes) (2 (2 (2 a) (2 slew)) (2 (2 of) (2 songs))))) (2 (2 --) (2 (2 (2 (2 a) (2 (2 few) (3 potential))) (2 (2 (2 hits) (2 ,)) (2 (2 (2 a) (2 few)) (1 (1 (2 more) (1 (2 simply) (2 intrusive))) (2 (2 to) (2 (2 the) (2 story))))))) (2 --)))) (2 but)) (3 (4 (2 the) (3 (2 whole) (2 package))) (2 (3 certainly) (3 (2 captures) (2 (1 (2 the) (2 (2 (2 intended) (2 (2 ,) (2 (2 er) (2 ,)))) (3 spirit))) (2 (2 of) (2 (2 the) (2 piece)))))))) (2 .))
(2 (2 (2 You) (2 (2 'd) (2 (2 think) (2 (2 by) (2 now))))) (2 (2 America) (2 (2 (2 would) (1 (2 have) (2 (2 (2 had) (1 (2 enough) (2 (2 of) (2 (2 plucky) (2 (2 British) (1 eccentrics)))))) (4 (2 with) (4 (3 hearts) (3 (2 of) (3 gold))))))) (2 .))))
"""
test_data = """(3 (2 Yet) (3 (2 (2 the) (2 act)) (3 (4 (3 (2 is) (3 (2 still) (4 charming))) (2 here)) (2 .))))
(4 (2 (2 Whether) (2 (2 (2 (2 or) (1 not)) (3 (2 you) (2 (2 're) (3 (3 enlightened) (2 (2 by) (2 (2 any) (2 (2 of) (2 (2 Derrida) (2 's))))))))) (2 (2 lectures) (2 (2 on) (2 (2 ``) (2 (2 (2 (2 (2 (2 the) (2 other)) (2 '')) (2 and)) (2 ``)) (2 (2 the) (2 self)))))))) (3 (2 ,) (3 (2 '') (3 (2 Derrida) (3 (3 (2 is) (4 (2 an) (4 (4 (2 undeniably) (3 (4 (3 fascinating) (2 and)) (4 playful))) (2 fellow)))) (2 .))))))
(4 (3 (2 (2 Just) (2 (2 the) (2 labour))) (3 (2 involved) (3 (2 in) (4 (2 creating) (3 (3 (2 the) (3 (3 layered) (2 richness))) (3 (2 of) (3 (2 (2 the) (2 imagery)) (2 (2 in) (3 (2 (2 this) (2 chiaroscuro)) (2 (2 of) (2 (2 (2 madness) (2 and)) (2 light)))))))))))) (3 (3 (2 is) (4 astonishing)) (2 .)))
(3 (3 (2 Part) (3 (2 of) (4 (2 (2 the) (3 charm)) (2 (2 of) (2 (2 Satin) (2 Rouge)))))) (3 (3 (2 is) (3 (2 that) (3 (2 it) (2 (1 (2 avoids) (2 (2 the) (1 obvious))) (3 (2 with) (3 (3 (3 humour) (2 and)) (2 lightness))))))) (2 .)))
(4 (2 (2 a) (2 (2 screenplay) (2 more))) (3 (4 ingeniously) (2 (2 constructed) (2 (2 (2 (2 than) (2 ``)) (2 Memento)) (2 '')))))
(3 (2 ``) (3 (2 (2 Extreme) (2 Ops)) (3 (2 '') (4 (4 (3 exceeds) (2 expectations)) (2 .)))))
"""
train, test = 'train--', 'test--'
with open(train, 'w', encoding='utf-8') as f:
f.write(train_data)
with open(test, 'w', encoding='utf-8') as f:
f.write(test_data)

loader = SSTLoader()
info = loader.process(
{train: train, test: test},
train_ds=[train],
src_vocab_op=dict(min_freq=2)
)
assert len(list(info.vocabs.items())) == 2
assert len(list(info.datasets.items())) == 2
print(info.vocabs)
print(info.datasets)
os.remove(train), os.remove(test)

# def test_import(self):
# import fastNLP
# from fastNLP.io import SNLILoader
# ds = SNLILoader().process('test/data_for_tests/sample_snli.jsonl', to_lower=True,
# get_index=True, seq_len_type='seq_len', extra_split=['-'])
# assert 'train' in ds.datasets
# assert len(ds.datasets) == 1
# assert len(ds.datasets['train']) == 3
#
# ds = SNLILoader().process('test/data_for_tests/sample_snli.jsonl', to_lower=True,
# get_index=True, seq_len_type='seq_len')
# assert 'train' in ds.datasets
# assert len(ds.datasets) == 1
# assert len(ds.datasets['train']) == 3

Loading…
Cancel
Save