Browse Source

Merge branch 'dev0.5.0' of https://github.com/SrWYG/fastNLP into pr

tags/v0.4.10
yunfan 5 years ago
parent
commit
e6dd7ba1a8
18 changed files with 1131 additions and 29 deletions
  1. +22
    -0
      reproduction/text_classification/README.md
  2. +7
    -5
      reproduction/text_classification/data/IMDBLoader.py
  3. +5
    -1
      reproduction/text_classification/data/MTL16Loader.py
  4. +187
    -0
      reproduction/text_classification/data/SSTLoader.py
  5. +97
    -8
      reproduction/text_classification/data/sstLoader.py
  6. +9
    -11
      reproduction/text_classification/data/yelpLoader.py
  7. +109
    -0
      reproduction/text_classification/model/HAN.py
  8. +31
    -0
      reproduction/text_classification/model/awd_lstm.py
  9. +86
    -0
      reproduction/text_classification/model/awdlstm_module.py
  10. +30
    -0
      reproduction/text_classification/model/lstm.py
  11. +35
    -0
      reproduction/text_classification/model/lstm_self_attention.py
  12. +99
    -0
      reproduction/text_classification/model/weight_drop.py
  13. BIN
      reproduction/text_classification/results_LSTM.xlsx
  14. +109
    -0
      reproduction/text_classification/train_HAN.py
  15. +102
    -0
      reproduction/text_classification/train_awdlstm.py
  16. +3
    -4
      reproduction/text_classification/train_char_cnn.py
  17. +99
    -0
      reproduction/text_classification/train_lstm.py
  18. +101
    -0
      reproduction/text_classification/train_lstm_att.py

+ 22
- 0
reproduction/text_classification/README.md View File

@@ -0,0 +1,22 @@
# text_classification任务模型复现
这里使用fastNLP复现以下模型:
char_cnn :论文链接[Character-level Convolutional Networks for Text Classification](https://arxiv.org/pdf/1509.01626v3.pdf)
dpcnn:论文链接[Deep Pyramid Convolutional Neural Networks for TextCategorization](https://ai.tencent.com/ailab/media/publications/ACL3-Brady.pdf)
HAN:论文链接[Hierarchical Attention Networks for Document Classification](https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf)
#待补充
awd_lstm:
lstm_self_attention(BCN?):
awd-sltm:

# 数据集及复现结果汇总

使用fastNLP复现的结果vs论文汇报结果(/前为fastNLP实现,后面为论文报道,-表示论文没有在该数据集上列出结果)

model name | yelp_p | sst-2|IMDB|
:---: | :---: | :---: | :---:
char_cnn | 93.80/95.12 | - |- |
dpcnn | 95.50/97.36 | - |- |
HAN |- | - |-|
BCN| - |- |-|
awd-lstm| - |- |-|


+ 7
- 5
reproduction/text_classification/data/IMDBLoader.py View File

@@ -32,27 +32,27 @@ class IMDBLoader(DataSetLoader):
continue
parts = line.split('\t')
target = parts[0]
words = parts[1].split()
words = parts[1].lower().split()
dataset.append(Instance(words=words, target=target))

if len(dataset)==0:
raise RuntimeError(f"{path} has no valid data.")

return dataset
def process(self,
paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None,
src_embed_opt: EmbeddingOption = None,
char_level_op=False):
datasets = {}
info = DataInfo()
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset
def wordtochar(words):
chars = []
for word in words:
@@ -69,7 +69,7 @@ class IMDBLoader(DataSetLoader):

src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
src_vocab.from_dataset(datasets['train'], field_name='words')
# src_vocab.from_dataset(datasets['train'], datasets["dev"], datasets["test"], field_name='words')
src_vocab.index_dataset(*datasets.values(), field_name='words')

tgt_vocab = Vocabulary(unknown=None, padding=None) \
@@ -95,6 +95,7 @@ class IMDBLoader(DataSetLoader):
return info



if __name__=="__main__":
datapath = {"train": "/remote-home/ygwang/IMDB_data/train.csv",
"test": "/remote-home/ygwang/IMDB_data/test.csv"}
@@ -106,3 +107,4 @@ if __name__=="__main__":

ave_len = len_count / len(datainfo.datasets["train"])
print(ave_len)


+ 5
- 1
reproduction/text_classification/data/MTL16Loader.py View File

@@ -32,7 +32,7 @@ class MTL16Loader(DataSetLoader):
continue
parts = line.split('\t')
target = parts[0]
words = parts[1].split()
words = parts[1].lower().split()
dataset.append(Instance(words=words, target=target))
if len(dataset)==0:
raise RuntimeError(f"{path} has no valid data.")
@@ -72,4 +72,8 @@ class MTL16Loader(DataSetLoader):
embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
info.embeddings['words'] = embed

for name, dataset in info.datasets.items():
dataset.set_input("words")
dataset.set_target("target")

return info

+ 187
- 0
reproduction/text_classification/data/SSTLoader.py View File

@@ -0,0 +1,187 @@
from typing import Iterable
from nltk import Tree
from fastNLP.io.base_loader import DataInfo, DataSetLoader
from fastNLP.core.vocabulary import VocabularyOption, Vocabulary
from fastNLP import DataSet
from fastNLP import Instance
from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader
import csv
from typing import Union, Dict

class SSTLoader(DataSetLoader):
URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip'
DATA_DIR = 'sst/'

"""
别名::class:`fastNLP.io.SSTLoader` :class:`fastNLP.io.dataset_loader.SSTLoader`

读取SST数据集, DataSet包含fields::

words: list(str) 需要分类的文本
target: str 文本的标签

数据来源: https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip

:param subtree: 是否将数据展开为子树,扩充数据量. Default: ``False``
:param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False``
"""

def __init__(self, subtree=False, fine_grained=False):
self.subtree = subtree

tag_v = {'0': 'very negative', '1': 'negative', '2': 'neutral',
'3': 'positive', '4': 'very positive'}
if not fine_grained:
tag_v['0'] = tag_v['1']
tag_v['4'] = tag_v['3']
self.tag_v = tag_v

def _load(self, path):
"""

:param str path: 存储数据的路径
:return: 一个 :class:`~fastNLP.DataSet` 类型的对象
"""
datalist = []
with open(path, 'r', encoding='utf-8') as f:
datas = []
for l in f:
datas.extend([(s, self.tag_v[t])
for s, t in self._get_one(l, self.subtree)])
ds = DataSet()
for words, tag in datas:
ds.append(Instance(words=words, target=tag))
return ds

@staticmethod
def _get_one(data, subtree):
tree = Tree.fromstring(data)
if subtree:
return [(t.leaves(), t.label()) for t in tree.subtrees()]
return [(tree.leaves(), tree.label())]

def process(self,
paths,
train_ds: Iterable[str] = None,
src_vocab_op: VocabularyOption = None,
tgt_vocab_op: VocabularyOption = None,
src_embed_op: EmbeddingOption = None):
input_name, target_name = 'words', 'target'
src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op)
tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)

info = DataInfo(datasets=self.load(paths))
_train_ds = [info.datasets[name]
for name in train_ds] if train_ds else info.datasets.values()
src_vocab.from_dataset(*_train_ds, field_name=input_name)
tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
src_vocab.index_dataset(
*info.datasets.values(),
field_name=input_name, new_field_name=input_name)
tgt_vocab.index_dataset(
*info.datasets.values(),
field_name=target_name, new_field_name=target_name)
info.vocabs = {
input_name: src_vocab,
target_name: tgt_vocab
}

if src_embed_op is not None:
src_embed_op.vocab = src_vocab
init_emb = EmbedLoader.load_with_vocab(**src_embed_op)
info.embeddings[input_name] = init_emb

for name, dataset in info.datasets.items():
dataset.set_input(input_name)
dataset.set_target(target_name)

return info

class sst2Loader(DataSetLoader):
'''
数据来源"SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
'''
def __init__(self):
super(sst2Loader, self).__init__()

def _load(self, path: str) -> DataSet:
ds = DataSet()
all_count=0
csv_reader = csv.reader(open(path, encoding='utf-8'),delimiter='\t')
skip_row = 0
for idx,row in enumerate(csv_reader):
if idx<=skip_row:
continue
target = row[1]
words = row[0].split()
ds.append(Instance(words=words,target=target))
all_count+=1
print("all count:", all_count)
return ds

def process(self,
paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None,
src_embed_opt: EmbeddingOption = None,
char_level_op=False):

paths = check_dataloader_paths(paths)
datasets = {}
info = DataInfo()
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset

def wordtochar(words):
chars=[]
for word in words:
word=word.lower()
for char in word:
chars.append(char)
return chars

input_name, target_name = 'words', 'target'
info.vocabs={}

# 就分隔为char形式
if char_level_op:
for dataset in datasets.values():
dataset.apply_field(wordtochar, field_name="words", new_field_name='chars')

src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
src_vocab.from_dataset(datasets['train'], field_name='words')
src_vocab.index_dataset(*datasets.values(), field_name='words')

tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
tgt_vocab.from_dataset(datasets['train'], field_name='target')
tgt_vocab.index_dataset(*datasets.values(), field_name='target')


info.vocabs = {
"words": src_vocab,
"target": tgt_vocab
}

info.datasets = datasets


if src_embed_opt is not None:
embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
info.embeddings['words'] = embed

return info

if __name__=="__main__":
datapath = {"train": "/remote-home/ygwang/workspace/GLUE/SST-2/train.tsv",
"dev": "/remote-home/ygwang/workspace/GLUE/SST-2/dev.tsv"}
datainfo=sst2Loader().process(datapath,char_level_op=True)
#print(datainfo.datasets["train"])
len_count = 0
for instance in datainfo.datasets["train"]:
len_count += len(instance["chars"])

ave_len = len_count / len(datainfo.datasets["train"])
print(ave_len)

+ 97
- 8
reproduction/text_classification/data/sstLoader.py View File

@@ -1,13 +1,102 @@
import csv
from typing import Iterable
from fastNLP import DataSet, Instance, Vocabulary
from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io.base_loader import DataInfo,DataSetLoader
from fastNLP.io.embed_loader import EmbeddingOption
from fastNLP.io.file_reader import _read_json
from nltk import Tree
from fastNLP.io.base_loader import DataInfo, DataSetLoader
from fastNLP.core.vocabulary import VocabularyOption, Vocabulary
from fastNLP import DataSet
from fastNLP import Instance
from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader
import csv
from typing import Union, Dict
from reproduction.Star_transformer.datasets import EmbedLoader
from reproduction.utils import check_dataloader_paths

class SSTLoader(DataSetLoader):
URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip'
DATA_DIR = 'sst/'

"""
别名::class:`fastNLP.io.SSTLoader` :class:`fastNLP.io.dataset_loader.SSTLoader`

读取SST数据集, DataSet包含fields::

words: list(str) 需要分类的文本
target: str 文本的标签

数据来源: https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip

:param subtree: 是否将数据展开为子树,扩充数据量. Default: ``False``
:param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False``
"""

def __init__(self, subtree=False, fine_grained=False):
self.subtree = subtree

tag_v = {'0': 'very negative', '1': 'negative', '2': 'neutral',
'3': 'positive', '4': 'very positive'}
if not fine_grained:
tag_v['0'] = tag_v['1']
tag_v['4'] = tag_v['3']
self.tag_v = tag_v

def _load(self, path):
"""

:param str path: 存储数据的路径
:return: 一个 :class:`~fastNLP.DataSet` 类型的对象
"""
datalist = []
with open(path, 'r', encoding='utf-8') as f:
datas = []
for l in f:
datas.extend([(s, self.tag_v[t])
for s, t in self._get_one(l, self.subtree)])
ds = DataSet()
for words, tag in datas:
ds.append(Instance(words=words, target=tag))
return ds

@staticmethod
def _get_one(data, subtree):
tree = Tree.fromstring(data)
if subtree:
return [(t.leaves(), t.label()) for t in tree.subtrees()]
return [(tree.leaves(), tree.label())]

def process(self,
paths,
train_ds: Iterable[str] = None,
src_vocab_op: VocabularyOption = None,
tgt_vocab_op: VocabularyOption = None,
src_embed_op: EmbeddingOption = None):
input_name, target_name = 'words', 'target'
src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op)
tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)

info = DataInfo(datasets=self.load(paths))
_train_ds = [info.datasets[name]
for name in train_ds] if train_ds else info.datasets.values()
src_vocab.from_dataset(*_train_ds, field_name=input_name)
tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
src_vocab.index_dataset(
*info.datasets.values(),
field_name=input_name, new_field_name=input_name)
tgt_vocab.index_dataset(
*info.datasets.values(),
field_name=target_name, new_field_name=target_name)
info.vocabs = {
input_name: src_vocab,
target_name: tgt_vocab
}

if src_embed_op is not None:
src_embed_op.vocab = src_vocab
init_emb = EmbedLoader.load_with_vocab(**src_embed_op)
info.embeddings[input_name] = init_emb

for name, dataset in info.datasets.items():
dataset.set_input(input_name)
dataset.set_target(target_name)

return info

class sst2Loader(DataSetLoader):
'''


+ 9
- 11
reproduction/text_classification/data/yelpLoader.py View File

@@ -34,18 +34,10 @@ def clean_str(sentence, tokenizer, char_lower=False):
return words_collection


class yelpLoader(JsonLoader):
class yelpLoader(DataSetLoader):
"""
读取Yelp数据集, DataSet包含fields:
review_id: str, 22 character unique review id
user_id: str, 22 character unique user id
business_id: str, 22 character business id
useful: int, number of useful votes received
funny: int, number of funny votes received
cool: int, number of cool votes received
date: str, date formatted YYYY-MM-DD
读取Yelp_full/Yelp_polarity数据集, DataSet包含fields:
words: list(str), 需要分类的文本
target: str, 文本的标签
chars:list(str),未index的字符列表
@@ -180,6 +172,12 @@ class yelpLoader(JsonLoader):

info.vocabs[target_name]=tgt_vocab

info.datasets['train'],info.datasets['dev']=info.datasets['train'].split(0.1, shuffle=False)

for name, dataset in info.datasets.items():
dataset.set_input("words")
dataset.set_target("target")

return info

if __name__=="__main__":
@@ -196,4 +194,4 @@ if __name__=="__main__":
len_count+=len(instance["chars"])

ave_len=len_count/len(datainfo.datasets["train"])
print(ave_len)
print(ave_len)

+ 109
- 0
reproduction/text_classification/model/HAN.py View File

@@ -0,0 +1,109 @@
import torch
import torch.nn as nn
from torch.autograd import Variable
from fastNLP.modules.utils import get_embeddings
from fastNLP.core import Const as C


def pack_sequence(tensor_seq, padding_value=0.0):
if len(tensor_seq) <= 0:
return
length = [v.size(0) for v in tensor_seq]
max_len = max(length)
size = [len(tensor_seq), max_len]
size.extend(list(tensor_seq[0].size()[1:]))
ans = torch.Tensor(*size).fill_(padding_value)
if tensor_seq[0].data.is_cuda:
ans = ans.cuda()
ans = Variable(ans)
for i, v in enumerate(tensor_seq):
ans[i, :length[i], :] = v
return ans


class HANCLS(nn.Module):
def __init__(self, init_embed, num_cls):
super(HANCLS, self).__init__()

self.embed = get_embeddings(init_embed)
self.han = HAN(input_size=300,
output_size=num_cls,
word_hidden_size=50, word_num_layers=1, word_context_size=100,
sent_hidden_size=50, sent_num_layers=1, sent_context_size=100
)

def forward(self, input_sents):
# input_sents [B, num_sents, seq-len] dtype long
# target
B, num_sents, seq_len = input_sents.size()
input_sents = input_sents.view(-1, seq_len) # flat
words_embed = self.embed(input_sents) # should be [B*num-sent, seqlen , word-dim]
words_embed = words_embed.view(B, num_sents, seq_len, -1) # recover # [B, num-sent, seqlen , word-dim]
out = self.han(words_embed)

return {C.OUTPUT: out}

def predict(self, input_sents):
x = self.forward(input_sents)[C.OUTPUT]
return {C.OUTPUT: torch.argmax(x, 1)}


class HAN(nn.Module):
def __init__(self, input_size, output_size,
word_hidden_size, word_num_layers, word_context_size,
sent_hidden_size, sent_num_layers, sent_context_size):
super(HAN, self).__init__()

self.word_layer = AttentionNet(input_size,
word_hidden_size,
word_num_layers,
word_context_size)
self.sent_layer = AttentionNet(2 * word_hidden_size,
sent_hidden_size,
sent_num_layers,
sent_context_size)
self.output_layer = nn.Linear(2 * sent_hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)

def forward(self, batch_doc):
# input is a sequence of matrix
doc_vec_list = []
for doc in batch_doc:
sent_mat = self.word_layer(doc) # doc's dim (num_sent, seq_len, word_dim)
doc_vec_list.append(sent_mat) # sent_mat's dim (num_sent, vec_dim)
doc_vec = self.sent_layer(pack_sequence(doc_vec_list))
output = self.softmax(self.output_layer(doc_vec))
return output


class AttentionNet(nn.Module):
def __init__(self, input_size, gru_hidden_size, gru_num_layers, context_vec_size):
super(AttentionNet, self).__init__()

self.input_size = input_size
self.gru_hidden_size = gru_hidden_size
self.gru_num_layers = gru_num_layers
self.context_vec_size = context_vec_size

# Encoder
self.gru = nn.GRU(input_size=input_size,
hidden_size=gru_hidden_size,
num_layers=gru_num_layers,
batch_first=True,
bidirectional=True)
# Attention
self.fc = nn.Linear(2 * gru_hidden_size, context_vec_size)
self.tanh = nn.Tanh()
self.softmax = nn.Softmax(dim=1)
# context vector
self.context_vec = nn.Parameter(torch.Tensor(context_vec_size, 1))
self.context_vec.data.uniform_(-0.1, 0.1)

def forward(self, inputs):
# GRU part
h_t, hidden = self.gru(inputs) # inputs's dim (batch_size, seq_len, word_dim)
u = self.tanh(self.fc(h_t))
# Attention part
alpha = self.softmax(torch.matmul(u, self.context_vec)) # u's dim (batch_size, seq_len, context_vec_size)
output = torch.bmm(torch.transpose(h_t, 1, 2), alpha) # alpha's dim (batch_size, seq_len, 1)
return torch.squeeze(output, dim=2) # output's dim (batch_size, 2*hidden_size, 1)

+ 31
- 0
reproduction/text_classification/model/awd_lstm.py View File

@@ -0,0 +1,31 @@
import torch
import torch.nn as nn
from fastNLP.core.const import Const as C
from .awdlstm_module import LSTM
from fastNLP.modules import encoder
from fastNLP.modules.decoder.mlp import MLP


class AWDLSTMSentiment(nn.Module):
def __init__(self, init_embed,
num_classes,
hidden_dim=256,
num_layers=1,
nfc=128,
wdrop=0.5):
super(AWDLSTMSentiment,self).__init__()
self.embed = encoder.Embedding(init_embed)
self.lstm = LSTM(input_size=self.embed.embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, bidirectional=True, wdrop=wdrop)
self.mlp = MLP(size_layer=[hidden_dim* 2, nfc, num_classes])

def forward(self, words):
x_emb = self.embed(words)
output, _ = self.lstm(x_emb)
output = self.mlp(output[:,-1,:])
return {C.OUTPUT: output}

def predict(self, words):
output = self(words)
_, predict = output[C.OUTPUT].max(dim=1)
return {C.OUTPUT: predict}


+ 86
- 0
reproduction/text_classification/model/awdlstm_module.py View File

@@ -0,0 +1,86 @@
"""
轻量封装的 Pytorch LSTM 模块.
可在 forward 时传入序列的长度, 自动对padding做合适的处理.
"""
__all__ = [
"LSTM"
]

import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn

from fastNLP.modules.utils import initial_parameter
from torch import autograd
from .weight_drop import WeightDrop


class LSTM(nn.Module):
"""
别名::class:`fastNLP.modules.LSTM` :class:`fastNLP.modules.encoder.lstm.LSTM`

LSTM 模块, 轻量封装的Pytorch LSTM. 在提供seq_len的情况下,将自动使用pack_padded_sequence; 同时默认将forget gate的bias初始化
为1; 且可以应对DataParallel中LSTM的使用问题。

:param input_size: 输入 `x` 的特征维度
:param hidden_size: 隐状态 `h` 的特征维度.
:param num_layers: rnn的层数. Default: 1
:param dropout: 层间dropout概率. Default: 0
:param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
:param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
:(batch, seq, feature). Default: ``False``
:param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
"""
def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, batch_first=True,
bidirectional=False, bias=True, wdrop=0.5):
super(LSTM, self).__init__()
self.batch_first = batch_first
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=bias, batch_first=batch_first,
dropout=dropout, bidirectional=bidirectional)
self.lstm = WeightDrop(self.lstm, ['weight_hh_l0'], dropout=wdrop)
self.init_param()

def init_param(self):
for name, param in self.named_parameters():
if 'bias' in name:
# based on https://github.com/pytorch/pytorch/issues/750#issuecomment-280671871
param.data.fill_(0)
n = param.size(0)
start, end = n // 4, n // 2
param.data[start:end].fill_(1)
else:
nn.init.xavier_uniform_(param)

def forward(self, x, seq_len=None, h0=None, c0=None):
"""

:param x: [batch, seq_len, input_size] 输入序列
:param seq_len: [batch, ] 序列长度, 若为 ``None``, 所有输入看做一样长. Default: ``None``
:param h0: [batch, hidden_size] 初始隐状态, 若为 ``None`` , 设为全0向量. Default: ``None``
:param c0: [batch, hidden_size] 初始Cell状态, 若为 ``None`` , 设为全0向量. Default: ``None``
:return (output, ht) 或 output: 若 ``get_hidden=True`` [batch, seq_len, hidden_size*num_direction] 输出序列
和 [batch, hidden_size*num_direction] 最后时刻隐状态.
"""
batch_size, max_len, _ = x.size()
if h0 is not None and c0 is not None:
hx = (h0, c0)
else:
hx = None
if seq_len is not None and not isinstance(x, rnn.PackedSequence):
sort_lens, sort_idx = torch.sort(seq_len, dim=0, descending=True)
if self.batch_first:
x = x[sort_idx]
else:
x = x[:, sort_idx]
x = rnn.pack_padded_sequence(x, sort_lens, batch_first=self.batch_first)
output, hx = self.lstm(x, hx) # -> [N,L,C]
output, _ = rnn.pad_packed_sequence(output, batch_first=self.batch_first, total_length=max_len)
_, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
if self.batch_first:
output = output[unsort_idx]
else:
output = output[:, unsort_idx]
else:
output, hx = self.lstm(x, hx)
return output, hx

+ 30
- 0
reproduction/text_classification/model/lstm.py View File

@@ -0,0 +1,30 @@
import torch
import torch.nn as nn
from fastNLP.core.const import Const as C
from fastNLP.modules.encoder.lstm import LSTM
from fastNLP.modules import encoder
from fastNLP.modules.decoder.mlp import MLP


class BiLSTMSentiment(nn.Module):
def __init__(self, init_embed,
num_classes,
hidden_dim=256,
num_layers=1,
nfc=128):
super(BiLSTMSentiment,self).__init__()
self.embed = encoder.Embedding(init_embed)
self.lstm = LSTM(input_size=self.embed.embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, bidirectional=True)
self.mlp = MLP(size_layer=[hidden_dim* 2, nfc, num_classes])

def forward(self, words):
x_emb = self.embed(words)
output, _ = self.lstm(x_emb)
output = self.mlp(output[:,-1,:])
return {C.OUTPUT: output}

def predict(self, words):
output = self(words)
_, predict = output[C.OUTPUT].max(dim=1)
return {C.OUTPUT: predict}


+ 35
- 0
reproduction/text_classification/model/lstm_self_attention.py View File

@@ -0,0 +1,35 @@
import torch
import torch.nn as nn
from fastNLP.core.const import Const as C
from fastNLP.modules.encoder.lstm import LSTM
from fastNLP.modules import encoder
from fastNLP.modules.aggregator.attention import SelfAttention
from fastNLP.modules.decoder.mlp import MLP


class BiLSTM_SELF_ATTENTION(nn.Module):
def __init__(self, init_embed,
num_classes,
hidden_dim=256,
num_layers=1,
attention_unit=256,
attention_hops=1,
nfc=128):
super(BiLSTM_SELF_ATTENTION,self).__init__()
self.embed = encoder.Embedding(init_embed)
self.lstm = LSTM(input_size=self.embed.embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, bidirectional=True)
self.attention = SelfAttention(input_size=hidden_dim * 2 , attention_unit=attention_unit, attention_hops=attention_hops)
self.mlp = MLP(size_layer=[hidden_dim* 2*attention_hops, nfc, num_classes])

def forward(self, words):
x_emb = self.embed(words)
output, _ = self.lstm(x_emb)
after_attention, penalty = self.attention(output,words)
after_attention =after_attention.view(after_attention.size(0),-1)
output = self.mlp(after_attention)
return {C.OUTPUT: output}

def predict(self, words):
output = self(words)
_, predict = output[C.OUTPUT].max(dim=1)
return {C.OUTPUT: predict}

+ 99
- 0
reproduction/text_classification/model/weight_drop.py View File

@@ -0,0 +1,99 @@
import torch
from torch.nn import Parameter
from functools import wraps

class WeightDrop(torch.nn.Module):
def __init__(self, module, weights, dropout=0, variational=False):
super(WeightDrop, self).__init__()
self.module = module
self.weights = weights
self.dropout = dropout
self.variational = variational
self._setup()

def widget_demagnetizer_y2k_edition(*args, **kwargs):
# We need to replace flatten_parameters with a nothing function
# It must be a function rather than a lambda as otherwise pickling explodes
# We can't write boring code though, so ... WIDGET DEMAGNETIZER Y2K EDITION!
# (╯°□°)╯︵ ┻━┻
return

def _setup(self):
# Terrible temporary solution to an issue regarding compacting weights re: CUDNN RNN
if issubclass(type(self.module), torch.nn.RNNBase):
self.module.flatten_parameters = self.widget_demagnetizer_y2k_edition

for name_w in self.weights:
print('Applying weight drop of {} to {}'.format(self.dropout, name_w))
w = getattr(self.module, name_w)
del self.module._parameters[name_w]
self.module.register_parameter(name_w + '_raw', Parameter(w.data))

def _setweights(self):
for name_w in self.weights:
raw_w = getattr(self.module, name_w + '_raw')
w = None
if self.variational:
mask = torch.autograd.Variable(torch.ones(raw_w.size(0), 1))
if raw_w.is_cuda: mask = mask.cuda()
mask = torch.nn.functional.dropout(mask, p=self.dropout, training=True)
w = mask.expand_as(raw_w) * raw_w
else:
w = torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training)
setattr(self.module, name_w, w)

def forward(self, *args):
self._setweights()
return self.module.forward(*args)

if __name__ == '__main__':
import torch
from weight_drop import WeightDrop

# Input is (seq, batch, input)
x = torch.autograd.Variable(torch.randn(2, 1, 10)).cuda()
h0 = None

###

print('Testing WeightDrop')
print('=-=-=-=-=-=-=-=-=-=')

###

print('Testing WeightDrop with Linear')

lin = WeightDrop(torch.nn.Linear(10, 10), ['weight'], dropout=0.9)
lin.cuda()
run1 = [x.sum() for x in lin(x).data]
run2 = [x.sum() for x in lin(x).data]

print('All items should be different')
print('Run 1:', run1)
print('Run 2:', run2)

assert run1[0] != run2[0]
assert run1[1] != run2[1]

print('---')

###

print('Testing WeightDrop with LSTM')

wdrnn = WeightDrop(torch.nn.LSTM(10, 10), ['weight_hh_l0'], dropout=0.9)
wdrnn.cuda()

run1 = [x.sum() for x in wdrnn(x, h0)[0].data]
run2 = [x.sum() for x in wdrnn(x, h0)[0].data]

print('First timesteps should be equal, all others should differ')
print('Run 1:', run1)
print('Run 2:', run2)

# First time step, not influenced by hidden to hidden weights, should be equal
assert run1[0] == run2[0]
# Second step should not
assert run1[1] != run2[1]

print('---')

BIN
reproduction/text_classification/results_LSTM.xlsx View File


+ 109
- 0
reproduction/text_classification/train_HAN.py View File

@@ -0,0 +1,109 @@
# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径

import os
import sys
sys.path.append('../../')
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

from fastNLP.core.const import Const as C
from fastNLP.core import LRScheduler
import torch.nn as nn
from fastNLP.io.dataset_loader import SSTLoader
from reproduction.text_classification.data.yelpLoader import yelpLoader
from reproduction.text_classification.model.HAN import HANCLS
from fastNLP.modules.encoder.embedding import StaticEmbedding, CNNCharEmbedding, StackEmbedding
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP.core.trainer import Trainer
from torch.optim import SGD
import torch.cuda
from torch.optim.lr_scheduler import CosineAnnealingLR


##hyper

class Config():
model_dir_or_name = "en-base-uncased"
embedding_grad = False,
train_epoch = 30
batch_size = 100
num_classes = 5
task = "yelp"
#datadir = '/remote-home/lyli/fastNLP/yelp_polarity/'
datadir = '/remote-home/ygwang/yelp_polarity/'
datafile = {"train": "train.csv", "test": "test.csv"}
lr = 1e-3

def __init__(self):
self.datapath = {k: os.path.join(self.datadir, v)
for k, v in self.datafile.items()}


ops = Config()

##1.task相关信息:利用dataloader载入dataInfo

datainfo = yelpLoader(fine_grained=True).process(paths=ops.datapath, train_ds=['train'])
print(len(datainfo.datasets['train']))
print(len(datainfo.datasets['test']))


# post process
def make_sents(words):
sents = [words]
return sents


for dataset in datainfo.datasets.values():
dataset.apply_field(make_sents, field_name='words', new_field_name='input_sents')

datainfo = datainfo
datainfo.datasets['train'].set_input('input_sents')
datainfo.datasets['test'].set_input('input_sents')
datainfo.datasets['train'].set_target('target')
datainfo.datasets['test'].set_target('target')

## 2.或直接复用fastNLP的模型

vocab = datainfo.vocabs['words']
# embedding = StackEmbedding([StaticEmbedding(vocab), CNNCharEmbedding(vocab, 100)])
embedding = StaticEmbedding(vocab)

print(len(vocab))
print(len(datainfo.vocabs['target']))

# model = DPCNN(init_embed=embedding, num_cls=ops.num_classes)
model = HANCLS(init_embed=embedding, num_cls=ops.num_classes)

## 3. 声明loss,metric,optimizer
loss = CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET)
metric = AccuracyMetric(pred=C.OUTPUT, target=C.TARGET)
optimizer = SGD([param for param in model.parameters() if param.requires_grad == True],
lr=ops.lr, momentum=0.9, weight_decay=0)

callbacks = []
callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5)))

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

print(device)

for ds in datainfo.datasets.values():
ds.apply_field(len, C.INPUT, C.INPUT_LEN)
ds.set_input(C.INPUT, C.INPUT_LEN)
ds.set_target(C.TARGET)


## 4.定义train方法
def train(model, datainfo, loss, metrics, optimizer, num_epochs=ops.train_epoch):
trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss,
metrics=[metrics], dev_data=datainfo.datasets['test'], device=device,
check_code_level=-1, batch_size=ops.batch_size, callbacks=callbacks,
n_epochs=num_epochs)

print(trainer.train())


if __name__ == "__main__":
train(model, datainfo, loss, metric, optimizer)

+ 102
- 0
reproduction/text_classification/train_awdlstm.py View File

@@ -0,0 +1,102 @@
# 这个模型需要在pytorch=0.4下运行,weight_drop不支持1.0

# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径
import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'


import torch.nn as nn

from data.SSTLoader import SSTLoader
from data.IMDBLoader import IMDBLoader
from data.yelpLoader import yelpLoader
from fastNLP.modules.encoder.embedding import StaticEmbedding
from model.awd_lstm import AWDLSTMSentiment

from fastNLP.core.const import Const as C
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP import Trainer, Tester
from torch.optim import Adam
from fastNLP.io.model_io import ModelLoader, ModelSaver

import argparse


class Config():
train_epoch= 10
lr=0.001

num_classes=2
hidden_dim=256
num_layers=1
nfc=128
wdrop=0.5

task_name = "IMDB"
datapath={"train":"IMDB_data/train.csv", "test":"IMDB_data/test.csv"}
load_model_path="./result_IMDB/best_BiLSTM_SELF_ATTENTION_acc_2019-07-07-04-16-51"
save_model_path="./result_IMDB_test/"
opt=Config


# load data
dataloaders = {
"IMDB":IMDBLoader(),
"YELP":yelpLoader(),
"SST-5":SSTLoader(subtree=True,fine_grained=True),
"SST-3":SSTLoader(subtree=True,fine_grained=False)
}

if opt.task_name not in ["IMDB", "YELP", "SST-5", "SST-3"]:
raise ValueError("task name must in ['IMDB', 'YELP, 'SST-5', 'SST-3']")

dataloader = dataloaders[opt.task_name]
datainfo=dataloader.process(opt.datapath)
# print(datainfo.datasets["train"])
# print(datainfo)


# define model
vocab=datainfo.vocabs['words']
embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-840b-300', requires_grad=True)
model=AWDLSTMSentiment(init_embed=embed, num_classes=opt.num_classes, hidden_dim=opt.hidden_dim, num_layers=opt.num_layers, nfc=opt.nfc, wdrop=opt.wdrop)


# define loss_function and metrics
loss=CrossEntropyLoss()
metrics=AccuracyMetric()
optimizer= Adam([param for param in model.parameters() if param.requires_grad==True], lr=opt.lr)


def train(datainfo, model, optimizer, loss, metrics, opt):
trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss,
metrics=metrics, dev_data=datainfo.datasets['dev'], device=0, check_code_level=-1,
n_epochs=opt.train_epoch, save_path=opt.save_model_path)
trainer.train()


def test(datainfo, metrics, opt):
# load model
model = ModelLoader.load_pytorch_model(opt.load_model_path)
print("model loaded!")

# Tester
tester = Tester(datainfo.datasets['test'], model, metrics, batch_size=4, device=0)
acc = tester.test()
print("acc=",acc)



parser = argparse.ArgumentParser()
parser.add_argument('--mode', required=True, dest="mode",help='set the model\'s model')


args = parser.parse_args()
if args.mode == 'train':
train(datainfo, model, optimizer, loss, metrics, opt)
elif args.mode == 'test':
test(datainfo, metrics, opt)
else:
print('no mode specified for model!')
parser.print_help()

+ 3
- 4
reproduction/text_classification/train_char_cnn.py View File

@@ -7,7 +7,6 @@ import sys
sys.path.append('../..')
from fastNLP.core.const import Const as C
import torch.nn as nn
from fastNLP.io.dataset_loader import SSTLoader
from data.yelpLoader import yelpLoader
from data.sstLoader import sst2Loader
from data.IMDBLoader import IMDBLoader
@@ -107,9 +106,9 @@ ops=Config


##1.task相关信息:利用dataloader载入dataInfo
dataloader=sst2Loader()
dataloader=IMDBLoader()
#dataloader=yelpLoader(fine_grained=True)
#dataloader=sst2Loader()
#dataloader=IMDBLoader()
dataloader=yelpLoader(fine_grained=True)
datainfo=dataloader.process(ops.datapath,char_level_op=True)
char_vocab=ops.char_cnn_config["alphabet"]["en"]["lower"]["alphabet"]
ops.number_of_characters=len(char_vocab)


+ 99
- 0
reproduction/text_classification/train_lstm.py View File

@@ -0,0 +1,99 @@
# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径
import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'


import torch.nn as nn

from data.SSTLoader import SSTLoader
from data.IMDBLoader import IMDBLoader
from data.yelpLoader import yelpLoader
from fastNLP.modules.encoder.embedding import StaticEmbedding
from model.lstm import BiLSTMSentiment

from fastNLP.core.const import Const as C
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP import Trainer, Tester
from torch.optim import Adam
from fastNLP.io.model_io import ModelLoader, ModelSaver

import argparse


class Config():
train_epoch= 10
lr=0.001

num_classes=2
hidden_dim=256
num_layers=1
nfc=128

task_name = "IMDB"
datapath={"train":"IMDB_data/train.csv", "test":"IMDB_data/test.csv"}
load_model_path="./result_IMDB/best_BiLSTM_SELF_ATTENTION_acc_2019-07-07-04-16-51"
save_model_path="./result_IMDB_test/"
opt=Config


# load data
dataloaders = {
"IMDB":IMDBLoader(),
"YELP":yelpLoader(),
"SST-5":SSTLoader(subtree=True,fine_grained=True),
"SST-3":SSTLoader(subtree=True,fine_grained=False)
}

if opt.task_name not in ["IMDB", "YELP", "SST-5", "SST-3"]:
raise ValueError("task name must in ['IMDB', 'YELP, 'SST-5', 'SST-3']")

dataloader = dataloaders[opt.task_name]
datainfo=dataloader.process(opt.datapath)
# print(datainfo.datasets["train"])
# print(datainfo)


# define model
vocab=datainfo.vocabs['words']
embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-840b-300', requires_grad=True)
model=BiLSTMSentiment(init_embed=embed, num_classes=opt.num_classes, hidden_dim=opt.hidden_dim, num_layers=opt.num_layers, nfc=opt.nfc)


# define loss_function and metrics
loss=CrossEntropyLoss()
metrics=AccuracyMetric()
optimizer= Adam([param for param in model.parameters() if param.requires_grad==True], lr=opt.lr)


def train(datainfo, model, optimizer, loss, metrics, opt):
trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss,
metrics=metrics, dev_data=datainfo.datasets['dev'], device=0, check_code_level=-1,
n_epochs=opt.train_epoch, save_path=opt.save_model_path)
trainer.train()


def test(datainfo, metrics, opt):
# load model
model = ModelLoader.load_pytorch_model(opt.load_model_path)
print("model loaded!")

# Tester
tester = Tester(datainfo.datasets['test'], model, metrics, batch_size=4, device=0)
acc = tester.test()
print("acc=",acc)



parser = argparse.ArgumentParser()
parser.add_argument('--mode', required=True, dest="mode",help='set the model\'s model')


args = parser.parse_args()
if args.mode == 'train':
train(datainfo, model, optimizer, loss, metrics, opt)
elif args.mode == 'test':
test(datainfo, metrics, opt)
else:
print('no mode specified for model!')
parser.print_help()

+ 101
- 0
reproduction/text_classification/train_lstm_att.py View File

@@ -0,0 +1,101 @@
# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径
import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'


import torch.nn as nn

from data.SSTLoader import SSTLoader
from data.IMDBLoader import IMDBLoader
from data.yelpLoader import yelpLoader
from fastNLP.modules.encoder.embedding import StaticEmbedding
from model.lstm_self_attention import BiLSTM_SELF_ATTENTION

from fastNLP.core.const import Const as C
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP import Trainer, Tester
from torch.optim import Adam
from fastNLP.io.model_io import ModelLoader, ModelSaver

import argparse


class Config():
train_epoch= 10
lr=0.001

num_classes=2
hidden_dim=256
num_layers=1
attention_unit=256
attention_hops=1
nfc=128

task_name = "IMDB"
datapath={"train":"IMDB_data/train.csv", "test":"IMDB_data/test.csv"}
load_model_path="./result_IMDB/best_BiLSTM_SELF_ATTENTION_acc_2019-07-07-04-16-51"
save_model_path="./result_IMDB_test/"
opt=Config


# load data
dataloaders = {
"IMDB":IMDBLoader(),
"YELP":yelpLoader(),
"SST-5":SSTLoader(subtree=True,fine_grained=True),
"SST-3":SSTLoader(subtree=True,fine_grained=False)
}

if opt.task_name not in ["IMDB", "YELP", "SST-5", "SST-3"]:
raise ValueError("task name must in ['IMDB', 'YELP, 'SST-5', 'SST-3']")

dataloader = dataloaders[opt.task_name]
datainfo=dataloader.process(opt.datapath)
# print(datainfo.datasets["train"])
# print(datainfo)


# define model
vocab=datainfo.vocabs['words']
embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-840b-300', requires_grad=True)
model=BiLSTM_SELF_ATTENTION(init_embed=embed, num_classes=opt.num_classes, hidden_dim=opt.hidden_dim, num_layers=opt.num_layers, attention_unit=opt.attention_unit, attention_hops=opt.attention_hops, nfc=opt.nfc)


# define loss_function and metrics
loss=CrossEntropyLoss()
metrics=AccuracyMetric()
optimizer= Adam([param for param in model.parameters() if param.requires_grad==True], lr=opt.lr)


def train(datainfo, model, optimizer, loss, metrics, opt):
trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss,
metrics=metrics, dev_data=datainfo.datasets['dev'], device=0, check_code_level=-1,
n_epochs=opt.train_epoch, save_path=opt.save_model_path)
trainer.train()


def test(datainfo, metrics, opt):
# load model
model = ModelLoader.load_pytorch_model(opt.load_model_path)
print("model loaded!")

# Tester
tester = Tester(datainfo.datasets['test'], model, metrics, batch_size=4, device=0)
acc = tester.test()
print("acc=",acc)



parser = argparse.ArgumentParser()
parser.add_argument('--mode', required=True, dest="mode",help='set the model\'s model')


args = parser.parse_args()
if args.mode == 'train':
train(datainfo, model, optimizer, loss, metrics, opt)
elif args.mode == 'test':
test(datainfo, metrics, opt)
else:
print('no mode specified for model!')
parser.print_help()

Loading…
Cancel
Save