Browse Source

Merge branch 'dev0.5.0' into lyhuang-reproduction

tags/v0.4.10
SrWYG GitHub 5 years ago
parent
commit
368733d98c
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 966 additions and 66 deletions
  1. +4
    -5
      fastNLP/modules/aggregator/attention.py
  2. +111
    -0
      reproduction/seqence_labelling/ner/model/dilated_cnn.py
  3. +33
    -3
      reproduction/text_classification/data/IMDBLoader.py
  4. +98
    -0
      reproduction/text_classification/data/sstLoader.py
  5. +182
    -56
      reproduction/text_classification/data/yelpLoader.py
  6. +90
    -1
      reproduction/text_classification/model/char_cnn.py
  7. +106
    -1
      reproduction/text_classification/model/dpcnn.py
  8. +206
    -0
      reproduction/text_classification/train_char_cnn.py
  9. +125
    -0
      reproduction/text_classification/train_dpcnn.py
  10. +11
    -0
      reproduction/text_classification/utils/util_init.py

+ 4
- 5
fastNLP/modules/aggregator/attention.py View File

@@ -19,7 +19,7 @@ class DotAttention(nn.Module):
补上文档
"""
def __init__(self, key_size, value_size, dropout=0):
def __init__(self, key_size, value_size, dropout=0.0):
super(DotAttention, self).__init__()
self.key_size = key_size
self.value_size = value_size
@@ -37,7 +37,7 @@ class DotAttention(nn.Module):
"""
output = torch.matmul(Q, K.transpose(1, 2)) / self.scale
if mask_out is not None:
output.masked_fill_(mask_out, -1e8)
output.masked_fill_(mask_out, -1e18)
output = self.softmax(output)
output = self.drop(output)
return torch.matmul(output, V)
@@ -67,9 +67,8 @@ class MultiHeadAttention(nn.Module):
self.k_in = nn.Linear(input_size, in_size)
self.v_in = nn.Linear(input_size, in_size)
# follow the paper, do not apply dropout within dot-product
self.attention = DotAttention(key_size=key_size, value_size=value_size, dropout=0)
self.attention = DotAttention(key_size=key_size, value_size=value_size, dropout=dropout)
self.out = nn.Linear(value_size * num_head, input_size)
self.drop = TimestepDropout(dropout)
self.reset_parameters()
def reset_parameters(self):
@@ -105,7 +104,7 @@ class MultiHeadAttention(nn.Module):
# concat all heads, do output linear
atte = atte.permute(1, 2, 0, 3).contiguous().view(batch, sq, -1)
output = self.drop(self.out(atte))
output = self.out(atte)
return output




+ 111
- 0
reproduction/seqence_labelling/ner/model/dilated_cnn.py View File

@@ -0,0 +1,111 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
from fastNLP.modules.decoder import ConditionalRandomField
from fastNLP.modules.encoder import Embedding
from fastNLP.core.utils import seq_len_to_mask
from fastNLP.core.const import Const as C


class IDCNN(nn.Module):
def __init__(self, init_embed, char_embed,
num_cls,
repeats, num_layers, num_filters, kernel_size,
use_crf=False, use_projection=False, block_loss=False,
input_dropout=0.3, hidden_dropout=0.2, inner_dropout=0.0):
super(IDCNN, self).__init__()
self.word_embeddings = Embedding(init_embed)
self.char_embeddings = Embedding(char_embed)
embedding_size = self.word_embeddings.embedding_dim + \
self.char_embeddings.embedding_dim

self.conv0 = nn.Sequential(
nn.Conv1d(in_channels=embedding_size,
out_channels=num_filters,
kernel_size=kernel_size,
stride=1, dilation=1,
padding=kernel_size//2,
bias=True),
nn.ReLU(),
)

block = []
for layer_i in range(num_layers):
dilated = 2 ** layer_i
block.append(nn.Conv1d(
in_channels=num_filters,
out_channels=num_filters,
kernel_size=kernel_size,
stride=1, dilation=dilated,
padding=(kernel_size//2) * dilated,
bias=True))
block.append(nn.ReLU())
self.block = nn.Sequential(*block)

if use_projection:
self.projection = nn.Sequential(
nn.Conv1d(
in_channels=num_filters,
out_channels=num_filters//2,
kernel_size=1,
bias=True),
nn.ReLU(),)
encode_dim = num_filters // 2
else:
self.projection = None
encode_dim = num_filters

self.input_drop = nn.Dropout(input_dropout)
self.hidden_drop = nn.Dropout(hidden_dropout)
self.inner_drop = nn.Dropout(inner_dropout)
self.repeats = repeats
self.out_fc = nn.Conv1d(
in_channels=encode_dim,
out_channels=num_cls,
kernel_size=1,
bias=True)
self.crf = ConditionalRandomField(
num_tags=num_cls) if use_crf else None
self.block_loss = block_loss

def forward(self, words, chars, seq_len, target=None):
e1 = self.word_embeddings(words)
e2 = self.char_embeddings(chars)
x = torch.cat((e1, e2), dim=-1) # b,l,h
mask = seq_len_to_mask(seq_len)

x = x.transpose(1, 2) # b,h,l
last_output = self.conv0(x)
output = []
for repeat in range(self.repeats):
last_output = self.block(last_output)
hidden = self.projection(last_output) if self.projection is not None else last_output
output.append(self.out_fc(hidden))

def compute_loss(y, t, mask):
if self.crf is not None and target is not None:
loss = self.crf(y, t, mask)
else:
t.masked_fill_(mask == 0, -100)
loss = F.cross_entropy(y, t, ignore_index=-100)
return loss

if self.block_loss:
losses = [compute_loss(o, target, mask) for o in output]
loss = sum(losses)
else:
loss = compute_loss(output[-1], target, mask)

scores = output[-1]
if self.crf is not None:
pred = self.crf.viterbi_decode(scores, target, mask)
else:
pred = scores.max(1)[1] * mask.long()

return {
C.LOSS: loss,
C.OUTPUT: pred,
}

def predict(self, words, chars, seq_len):
return self.forward(words, chars, seq_len)[C.OUTPUT]

+ 33
- 3
reproduction/text_classification/data/IMDBLoader.py View File

@@ -9,6 +9,7 @@ from fastNLP import Const
# from reproduction.utils import check_dataloader_paths
from functools import partial


class IMDBLoader(DataSetLoader):
"""
读取IMDB数据集,DataSet包含以下fields:
@@ -33,6 +34,7 @@ class IMDBLoader(DataSetLoader):
target = parts[0]
words = parts[1].lower().split()
dataset.append(Instance(words=words, target=target))

if len(dataset)==0:
raise RuntimeError(f"{path} has no valid data.")

@@ -42,19 +44,32 @@ class IMDBLoader(DataSetLoader):
paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None,
src_embed_opt: EmbeddingOption = None):
# paths = check_dataloader_paths(paths)
src_embed_opt: EmbeddingOption = None,
char_level_op=False):
datasets = {}
info = DataInfo()
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset
def wordtochar(words):
chars = []
for word in words:
word = word.lower()
for char in word:
chars.append(char)
return chars

if char_level_op:
for dataset in datasets.values():
dataset.apply_field(wordtochar, field_name="words", new_field_name='chars')

datasets["train"], datasets["dev"] = datasets["train"].split(0.1, shuffle=False)

src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
src_vocab.from_dataset(datasets['train'], field_name='words')

src_vocab.index_dataset(*datasets.values(), field_name='words')

tgt_vocab = Vocabulary(unknown=None, padding=None) \
@@ -78,3 +93,18 @@ class IMDBLoader(DataSetLoader):
dataset.set_target("target")

return info



if __name__=="__main__":
datapath = {"train": "/remote-home/ygwang/IMDB_data/train.csv",
"test": "/remote-home/ygwang/IMDB_data/test.csv"}
datainfo=IMDBLoader().process(datapath,char_level_op=True)
#print(datainfo.datasets["train"])
len_count = 0
for instance in datainfo.datasets["train"]:
len_count += len(instance["chars"])

ave_len = len_count / len(datainfo.datasets["train"])
print(ave_len)


+ 98
- 0
reproduction/text_classification/data/sstLoader.py View File

@@ -0,0 +1,98 @@
import csv
from typing import Iterable
from fastNLP import DataSet, Instance, Vocabulary
from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io.base_loader import DataInfo,DataSetLoader
from fastNLP.io.embed_loader import EmbeddingOption
from fastNLP.io.file_reader import _read_json
from typing import Union, Dict
from reproduction.Star_transformer.datasets import EmbedLoader
from reproduction.utils import check_dataloader_paths

class sst2Loader(DataSetLoader):
'''
数据来源"SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
'''
def __init__(self):
super(sst2Loader, self).__init__()

def _load(self, path: str) -> DataSet:
ds = DataSet()
all_count=0
csv_reader = csv.reader(open(path, encoding='utf-8'),delimiter='\t')
skip_row = 0
for idx,row in enumerate(csv_reader):
if idx<=skip_row:
continue
target = row[1]
words = row[0].split()
ds.append(Instance(words=words,target=target))
all_count+=1
print("all count:", all_count)
return ds

def process(self,
paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None,
src_embed_opt: EmbeddingOption = None,
char_level_op=False):

paths = check_dataloader_paths(paths)
datasets = {}
info = DataInfo()
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset

def wordtochar(words):
chars=[]
for word in words:
word=word.lower()
for char in word:
chars.append(char)
return chars

input_name, target_name = 'words', 'target'
info.vocabs={}

# 就分隔为char形式
if char_level_op:
for dataset in datasets.values():
dataset.apply_field(wordtochar, field_name="words", new_field_name='chars')

src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
src_vocab.from_dataset(datasets['train'], field_name='words')
src_vocab.index_dataset(*datasets.values(), field_name='words')

tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
tgt_vocab.from_dataset(datasets['train'], field_name='target')
tgt_vocab.index_dataset(*datasets.values(), field_name='target')


info.vocabs = {
"words": src_vocab,
"target": tgt_vocab
}

info.datasets = datasets


if src_embed_opt is not None:
embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
info.embeddings['words'] = embed

return info

if __name__=="__main__":
datapath = {"train": "/remote-home/ygwang/workspace/GLUE/SST-2/train.tsv",
"dev": "/remote-home/ygwang/workspace/GLUE/SST-2/dev.tsv"}
datainfo=sst2Loader().process(datapath,char_level_op=True)
#print(datainfo.datasets["train"])
len_count = 0
for instance in datainfo.datasets["train"]:
len_count += len(instance["chars"])

ave_len = len_count / len(datainfo.datasets["train"])
print(ave_len)

+ 182
- 56
reproduction/text_classification/data/yelpLoader.py View File

@@ -1,77 +1,203 @@
from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader
import ast
import csv
from typing import Iterable
from fastNLP import DataSet, Instance, Vocabulary
from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io.base_loader import DataSetLoader, DataInfo
from typing import Union, Dict, List, Iterator
from fastNLP import DataSet
from fastNLP import Instance
from fastNLP import Vocabulary
from fastNLP import Const
# from reproduction.utils import check_dataloader_paths
from functools import partial
import pandas as pd
from fastNLP.io import JsonLoader
from fastNLP.io.base_loader import DataInfo,DataSetLoader
from fastNLP.io.embed_loader import EmbeddingOption
from fastNLP.io.file_reader import _read_json
from typing import Union, Dict
from reproduction.utils import check_dataloader_paths

class yelpLoader(DataSetLoader):


def get_tokenizer():
try:
import spacy
en = spacy.load('en')
print('use spacy tokenizer')
return lambda x: [w.text for w in en.tokenizer(x)]
except Exception as e:
print('use raw tokenizer')
return lambda x: x.split()

def clean_str(sentence, tokenizer, char_lower=False):
"""
读取IMDB数据集,DataSet包含以下fields:
heavily borrowed from github
https://github.com/LukeZhuang/Hierarchical-Attention-Network/blob/master/yelp-preprocess.ipynb
:param sentence: is a str
:return:
"""
if char_lower:
sentence = sentence.lower()
import re
nonalpnum = re.compile('[^0-9a-zA-Z?!\']+')
words = tokenizer(sentence)
words_collection = []
for word in words:
if word in ['-lrb-', '-rrb-', '<sssss>', '-r', '-l', 'b-']:
continue
tt = nonalpnum.split(word)
t = ''.join(tt)
if t != '':
words_collection.append(t)

words: list(str), 需要分类的文本
target: str, 文本的标签
return words_collection


class yelpLoader(DataSetLoader):
"""
读取Yelp_full/Yelp_polarity数据集, DataSet包含fields:
words: list(str), 需要分类的文本
target: str, 文本的标签
chars:list(str),未index的字符列表

def __init__(self):
数据集:yelp_full/yelp_polarity
:param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False``
"""
def __init__(self, fine_grained=False,lower=False):
super(yelpLoader, self).__init__()
tag_v = {'1.0': 'very negative', '2.0': 'negative', '3.0': 'neutral',
'4.0': 'positive', '5.0': 'very positive'}
if not fine_grained:
tag_v['1.0'] = tag_v['2.0']
tag_v['5.0'] = tag_v['4.0']
self.fine_grained = fine_grained
self.tag_v = tag_v
self.lower = lower
self.tokenizer = get_tokenizer()

def _load(self, path):
dataset = DataSet()
data = pd.read_csv(path, header=None, sep=",").values
for line in data:
target = str(line[0])
words = str(line[1]).lower().split()
dataset.append(Instance(words=words, target=target))
if len(dataset)==0:
raise RuntimeError(f"{path} has no valid data.")

return dataset
'''
读取Yelp数据集, DataSet包含fields:
def process(self,
paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None,
src_embed_opt: EmbeddingOption = None):
# paths = check_dataloader_paths(paths)
datasets = {}
info = DataInfo()
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset
review_id: str, 22 character unique review id
user_id: str, 22 character unique user id
business_id: str, 22 character business id
useful: int, number of useful votes received
funny: int, number of funny votes received
cool: int, number of cool votes received
date: str, date formatted YYYY-MM-DD
words: list(str), 需要分类的文本
target: str, 文本的标签
数据来源: https://www.yelp.com/dataset/download

def _load_json(self, path):
ds = DataSet()
for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna):
d = ast.literal_eval(d)
d["words"] = d.pop("text").split()
d["target"] = self.tag_v[str(d.pop("stars"))]
ds.append(Instance(**d))
return ds
def _load_yelp2015_broken(self,path):
ds = DataSet()
with open (path,encoding='ISO 8859-1') as f:
row=f.readline()
all_count=0
exp_count=0
while row:
row=row.split("\t\t")
all_count+=1
if len(row)>=3:
words=row[-1].split()
try:
target=self.tag_v[str(row[-2])+".0"]
ds.append(Instance(words=words, target=target))
except KeyError:
exp_count+=1
else:
exp_count+=1
row = f.readline()
print("error sample count:",exp_count)
print("all count:",all_count)
return ds
'''
def _load(self, path):
ds = DataSet()
csv_reader=csv.reader(open(path,encoding='utf-8'))
all_count=0
real_count=0
for row in csv_reader:
all_count+=1
if len(row)==2:
target=self.tag_v[row[0]+".0"]
words=clean_str(row[1],self.tokenizer,self.lower)
if len(words)!=0:
ds.append(Instance(words=words,target=target))
real_count += 1
print("all count:", all_count)
print("real count:", real_count)
return ds

datasets["train"], datasets["dev"] = datasets["train"].split(0.1, shuffle=False)

src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
src_vocab.from_dataset(datasets['train'], field_name='words')
src_vocab.index_dataset(*datasets.values(), field_name='words')

def process(self, paths: Union[str, Dict[str, str]],
train_ds: Iterable[str] = None,
src_vocab_op: VocabularyOption = None,
tgt_vocab_op: VocabularyOption = None,
embed_opt: EmbeddingOption = None,
char_level_op=False):
paths = check_dataloader_paths(paths)
datasets = {}
info = DataInfo(datasets=self.load(paths))
src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op)
tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
tgt_vocab.from_dataset(datasets['train'], field_name='target')
tgt_vocab.index_dataset(*datasets.values(), field_name='target')
if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)
_train_ds = [info.datasets[name]
for name in train_ds] if train_ds else info.datasets.values()


def wordtochar(words):

info.vocabs = {
"words": src_vocab,
"target": tgt_vocab
}
chars=[]
for word in words:
word=word.lower()
for char in word:
chars.append(char)
return chars

info.datasets = datasets
input_name, target_name = 'words', 'target'
info.vocabs={}
#就分隔为char形式
if char_level_op:
for dataset in info.datasets.values():
dataset.apply_field(wordtochar, field_name="words",new_field_name='chars')
# if embed_opt is not None:
# embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab)
# info.embeddings['words'] = embed
else:
src_vocab.from_dataset(*_train_ds, field_name=input_name)
src_vocab.index_dataset(*info.datasets.values(),field_name=input_name, new_field_name=input_name)
info.vocabs[input_name]=src_vocab

if src_embed_opt is not None:
embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
info.embeddings['words'] = embed
tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
tgt_vocab.index_dataset(
*info.datasets.values(),
field_name=target_name, new_field_name=target_name)

for name, dataset in info.datasets.items():
dataset.set_input("words")
dataset.set_target("target")
info.vocabs[target_name]=tgt_vocab

return info

if __name__=="__main__":
testloader=yelpLoader()
# datapath = {"train": "/remote-home/ygwang/yelp_full/train.csv",
# "test": "/remote-home/ygwang/yelp_full/test.csv"}
#datapath={"train": "/remote-home/ygwang/yelp_full/test.csv"}
datapath = {"train": "/remote-home/ygwang/yelp_polarity/train.csv",
"test": "/remote-home/ygwang/yelp_polarity/test.csv"}
datainfo=testloader.process(datapath,char_level_op=True)

len_count=0
for instance in datainfo.datasets["train"]:
len_count+=len(instance["chars"])

ave_len=len_count/len(datainfo.datasets["train"])
print(ave_len)

+ 90
- 1
reproduction/text_classification/model/char_cnn.py View File

@@ -1 +1,90 @@
# TODO
'''
@author: https://github.com/ahmedbesbes/character-based-cnn
这里借鉴了上述链接中char-cnn model的代码,改动主要为将其改动为符合fastnlp的pipline
'''
import torch
import torch.nn as nn
from fastNLP.core.const import Const as C

class CharacterLevelCNN(nn.Module):
def __init__(self, args,embedding):
super(CharacterLevelCNN, self).__init__()

self.config=args.char_cnn_config
self.embedding=embedding

conv_layers = []
for i, conv_layer_parameter in enumerate(self.config['model_parameters'][args.model_size]['conv']):
if i == 0:
#in_channels = args.number_of_characters + len(args.extra_characters)
in_channels = args.embedding_dim
out_channels = conv_layer_parameter[0]
else:
in_channels, out_channels = conv_layer_parameter[0], conv_layer_parameter[0]

if conv_layer_parameter[2] != -1:
conv_layer = nn.Sequential(nn.Conv1d(in_channels,
out_channels,
kernel_size=conv_layer_parameter[1], padding=0),
nn.ReLU(),
nn.MaxPool1d(conv_layer_parameter[2]))
else:
conv_layer = nn.Sequential(nn.Conv1d(in_channels,
out_channels,
kernel_size=conv_layer_parameter[1], padding=0),
nn.ReLU())
conv_layers.append(conv_layer)
self.conv_layers = nn.ModuleList(conv_layers)

input_shape = (args.batch_size, args.max_length,
args.number_of_characters + len(args.extra_characters))
dimension = self._get_conv_output(input_shape)

print('dimension :', dimension)

fc_layer_parameter = self.config['model_parameters'][args.model_size]['fc'][0]
fc_layers = nn.ModuleList([
nn.Sequential(
nn.Linear(dimension, fc_layer_parameter), nn.Dropout(0.5)),
nn.Sequential(nn.Linear(fc_layer_parameter,
fc_layer_parameter), nn.Dropout(0.5)),
nn.Linear(fc_layer_parameter, args.num_classes),
])

self.fc_layers = fc_layers

if args.model_size == 'small':
self._create_weights(mean=0.0, std=0.05)
elif args.model_size == 'large':
self._create_weights(mean=0.0, std=0.02)

def _create_weights(self, mean=0.0, std=0.05):
for module in self.modules():
if isinstance(module, nn.Conv1d) or isinstance(module, nn.Linear):
module.weight.data.normal_(mean, std)

def _get_conv_output(self, shape):
input = torch.rand(shape)
output = input.transpose(1, 2)
# forward pass through conv layers
for i in range(len(self.conv_layers)):
output = self.conv_layers[i](output)

output = output.view(output.size(0), -1)
n_size = output.size(1)
return n_size

def forward(self, chars):
input=self.embedding(chars)
output = input.transpose(1, 2)
# forward pass through conv layers
for i in range(len(self.conv_layers)):
output = self.conv_layers[i](output)

output = output.view(output.size(0), -1)

# forward pass through fc layers
for i in range(len(self.fc_layers)):
output = self.fc_layers[i](output)

return {C.OUTPUT: output}

+ 106
- 1
reproduction/text_classification/model/dpcnn.py View File

@@ -1 +1,106 @@
# TODO

import torch
import torch.nn as nn
from fastNLP.modules.utils import get_embeddings
from fastNLP.core import Const as C


class DPCNN(nn.Module):
def __init__(self, init_embed, num_cls, n_filters=256,
kernel_size=3, n_layers=7, embed_dropout=0.1, cls_dropout=0.1):
super().__init__()
self.region_embed = RegionEmbedding(
init_embed, out_dim=n_filters, kernel_sizes=[1, 3, 5])

embed_dim = self.region_embed.embedding_dim
self.conv_list = nn.ModuleList()
for i in range(n_layers):
self.conv_list.append(nn.Sequential(
nn.ReLU(),

nn.Conv1d(n_filters, n_filters, kernel_size,
padding=kernel_size//2),
nn.Conv1d(n_filters, n_filters, kernel_size,
padding=kernel_size//2),
))
self.pool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
self.embed_drop = nn.Dropout(embed_dropout)
self.classfier = nn.Sequential(
nn.Dropout(cls_dropout),

nn.Linear(n_filters, num_cls),
)
self.reset_parameters()


def reset_parameters(self):
for m in self.modules():
if isinstance(m, (nn.Conv1d, nn.Conv2d, nn.Linear)):
nn.init.normal_(m.weight, mean=0, std=0.01)
if m.bias is not None:
nn.init.normal_(m.bias, mean=0, std=0.01)


def forward(self, words, seq_len=None):
words = words.long()
# get region embeddings
x = self.region_embed(words)
x = self.embed_drop(x)

# not pooling on first conv
x = self.conv_list[0](x) + x
for conv in self.conv_list[1:]:
x = self.pool(x)
x = conv(x) + x

# B, C, L => B, C
x, _ = torch.max(x, dim=2)
x = self.classfier(x)
return {C.OUTPUT: x}


def predict(self, words, seq_len=None):
x = self.forward(words, seq_len)[C.OUTPUT]
return {C.OUTPUT: torch.argmax(x, 1)}

class RegionEmbedding(nn.Module):
def __init__(self, init_embed, out_dim=300, kernel_sizes=None):
super().__init__()
if kernel_sizes is None:
kernel_sizes = [5, 9]

assert isinstance(
kernel_sizes, list), 'kernel_sizes should be List(int)'

self.embed = get_embeddings(init_embed)
try:
embed_dim = self.embed.embedding_dim
except Exception:
embed_dim = self.embed.embed_size
self.region_embeds = nn.ModuleList()
for ksz in kernel_sizes:
self.region_embeds.append(nn.Sequential(
nn.Conv1d(embed_dim, embed_dim, ksz, padding=ksz // 2),
))
self.linears = nn.ModuleList([nn.Conv1d(embed_dim, out_dim, 1)
for _ in range(len(kernel_sizes))])
self.embedding_dim = embed_dim

def forward(self, x):
x = self.embed(x)
x = x.transpose(1, 2)
# B, C, L
out = 0
for conv, fc in zip(self.region_embeds, self.linears[1:]):
conv_i = conv(x)
out = out + fc(conv_i)
# B, C, L
return out


if __name__ == '__main__':
x = torch.randint(0, 10000, size=(5, 15), dtype=torch.long)
model = DPCNN((10000, 300), 20)
y = model(x)
print(y.size(), y.mean(1), y.std(1))


+ 206
- 0
reproduction/text_classification/train_char_cnn.py View File

@@ -0,0 +1,206 @@
# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径
import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'

import sys
sys.path.append('../..')
from fastNLP.core.const import Const as C
import torch.nn as nn
from fastNLP.io.dataset_loader import SSTLoader
from data.yelpLoader import yelpLoader
from data.sstLoader import sst2Loader
from data.IMDBLoader import IMDBLoader
from model.char_cnn import CharacterLevelCNN
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.models.cnn_text_classification import CNNText
from fastNLP.modules.encoder.embedding import CNNCharEmbedding,StaticEmbedding,StackEmbedding,LSTMCharEmbedding
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP.core.trainer import Trainer
from torch.optim import SGD
from torch.autograd import Variable
import torch
from fastNLP import BucketSampler

##hyper
#todo 这里加入fastnlp的记录
class Config():
model_dir_or_name="en-base-uncased"
embedding_grad= False,
bert_embedding_larers= '4,-2,-1'
train_epoch= 50
num_classes=2
task= "IMDB"
#yelp_p
datapath = {"train": "/remote-home/ygwang/yelp_polarity/train.csv",
"test": "/remote-home/ygwang/yelp_polarity/test.csv"}
#IMDB
#datapath = {"train": "/remote-home/ygwang/IMDB_data/train.csv",
# "test": "/remote-home/ygwang/IMDB_data/test.csv"}
# sst
# datapath = {"train": "/remote-home/ygwang/workspace/GLUE/SST-2/train.tsv",
# "dev": "/remote-home/ygwang/workspace/GLUE/SST-2/dev.tsv"}

lr=0.01
batch_size=128
model_size="large"
number_of_characters=69
extra_characters=''
max_length=1014

char_cnn_config={
"alphabet": {
"en": {
"lower": {
"alphabet": "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}",
"number_of_characters": 69
},
"both": {
"alphabet": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}",
"number_of_characters": 95
}
}
},
"model_parameters": {
"small": {
"conv": [
#依次是channel,kennnel_size,maxpooling_size
[256,7,3],
[256,7,3],
[256,3,-1],
[256,3,-1],
[256,3,-1],
[256,3,3]
],
"fc": [1024,1024]
},
"large":{
"conv":[
[1024, 7, 3],
[1024, 7, 3],
[1024, 3, -1],
[1024, 3, -1],
[1024, 3, -1],
[1024, 3, 3]
],
"fc": [2048,2048]
}
},
"data": {
"text_column": "SentimentText",
"label_column": "Sentiment",
"max_length": 1014,
"num_of_classes": 2,
"encoding": None,
"chunksize": 50000,
"max_rows": 100000,
"preprocessing_steps": ["lower", "remove_hashtags", "remove_urls", "remove_user_mentions"]
},
"training": {
"batch_size": 128,
"learning_rate": 0.01,
"epochs": 10,
"optimizer": "sgd"
}
}
ops=Config


##1.task相关信息:利用dataloader载入dataInfo
dataloader=sst2Loader()
dataloader=IMDBLoader()
#dataloader=yelpLoader(fine_grained=True)
datainfo=dataloader.process(ops.datapath,char_level_op=True)
char_vocab=ops.char_cnn_config["alphabet"]["en"]["lower"]["alphabet"]
ops.number_of_characters=len(char_vocab)
ops.embedding_dim=ops.number_of_characters

#chartoindex
def chartoindex(chars):
max_seq_len=ops.max_length
zero_index=len(char_vocab)
char_index_list=[]
for char in chars:
if char in char_vocab:
char_index_list.append(char_vocab.index(char))
else:
#<unk>和<pad>均使用最后一个作为embbeding
char_index_list.append(zero_index)
if len(char_index_list) > max_seq_len:
char_index_list = char_index_list[:max_seq_len]
elif 0 < len(char_index_list) < max_seq_len:
char_index_list = char_index_list+[zero_index]*(max_seq_len-len(char_index_list))
elif len(char_index_list) == 0:
char_index_list=[zero_index]*max_seq_len
return char_index_list

for dataset in datainfo.datasets.values():
dataset.apply_field(chartoindex,field_name='chars',new_field_name='chars')

datainfo.datasets['train'].set_input('chars')
datainfo.datasets['test'].set_input('chars')
datainfo.datasets['train'].set_target('target')
datainfo.datasets['test'].set_target('target')

##2. 定义/组装模型,这里可以随意,就如果是fastNLP封装好的,类似CNNText就直接用初始化调用就好了,这里只是给出一个伪框架表示占位,在这里建立符合fastNLP输入输出规范的model
class ModelFactory(nn.Module):
"""
用于拼装embedding,encoder,decoder 以及设计forward过程

:param embedding: embbeding model
:param encoder: encoder model
:param decoder: decoder model

"""
def __int__(self,embedding,encoder,decoder,**kwargs):
super(ModelFactory,self).__init__()
self.embedding=embedding
self.encoder=encoder
self.decoder=decoder

def forward(self,x):
return {C.OUTPUT:None}

## 2.或直接复用fastNLP的模型
#vocab=datainfo.vocabs['words']
vocab_label=datainfo.vocabs['target']
'''
# emded_char=CNNCharEmbedding(vocab)
# embed_word = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True)
# embedding=StackEmbedding([emded_char, embed_word])
# cnn_char_embed = CNNCharEmbedding(vocab)
# lstm_char_embed = LSTMCharEmbedding(vocab)
# embedding = StackEmbedding([cnn_char_embed, lstm_char_embed])
'''
#one-hot embedding
embedding_weight= Variable(torch.zeros(len(char_vocab)+1, len(char_vocab)))

for i in range(len(char_vocab)):
embedding_weight[i][i]=1
embedding=nn.Embedding(num_embeddings=len(char_vocab)+1,embedding_dim=len(char_vocab),padding_idx=len(char_vocab),_weight=embedding_weight)
for para in embedding.parameters():
para.requires_grad=False
#CNNText太过于简单
#model=CNNText(init_embed=embedding, num_classes=ops.num_classes)
model=CharacterLevelCNN(ops,embedding)

## 3. 声明loss,metric,optimizer
loss=CrossEntropyLoss
metric=AccuracyMetric
optimizer= SGD([param for param in model.parameters() if param.requires_grad==True], lr=ops.lr)

## 4.定义train方法
def train(model,datainfo,loss,metrics,optimizer,num_epochs=100):
trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss(target='target'),
metrics=[metrics(target='target')], dev_data=datainfo.datasets['test'], device=0, check_code_level=-1,
n_epochs=num_epochs)
print(trainer.train())



if __name__=="__main__":
#print(vocab_label)

#print(datainfo.datasets["train"])
train(model,datainfo,loss,metric,optimizer,num_epochs=ops.train_epoch)

+ 125
- 0
reproduction/text_classification/train_dpcnn.py View File

@@ -0,0 +1,125 @@
# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径

import torch.cuda
from fastNLP.core.utils import cache_results
from torch.optim import SGD
from torch.optim.lr_scheduler import CosineAnnealingLR, LambdaLR
from fastNLP.core.trainer import Trainer
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP.modules.encoder.embedding import StaticEmbedding, CNNCharEmbedding, StackEmbedding
from reproduction.text_classification.model.dpcnn import DPCNN
from data.yelpLoader import yelpLoader
import torch.nn as nn
from fastNLP.core import LRScheduler
from fastNLP.core.const import Const as C
from fastNLP.core.vocabulary import VocabularyOption
from utils.util_init import set_rng_seeds
import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"



# hyper

class Config():
seed = 12345
model_dir_or_name = "dpcnn-yelp-p"
embedding_grad = True
train_epoch = 30
batch_size = 100
num_classes = 2
task = "yelp_p"
#datadir = '/remote-home/yfshao/workdir/datasets/SST'
datadir = '/remote-home/yfshao/workdir/datasets/yelp_polarity'
#datafile = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"}
datafile = {"train": "train.csv", "test": "test.csv"}
lr = 1e-3
src_vocab_op = VocabularyOption()
embed_dropout = 0.3
cls_dropout = 0.1
weight_decay = 1e-4

def __init__(self):
self.datapath = {k: os.path.join(self.datadir, v)
for k, v in self.datafile.items()}


ops = Config()

set_rng_seeds(ops.seed)
print('RNG SEED: {}'.format(ops.seed))

# 1.task相关信息:利用dataloader载入dataInfo

#datainfo=SSTLoader(fine_grained=True).process(paths=ops.datapath, train_ds=['train'])
@cache_results(ops.model_dir_or_name+'-data-cache')
def load_data():
datainfo = yelpLoader(fine_grained=True, lower=True).process(
paths=ops.datapath, train_ds=['train'], src_vocab_op=ops.src_vocab_op)
for ds in datainfo.datasets.values():
ds.apply_field(len, C.INPUT, C.INPUT_LEN)
ds.set_input(C.INPUT, C.INPUT_LEN)
ds.set_target(C.TARGET)
return datainfo

datainfo = load_data()

# 2.或直接复用fastNLP的模型

vocab = datainfo.vocabs['words']
# embedding = StackEmbedding([StaticEmbedding(vocab), CNNCharEmbedding(vocab, 100)])
#embedding = StaticEmbedding(vocab)

embedding = StaticEmbedding(
vocab, model_dir_or_name='en-word2vec-300', requires_grad=ops.embedding_grad,
normalize=False
)

print(len(datainfo.datasets['train']))
print(len(datainfo.datasets['test']))
print(datainfo.datasets['train'][0])


print(len(vocab))
print(len(datainfo.vocabs['target']))


model = DPCNN(init_embed=embedding, num_cls=ops.num_classes,
embed_dropout=ops.embed_dropout, cls_dropout=ops.cls_dropout)
print(model)

# 3. 声明loss,metric,optimizer
loss = CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET)
metric = AccuracyMetric(pred=C.OUTPUT, target=C.TARGET)
optimizer = SGD([param for param in model.parameters() if param.requires_grad == True],
lr=ops.lr, momentum=0.9, weight_decay=ops.weight_decay)

callbacks = []
callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5)))
# callbacks.append
# LRScheduler(LambdaLR(optimizer, lambda epoch: ops.lr if epoch <
# ops.train_epoch * 0.8 else ops.lr * 0.1))
# )

# callbacks.append(
# FitlogCallback(data=datainfo.datasets, verbose=1)
# )

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

print(device)

# 4.定义train方法
trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss,
metrics=[metric],
dev_data=datainfo.datasets['test'], device=device,
check_code_level=-1, batch_size=ops.batch_size, callbacks=callbacks,
n_epochs=ops.train_epoch, num_workers=4)



if __name__ == "__main__":
print(trainer.train())


+ 11
- 0
reproduction/text_classification/utils/util_init.py View File

@@ -0,0 +1,11 @@
import numpy
import torch
import random


def set_rng_seeds(seed):
random.seed(seed)
numpy.random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# print('RNG_SEED {}'.format(seed))

Loading…
Cancel
Save