Browse Source

[add] dataloader: yelp/sst2/IMDB/MTL16

[add] model: char_cnn dpcnn
[test] train_char_cnn train_dpcnn,dataloader的均在自己的main方法内测试
tags/v0.4.10
wyg 5 years ago
parent
commit
b02a91ea01
7 changed files with 868 additions and 31 deletions
  1. +107
    -0
      reproduction/text_classification/data/IMDBLoader.py
  2. +98
    -0
      reproduction/text_classification/data/sstLoader.py
  3. +155
    -29
      reproduction/text_classification/data/yelpLoader.py
  4. +90
    -1
      reproduction/text_classification/model/char_cnn.py
  5. +111
    -1
      reproduction/text_classification/model/dpcnn.py
  6. +206
    -0
      reproduction/text_classification/train_char_cnn.py
  7. +101
    -0
      reproduction/text_classification/train_dpcnn.py

+ 107
- 0
reproduction/text_classification/data/IMDBLoader.py View File

@@ -0,0 +1,107 @@
from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader
from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io.base_loader import DataSetLoader, DataInfo
from typing import Union, Dict, List, Iterator
from fastNLP import DataSet
from fastNLP import Instance
from fastNLP import Vocabulary
from fastNLP import Const
# from reproduction.utils import check_dataloader_paths
from functools import partial

class IMDBLoader(DataSetLoader):
"""
读取IMDB数据集,DataSet包含以下fields:

words: list(str), 需要分类的文本
target: str, 文本的标签


"""

def __init__(self):
super(IMDBLoader, self).__init__()

def _load(self, path):
dataset = DataSet()
with open(path, 'r', encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
parts = line.split('\t')
target = parts[0]
words = parts[1].split()
dataset.append(Instance(words=words, target=target))
if len(dataset)==0:
raise RuntimeError(f"{path} has no valid data.")

return dataset
def process(self,
paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None,
src_embed_opt: EmbeddingOption = None,
char_level_op=False):
# paths = check_dataloader_paths(paths)

datasets = {}
info = DataInfo()
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset

def wordtochar(words):
chars = []
for word in words:
word = word.lower()
for char in word:
chars.append(char)
return chars

if char_level_op:
for dataset in datasets.values():
dataset.apply_field(wordtochar, field_name="words", new_field_name='chars')

datasets["train"], datasets["dev"] = datasets["train"].split(0.1, shuffle=False)

src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
src_vocab.from_dataset(datasets['train'], field_name='words')
# src_vocab.from_dataset(datasets['train'], datasets["dev"], datasets["test"], field_name='words')
src_vocab.index_dataset(*datasets.values(), field_name='words')

tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
tgt_vocab.from_dataset(datasets['train'], field_name='target')
tgt_vocab.index_dataset(*datasets.values(), field_name='target')

info.vocabs = {
"words": src_vocab,
"target": tgt_vocab
}

info.datasets = datasets

if src_embed_opt is not None:
embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
info.embeddings['words'] = embed

for name, dataset in info.datasets.items():
dataset.set_input("words")
dataset.set_target("target")

return info

if __name__=="__main__":
datapath = {"train": "/remote-home/ygwang/IMDB_data/train.csv",
"test": "/remote-home/ygwang/IMDB_data/test.csv"}
datainfo=IMDBLoader().process(datapath,char_level_op=True)
#print(datainfo.datasets["train"])
len_count = 0
for instance in datainfo.datasets["train"]:
len_count += len(instance["chars"])

ave_len = len_count / len(datainfo.datasets["train"])
print(ave_len)

+ 98
- 0
reproduction/text_classification/data/sstLoader.py View File

@@ -0,0 +1,98 @@
import csv
from typing import Iterable
from fastNLP import DataSet, Instance, Vocabulary
from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io.base_loader import DataInfo,DataSetLoader
from fastNLP.io.embed_loader import EmbeddingOption
from fastNLP.io.file_reader import _read_json
from typing import Union, Dict
from reproduction.Star_transformer.datasets import EmbedLoader
from reproduction.utils import check_dataloader_paths

class sst2Loader(DataSetLoader):
'''
数据来源"SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
'''
def __init__(self):
super(sst2Loader, self).__init__()

def _load(self, path: str) -> DataSet:
ds = DataSet()
all_count=0
csv_reader = csv.reader(open(path, encoding='utf-8'),delimiter='\t')
skip_row = 0
for idx,row in enumerate(csv_reader):
if idx<=skip_row:
continue
target = row[1]
words = row[0].split()
ds.append(Instance(words=words,target=target))
all_count+=1
print("all count:", all_count)
return ds

def process(self,
paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None,
src_embed_opt: EmbeddingOption = None,
char_level_op=False):

paths = check_dataloader_paths(paths)
datasets = {}
info = DataInfo()
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset

def wordtochar(words):
chars=[]
for word in words:
word=word.lower()
for char in word:
chars.append(char)
return chars

input_name, target_name = 'words', 'target'
info.vocabs={}

# 就分隔为char形式
if char_level_op:
for dataset in datasets.values():
dataset.apply_field(wordtochar, field_name="words", new_field_name='chars')

src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
src_vocab.from_dataset(datasets['train'], field_name='words')
src_vocab.index_dataset(*datasets.values(), field_name='words')

tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
tgt_vocab.from_dataset(datasets['train'], field_name='target')
tgt_vocab.index_dataset(*datasets.values(), field_name='target')


info.vocabs = {
"words": src_vocab,
"target": tgt_vocab
}

info.datasets = datasets


if src_embed_opt is not None:
embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
info.embeddings['words'] = embed

return info

if __name__=="__main__":
datapath = {"train": "/remote-home/ygwang/workspace/GLUE/SST-2/train.tsv",
"dev": "/remote-home/ygwang/workspace/GLUE/SST-2/dev.tsv"}
datainfo=sst2Loader().process(datapath,char_level_op=True)
#print(datainfo.datasets["train"])
len_count = 0
for instance in datainfo.datasets["train"]:
len_count += len(instance["chars"])

ave_len = len_count / len(datainfo.datasets["train"])
print(ave_len)

+ 155
- 29
reproduction/text_classification/data/yelpLoader.py View File

@@ -1,8 +1,10 @@
import ast import ast
import csv
from typing import Iterable
from fastNLP import DataSet, Instance, Vocabulary from fastNLP import DataSet, Instance, Vocabulary
from fastNLP.core.vocabulary import VocabularyOption from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io import JsonLoader from fastNLP.io import JsonLoader
from fastNLP.io.base_loader import DataInfo
from fastNLP.io.base_loader import DataInfo,DataSetLoader
from fastNLP.io.embed_loader import EmbeddingOption from fastNLP.io.embed_loader import EmbeddingOption
from fastNLP.io.file_reader import _read_json from fastNLP.io.file_reader import _read_json
from typing import Union, Dict from typing import Union, Dict
@@ -10,27 +12,44 @@ from reproduction.Star_transformer.datasets import EmbedLoader
from reproduction.utils import check_dataloader_paths from reproduction.utils import check_dataloader_paths




class yelpLoader(JsonLoader):
def clean_str(sentence,char_lower=False):
""" """
读取Yelp数据集, DataSet包含fields:
heavily borrowed from github
https://github.com/LukeZhuang/Hierarchical-Attention-Network/blob/master/yelp-preprocess.ipynb
:param sentence: is a str
:return:
"""
if char_lower:
sentence=sentence.lower()
import re
nonalpnum = re.compile('[^0-9a-zA-Z?!\']+')
words = sentence.split()
words_collection = []
for word in words:
if word in ['-lrb-', '-rrb-', '<sssss>', '-r', '-l', 'b-']:
continue
tt = nonalpnum.split(word)
t = ''.join(tt)
if t != '':
words_collection.append(t)

return words_collection


class yelpLoader(DataSetLoader):
review_id: str, 22 character unique review id
user_id: str, 22 character unique user id
business_id: str, 22 character business id
useful: int, number of useful votes received
funny: int, number of funny votes received
cool: int, number of cool votes received
date: str, date formatted YYYY-MM-DD
"""
读取Yelp_full/Yelp_polarity数据集, DataSet包含fields:

words: list(str), 需要分类的文本 words: list(str), 需要分类的文本
target: str, 文本的标签 target: str, 文本的标签
数据来源: https://www.yelp.com/dataset/download
chars:list(str),未index的字符列表

数据集:yelp_full/yelp_polarity
:param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False`` :param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False``
""" """
def __init__(self, fine_grained=False):
def __init__(self, fine_grained=False,lower=False):
super(yelpLoader, self).__init__() super(yelpLoader, self).__init__()
tag_v = {'1.0': 'very negative', '2.0': 'negative', '3.0': 'neutral', tag_v = {'1.0': 'very negative', '2.0': 'negative', '3.0': 'neutral',
'4.0': 'positive', '5.0': 'very positive'} '4.0': 'positive', '5.0': 'very positive'}
@@ -39,8 +58,24 @@ class yelpLoader(JsonLoader):
tag_v['5.0'] = tag_v['4.0'] tag_v['5.0'] = tag_v['4.0']
self.fine_grained = fine_grained self.fine_grained = fine_grained
self.tag_v = tag_v self.tag_v = tag_v
self.lower=lower

'''
读取Yelp数据集, DataSet包含fields:
def _load(self, path):
review_id: str, 22 character unique review id
user_id: str, 22 character unique user id
business_id: str, 22 character business id
useful: int, number of useful votes received
funny: int, number of funny votes received
cool: int, number of cool votes received
date: str, date formatted YYYY-MM-DD
words: list(str), 需要分类的文本
target: str, 文本的标签
数据来源: https://www.yelp.com/dataset/download
def _load_json(self, path):
ds = DataSet() ds = DataSet()
for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna): for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna):
d = ast.literal_eval(d) d = ast.literal_eval(d)
@@ -48,21 +83,112 @@ class yelpLoader(JsonLoader):
d["target"] = self.tag_v[str(d.pop("stars"))] d["target"] = self.tag_v[str(d.pop("stars"))]
ds.append(Instance(**d)) ds.append(Instance(**d))
return ds return ds
def _load_yelp2015_broken(self,path):
ds = DataSet()
with open (path,encoding='ISO 8859-1') as f:
row=f.readline()
all_count=0
exp_count=0
while row:
row=row.split("\t\t")
all_count+=1
if len(row)>=3:
words=row[-1].split()
try:
target=self.tag_v[str(row[-2])+".0"]
ds.append(Instance(words=words, target=target))
except KeyError:
exp_count+=1
else:
exp_count+=1
row = f.readline()
print("error sample count:",exp_count)
print("all count:",all_count)
return ds
'''
def _load(self, path):
ds = DataSet()
csv_reader=csv.reader(open(path,encoding='utf-8'))
all_count=0
real_count=0
for row in csv_reader:
all_count+=1
if len(row)==2:
target=self.tag_v[row[0]+".0"]
words=clean_str(row[1],self.lower)
if len(words)!=0:
ds.append(Instance(words=words,target=target))
real_count += 1
print("all count:", all_count)
print("real count:", real_count)
return ds



def process(self, paths: Union[str, Dict[str, str]], vocab_opt: VocabularyOption = None,
embed_opt: EmbeddingOption = None):
def process(self, paths: Union[str, Dict[str, str]],
train_ds: Iterable[str] = None,
src_vocab_op: VocabularyOption = None,
tgt_vocab_op: VocabularyOption = None,
embed_opt: EmbeddingOption = None,
char_level_op=False):
paths = check_dataloader_paths(paths) paths = check_dataloader_paths(paths)
datasets = {} datasets = {}
info = DataInfo()
vocab = Vocabulary(min_freq=2) if vocab_opt is None else Vocabulary(**vocab_opt)
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset
vocab.from_dataset(dataset, field_name="words")
info.vocabs = vocab
info.datasets = datasets
if embed_opt is not None:
embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab)
info.embeddings['words'] = embed
info = DataInfo(datasets=self.load(paths))
src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op)
tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)
_train_ds = [info.datasets[name]
for name in train_ds] if train_ds else info.datasets.values()
#vocab = Vocabulary(min_freq=2) if vocab_opt is None else Vocabulary(**vocab_opt)
# for name, path in paths.items():
# dataset = self.load(path)
# datasets[name] = dataset
# vocab.from_dataset(dataset, field_name="words")
# info.vocabs = vocab
# info.datasets = datasets

def wordtochar(words):
chars=[]
for word in words:
word=word.lower()
for char in word:
chars.append(char)
return chars

input_name, target_name = 'words', 'target'
info.vocabs={}
#就分隔为char形式
if char_level_op:
for dataset in info.datasets.values():
dataset.apply_field(wordtochar, field_name="words",new_field_name='chars')
# if embed_opt is not None:
# embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab)
# info.embeddings['words'] = embed
else:
src_vocab.from_dataset(*_train_ds, field_name=input_name)
src_vocab.index_dataset(*info.datasets.values(),field_name=input_name, new_field_name=input_name)
info.vocabs[input_name]=src_vocab

tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
tgt_vocab.index_dataset(
*info.datasets.values(),
field_name=target_name, new_field_name=target_name)
info.vocabs[target_name]=tgt_vocab

return info return info


if __name__=="__main__":
testloader=yelpLoader()
# datapath = {"train": "/remote-home/ygwang/yelp_full/train.csv",
# "test": "/remote-home/ygwang/yelp_full/test.csv"}
#datapath={"train": "/remote-home/ygwang/yelp_full/test.csv"}
datapath = {"train": "/remote-home/ygwang/yelp_polarity/train.csv",
"test": "/remote-home/ygwang/yelp_polarity/test.csv"}
datainfo=testloader.process(datapath,char_level_op=True)

len_count=0
for instance in datainfo.datasets["train"]:
len_count+=len(instance["chars"])

ave_len=len_count/len(datainfo.datasets["train"])
print(ave_len)

+ 90
- 1
reproduction/text_classification/model/char_cnn.py View File

@@ -1 +1,90 @@
# TODO
'''
@author: https://github.com/ahmedbesbes/character-based-cnn
这里借鉴了上述链接中char-cnn model的代码,改动主要为将其改动为符合fastnlp的pipline
'''
import torch
import torch.nn as nn
from fastNLP.core.const import Const as C

class CharacterLevelCNN(nn.Module):
def __init__(self, args,embedding):
super(CharacterLevelCNN, self).__init__()

self.config=args.char_cnn_config
self.embedding=embedding

conv_layers = []
for i, conv_layer_parameter in enumerate(self.config['model_parameters'][args.model_size]['conv']):
if i == 0:
#in_channels = args.number_of_characters + len(args.extra_characters)
in_channels = args.embedding_dim
out_channels = conv_layer_parameter[0]
else:
in_channels, out_channels = conv_layer_parameter[0], conv_layer_parameter[0]

if conv_layer_parameter[2] != -1:
conv_layer = nn.Sequential(nn.Conv1d(in_channels,
out_channels,
kernel_size=conv_layer_parameter[1], padding=0),
nn.ReLU(),
nn.MaxPool1d(conv_layer_parameter[2]))
else:
conv_layer = nn.Sequential(nn.Conv1d(in_channels,
out_channels,
kernel_size=conv_layer_parameter[1], padding=0),
nn.ReLU())
conv_layers.append(conv_layer)
self.conv_layers = nn.ModuleList(conv_layers)

input_shape = (args.batch_size, args.max_length,
args.number_of_characters + len(args.extra_characters))
dimension = self._get_conv_output(input_shape)

print('dimension :', dimension)

fc_layer_parameter = self.config['model_parameters'][args.model_size]['fc'][0]
fc_layers = nn.ModuleList([
nn.Sequential(
nn.Linear(dimension, fc_layer_parameter), nn.Dropout(0.5)),
nn.Sequential(nn.Linear(fc_layer_parameter,
fc_layer_parameter), nn.Dropout(0.5)),
nn.Linear(fc_layer_parameter, args.num_classes),
])

self.fc_layers = fc_layers

if args.model_size == 'small':
self._create_weights(mean=0.0, std=0.05)
elif args.model_size == 'large':
self._create_weights(mean=0.0, std=0.02)

def _create_weights(self, mean=0.0, std=0.05):
for module in self.modules():
if isinstance(module, nn.Conv1d) or isinstance(module, nn.Linear):
module.weight.data.normal_(mean, std)

def _get_conv_output(self, shape):
input = torch.rand(shape)
output = input.transpose(1, 2)
# forward pass through conv layers
for i in range(len(self.conv_layers)):
output = self.conv_layers[i](output)

output = output.view(output.size(0), -1)
n_size = output.size(1)
return n_size

def forward(self, chars):
input=self.embedding(chars)
output = input.transpose(1, 2)
# forward pass through conv layers
for i in range(len(self.conv_layers)):
output = self.conv_layers[i](output)

output = output.view(output.size(0), -1)

# forward pass through fc layers
for i in range(len(self.fc_layers)):
output = self.fc_layers[i](output)

return {C.OUTPUT: output}

+ 111
- 1
reproduction/text_classification/model/dpcnn.py View File

@@ -1 +1,111 @@
# TODO


import torch
import torch.nn as nn
from fastNLP.modules.utils import get_embeddings
from fastNLP.core import Const as C



class DPCNN(nn.Module):

def __init__(self, init_embed, num_cls, n_filters=256, kernel_size=3, n_layers=7, embed_dropout=0.1, dropout=0.1):
super().__init__()
self.region_embed = RegionEmbedding(init_embed, out_dim=n_filters, kernel_sizes=[3, 5, 9])
embed_dim = self.region_embed.embedding_dim
self.conv_list = nn.ModuleList()
for i in range(n_layers):
self.conv_list.append(nn.Sequential(
nn.ReLU(),
nn.Conv1d(n_filters, n_filters, kernel_size, padding=kernel_size//2),
nn.Conv1d(n_filters, n_filters, kernel_size, padding=kernel_size//2),
))

self.pool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
self.embed_drop = nn.Dropout(embed_dropout)
self.classfier = nn.Sequential(
nn.Dropout(dropout),
nn.Linear(n_filters, num_cls),
)
self.reset_parameters()



def reset_parameters(self):
for m in self.modules():
if isinstance(m, (nn.Conv1d, nn.Conv2d, nn.Linear)):
nn.init.normal_(m.weight, mean=0, std=0.01)
if m.bias is not None:
nn.init.normal_(m.bias, mean=0, std=0.01)



def forward(self, words, seq_len=None):
words = words.long()
# get region embeddings
x = self.region_embed(words)
x = self.embed_drop(x)

# not pooling on first conv
x = self.conv_list[0](x) + x
for conv in self.conv_list[1:]:
x = self.pool(x)
x = conv(x) + x

# B, C, L => B, C
x, _ = torch.max(x, dim=2)
x = self.classfier(x)
return {C.OUTPUT: x}



def predict(self, words, seq_len=None):
x = self.forward(words, seq_len)[C.OUTPUT]
return {C.OUTPUT: torch.argmax(x, 1)}





class RegionEmbedding(nn.Module):
def __init__(self, init_embed, out_dim=300, kernel_sizes=None):
super().__init__()
if kernel_sizes is None:
kernel_sizes = [5, 9]
assert isinstance(kernel_sizes, list), 'kernel_sizes should be List(int)'
self.embed = get_embeddings(init_embed)
try:
embed_dim = self.embed.embedding_dim
except Exception:
embed_dim = self.embed.embed_size
self.region_embeds = nn.ModuleList()
for ksz in kernel_sizes:
self.region_embeds.append(nn.Sequential(
nn.Conv1d(embed_dim, embed_dim, ksz, padding=ksz // 2),
))
self.linears = nn.ModuleList([nn.Conv1d(embed_dim, out_dim, 1)
for _ in range(len(kernel_sizes) + 1)])
self.embedding_dim = embed_dim


def forward(self, x):
x = self.embed(x)
x = x.transpose(1, 2)
# B, C, L
out = self.linears[0](x)
for conv, fc in zip(self.region_embeds, self.linears[1:]):
conv_i = conv(x)
out = out + fc(conv_i)
# B, C, L

return out





if __name__ == '__main__':
x = torch.randint(0, 10000, size=(5, 15), dtype=torch.long)
model = DPCNN((10000, 300), 20)
y = model(x)
print(y.size(), y.mean(1), y.std(1))

+ 206
- 0
reproduction/text_classification/train_char_cnn.py View File

@@ -0,0 +1,206 @@
# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径
import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'

import sys
sys.path.append('../..')
from fastNLP.core.const import Const as C
import torch.nn as nn
from fastNLP.io.dataset_loader import SSTLoader
from data.yelpLoader import yelpLoader
from data.sstLoader import sst2Loader
from data.IMDBLoader import IMDBLoader
from model.char_cnn import CharacterLevelCNN
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.models.cnn_text_classification import CNNText
from fastNLP.modules.encoder.embedding import CNNCharEmbedding,StaticEmbedding,StackEmbedding,LSTMCharEmbedding
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP.core.trainer import Trainer
from torch.optim import SGD
from torch.autograd import Variable
import torch
from fastNLP import BucketSampler

##hyper
#todo 这里加入fastnlp的记录
class Config():
model_dir_or_name="en-base-uncased"
embedding_grad= False,
bert_embedding_larers= '4,-2,-1'
train_epoch= 50
num_classes=2
task= "IMDB"
#yelp_p
datapath = {"train": "/remote-home/ygwang/yelp_polarity/train.csv",
"test": "/remote-home/ygwang/yelp_polarity/test.csv"}
#IMDB
#datapath = {"train": "/remote-home/ygwang/IMDB_data/train.csv",
# "test": "/remote-home/ygwang/IMDB_data/test.csv"}
# sst
# datapath = {"train": "/remote-home/ygwang/workspace/GLUE/SST-2/train.tsv",
# "dev": "/remote-home/ygwang/workspace/GLUE/SST-2/dev.tsv"}

lr=0.01
batch_size=128
model_size="large"
number_of_characters=69
extra_characters=''
max_length=1014

char_cnn_config={
"alphabet": {
"en": {
"lower": {
"alphabet": "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}",
"number_of_characters": 69
},
"both": {
"alphabet": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}",
"number_of_characters": 95
}
}
},
"model_parameters": {
"small": {
"conv": [
#依次是channel,kennnel_size,maxpooling_size
[256,7,3],
[256,7,3],
[256,3,-1],
[256,3,-1],
[256,3,-1],
[256,3,3]
],
"fc": [1024,1024]
},
"large":{
"conv":[
[1024, 7, 3],
[1024, 7, 3],
[1024, 3, -1],
[1024, 3, -1],
[1024, 3, -1],
[1024, 3, 3]
],
"fc": [2048,2048]
}
},
"data": {
"text_column": "SentimentText",
"label_column": "Sentiment",
"max_length": 1014,
"num_of_classes": 2,
"encoding": None,
"chunksize": 50000,
"max_rows": 100000,
"preprocessing_steps": ["lower", "remove_hashtags", "remove_urls", "remove_user_mentions"]
},
"training": {
"batch_size": 128,
"learning_rate": 0.01,
"epochs": 10,
"optimizer": "sgd"
}
}
ops=Config


##1.task相关信息:利用dataloader载入dataInfo
dataloader=sst2Loader()
dataloader=IMDBLoader()
#dataloader=yelpLoader(fine_grained=True)
datainfo=dataloader.process(ops.datapath,char_level_op=True)
char_vocab=ops.char_cnn_config["alphabet"]["en"]["lower"]["alphabet"]
ops.number_of_characters=len(char_vocab)
ops.embedding_dim=ops.number_of_characters

#chartoindex
def chartoindex(chars):
max_seq_len=ops.max_length
zero_index=len(char_vocab)
char_index_list=[]
for char in chars:
if char in char_vocab:
char_index_list.append(char_vocab.index(char))
else:
#<unk>和<pad>均使用最后一个作为embbeding
char_index_list.append(zero_index)
if len(char_index_list) > max_seq_len:
char_index_list = char_index_list[:max_seq_len]
elif 0 < len(char_index_list) < max_seq_len:
char_index_list = char_index_list+[zero_index]*(max_seq_len-len(char_index_list))
elif len(char_index_list) == 0:
char_index_list=[zero_index]*max_seq_len
return char_index_list

for dataset in datainfo.datasets.values():
dataset.apply_field(chartoindex,field_name='chars',new_field_name='chars')

datainfo.datasets['train'].set_input('chars')
datainfo.datasets['test'].set_input('chars')
datainfo.datasets['train'].set_target('target')
datainfo.datasets['test'].set_target('target')

##2. 定义/组装模型,这里可以随意,就如果是fastNLP封装好的,类似CNNText就直接用初始化调用就好了,这里只是给出一个伪框架表示占位,在这里建立符合fastNLP输入输出规范的model
class ModelFactory(nn.Module):
"""
用于拼装embedding,encoder,decoder 以及设计forward过程

:param embedding: embbeding model
:param encoder: encoder model
:param decoder: decoder model

"""
def __int__(self,embedding,encoder,decoder,**kwargs):
super(ModelFactory,self).__init__()
self.embedding=embedding
self.encoder=encoder
self.decoder=decoder

def forward(self,x):
return {C.OUTPUT:None}

## 2.或直接复用fastNLP的模型
#vocab=datainfo.vocabs['words']
vocab_label=datainfo.vocabs['target']
'''
# emded_char=CNNCharEmbedding(vocab)
# embed_word = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True)
# embedding=StackEmbedding([emded_char, embed_word])
# cnn_char_embed = CNNCharEmbedding(vocab)
# lstm_char_embed = LSTMCharEmbedding(vocab)
# embedding = StackEmbedding([cnn_char_embed, lstm_char_embed])
'''
#one-hot embedding
embedding_weight= Variable(torch.zeros(len(char_vocab)+1, len(char_vocab)))

for i in range(len(char_vocab)):
embedding_weight[i][i]=1
embedding=nn.Embedding(num_embeddings=len(char_vocab)+1,embedding_dim=len(char_vocab),padding_idx=len(char_vocab),_weight=embedding_weight)
for para in embedding.parameters():
para.requires_grad=False
#CNNText太过于简单
#model=CNNText(init_embed=embedding, num_classes=ops.num_classes)
model=CharacterLevelCNN(ops,embedding)

## 3. 声明loss,metric,optimizer
loss=CrossEntropyLoss
metric=AccuracyMetric
optimizer= SGD([param for param in model.parameters() if param.requires_grad==True], lr=ops.lr)

## 4.定义train方法
def train(model,datainfo,loss,metrics,optimizer,num_epochs=100):
trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss(target='target'),
metrics=[metrics(target='target')], dev_data=datainfo.datasets['test'], device=0, check_code_level=-1,
n_epochs=num_epochs)
print(trainer.train())



if __name__=="__main__":
#print(vocab_label)

#print(datainfo.datasets["train"])
train(model,datainfo,loss,metric,optimizer,num_epochs=ops.train_epoch)

+ 101
- 0
reproduction/text_classification/train_dpcnn.py View File

@@ -0,0 +1,101 @@
# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径

import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

import sys
sys.path.append('../..')
from fastNLP.core.const import Const as C
from fastNLP.core import LRScheduler
import torch.nn as nn
from fastNLP.io.dataset_loader import SSTLoader
from data.yelpLoader import yelpLoader
from reproduction.text_classification.model.dpcnn import DPCNN
from fastNLP.modules.encoder.embedding import StaticEmbedding, CNNCharEmbedding, StackEmbedding
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP.core.trainer import Trainer
from torch.optim import SGD
import torch.cuda
from torch.optim.lr_scheduler import CosineAnnealingLR


##hyper

class Config():
model_dir_or_name="en-base-uncased"
embedding_grad= False,
train_epoch= 30
batch_size = 100
num_classes=2
task= "yelp_p"
#datadir = '/remote-home/yfshao/workdir/datasets/SST'
datadir = '/remote-home/ygwang/yelp_polarity'
#datafile = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"}
datafile = {"train": "train.csv", "test": "test.csv"}
lr=1e-3

def __init__(self):
self.datapath = {k:os.path.join(self.datadir, v)
for k, v in self.datafile.items()}

ops=Config()



##1.task相关信息:利用dataloader载入dataInfo

#datainfo=SSTLoader(fine_grained=True).process(paths=ops.datapath, train_ds=['train'])
datainfo=yelpLoader(fine_grained=True,lower=True).process(paths=ops.datapath, train_ds=['train'])
print(len(datainfo.datasets['train']))
print(len(datainfo.datasets['test']))


## 2.或直接复用fastNLP的模型

vocab = datainfo.vocabs['words']
# embedding = StackEmbedding([StaticEmbedding(vocab), CNNCharEmbedding(vocab, 100)])
#embedding = StaticEmbedding(vocab)
embedding = StaticEmbedding(vocab, model_dir_or_name='en-word2vec-300', requires_grad=True)

print(len(vocab))
print(len(datainfo.vocabs['target']))

model = DPCNN(init_embed=embedding, num_cls=ops.num_classes)



## 3. 声明loss,metric,optimizer
loss=CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET)
metric=AccuracyMetric(pred=C.OUTPUT, target=C.TARGET)
optimizer= SGD([param for param in model.parameters() if param.requires_grad==True],
lr=ops.lr, momentum=0.9, weight_decay=0)

callbacks = []
callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5)))


device = 'cuda:3' if torch.cuda.is_available() else 'cpu'

print(device)

for ds in datainfo.datasets.values():
ds.apply_field(len, C.INPUT, C.INPUT_LEN)
ds.set_input(C.INPUT, C.INPUT_LEN)
ds.set_target(C.TARGET)


## 4.定义train方法
def train(model,datainfo,loss,metrics,optimizer,num_epochs=ops.train_epoch):
trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss,
metrics=[metrics], dev_data=datainfo.datasets['test'], device=3,
check_code_level=-1, batch_size=ops.batch_size, callbacks=callbacks,
n_epochs=num_epochs)

print(trainer.train())



if __name__=="__main__":
train(model,datainfo,loss,metric,optimizer)

Loading…
Cancel
Save