Browse Source

update model & dataloader in text_classification

tags/v0.4.10
yunfan 5 years ago
parent
commit
372496ca32
3 changed files with 283 additions and 60 deletions
  1. +82
    -0
      reproduction/text_classification/data/IMDBLoader.py
  2. +142
    -22
      reproduction/text_classification/data/yelpLoader.py
  3. +59
    -38
      reproduction/text_classification/train_dpcnn.py

+ 82
- 0
reproduction/text_classification/data/IMDBLoader.py View File

@@ -0,0 +1,82 @@
from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader
from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io.base_loader import DataSetLoader, DataInfo
from typing import Union, Dict, List, Iterator
from fastNLP import DataSet
from fastNLP import Instance
from fastNLP import Vocabulary
from fastNLP import Const
# from reproduction.utils import check_dataloader_paths
from functools import partial


class IMDBLoader(DataSetLoader):
"""
读取IMDB数据集,DataSet包含以下fields:

words: list(str), 需要分类的文本
target: str, 文本的标签


"""

def __init__(self):
super(IMDBLoader, self).__init__()

def _load(self, path):
dataset = DataSet()
with open(path, 'r', encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
parts = line.split('\t')
target = parts[0]
words = parts[1].split()
dataset.append(Instance(words=words, target=target))
if len(dataset) == 0:
raise RuntimeError(f"{path} has no valid data.")

return dataset

def process(self,
paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None,
src_embed_opt: EmbeddingOption = None):

# paths = check_dataloader_paths(paths)
datasets = {}
info = DataInfo()
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset

datasets["train"], datasets["dev"] = datasets["train"].split(0.1, shuffle=False)

src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
src_vocab.from_dataset(datasets['train'], field_name='words')
# src_vocab.from_dataset(datasets['train'], datasets["dev"], datasets["test"], field_name='words')
src_vocab.index_dataset(*datasets.values(), field_name='words')

tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
tgt_vocab.from_dataset(datasets['train'], field_name='target')
tgt_vocab.index_dataset(*datasets.values(), field_name='target')

info.vocabs = {
"words": src_vocab,
"target": tgt_vocab
}

info.datasets = datasets

if src_embed_opt is not None:
embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
info.embeddings['words'] = embed

for name, dataset in info.datasets.items():
dataset.set_input("words")
dataset.set_target("target")

return info

+ 142
- 22
reproduction/text_classification/data/yelpLoader.py View File

@@ -1,4 +1,6 @@
import ast import ast
import csv
from typing import Iterable
from fastNLP import DataSet, Instance, Vocabulary from fastNLP import DataSet, Instance, Vocabulary
from fastNLP.core.vocabulary import VocabularyOption from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io import JsonLoader from fastNLP.io import JsonLoader
@@ -10,11 +12,34 @@ from reproduction.Star_transformer.datasets import EmbedLoader
from reproduction.utils import check_dataloader_paths from reproduction.utils import check_dataloader_paths




def clean_str(sentence, char_lower=False):
"""
heavily borrowed from github
https://github.com/LukeZhuang/Hierarchical-Attention-Network/blob/master/yelp-preprocess.ipynb
:param sentence: is a str
:return:
"""
if char_lower:
sentence = sentence.lower()
import re
nonalpnum = re.compile('[^0-9a-zA-Z?!\']+')
words = sentence.split()
words_collection = []
for word in words:
if word in ['-lrb-', '-rrb-', '<sssss>', '-r', '-l', 'b-']:
continue
tt = nonalpnum.split(word)
t = ''.join(tt)
if t != '':
words_collection.append(t)

return words_collection


class yelpLoader(JsonLoader): class yelpLoader(JsonLoader):
""" """
读取Yelp数据集, DataSet包含fields: 读取Yelp数据集, DataSet包含fields:
review_id: str, 22 character unique review id review_id: str, 22 character unique review id
user_id: str, 22 character unique user id user_id: str, 22 character unique user id
business_id: str, 22 character business id business_id: str, 22 character business id
@@ -24,23 +49,25 @@ class yelpLoader(JsonLoader):
date: str, date formatted YYYY-MM-DD date: str, date formatted YYYY-MM-DD
words: list(str), 需要分类的文本 words: list(str), 需要分类的文本
target: str, 文本的标签 target: str, 文本的标签
数据来源: https://www.yelp.com/dataset/download 数据来源: https://www.yelp.com/dataset/download
:param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False`` :param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False``
""" """
def __init__(self, fine_grained=False):
def __init__(self, fine_grained=False, lower=False):
super(yelpLoader, self).__init__() super(yelpLoader, self).__init__()
tag_v = {'1.0': 'very negative', '2.0': 'negative', '3.0': 'neutral', tag_v = {'1.0': 'very negative', '2.0': 'negative', '3.0': 'neutral',
'4.0': 'positive', '5.0': 'very positive'}
'4.0': 'positive', '5.0': 'very positive'}
if not fine_grained: if not fine_grained:
tag_v['1.0'] = tag_v['2.0'] tag_v['1.0'] = tag_v['2.0']
tag_v['5.0'] = tag_v['4.0'] tag_v['5.0'] = tag_v['4.0']
self.fine_grained = fine_grained self.fine_grained = fine_grained
self.tag_v = tag_v self.tag_v = tag_v
def _load(self, path):
self.lower = lower

'''
def _load_json(self, path):
ds = DataSet() ds = DataSet()
for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna): for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna):
d = ast.literal_eval(d) d = ast.literal_eval(d)
@@ -49,20 +76,113 @@ class yelpLoader(JsonLoader):
ds.append(Instance(**d)) ds.append(Instance(**d))
return ds return ds


def process(self, paths: Union[str, Dict[str, str]], vocab_opt: VocabularyOption = None,
embed_opt: EmbeddingOption = None):
def _load_yelp2015_broken(self,path):
ds = DataSet()
with open (path,encoding='ISO 8859-1') as f:
row=f.readline()
all_count=0
exp_count=0
while row:
row=row.split("\t\t")
all_count+=1
if len(row)>=3:
words=row[-1].split()
try:
target=self.tag_v[str(row[-2])+".0"]
ds.append(Instance(words=words, target=target))
except KeyError:
exp_count+=1
else:
exp_count+=1
row = f.readline()
print("error sample count:",exp_count)
print("all count:",all_count)
return ds
'''

def _load(self, path):
ds = DataSet()
csv_reader = csv.reader(open(path, encoding='utf-8'))
all_count = 0
real_count = 0
for row in csv_reader:
all_count += 1
if len(row) == 2:
target = self.tag_v[row[0] + ".0"]
words = clean_str(row[1], self.lower)
if len(words) != 0:
ds.append(Instance(words=words, target=target))
real_count += 1
print("all count:", all_count)
print("real count:", real_count)
return ds

def process(self, paths: Union[str, Dict[str, str]],
train_ds: Iterable[str] = None,
src_vocab_op: VocabularyOption = None,
tgt_vocab_op: VocabularyOption = None,
embed_opt: EmbeddingOption = None,
char_level_op=False):
paths = check_dataloader_paths(paths) paths = check_dataloader_paths(paths)
datasets = {} datasets = {}
info = DataInfo()
vocab = Vocabulary(min_freq=2) if vocab_opt is None else Vocabulary(**vocab_opt)
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset
vocab.from_dataset(dataset, field_name="words")
info.vocabs = vocab
info.datasets = datasets
if embed_opt is not None:
embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab)
info.embeddings['words'] = embed
info = DataInfo(datasets=self.load(paths))
src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op)
tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)
_train_ds = [info.datasets[name]
for name in train_ds] if train_ds else info.datasets.values()

# vocab = Vocabulary(min_freq=2) if vocab_opt is None else Vocabulary(**vocab_opt)
# for name, path in paths.items():
# dataset = self.load(path)
# datasets[name] = dataset
# vocab.from_dataset(dataset, field_name="words")
# info.vocabs = vocab
# info.datasets = datasets

def wordtochar(words):
chars = []
for word in words:
word = word.lower()
for char in word:
chars.append(char)
return chars

input_name, target_name = 'words', 'target'
info.vocabs = {}
# 就分隔为char形式
if char_level_op:
for dataset in info.datasets.values():
dataset.apply_field(wordtochar, field_name="words", new_field_name='chars')
# if embed_opt is not None:
# embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab)
# info.embeddings['words'] = embed
else:
src_vocab.from_dataset(*_train_ds, field_name=input_name)
src_vocab.index_dataset(*info.datasets.values(), field_name=input_name, new_field_name=input_name)
info.vocabs[input_name] = src_vocab

tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
tgt_vocab.index_dataset(
*info.datasets.values(),
field_name=target_name, new_field_name=target_name)
info.vocabs[target_name] = tgt_vocab

return info return info



if __name__ == "__main__":
testloader = yelpLoader()
# datapath = {"train": "/remote-home/ygwang/yelp_full/train.csv",
# "test": "/remote-home/ygwang/yelp_full/test.csv"}
# datapath={"train": "/remote-home/ygwang/yelp_full/test.csv"}
datapath = {"train": "/remote-home/ygwang/yelp_polarity/train.csv",
"test": "/remote-home/ygwang/yelp_polarity/test.csv"}
datainfo = testloader.process(datapath, char_level_op=True)

len_count = 0
for instance in datainfo.datasets["train"]:
len_count += len(instance["chars"])

ave_len = len_count / len(datainfo.datasets["train"])
print(ave_len)

+ 59
- 38
reproduction/text_classification/train_dpcnn.py View File

@@ -1,65 +1,83 @@
# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径 # 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径

from torch.optim.lr_scheduler import CosineAnnealingLR
import torch.cuda
from torch.optim import SGD
from fastNLP.core.trainer import Trainer
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP.modules.encoder.embedding import StaticEmbedding, CNNCharEmbedding, StackEmbedding
from reproduction.text_classification.model.dpcnn import DPCNN
from .data.yelpLoader import yelpLoader
from fastNLP.io.dataset_loader import SSTLoader
import torch.nn as nn
from fastNLP.core import LRScheduler
from fastNLP.core.const import Const as C
import sys
import os import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"


from fastNLP.core.const import Const as C
from fastNLP.core import LRScheduler
import torch.nn as nn
from fastNLP.io.dataset_loader import SSTLoader
from reproduction.text_classification.model.dpcnn import DPCNN
from fastNLP.modules.encoder.embedding import StaticEmbedding, CNNCharEmbedding, StackEmbedding
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP.core.trainer import Trainer
from torch.optim import SGD
import torch.cuda
from torch.optim.lr_scheduler import CosineAnnealingLR
sys.path.append('../..')


# hyper


##hyper
class Config(): class Config():
model_dir_or_name="en-base-uncased"
embedding_grad= False,
train_epoch= 30
model_dir_or_name = "en-base-uncased"
embedding_grad = False,
train_epoch = 30
batch_size = 100 batch_size = 100
num_classes=5
task= "SST"
datadir = '/remote-home/yfshao/workdir/datasets/SST'
datafile = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"}
lr=1e-3
num_classes = 2
task = "yelp_p"
#datadir = '/remote-home/yfshao/workdir/datasets/SST'
datadir = '/remote-home/ygwang/yelp_polarity'
#datafile = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"}
datafile = {"train": "train.csv", "test": "test.csv"}
lr = 1e-3

def __init__(self): def __init__(self):
self.datapath = {k:os.path.join(self.datadir, v)
self.datapath = {k: os.path.join(self.datadir, v)
for k, v in self.datafile.items()} for k, v in self.datafile.items()}


ops=Config()

ops = Config()




##1.task相关信息:利用dataloader载入dataInfo
datainfo=SSTLoader(fine_grained=True).process(paths=ops.datapath, train_ds='train')
# 1.task相关信息:利用dataloader载入dataInfo


#datainfo=SSTLoader(fine_grained=True).process(paths=ops.datapath, train_ds=['train'])
datainfo = yelpLoader(fine_grained=True, lower=True).process(
paths=ops.datapath, train_ds=['train'])
print(len(datainfo.datasets['train'])) print(len(datainfo.datasets['train']))
print(len(datainfo.datasets['dev']))
print(len(datainfo.datasets['test']))




## 2.或直接复用fastNLP的模型
vocab = datainfo.vocabs['words']
# 2.或直接复用fastNLP的模型


vocab = datainfo.vocabs['words']
# embedding = StackEmbedding([StaticEmbedding(vocab), CNNCharEmbedding(vocab, 100)]) # embedding = StackEmbedding([StaticEmbedding(vocab), CNNCharEmbedding(vocab, 100)])
embedding = StaticEmbedding(vocab)
#embedding = StaticEmbedding(vocab)
embedding = StaticEmbedding(
vocab, model_dir_or_name='en-word2vec-300', requires_grad=True)

print(len(vocab)) print(len(vocab))
print(len(datainfo.vocabs['target'])) print(len(datainfo.vocabs['target']))

model = DPCNN(init_embed=embedding, num_cls=ops.num_classes) model = DPCNN(init_embed=embedding, num_cls=ops.num_classes)


## 3. 声明loss,metric,optimizer
loss=CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET)
metric=AccuracyMetric(pred=C.OUTPUT, target=C.TARGET)
optimizer= SGD([param for param in model.parameters() if param.requires_grad==True],
lr=ops.lr, momentum=0.9, weight_decay=0)

# 3. 声明loss,metric,optimizer
loss = CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET)
metric = AccuracyMetric(pred=C.OUTPUT, target=C.TARGET)
optimizer = SGD([param for param in model.parameters() if param.requires_grad == True],
lr=ops.lr, momentum=0.9, weight_decay=0)


callbacks = [] callbacks = []
callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5))) callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5)))


device = 'cuda:0' if torch.cuda.is_available() else 'cpu' device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

print(device) print(device)


for ds in datainfo.datasets.values(): for ds in datainfo.datasets.values():
@@ -67,14 +85,17 @@ for ds in datainfo.datasets.values():
ds.set_input(C.INPUT, C.INPUT_LEN) ds.set_input(C.INPUT, C.INPUT_LEN)
ds.set_target(C.TARGET) ds.set_target(C.TARGET)


## 4.定义train方法
def train(model,datainfo,loss,metrics,optimizer,num_epochs=ops.train_epoch):

# 4.定义train方法
def train(model, datainfo, loss, metrics, optimizer, num_epochs=ops.train_epoch):
trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss,
metrics=[metrics], dev_data=datainfo.datasets['dev'], device=device,
metrics=[metrics],
dev_data=datainfo.datasets['test'], device=device,
check_code_level=-1, batch_size=ops.batch_size, callbacks=callbacks, check_code_level=-1, batch_size=ops.batch_size, callbacks=callbacks,
n_epochs=num_epochs) n_epochs=num_epochs)

print(trainer.train()) print(trainer.train())




if __name__=="__main__":
train(model,datainfo,loss,metric,optimizer)
if __name__ == "__main__":
train(model, datainfo, loss, metric, optimizer)

Loading…
Cancel
Save