Browse Source

update model & dataloader in text_classification

tags/v0.4.10
yunfan 5 years ago
parent
commit
372496ca32
3 changed files with 283 additions and 60 deletions
  1. +82
    -0
      reproduction/text_classification/data/IMDBLoader.py
  2. +142
    -22
      reproduction/text_classification/data/yelpLoader.py
  3. +59
    -38
      reproduction/text_classification/train_dpcnn.py

+ 82
- 0
reproduction/text_classification/data/IMDBLoader.py View File

@@ -0,0 +1,82 @@
from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader
from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io.base_loader import DataSetLoader, DataInfo
from typing import Union, Dict, List, Iterator
from fastNLP import DataSet
from fastNLP import Instance
from fastNLP import Vocabulary
from fastNLP import Const
# from reproduction.utils import check_dataloader_paths
from functools import partial


class IMDBLoader(DataSetLoader):
"""
读取IMDB数据集,DataSet包含以下fields:

words: list(str), 需要分类的文本
target: str, 文本的标签


"""

def __init__(self):
super(IMDBLoader, self).__init__()

def _load(self, path):
dataset = DataSet()
with open(path, 'r', encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
parts = line.split('\t')
target = parts[0]
words = parts[1].split()
dataset.append(Instance(words=words, target=target))
if len(dataset) == 0:
raise RuntimeError(f"{path} has no valid data.")

return dataset

def process(self,
paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None,
src_embed_opt: EmbeddingOption = None):

# paths = check_dataloader_paths(paths)
datasets = {}
info = DataInfo()
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset

datasets["train"], datasets["dev"] = datasets["train"].split(0.1, shuffle=False)

src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
src_vocab.from_dataset(datasets['train'], field_name='words')
# src_vocab.from_dataset(datasets['train'], datasets["dev"], datasets["test"], field_name='words')
src_vocab.index_dataset(*datasets.values(), field_name='words')

tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
tgt_vocab.from_dataset(datasets['train'], field_name='target')
tgt_vocab.index_dataset(*datasets.values(), field_name='target')

info.vocabs = {
"words": src_vocab,
"target": tgt_vocab
}

info.datasets = datasets

if src_embed_opt is not None:
embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
info.embeddings['words'] = embed

for name, dataset in info.datasets.items():
dataset.set_input("words")
dataset.set_target("target")

return info

+ 142
- 22
reproduction/text_classification/data/yelpLoader.py View File

@@ -1,4 +1,6 @@
import ast
import csv
from typing import Iterable
from fastNLP import DataSet, Instance, Vocabulary
from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io import JsonLoader
@@ -10,11 +12,34 @@ from reproduction.Star_transformer.datasets import EmbedLoader
from reproduction.utils import check_dataloader_paths


def clean_str(sentence, char_lower=False):
"""
heavily borrowed from github
https://github.com/LukeZhuang/Hierarchical-Attention-Network/blob/master/yelp-preprocess.ipynb
:param sentence: is a str
:return:
"""
if char_lower:
sentence = sentence.lower()
import re
nonalpnum = re.compile('[^0-9a-zA-Z?!\']+')
words = sentence.split()
words_collection = []
for word in words:
if word in ['-lrb-', '-rrb-', '<sssss>', '-r', '-l', 'b-']:
continue
tt = nonalpnum.split(word)
t = ''.join(tt)
if t != '':
words_collection.append(t)

return words_collection


class yelpLoader(JsonLoader):
"""
读取Yelp数据集, DataSet包含fields:
review_id: str, 22 character unique review id
user_id: str, 22 character unique user id
business_id: str, 22 character business id
@@ -24,23 +49,25 @@ class yelpLoader(JsonLoader):
date: str, date formatted YYYY-MM-DD
words: list(str), 需要分类的文本
target: str, 文本的标签
数据来源: https://www.yelp.com/dataset/download
:param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False``
"""
def __init__(self, fine_grained=False):
def __init__(self, fine_grained=False, lower=False):
super(yelpLoader, self).__init__()
tag_v = {'1.0': 'very negative', '2.0': 'negative', '3.0': 'neutral',
'4.0': 'positive', '5.0': 'very positive'}
'4.0': 'positive', '5.0': 'very positive'}
if not fine_grained:
tag_v['1.0'] = tag_v['2.0']
tag_v['5.0'] = tag_v['4.0']
self.fine_grained = fine_grained
self.tag_v = tag_v
def _load(self, path):
self.lower = lower

'''
def _load_json(self, path):
ds = DataSet()
for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna):
d = ast.literal_eval(d)
@@ -49,20 +76,113 @@ class yelpLoader(JsonLoader):
ds.append(Instance(**d))
return ds

def process(self, paths: Union[str, Dict[str, str]], vocab_opt: VocabularyOption = None,
embed_opt: EmbeddingOption = None):
def _load_yelp2015_broken(self,path):
ds = DataSet()
with open (path,encoding='ISO 8859-1') as f:
row=f.readline()
all_count=0
exp_count=0
while row:
row=row.split("\t\t")
all_count+=1
if len(row)>=3:
words=row[-1].split()
try:
target=self.tag_v[str(row[-2])+".0"]
ds.append(Instance(words=words, target=target))
except KeyError:
exp_count+=1
else:
exp_count+=1
row = f.readline()
print("error sample count:",exp_count)
print("all count:",all_count)
return ds
'''

def _load(self, path):
ds = DataSet()
csv_reader = csv.reader(open(path, encoding='utf-8'))
all_count = 0
real_count = 0
for row in csv_reader:
all_count += 1
if len(row) == 2:
target = self.tag_v[row[0] + ".0"]
words = clean_str(row[1], self.lower)
if len(words) != 0:
ds.append(Instance(words=words, target=target))
real_count += 1
print("all count:", all_count)
print("real count:", real_count)
return ds

def process(self, paths: Union[str, Dict[str, str]],
train_ds: Iterable[str] = None,
src_vocab_op: VocabularyOption = None,
tgt_vocab_op: VocabularyOption = None,
embed_opt: EmbeddingOption = None,
char_level_op=False):
paths = check_dataloader_paths(paths)
datasets = {}
info = DataInfo()
vocab = Vocabulary(min_freq=2) if vocab_opt is None else Vocabulary(**vocab_opt)
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset
vocab.from_dataset(dataset, field_name="words")
info.vocabs = vocab
info.datasets = datasets
if embed_opt is not None:
embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab)
info.embeddings['words'] = embed
info = DataInfo(datasets=self.load(paths))
src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op)
tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)
_train_ds = [info.datasets[name]
for name in train_ds] if train_ds else info.datasets.values()

# vocab = Vocabulary(min_freq=2) if vocab_opt is None else Vocabulary(**vocab_opt)
# for name, path in paths.items():
# dataset = self.load(path)
# datasets[name] = dataset
# vocab.from_dataset(dataset, field_name="words")
# info.vocabs = vocab
# info.datasets = datasets

def wordtochar(words):
chars = []
for word in words:
word = word.lower()
for char in word:
chars.append(char)
return chars

input_name, target_name = 'words', 'target'
info.vocabs = {}
# 就分隔为char形式
if char_level_op:
for dataset in info.datasets.values():
dataset.apply_field(wordtochar, field_name="words", new_field_name='chars')
# if embed_opt is not None:
# embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab)
# info.embeddings['words'] = embed
else:
src_vocab.from_dataset(*_train_ds, field_name=input_name)
src_vocab.index_dataset(*info.datasets.values(), field_name=input_name, new_field_name=input_name)
info.vocabs[input_name] = src_vocab

tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
tgt_vocab.index_dataset(
*info.datasets.values(),
field_name=target_name, new_field_name=target_name)
info.vocabs[target_name] = tgt_vocab

return info


if __name__ == "__main__":
testloader = yelpLoader()
# datapath = {"train": "/remote-home/ygwang/yelp_full/train.csv",
# "test": "/remote-home/ygwang/yelp_full/test.csv"}
# datapath={"train": "/remote-home/ygwang/yelp_full/test.csv"}
datapath = {"train": "/remote-home/ygwang/yelp_polarity/train.csv",
"test": "/remote-home/ygwang/yelp_polarity/test.csv"}
datainfo = testloader.process(datapath, char_level_op=True)

len_count = 0
for instance in datainfo.datasets["train"]:
len_count += len(instance["chars"])

ave_len = len_count / len(datainfo.datasets["train"])
print(ave_len)

+ 59
- 38
reproduction/text_classification/train_dpcnn.py View File

@@ -1,65 +1,83 @@
# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径

from torch.optim.lr_scheduler import CosineAnnealingLR
import torch.cuda
from torch.optim import SGD
from fastNLP.core.trainer import Trainer
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP.modules.encoder.embedding import StaticEmbedding, CNNCharEmbedding, StackEmbedding
from reproduction.text_classification.model.dpcnn import DPCNN
from .data.yelpLoader import yelpLoader
from fastNLP.io.dataset_loader import SSTLoader
import torch.nn as nn
from fastNLP.core import LRScheduler
from fastNLP.core.const import Const as C
import sys
import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

from fastNLP.core.const import Const as C
from fastNLP.core import LRScheduler
import torch.nn as nn
from fastNLP.io.dataset_loader import SSTLoader
from reproduction.text_classification.model.dpcnn import DPCNN
from fastNLP.modules.encoder.embedding import StaticEmbedding, CNNCharEmbedding, StackEmbedding
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP.core.trainer import Trainer
from torch.optim import SGD
import torch.cuda
from torch.optim.lr_scheduler import CosineAnnealingLR
sys.path.append('../..')


# hyper

##hyper
class Config():
model_dir_or_name="en-base-uncased"
embedding_grad= False,
train_epoch= 30
model_dir_or_name = "en-base-uncased"
embedding_grad = False,
train_epoch = 30
batch_size = 100
num_classes=5
task= "SST"
datadir = '/remote-home/yfshao/workdir/datasets/SST'
datafile = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"}
lr=1e-3
num_classes = 2
task = "yelp_p"
#datadir = '/remote-home/yfshao/workdir/datasets/SST'
datadir = '/remote-home/ygwang/yelp_polarity'
#datafile = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"}
datafile = {"train": "train.csv", "test": "test.csv"}
lr = 1e-3

def __init__(self):
self.datapath = {k:os.path.join(self.datadir, v)
self.datapath = {k: os.path.join(self.datadir, v)
for k, v in self.datafile.items()}

ops=Config()

ops = Config()


##1.task相关信息:利用dataloader载入dataInfo
datainfo=SSTLoader(fine_grained=True).process(paths=ops.datapath, train_ds='train')
# 1.task相关信息:利用dataloader载入dataInfo

#datainfo=SSTLoader(fine_grained=True).process(paths=ops.datapath, train_ds=['train'])
datainfo = yelpLoader(fine_grained=True, lower=True).process(
paths=ops.datapath, train_ds=['train'])
print(len(datainfo.datasets['train']))
print(len(datainfo.datasets['dev']))
print(len(datainfo.datasets['test']))


## 2.或直接复用fastNLP的模型
vocab = datainfo.vocabs['words']
# 2.或直接复用fastNLP的模型

vocab = datainfo.vocabs['words']
# embedding = StackEmbedding([StaticEmbedding(vocab), CNNCharEmbedding(vocab, 100)])
embedding = StaticEmbedding(vocab)
#embedding = StaticEmbedding(vocab)
embedding = StaticEmbedding(
vocab, model_dir_or_name='en-word2vec-300', requires_grad=True)

print(len(vocab))
print(len(datainfo.vocabs['target']))

model = DPCNN(init_embed=embedding, num_cls=ops.num_classes)

## 3. 声明loss,metric,optimizer
loss=CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET)
metric=AccuracyMetric(pred=C.OUTPUT, target=C.TARGET)
optimizer= SGD([param for param in model.parameters() if param.requires_grad==True],
lr=ops.lr, momentum=0.9, weight_decay=0)

# 3. 声明loss,metric,optimizer
loss = CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET)
metric = AccuracyMetric(pred=C.OUTPUT, target=C.TARGET)
optimizer = SGD([param for param in model.parameters() if param.requires_grad == True],
lr=ops.lr, momentum=0.9, weight_decay=0)

callbacks = []
callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5)))

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

print(device)

for ds in datainfo.datasets.values():
@@ -67,14 +85,17 @@ for ds in datainfo.datasets.values():
ds.set_input(C.INPUT, C.INPUT_LEN)
ds.set_target(C.TARGET)

## 4.定义train方法
def train(model,datainfo,loss,metrics,optimizer,num_epochs=ops.train_epoch):

# 4.定义train方法
def train(model, datainfo, loss, metrics, optimizer, num_epochs=ops.train_epoch):
trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss,
metrics=[metrics], dev_data=datainfo.datasets['dev'], device=device,
metrics=[metrics],
dev_data=datainfo.datasets['test'], device=device,
check_code_level=-1, batch_size=ops.batch_size, callbacks=callbacks,
n_epochs=num_epochs)

print(trainer.train())


if __name__=="__main__":
train(model,datainfo,loss,metric,optimizer)
if __name__ == "__main__":
train(model, datainfo, loss, metric, optimizer)

Loading…
Cancel
Save