From 372496ca32a5015c19563c82ca1b29d6071a6e81 Mon Sep 17 00:00:00 2001 From: yunfan Date: Thu, 4 Jul 2019 14:03:53 +0800 Subject: [PATCH] update model & dataloader in text_classification --- .../text_classification/data/IMDBLoader.py | 82 +++++++++ .../text_classification/data/yelpLoader.py | 164 +++++++++++++++--- .../text_classification/train_dpcnn.py | 97 +++++++---- 3 files changed, 283 insertions(+), 60 deletions(-) create mode 100644 reproduction/text_classification/data/IMDBLoader.py diff --git a/reproduction/text_classification/data/IMDBLoader.py b/reproduction/text_classification/data/IMDBLoader.py new file mode 100644 index 00000000..2df87e26 --- /dev/null +++ b/reproduction/text_classification/data/IMDBLoader.py @@ -0,0 +1,82 @@ +from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader +from fastNLP.core.vocabulary import VocabularyOption +from fastNLP.io.base_loader import DataSetLoader, DataInfo +from typing import Union, Dict, List, Iterator +from fastNLP import DataSet +from fastNLP import Instance +from fastNLP import Vocabulary +from fastNLP import Const +# from reproduction.utils import check_dataloader_paths +from functools import partial + + +class IMDBLoader(DataSetLoader): + """ + 读取IMDB数据集,DataSet包含以下fields: + + words: list(str), 需要分类的文本 + target: str, 文本的标签 + + + """ + + def __init__(self): + super(IMDBLoader, self).__init__() + + def _load(self, path): + dataset = DataSet() + with open(path, 'r', encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + parts = line.split('\t') + target = parts[0] + words = parts[1].split() + dataset.append(Instance(words=words, target=target)) + if len(dataset) == 0: + raise RuntimeError(f"{path} has no valid data.") + + return dataset + + def process(self, + paths: Union[str, Dict[str, str]], + src_vocab_opt: VocabularyOption = None, + tgt_vocab_opt: VocabularyOption = None, + src_embed_opt: EmbeddingOption = None): + + # paths = check_dataloader_paths(paths) + datasets = {} + info = DataInfo() + for name, path in paths.items(): + dataset = self.load(path) + datasets[name] = dataset + + datasets["train"], datasets["dev"] = datasets["train"].split(0.1, shuffle=False) + + src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt) + src_vocab.from_dataset(datasets['train'], field_name='words') + # src_vocab.from_dataset(datasets['train'], datasets["dev"], datasets["test"], field_name='words') + src_vocab.index_dataset(*datasets.values(), field_name='words') + + tgt_vocab = Vocabulary(unknown=None, padding=None) \ + if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) + tgt_vocab.from_dataset(datasets['train'], field_name='target') + tgt_vocab.index_dataset(*datasets.values(), field_name='target') + + info.vocabs = { + "words": src_vocab, + "target": tgt_vocab + } + + info.datasets = datasets + + if src_embed_opt is not None: + embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab) + info.embeddings['words'] = embed + + for name, dataset in info.datasets.items(): + dataset.set_input("words") + dataset.set_target("target") + + return info diff --git a/reproduction/text_classification/data/yelpLoader.py b/reproduction/text_classification/data/yelpLoader.py index c47d48fd..63605ecf 100644 --- a/reproduction/text_classification/data/yelpLoader.py +++ b/reproduction/text_classification/data/yelpLoader.py @@ -1,4 +1,6 @@ import ast +import csv +from typing import Iterable from fastNLP import DataSet, Instance, Vocabulary from fastNLP.core.vocabulary import VocabularyOption from fastNLP.io import JsonLoader @@ -10,11 +12,34 @@ from reproduction.Star_transformer.datasets import EmbedLoader from reproduction.utils import check_dataloader_paths +def clean_str(sentence, char_lower=False): + """ + heavily borrowed from github + https://github.com/LukeZhuang/Hierarchical-Attention-Network/blob/master/yelp-preprocess.ipynb + :param sentence: is a str + :return: + """ + if char_lower: + sentence = sentence.lower() + import re + nonalpnum = re.compile('[^0-9a-zA-Z?!\']+') + words = sentence.split() + words_collection = [] + for word in words: + if word in ['-lrb-', '-rrb-', '', '-r', '-l', 'b-']: + continue + tt = nonalpnum.split(word) + t = ''.join(tt) + if t != '': + words_collection.append(t) + + return words_collection + + class yelpLoader(JsonLoader): - """ 读取Yelp数据集, DataSet包含fields: - + review_id: str, 22 character unique review id user_id: str, 22 character unique user id business_id: str, 22 character business id @@ -24,23 +49,25 @@ class yelpLoader(JsonLoader): date: str, date formatted YYYY-MM-DD words: list(str), 需要分类的文本 target: str, 文本的标签 - + 数据来源: https://www.yelp.com/dataset/download - + :param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False`` """ - - def __init__(self, fine_grained=False): + + def __init__(self, fine_grained=False, lower=False): super(yelpLoader, self).__init__() tag_v = {'1.0': 'very negative', '2.0': 'negative', '3.0': 'neutral', - '4.0': 'positive', '5.0': 'very positive'} + '4.0': 'positive', '5.0': 'very positive'} if not fine_grained: tag_v['1.0'] = tag_v['2.0'] tag_v['5.0'] = tag_v['4.0'] self.fine_grained = fine_grained self.tag_v = tag_v - - def _load(self, path): + self.lower = lower + + ''' + def _load_json(self, path): ds = DataSet() for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna): d = ast.literal_eval(d) @@ -49,20 +76,113 @@ class yelpLoader(JsonLoader): ds.append(Instance(**d)) return ds - def process(self, paths: Union[str, Dict[str, str]], vocab_opt: VocabularyOption = None, - embed_opt: EmbeddingOption = None): + def _load_yelp2015_broken(self,path): + ds = DataSet() + with open (path,encoding='ISO 8859-1') as f: + row=f.readline() + all_count=0 + exp_count=0 + while row: + row=row.split("\t\t") + all_count+=1 + if len(row)>=3: + words=row[-1].split() + try: + target=self.tag_v[str(row[-2])+".0"] + ds.append(Instance(words=words, target=target)) + except KeyError: + exp_count+=1 + else: + exp_count+=1 + row = f.readline() + print("error sample count:",exp_count) + print("all count:",all_count) + return ds + ''' + + def _load(self, path): + ds = DataSet() + csv_reader = csv.reader(open(path, encoding='utf-8')) + all_count = 0 + real_count = 0 + for row in csv_reader: + all_count += 1 + if len(row) == 2: + target = self.tag_v[row[0] + ".0"] + words = clean_str(row[1], self.lower) + if len(words) != 0: + ds.append(Instance(words=words, target=target)) + real_count += 1 + print("all count:", all_count) + print("real count:", real_count) + return ds + + def process(self, paths: Union[str, Dict[str, str]], + train_ds: Iterable[str] = None, + src_vocab_op: VocabularyOption = None, + tgt_vocab_op: VocabularyOption = None, + embed_opt: EmbeddingOption = None, + char_level_op=False): paths = check_dataloader_paths(paths) datasets = {} - info = DataInfo() - vocab = Vocabulary(min_freq=2) if vocab_opt is None else Vocabulary(**vocab_opt) - for name, path in paths.items(): - dataset = self.load(path) - datasets[name] = dataset - vocab.from_dataset(dataset, field_name="words") - info.vocabs = vocab - info.datasets = datasets - if embed_opt is not None: - embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab) - info.embeddings['words'] = embed + info = DataInfo(datasets=self.load(paths)) + src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op) + tgt_vocab = Vocabulary(unknown=None, padding=None) \ + if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op) + _train_ds = [info.datasets[name] + for name in train_ds] if train_ds else info.datasets.values() + + # vocab = Vocabulary(min_freq=2) if vocab_opt is None else Vocabulary(**vocab_opt) + # for name, path in paths.items(): + # dataset = self.load(path) + # datasets[name] = dataset + # vocab.from_dataset(dataset, field_name="words") + # info.vocabs = vocab + # info.datasets = datasets + + def wordtochar(words): + chars = [] + for word in words: + word = word.lower() + for char in word: + chars.append(char) + return chars + + input_name, target_name = 'words', 'target' + info.vocabs = {} + # 就分隔为char形式 + if char_level_op: + for dataset in info.datasets.values(): + dataset.apply_field(wordtochar, field_name="words", new_field_name='chars') + # if embed_opt is not None: + # embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab) + # info.embeddings['words'] = embed + else: + src_vocab.from_dataset(*_train_ds, field_name=input_name) + src_vocab.index_dataset(*info.datasets.values(), field_name=input_name, new_field_name=input_name) + info.vocabs[input_name] = src_vocab + + tgt_vocab.from_dataset(*_train_ds, field_name=target_name) + tgt_vocab.index_dataset( + *info.datasets.values(), + field_name=target_name, new_field_name=target_name) + info.vocabs[target_name] = tgt_vocab + return info + +if __name__ == "__main__": + testloader = yelpLoader() + # datapath = {"train": "/remote-home/ygwang/yelp_full/train.csv", + # "test": "/remote-home/ygwang/yelp_full/test.csv"} + # datapath={"train": "/remote-home/ygwang/yelp_full/test.csv"} + datapath = {"train": "/remote-home/ygwang/yelp_polarity/train.csv", + "test": "/remote-home/ygwang/yelp_polarity/test.csv"} + datainfo = testloader.process(datapath, char_level_op=True) + + len_count = 0 + for instance in datainfo.datasets["train"]: + len_count += len(instance["chars"]) + + ave_len = len_count / len(datainfo.datasets["train"]) + print(ave_len) \ No newline at end of file diff --git a/reproduction/text_classification/train_dpcnn.py b/reproduction/text_classification/train_dpcnn.py index 13ff4fc1..bf243ffb 100644 --- a/reproduction/text_classification/train_dpcnn.py +++ b/reproduction/text_classification/train_dpcnn.py @@ -1,65 +1,83 @@ # 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径 + +from torch.optim.lr_scheduler import CosineAnnealingLR +import torch.cuda +from torch.optim import SGD +from fastNLP.core.trainer import Trainer +from fastNLP import CrossEntropyLoss, AccuracyMetric +from fastNLP.modules.encoder.embedding import StaticEmbedding, CNNCharEmbedding, StackEmbedding +from reproduction.text_classification.model.dpcnn import DPCNN +from .data.yelpLoader import yelpLoader +from fastNLP.io.dataset_loader import SSTLoader +import torch.nn as nn +from fastNLP.core import LRScheduler +from fastNLP.core.const import Const as C +import sys import os os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" -from fastNLP.core.const import Const as C -from fastNLP.core import LRScheduler -import torch.nn as nn -from fastNLP.io.dataset_loader import SSTLoader -from reproduction.text_classification.model.dpcnn import DPCNN -from fastNLP.modules.encoder.embedding import StaticEmbedding, CNNCharEmbedding, StackEmbedding -from fastNLP import CrossEntropyLoss, AccuracyMetric -from fastNLP.core.trainer import Trainer -from torch.optim import SGD -import torch.cuda -from torch.optim.lr_scheduler import CosineAnnealingLR +sys.path.append('../..') + + +# hyper -##hyper class Config(): - model_dir_or_name="en-base-uncased" - embedding_grad= False, - train_epoch= 30 + model_dir_or_name = "en-base-uncased" + embedding_grad = False, + train_epoch = 30 batch_size = 100 - num_classes=5 - task= "SST" - datadir = '/remote-home/yfshao/workdir/datasets/SST' - datafile = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"} - lr=1e-3 + num_classes = 2 + task = "yelp_p" + #datadir = '/remote-home/yfshao/workdir/datasets/SST' + datadir = '/remote-home/ygwang/yelp_polarity' + #datafile = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"} + datafile = {"train": "train.csv", "test": "test.csv"} + lr = 1e-3 + def __init__(self): - self.datapath = {k:os.path.join(self.datadir, v) + self.datapath = {k: os.path.join(self.datadir, v) for k, v in self.datafile.items()} -ops=Config() + +ops = Config() -##1.task相关信息:利用dataloader载入dataInfo -datainfo=SSTLoader(fine_grained=True).process(paths=ops.datapath, train_ds='train') +# 1.task相关信息:利用dataloader载入dataInfo +#datainfo=SSTLoader(fine_grained=True).process(paths=ops.datapath, train_ds=['train']) +datainfo = yelpLoader(fine_grained=True, lower=True).process( + paths=ops.datapath, train_ds=['train']) print(len(datainfo.datasets['train'])) -print(len(datainfo.datasets['dev'])) +print(len(datainfo.datasets['test'])) -## 2.或直接复用fastNLP的模型 -vocab = datainfo.vocabs['words'] +# 2.或直接复用fastNLP的模型 +vocab = datainfo.vocabs['words'] # embedding = StackEmbedding([StaticEmbedding(vocab), CNNCharEmbedding(vocab, 100)]) -embedding = StaticEmbedding(vocab) +#embedding = StaticEmbedding(vocab) +embedding = StaticEmbedding( + vocab, model_dir_or_name='en-word2vec-300', requires_grad=True) + print(len(vocab)) print(len(datainfo.vocabs['target'])) + model = DPCNN(init_embed=embedding, num_cls=ops.num_classes) -## 3. 声明loss,metric,optimizer -loss=CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET) -metric=AccuracyMetric(pred=C.OUTPUT, target=C.TARGET) -optimizer= SGD([param for param in model.parameters() if param.requires_grad==True], - lr=ops.lr, momentum=0.9, weight_decay=0) + +# 3. 声明loss,metric,optimizer +loss = CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET) +metric = AccuracyMetric(pred=C.OUTPUT, target=C.TARGET) +optimizer = SGD([param for param in model.parameters() if param.requires_grad == True], + lr=ops.lr, momentum=0.9, weight_decay=0) callbacks = [] callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5))) device = 'cuda:0' if torch.cuda.is_available() else 'cpu' + print(device) for ds in datainfo.datasets.values(): @@ -67,14 +85,17 @@ for ds in datainfo.datasets.values(): ds.set_input(C.INPUT, C.INPUT_LEN) ds.set_target(C.TARGET) -## 4.定义train方法 -def train(model,datainfo,loss,metrics,optimizer,num_epochs=ops.train_epoch): + +# 4.定义train方法 +def train(model, datainfo, loss, metrics, optimizer, num_epochs=ops.train_epoch): trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, - metrics=[metrics], dev_data=datainfo.datasets['dev'], device=device, + metrics=[metrics], + dev_data=datainfo.datasets['test'], device=device, check_code_level=-1, batch_size=ops.batch_size, callbacks=callbacks, n_epochs=num_epochs) + print(trainer.train()) -if __name__=="__main__": - train(model,datainfo,loss,metric,optimizer) \ No newline at end of file +if __name__ == "__main__": + train(model, datainfo, loss, metric, optimizer)