update model & dataloader in text_classification

6 years ago · 372496ca32
--- a/reproduction/text_classification/data/IMDBLoader.py
+++ b/reproduction/text_classification/data/IMDBLoader.py
@@ -0,0 +1,82 @@
 from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader
 from fastNLP.core.vocabulary import VocabularyOption
 from fastNLP.io.base_loader import DataSetLoader, DataInfo
 from typing import Union, Dict, List, Iterator
 from fastNLP import DataSet
 from fastNLP import Instance
 from fastNLP import Vocabulary
 from fastNLP import Const
 # from reproduction.utils import check_dataloader_paths
 from functools import partial


 class IMDBLoader(DataSetLoader):
    """
    读取IMDB数据集，DataSet包含以下fields:

        words: list(str), 需要分类的文本
        target: str, 文本的标签


    """

    def __init__(self):
        super(IMDBLoader, self).__init__()

    def _load(self, path):
        dataset = DataSet()
        with open(path, 'r', encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                parts = line.split('\t')
                target = parts[0]
                words = parts[1].split()
                dataset.append(Instance(words=words, target=target))
        if len(dataset) == 0:
            raise RuntimeError(f"{path} has no valid data.")

        return dataset

    def process(self,
                paths: Union[str, Dict[str, str]],
                src_vocab_opt: VocabularyOption = None,
                tgt_vocab_opt: VocabularyOption = None,
                src_embed_opt: EmbeddingOption = None):

        # paths = check_dataloader_paths(paths)
        datasets = {}
        info = DataInfo()
        for name, path in paths.items():
            dataset = self.load(path)
            datasets[name] = dataset

        datasets["train"], datasets["dev"] = datasets["train"].split(0.1, shuffle=False)

        src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
        src_vocab.from_dataset(datasets['train'], field_name='words')
        # src_vocab.from_dataset(datasets['train'], datasets["dev"], datasets["test"], field_name='words')
        src_vocab.index_dataset(*datasets.values(), field_name='words')

        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
        tgt_vocab.from_dataset(datasets['train'], field_name='target')
        tgt_vocab.index_dataset(*datasets.values(), field_name='target')

        info.vocabs = {
            "words": src_vocab,
            "target": tgt_vocab
        }

        info.datasets = datasets

        if src_embed_opt is not None:
            embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
            info.embeddings['words'] = embed

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
--- a/reproduction/text_classification/data/yelpLoader.py
+++ b/reproduction/text_classification/data/yelpLoader.py
@@ -1,4 +1,6 @@
 import ast
 import csv
 from typing import Iterable
 from fastNLP import DataSet, Instance, Vocabulary
 from fastNLP.core.vocabulary import VocabularyOption
 from fastNLP.io import JsonLoader
@@ -10,11 +12,34 @@ from reproduction.Star_transformer.datasets import EmbedLoader
 from reproduction.utils import check_dataloader_paths


 def clean_str(sentence, char_lower=False):
    """
    heavily borrowed from github
    https://github.com/LukeZhuang/Hierarchical-Attention-Network/blob/master/yelp-preprocess.ipynb
    :param sentence:  is a str
    :return:
    """
    if char_lower:
        sentence = sentence.lower()
    import re
    nonalpnum = re.compile('[^0-9a-zA-Z?!\']+')
    words = sentence.split()
    words_collection = []
    for word in words:
        if word in ['-lrb-', '-rrb-', '<sssss>', '-r', '-l', 'b-']:
            continue
        tt = nonalpnum.split(word)
        t = ''.join(tt)
        if t != '':
            words_collection.append(t)

    return words_collection


 class yelpLoader(JsonLoader):
    
    """
    读取Yelp数据集, DataSet包含fields:
    

        review_id: str, 22 character unique review id
        user_id: str, 22 character unique user id
        business_id: str, 22 character business id
@@ -24,23 +49,25 @@ class yelpLoader(JsonLoader):
        date: str, date formatted YYYY-MM-DD
        words: list(str), 需要分类的文本
        target: str, 文本的标签
    

    数据来源: https://www.yelp.com/dataset/download
    

    :param fine_grained: 是否使用SST-5标准，若 ``False`` , 使用SST-2。Default: ``False``
    """
    
    def __init__(self, fine_grained=False):

    def __init__(self, fine_grained=False, lower=False):
        super(yelpLoader, self).__init__()
        tag_v = {'1.0': 'very negative', '2.0': 'negative', '3.0': 'neutral',
            '4.0': 'positive', '5.0': 'very positive'}
                 '4.0': 'positive', '5.0': 'very positive'}
        if not fine_grained:
            tag_v['1.0'] = tag_v['2.0']
            tag_v['5.0'] = tag_v['4.0']
        self.fine_grained = fine_grained
        self.tag_v = tag_v
    
    def _load(self, path):
        self.lower = lower

    '''
    def _load_json(self, path):
        ds = DataSet()
        for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna):
            d = ast.literal_eval(d)
@@ -49,20 +76,113 @@ class yelpLoader(JsonLoader):
            ds.append(Instance(**d))
        return ds

    def process(self, paths: Union[str, Dict[str, str]], vocab_opt: VocabularyOption = None,
                embed_opt: EmbeddingOption = None):
    def _load_yelp2015_broken(self,path):
        ds = DataSet()
        with open (path,encoding='ISO 8859-1') as f:
            row=f.readline()
            all_count=0
            exp_count=0
            while row:
                row=row.split("\t\t")
                all_count+=1
                if len(row)>=3:
                    words=row[-1].split()
                    try:
                        target=self.tag_v[str(row[-2])+".0"]
                        ds.append(Instance(words=words, target=target))
                    except KeyError:
                        exp_count+=1
                else:
                    exp_count+=1
                row = f.readline()
            print("error sample count:",exp_count)
            print("all count:",all_count)
        return ds
    '''

    def _load(self, path):
        ds = DataSet()
        csv_reader = csv.reader(open(path, encoding='utf-8'))
        all_count = 0
        real_count = 0
        for row in csv_reader:
            all_count += 1
            if len(row) == 2:
                target = self.tag_v[row[0] + ".0"]
                words = clean_str(row[1], self.lower)
                if len(words) != 0:
                    ds.append(Instance(words=words, target=target))
                    real_count += 1
        print("all count:", all_count)
        print("real count:", real_count)
        return ds

    def process(self, paths: Union[str, Dict[str, str]],
                train_ds: Iterable[str] = None,
                src_vocab_op: VocabularyOption = None,
                tgt_vocab_op: VocabularyOption = None,
                embed_opt: EmbeddingOption = None,
                char_level_op=False):
        paths = check_dataloader_paths(paths)
        datasets = {}
        info = DataInfo()
        vocab = Vocabulary(min_freq=2) if vocab_opt is None else Vocabulary(**vocab_opt)
        for name, path in paths.items():
            dataset = self.load(path)
            datasets[name] = dataset
            vocab.from_dataset(dataset, field_name="words")
        info.vocabs = vocab
        info.datasets = datasets
        if embed_opt is not None:
            embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab)
            info.embeddings['words'] = embed
        info = DataInfo(datasets=self.load(paths))
        src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op)
        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)
        _train_ds = [info.datasets[name]
                     for name in train_ds] if train_ds else info.datasets.values()

        # vocab = Vocabulary(min_freq=2) if vocab_opt is None else Vocabulary(**vocab_opt)
        # for name, path in paths.items():
        #     dataset = self.load(path)
        #     datasets[name] = dataset
        #     vocab.from_dataset(dataset, field_name="words")
        # info.vocabs = vocab
        # info.datasets = datasets

        def wordtochar(words):
            chars = []
            for word in words:
                word = word.lower()
                for char in word:
                    chars.append(char)
            return chars

        input_name, target_name = 'words', 'target'
        info.vocabs = {}
        # 就分隔为char形式
        if char_level_op:
            for dataset in info.datasets.values():
                dataset.apply_field(wordtochar, field_name="words", new_field_name='chars')
        # if embed_opt is not None:
        #     embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab)
        #     info.embeddings['words'] = embed
        else:
            src_vocab.from_dataset(*_train_ds, field_name=input_name)
            src_vocab.index_dataset(*info.datasets.values(), field_name=input_name, new_field_name=input_name)
            info.vocabs[input_name] = src_vocab

        tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
        tgt_vocab.index_dataset(
            *info.datasets.values(),
            field_name=target_name, new_field_name=target_name)
        info.vocabs[target_name] = tgt_vocab

        return info


 if __name__ == "__main__":
    testloader = yelpLoader()
    # datapath = {"train": "/remote-home/ygwang/yelp_full/train.csv",
    #             "test": "/remote-home/ygwang/yelp_full/test.csv"}
    # datapath={"train": "/remote-home/ygwang/yelp_full/test.csv"}
    datapath = {"train": "/remote-home/ygwang/yelp_polarity/train.csv",
                "test": "/remote-home/ygwang/yelp_polarity/test.csv"}
    datainfo = testloader.process(datapath, char_level_op=True)

    len_count = 0
    for instance in datainfo.datasets["train"]:
        len_count += len(instance["chars"])

    ave_len = len_count / len(datainfo.datasets["train"])
    print(ave_len)
--- a/reproduction/text_classification/train_dpcnn.py
+++ b/reproduction/text_classification/train_dpcnn.py
@@ -1,65 +1,83 @@
 # 首先需要加入以下的路径到环境变量，因为当前只对内部测试开放，所以需要手动申明一下路径

 from torch.optim.lr_scheduler import CosineAnnealingLR
 import torch.cuda
 from torch.optim import SGD
 from fastNLP.core.trainer import Trainer
 from fastNLP import CrossEntropyLoss, AccuracyMetric
 from fastNLP.modules.encoder.embedding import StaticEmbedding, CNNCharEmbedding, StackEmbedding
 from reproduction.text_classification.model.dpcnn import DPCNN
 from .data.yelpLoader import yelpLoader
 from fastNLP.io.dataset_loader import SSTLoader
 import torch.nn as nn
 from fastNLP.core import LRScheduler
 from fastNLP.core.const import Const as C
 import sys
 import os
 os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
 os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

 from fastNLP.core.const import Const as C
 from fastNLP.core import LRScheduler
 import torch.nn as nn
 from fastNLP.io.dataset_loader import SSTLoader
 from reproduction.text_classification.model.dpcnn import DPCNN
 from fastNLP.modules.encoder.embedding import StaticEmbedding, CNNCharEmbedding, StackEmbedding
 from fastNLP import CrossEntropyLoss, AccuracyMetric
 from fastNLP.core.trainer import Trainer
 from torch.optim import SGD
 import torch.cuda
 from torch.optim.lr_scheduler import CosineAnnealingLR
 sys.path.append('../..')


 # hyper

 ##hyper
 class Config():
    model_dir_or_name="en-base-uncased"
    embedding_grad= False,
    train_epoch= 30
    model_dir_or_name = "en-base-uncased"
    embedding_grad = False,
    train_epoch = 30
    batch_size = 100
    num_classes=5
    task= "SST"
    datadir = '/remote-home/yfshao/workdir/datasets/SST'
    datafile = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"}
    lr=1e-3
    num_classes = 2
    task = "yelp_p"
    #datadir = '/remote-home/yfshao/workdir/datasets/SST'
    datadir = '/remote-home/ygwang/yelp_polarity'
    #datafile = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"}
    datafile = {"train": "train.csv",  "test": "test.csv"}
    lr = 1e-3

    def __init__(self):
        self.datapath = {k:os.path.join(self.datadir, v)
        self.datapath = {k: os.path.join(self.datadir, v)
                         for k, v in self.datafile.items()}

 ops=Config()

 ops = Config()


 ##1.task相关信息：利用dataloader载入dataInfo
 datainfo=SSTLoader(fine_grained=True).process(paths=ops.datapath, train_ds='train')
 # 1.task相关信息：利用dataloader载入dataInfo

 #datainfo=SSTLoader(fine_grained=True).process(paths=ops.datapath, train_ds=['train'])
 datainfo = yelpLoader(fine_grained=True, lower=True).process(
    paths=ops.datapath, train_ds=['train'])
 print(len(datainfo.datasets['train']))
 print(len(datainfo.datasets['dev']))
 print(len(datainfo.datasets['test']))


 ## 2.或直接复用fastNLP的模型
 vocab = datainfo.vocabs['words']
 # 2.或直接复用fastNLP的模型

 vocab = datainfo.vocabs['words']
 # embedding = StackEmbedding([StaticEmbedding(vocab), CNNCharEmbedding(vocab, 100)])
 embedding = StaticEmbedding(vocab)
 #embedding = StaticEmbedding(vocab)
 embedding = StaticEmbedding(
    vocab, model_dir_or_name='en-word2vec-300', requires_grad=True)

 print(len(vocab))
 print(len(datainfo.vocabs['target']))

 model = DPCNN(init_embed=embedding, num_cls=ops.num_classes)

 ## 3. 声明loss,metric,optimizer
 loss=CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET)
 metric=AccuracyMetric(pred=C.OUTPUT, target=C.TARGET)
 optimizer= SGD([param for param in model.parameters() if param.requires_grad==True],
               lr=ops.lr, momentum=0.9, weight_decay=0)

 # 3. 声明loss,metric,optimizer
 loss = CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET)
 metric = AccuracyMetric(pred=C.OUTPUT, target=C.TARGET)
 optimizer = SGD([param for param in model.parameters() if param.requires_grad == True],
                lr=ops.lr, momentum=0.9, weight_decay=0)

 callbacks = []
 callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5)))

 device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

 print(device)

 for ds in datainfo.datasets.values():
@@ -67,14 +85,17 @@ for ds in datainfo.datasets.values():
    ds.set_input(C.INPUT, C.INPUT_LEN)
    ds.set_target(C.TARGET)

 ## 4.定义train方法
 def train(model,datainfo,loss,metrics,optimizer,num_epochs=ops.train_epoch):

 # 4.定义train方法
 def train(model, datainfo, loss, metrics, optimizer, num_epochs=ops.train_epoch):
    trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss,
                      metrics=[metrics], dev_data=datainfo.datasets['dev'], device=device,
                      metrics=[metrics],
                      dev_data=datainfo.datasets['test'], device=device,
                      check_code_level=-1, batch_size=ops.batch_size, callbacks=callbacks,
                      n_epochs=num_epochs)

    print(trainer.train())


 if __name__=="__main__":
    train(model,datainfo,loss,metric,optimizer)
 if __name__ == "__main__":
    train(model, datainfo, loss, metric, optimizer)