Browse Source

Merge branch 'dev0.5.0' into master

tags/v0.4.10
SrWYG GitHub 5 years ago
parent
commit
dcb8746c7a
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 494 additions and 56 deletions
  1. +31
    -5
      reproduction/text_classification/data/IMDBLoader.py
  2. +98
    -0
      reproduction/text_classification/data/sstLoader.py
  3. +54
    -49
      reproduction/text_classification/data/yelpLoader.py
  4. +90
    -1
      reproduction/text_classification/model/char_cnn.py
  5. +10
    -1
      reproduction/text_classification/model/dpcnn.py
  6. +206
    -0
      reproduction/text_classification/train_char_cnn.py
  7. +5
    -0
      reproduction/text_classification/train_dpcnn.py

+ 31
- 5
reproduction/text_classification/data/IMDBLoader.py View File

@@ -34,23 +34,36 @@ class IMDBLoader(DataSetLoader):
target = parts[0] target = parts[0]
words = parts[1].split() words = parts[1].split()
dataset.append(Instance(words=words, target=target)) dataset.append(Instance(words=words, target=target))
if len(dataset) == 0:

if len(dataset)==0:
raise RuntimeError(f"{path} has no valid data.") raise RuntimeError(f"{path} has no valid data.")


return dataset return dataset
def process(self, def process(self,
paths: Union[str, Dict[str, str]], paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None, src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None, tgt_vocab_opt: VocabularyOption = None,
src_embed_opt: EmbeddingOption = None):
# paths = check_dataloader_paths(paths)
src_embed_opt: EmbeddingOption = None,
char_level_op=False):
datasets = {} datasets = {}
info = DataInfo() info = DataInfo()
for name, path in paths.items(): for name, path in paths.items():
dataset = self.load(path) dataset = self.load(path)
datasets[name] = dataset datasets[name] = dataset
def wordtochar(words):
chars = []
for word in words:
word = word.lower()
for char in word:
chars.append(char)
return chars

if char_level_op:
for dataset in datasets.values():
dataset.apply_field(wordtochar, field_name="words", new_field_name='chars')


datasets["train"], datasets["dev"] = datasets["train"].split(0.1, shuffle=False) datasets["train"], datasets["dev"] = datasets["train"].split(0.1, shuffle=False)


@@ -80,3 +93,16 @@ class IMDBLoader(DataSetLoader):
dataset.set_target("target") dataset.set_target("target")


return info return info


if __name__=="__main__":
datapath = {"train": "/remote-home/ygwang/IMDB_data/train.csv",
"test": "/remote-home/ygwang/IMDB_data/test.csv"}
datainfo=IMDBLoader().process(datapath,char_level_op=True)
#print(datainfo.datasets["train"])
len_count = 0
for instance in datainfo.datasets["train"]:
len_count += len(instance["chars"])

ave_len = len_count / len(datainfo.datasets["train"])
print(ave_len)

+ 98
- 0
reproduction/text_classification/data/sstLoader.py View File

@@ -0,0 +1,98 @@
import csv
from typing import Iterable
from fastNLP import DataSet, Instance, Vocabulary
from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io.base_loader import DataInfo,DataSetLoader
from fastNLP.io.embed_loader import EmbeddingOption
from fastNLP.io.file_reader import _read_json
from typing import Union, Dict
from reproduction.Star_transformer.datasets import EmbedLoader
from reproduction.utils import check_dataloader_paths

class sst2Loader(DataSetLoader):
'''
数据来源"SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
'''
def __init__(self):
super(sst2Loader, self).__init__()

def _load(self, path: str) -> DataSet:
ds = DataSet()
all_count=0
csv_reader = csv.reader(open(path, encoding='utf-8'),delimiter='\t')
skip_row = 0
for idx,row in enumerate(csv_reader):
if idx<=skip_row:
continue
target = row[1]
words = row[0].split()
ds.append(Instance(words=words,target=target))
all_count+=1
print("all count:", all_count)
return ds

def process(self,
paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None,
src_embed_opt: EmbeddingOption = None,
char_level_op=False):

paths = check_dataloader_paths(paths)
datasets = {}
info = DataInfo()
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset

def wordtochar(words):
chars=[]
for word in words:
word=word.lower()
for char in word:
chars.append(char)
return chars

input_name, target_name = 'words', 'target'
info.vocabs={}

# 就分隔为char形式
if char_level_op:
for dataset in datasets.values():
dataset.apply_field(wordtochar, field_name="words", new_field_name='chars')

src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
src_vocab.from_dataset(datasets['train'], field_name='words')
src_vocab.index_dataset(*datasets.values(), field_name='words')

tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
tgt_vocab.from_dataset(datasets['train'], field_name='target')
tgt_vocab.index_dataset(*datasets.values(), field_name='target')


info.vocabs = {
"words": src_vocab,
"target": tgt_vocab
}

info.datasets = datasets


if src_embed_opt is not None:
embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
info.embeddings['words'] = embed

return info

if __name__=="__main__":
datapath = {"train": "/remote-home/ygwang/workspace/GLUE/SST-2/train.tsv",
"dev": "/remote-home/ygwang/workspace/GLUE/SST-2/dev.tsv"}
datainfo=sst2Loader().process(datapath,char_level_op=True)
#print(datainfo.datasets["train"])
len_count = 0
for instance in datainfo.datasets["train"]:
len_count += len(instance["chars"])

ave_len = len_count / len(datainfo.datasets["train"])
print(ave_len)

+ 54
- 49
reproduction/text_classification/data/yelpLoader.py View File

@@ -4,13 +4,14 @@ from typing import Iterable
from fastNLP import DataSet, Instance, Vocabulary from fastNLP import DataSet, Instance, Vocabulary
from fastNLP.core.vocabulary import VocabularyOption from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io import JsonLoader from fastNLP.io import JsonLoader
from fastNLP.io.base_loader import DataInfo
from fastNLP.io.base_loader import DataInfo,DataSetLoader
from fastNLP.io.embed_loader import EmbeddingOption from fastNLP.io.embed_loader import EmbeddingOption
from fastNLP.io.file_reader import _read_json from fastNLP.io.file_reader import _read_json
from typing import Union, Dict from typing import Union, Dict
from reproduction.utils import check_dataloader_paths from reproduction.utils import check_dataloader_paths





def get_tokenizer(): def get_tokenizer():
try: try:
import spacy import spacy
@@ -45,26 +46,19 @@ def clean_str(sentence, tokenizer, char_lower=False):
return words_collection return words_collection




class yelpLoader(JsonLoader):
class yelpLoader(DataSetLoader):
""" """
读取Yelp数据集, DataSet包含fields:

review_id: str, 22 character unique review id
user_id: str, 22 character unique user id
business_id: str, 22 character business id
useful: int, number of useful votes received
funny: int, number of funny votes received
cool: int, number of cool votes received
date: str, date formatted YYYY-MM-DD
读取Yelp_full/Yelp_polarity数据集, DataSet包含fields:
words: list(str), 需要分类的文本 words: list(str), 需要分类的文本
target: str, 文本的标签 target: str, 文本的标签
chars:list(str),未index的字符列表


数据来源: https://www.yelp.com/dataset/download

数据集:yelp_full/yelp_polarity
:param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False`` :param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False``
""" """
def __init__(self, fine_grained=False, lower=False):
def __init__(self, fine_grained=False,lower=False):
super(yelpLoader, self).__init__() super(yelpLoader, self).__init__()
tag_v = {'1.0': 'very negative', '2.0': 'negative', '3.0': 'neutral', tag_v = {'1.0': 'very negative', '2.0': 'negative', '3.0': 'neutral',
'4.0': 'positive', '5.0': 'very positive'} '4.0': 'positive', '5.0': 'very positive'}
@@ -77,6 +71,21 @@ class yelpLoader(JsonLoader):
self.tokenizer = get_tokenizer() self.tokenizer = get_tokenizer()


''' '''
读取Yelp数据集, DataSet包含fields:
review_id: str, 22 character unique review id
user_id: str, 22 character unique user id
business_id: str, 22 character business id
useful: int, number of useful votes received
funny: int, number of funny votes received
cool: int, number of cool votes received
date: str, date formatted YYYY-MM-DD
words: list(str), 需要分类的文本
target: str, 文本的标签
数据来源: https://www.yelp.com/dataset/download

def _load_json(self, path): def _load_json(self, path):
ds = DataSet() ds = DataSet()
for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna): for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna):
@@ -85,7 +94,7 @@ class yelpLoader(JsonLoader):
d["target"] = self.tag_v[str(d.pop("stars"))] d["target"] = self.tag_v[str(d.pop("stars"))]
ds.append(Instance(**d)) ds.append(Instance(**d))
return ds return ds
def _load_yelp2015_broken(self,path): def _load_yelp2015_broken(self,path):
ds = DataSet() ds = DataSet()
with open (path,encoding='ISO 8859-1') as f: with open (path,encoding='ISO 8859-1') as f:
@@ -109,24 +118,26 @@ class yelpLoader(JsonLoader):
print("all count:",all_count) print("all count:",all_count)
return ds return ds
''' '''
def _load(self, path): def _load(self, path):
ds = DataSet() ds = DataSet()
csv_reader = csv.reader(open(path, encoding='utf-8'))
all_count = 0
real_count = 0
csv_reader=csv.reader(open(path,encoding='utf-8'))
all_count=0
real_count=0
for row in csv_reader: for row in csv_reader:
all_count += 1
if len(row) == 2:
target = self.tag_v[row[0] + ".0"]
words = clean_str(row[1], self.tokenizer, self.lower)
if len(words) != 0:
ds.append(Instance(words=words, target=target))
all_count+=1
if len(row)==2:
target=self.tag_v[row[0]+".0"]
words=clean_str(row[1],self.lower)
if len(words)!=0:
ds.append(Instance(words=words,target=target))
real_count += 1 real_count += 1
print("all count:", all_count) print("all count:", all_count)
print("real count:", real_count) print("real count:", real_count)
return ds return ds




def process(self, paths: Union[str, Dict[str, str]], def process(self, paths: Union[str, Dict[str, str]],
train_ds: Iterable[str] = None, train_ds: Iterable[str] = None,
src_vocab_op: VocabularyOption = None, src_vocab_op: VocabularyOption = None,
@@ -142,57 +153,51 @@ class yelpLoader(JsonLoader):
_train_ds = [info.datasets[name] _train_ds = [info.datasets[name]
for name in train_ds] if train_ds else info.datasets.values() for name in train_ds] if train_ds else info.datasets.values()


# vocab = Vocabulary(min_freq=2) if vocab_opt is None else Vocabulary(**vocab_opt)
# for name, path in paths.items():
# dataset = self.load(path)
# datasets[name] = dataset
# vocab.from_dataset(dataset, field_name="words")
# info.vocabs = vocab
# info.datasets = datasets


def wordtochar(words): def wordtochar(words):
chars = []

chars=[]
for word in words: for word in words:
word = word.lower()
word=word.lower()
for char in word: for char in word:
chars.append(char) chars.append(char)
return chars return chars


input_name, target_name = 'words', 'target' input_name, target_name = 'words', 'target'
info.vocabs = {}
# 就分隔为char形式
info.vocabs={}
#就分隔为char形式
if char_level_op: if char_level_op:
for dataset in info.datasets.values(): for dataset in info.datasets.values():
dataset.apply_field(wordtochar, field_name="words", new_field_name='chars')
dataset.apply_field(wordtochar, field_name="words",new_field_name='chars')
# if embed_opt is not None: # if embed_opt is not None:
# embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab) # embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab)
# info.embeddings['words'] = embed # info.embeddings['words'] = embed
else: else:
src_vocab.from_dataset(*_train_ds, field_name=input_name) src_vocab.from_dataset(*_train_ds, field_name=input_name)
src_vocab.index_dataset(*info.datasets.values(), field_name=input_name, new_field_name=input_name)
info.vocabs[input_name] = src_vocab
src_vocab.index_dataset(*info.datasets.values(),field_name=input_name, new_field_name=input_name)
info.vocabs[input_name]=src_vocab


tgt_vocab.from_dataset(*_train_ds, field_name=target_name) tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
tgt_vocab.index_dataset( tgt_vocab.index_dataset(
*info.datasets.values(), *info.datasets.values(),
field_name=target_name, new_field_name=target_name) field_name=target_name, new_field_name=target_name)
info.vocabs[target_name] = tgt_vocab


return info
info.vocabs[target_name]=tgt_vocab


return info


if __name__ == "__main__":
testloader = yelpLoader()
if __name__=="__main__":
testloader=yelpLoader()
# datapath = {"train": "/remote-home/ygwang/yelp_full/train.csv", # datapath = {"train": "/remote-home/ygwang/yelp_full/train.csv",
# "test": "/remote-home/ygwang/yelp_full/test.csv"} # "test": "/remote-home/ygwang/yelp_full/test.csv"}
# datapath={"train": "/remote-home/ygwang/yelp_full/test.csv"}
#datapath={"train": "/remote-home/ygwang/yelp_full/test.csv"}
datapath = {"train": "/remote-home/ygwang/yelp_polarity/train.csv", datapath = {"train": "/remote-home/ygwang/yelp_polarity/train.csv",
"test": "/remote-home/ygwang/yelp_polarity/test.csv"} "test": "/remote-home/ygwang/yelp_polarity/test.csv"}
datainfo = testloader.process(datapath, char_level_op=True)
datainfo=testloader.process(datapath,char_level_op=True)


len_count = 0
len_count=0
for instance in datainfo.datasets["train"]: for instance in datainfo.datasets["train"]:
len_count += len(instance["chars"])
len_count+=len(instance["chars"])


ave_len = len_count / len(datainfo.datasets["train"])
ave_len=len_count/len(datainfo.datasets["train"])
print(ave_len) print(ave_len)

+ 90
- 1
reproduction/text_classification/model/char_cnn.py View File

@@ -1 +1,90 @@
# TODO
'''
@author: https://github.com/ahmedbesbes/character-based-cnn
这里借鉴了上述链接中char-cnn model的代码,改动主要为将其改动为符合fastnlp的pipline
'''
import torch
import torch.nn as nn
from fastNLP.core.const import Const as C

class CharacterLevelCNN(nn.Module):
def __init__(self, args,embedding):
super(CharacterLevelCNN, self).__init__()

self.config=args.char_cnn_config
self.embedding=embedding

conv_layers = []
for i, conv_layer_parameter in enumerate(self.config['model_parameters'][args.model_size]['conv']):
if i == 0:
#in_channels = args.number_of_characters + len(args.extra_characters)
in_channels = args.embedding_dim
out_channels = conv_layer_parameter[0]
else:
in_channels, out_channels = conv_layer_parameter[0], conv_layer_parameter[0]

if conv_layer_parameter[2] != -1:
conv_layer = nn.Sequential(nn.Conv1d(in_channels,
out_channels,
kernel_size=conv_layer_parameter[1], padding=0),
nn.ReLU(),
nn.MaxPool1d(conv_layer_parameter[2]))
else:
conv_layer = nn.Sequential(nn.Conv1d(in_channels,
out_channels,
kernel_size=conv_layer_parameter[1], padding=0),
nn.ReLU())
conv_layers.append(conv_layer)
self.conv_layers = nn.ModuleList(conv_layers)

input_shape = (args.batch_size, args.max_length,
args.number_of_characters + len(args.extra_characters))
dimension = self._get_conv_output(input_shape)

print('dimension :', dimension)

fc_layer_parameter = self.config['model_parameters'][args.model_size]['fc'][0]
fc_layers = nn.ModuleList([
nn.Sequential(
nn.Linear(dimension, fc_layer_parameter), nn.Dropout(0.5)),
nn.Sequential(nn.Linear(fc_layer_parameter,
fc_layer_parameter), nn.Dropout(0.5)),
nn.Linear(fc_layer_parameter, args.num_classes),
])

self.fc_layers = fc_layers

if args.model_size == 'small':
self._create_weights(mean=0.0, std=0.05)
elif args.model_size == 'large':
self._create_weights(mean=0.0, std=0.02)

def _create_weights(self, mean=0.0, std=0.05):
for module in self.modules():
if isinstance(module, nn.Conv1d) or isinstance(module, nn.Linear):
module.weight.data.normal_(mean, std)

def _get_conv_output(self, shape):
input = torch.rand(shape)
output = input.transpose(1, 2)
# forward pass through conv layers
for i in range(len(self.conv_layers)):
output = self.conv_layers[i](output)

output = output.view(output.size(0), -1)
n_size = output.size(1)
return n_size

def forward(self, chars):
input=self.embedding(chars)
output = input.transpose(1, 2)
# forward pass through conv layers
for i in range(len(self.conv_layers)):
output = self.conv_layers[i](output)

output = output.view(output.size(0), -1)

# forward pass through fc layers
for i in range(len(self.fc_layers)):
output = self.fc_layers[i](output)

return {C.OUTPUT: output}

+ 10
- 1
reproduction/text_classification/model/dpcnn.py View File

@@ -1,3 +1,4 @@

import torch import torch
import torch.nn as nn import torch.nn as nn
from fastNLP.modules.utils import get_embeddings from fastNLP.modules.utils import get_embeddings
@@ -10,11 +11,13 @@ class DPCNN(nn.Module):
super().__init__() super().__init__()
self.region_embed = RegionEmbedding( self.region_embed = RegionEmbedding(
init_embed, out_dim=n_filters, kernel_sizes=[1, 3, 5]) init_embed, out_dim=n_filters, kernel_sizes=[1, 3, 5])

embed_dim = self.region_embed.embedding_dim embed_dim = self.region_embed.embedding_dim
self.conv_list = nn.ModuleList() self.conv_list = nn.ModuleList()
for i in range(n_layers): for i in range(n_layers):
self.conv_list.append(nn.Sequential( self.conv_list.append(nn.Sequential(
nn.ReLU(), nn.ReLU(),

nn.Conv1d(n_filters, n_filters, kernel_size, nn.Conv1d(n_filters, n_filters, kernel_size,
padding=kernel_size//2), padding=kernel_size//2),
nn.Conv1d(n_filters, n_filters, kernel_size, nn.Conv1d(n_filters, n_filters, kernel_size,
@@ -24,10 +27,12 @@ class DPCNN(nn.Module):
self.embed_drop = nn.Dropout(embed_dropout) self.embed_drop = nn.Dropout(embed_dropout)
self.classfier = nn.Sequential( self.classfier = nn.Sequential(
nn.Dropout(cls_dropout), nn.Dropout(cls_dropout),

nn.Linear(n_filters, num_cls), nn.Linear(n_filters, num_cls),
) )
self.reset_parameters() self.reset_parameters()



def reset_parameters(self): def reset_parameters(self):
for m in self.modules(): for m in self.modules():
if isinstance(m, (nn.Conv1d, nn.Conv2d, nn.Linear)): if isinstance(m, (nn.Conv1d, nn.Conv2d, nn.Linear)):
@@ -35,6 +40,7 @@ class DPCNN(nn.Module):
if m.bias is not None: if m.bias is not None:
nn.init.normal_(m.bias, mean=0, std=0.01) nn.init.normal_(m.bias, mean=0, std=0.01)



def forward(self, words, seq_len=None): def forward(self, words, seq_len=None):
words = words.long() words = words.long()
# get region embeddings # get region embeddings
@@ -52,18 +58,20 @@ class DPCNN(nn.Module):
x = self.classfier(x) x = self.classfier(x)
return {C.OUTPUT: x} return {C.OUTPUT: x}



def predict(self, words, seq_len=None): def predict(self, words, seq_len=None):
x = self.forward(words, seq_len)[C.OUTPUT] x = self.forward(words, seq_len)[C.OUTPUT]
return {C.OUTPUT: torch.argmax(x, 1)} return {C.OUTPUT: torch.argmax(x, 1)}



class RegionEmbedding(nn.Module): class RegionEmbedding(nn.Module):
def __init__(self, init_embed, out_dim=300, kernel_sizes=None): def __init__(self, init_embed, out_dim=300, kernel_sizes=None):
super().__init__() super().__init__()
if kernel_sizes is None: if kernel_sizes is None:
kernel_sizes = [5, 9] kernel_sizes = [5, 9]

assert isinstance( assert isinstance(
kernel_sizes, list), 'kernel_sizes should be List(int)' kernel_sizes, list), 'kernel_sizes should be List(int)'

self.embed = get_embeddings(init_embed) self.embed = get_embeddings(init_embed)
try: try:
embed_dim = self.embed.embedding_dim embed_dim = self.embed.embedding_dim
@@ -95,3 +103,4 @@ if __name__ == '__main__':
model = DPCNN((10000, 300), 20) model = DPCNN((10000, 300), 20)
y = model(x) y = model(x)
print(y.size(), y.mean(1), y.std(1)) print(y.size(), y.mean(1), y.std(1))


+ 206
- 0
reproduction/text_classification/train_char_cnn.py View File

@@ -0,0 +1,206 @@
# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径
import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'

import sys
sys.path.append('../..')
from fastNLP.core.const import Const as C
import torch.nn as nn
from fastNLP.io.dataset_loader import SSTLoader
from data.yelpLoader import yelpLoader
from data.sstLoader import sst2Loader
from data.IMDBLoader import IMDBLoader
from model.char_cnn import CharacterLevelCNN
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.models.cnn_text_classification import CNNText
from fastNLP.modules.encoder.embedding import CNNCharEmbedding,StaticEmbedding,StackEmbedding,LSTMCharEmbedding
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP.core.trainer import Trainer
from torch.optim import SGD
from torch.autograd import Variable
import torch
from fastNLP import BucketSampler

##hyper
#todo 这里加入fastnlp的记录
class Config():
model_dir_or_name="en-base-uncased"
embedding_grad= False,
bert_embedding_larers= '4,-2,-1'
train_epoch= 50
num_classes=2
task= "IMDB"
#yelp_p
datapath = {"train": "/remote-home/ygwang/yelp_polarity/train.csv",
"test": "/remote-home/ygwang/yelp_polarity/test.csv"}
#IMDB
#datapath = {"train": "/remote-home/ygwang/IMDB_data/train.csv",
# "test": "/remote-home/ygwang/IMDB_data/test.csv"}
# sst
# datapath = {"train": "/remote-home/ygwang/workspace/GLUE/SST-2/train.tsv",
# "dev": "/remote-home/ygwang/workspace/GLUE/SST-2/dev.tsv"}

lr=0.01
batch_size=128
model_size="large"
number_of_characters=69
extra_characters=''
max_length=1014

char_cnn_config={
"alphabet": {
"en": {
"lower": {
"alphabet": "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}",
"number_of_characters": 69
},
"both": {
"alphabet": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}",
"number_of_characters": 95
}
}
},
"model_parameters": {
"small": {
"conv": [
#依次是channel,kennnel_size,maxpooling_size
[256,7,3],
[256,7,3],
[256,3,-1],
[256,3,-1],
[256,3,-1],
[256,3,3]
],
"fc": [1024,1024]
},
"large":{
"conv":[
[1024, 7, 3],
[1024, 7, 3],
[1024, 3, -1],
[1024, 3, -1],
[1024, 3, -1],
[1024, 3, 3]
],
"fc": [2048,2048]
}
},
"data": {
"text_column": "SentimentText",
"label_column": "Sentiment",
"max_length": 1014,
"num_of_classes": 2,
"encoding": None,
"chunksize": 50000,
"max_rows": 100000,
"preprocessing_steps": ["lower", "remove_hashtags", "remove_urls", "remove_user_mentions"]
},
"training": {
"batch_size": 128,
"learning_rate": 0.01,
"epochs": 10,
"optimizer": "sgd"
}
}
ops=Config


##1.task相关信息:利用dataloader载入dataInfo
dataloader=sst2Loader()
dataloader=IMDBLoader()
#dataloader=yelpLoader(fine_grained=True)
datainfo=dataloader.process(ops.datapath,char_level_op=True)
char_vocab=ops.char_cnn_config["alphabet"]["en"]["lower"]["alphabet"]
ops.number_of_characters=len(char_vocab)
ops.embedding_dim=ops.number_of_characters

#chartoindex
def chartoindex(chars):
max_seq_len=ops.max_length
zero_index=len(char_vocab)
char_index_list=[]
for char in chars:
if char in char_vocab:
char_index_list.append(char_vocab.index(char))
else:
#<unk>和<pad>均使用最后一个作为embbeding
char_index_list.append(zero_index)
if len(char_index_list) > max_seq_len:
char_index_list = char_index_list[:max_seq_len]
elif 0 < len(char_index_list) < max_seq_len:
char_index_list = char_index_list+[zero_index]*(max_seq_len-len(char_index_list))
elif len(char_index_list) == 0:
char_index_list=[zero_index]*max_seq_len
return char_index_list

for dataset in datainfo.datasets.values():
dataset.apply_field(chartoindex,field_name='chars',new_field_name='chars')

datainfo.datasets['train'].set_input('chars')
datainfo.datasets['test'].set_input('chars')
datainfo.datasets['train'].set_target('target')
datainfo.datasets['test'].set_target('target')

##2. 定义/组装模型,这里可以随意,就如果是fastNLP封装好的,类似CNNText就直接用初始化调用就好了,这里只是给出一个伪框架表示占位,在这里建立符合fastNLP输入输出规范的model
class ModelFactory(nn.Module):
"""
用于拼装embedding,encoder,decoder 以及设计forward过程

:param embedding: embbeding model
:param encoder: encoder model
:param decoder: decoder model

"""
def __int__(self,embedding,encoder,decoder,**kwargs):
super(ModelFactory,self).__init__()
self.embedding=embedding
self.encoder=encoder
self.decoder=decoder

def forward(self,x):
return {C.OUTPUT:None}

## 2.或直接复用fastNLP的模型
#vocab=datainfo.vocabs['words']
vocab_label=datainfo.vocabs['target']
'''
# emded_char=CNNCharEmbedding(vocab)
# embed_word = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True)
# embedding=StackEmbedding([emded_char, embed_word])
# cnn_char_embed = CNNCharEmbedding(vocab)
# lstm_char_embed = LSTMCharEmbedding(vocab)
# embedding = StackEmbedding([cnn_char_embed, lstm_char_embed])
'''
#one-hot embedding
embedding_weight= Variable(torch.zeros(len(char_vocab)+1, len(char_vocab)))

for i in range(len(char_vocab)):
embedding_weight[i][i]=1
embedding=nn.Embedding(num_embeddings=len(char_vocab)+1,embedding_dim=len(char_vocab),padding_idx=len(char_vocab),_weight=embedding_weight)
for para in embedding.parameters():
para.requires_grad=False
#CNNText太过于简单
#model=CNNText(init_embed=embedding, num_classes=ops.num_classes)
model=CharacterLevelCNN(ops,embedding)

## 3. 声明loss,metric,optimizer
loss=CrossEntropyLoss
metric=AccuracyMetric
optimizer= SGD([param for param in model.parameters() if param.requires_grad==True], lr=ops.lr)

## 4.定义train方法
def train(model,datainfo,loss,metrics,optimizer,num_epochs=100):
trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss(target='target'),
metrics=[metrics(target='target')], dev_data=datainfo.datasets['test'], device=0, check_code_level=-1,
n_epochs=num_epochs)
print(trainer.train())



if __name__=="__main__":
#print(vocab_label)

#print(datainfo.datasets["train"])
train(model,datainfo,loss,metric,optimizer,num_epochs=ops.train_epoch)

+ 5
- 0
reproduction/text_classification/train_dpcnn.py View File

@@ -20,6 +20,7 @@ os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"





# hyper # hyper


class Config(): class Config():
@@ -70,6 +71,7 @@ datainfo = load_data()
vocab = datainfo.vocabs['words'] vocab = datainfo.vocabs['words']
# embedding = StackEmbedding([StaticEmbedding(vocab), CNNCharEmbedding(vocab, 100)]) # embedding = StackEmbedding([StaticEmbedding(vocab), CNNCharEmbedding(vocab, 100)])
#embedding = StaticEmbedding(vocab) #embedding = StaticEmbedding(vocab)

embedding = StaticEmbedding( embedding = StaticEmbedding(
vocab, model_dir_or_name='en-word2vec-300', requires_grad=ops.embedding_grad, vocab, model_dir_or_name='en-word2vec-300', requires_grad=ops.embedding_grad,
normalize=False normalize=False
@@ -79,9 +81,11 @@ print(len(datainfo.datasets['train']))
print(len(datainfo.datasets['test'])) print(len(datainfo.datasets['test']))
print(datainfo.datasets['train'][0]) print(datainfo.datasets['train'][0])



print(len(vocab)) print(len(vocab))
print(len(datainfo.vocabs['target'])) print(len(datainfo.vocabs['target']))



model = DPCNN(init_embed=embedding, num_cls=ops.num_classes, model = DPCNN(init_embed=embedding, num_cls=ops.num_classes,
embed_dropout=ops.embed_dropout, cls_dropout=ops.cls_dropout) embed_dropout=ops.embed_dropout, cls_dropout=ops.cls_dropout)
print(model) print(model)
@@ -118,3 +122,4 @@ trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=l


if __name__ == "__main__": if __name__ == "__main__":
print(trainer.train()) print(trainer.train())


Loading…
Cancel
Save