Browse Source

Merge branch 'dev0.5.0' into master

tags/v0.4.10
SrWYG GitHub 5 years ago
parent
commit
dcb8746c7a
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 494 additions and 56 deletions
  1. +31
    -5
      reproduction/text_classification/data/IMDBLoader.py
  2. +98
    -0
      reproduction/text_classification/data/sstLoader.py
  3. +54
    -49
      reproduction/text_classification/data/yelpLoader.py
  4. +90
    -1
      reproduction/text_classification/model/char_cnn.py
  5. +10
    -1
      reproduction/text_classification/model/dpcnn.py
  6. +206
    -0
      reproduction/text_classification/train_char_cnn.py
  7. +5
    -0
      reproduction/text_classification/train_dpcnn.py

+ 31
- 5
reproduction/text_classification/data/IMDBLoader.py View File

@@ -34,23 +34,36 @@ class IMDBLoader(DataSetLoader):
target = parts[0]
words = parts[1].split()
dataset.append(Instance(words=words, target=target))
if len(dataset) == 0:

if len(dataset)==0:
raise RuntimeError(f"{path} has no valid data.")

return dataset
def process(self,
paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None,
src_embed_opt: EmbeddingOption = None):
# paths = check_dataloader_paths(paths)
src_embed_opt: EmbeddingOption = None,
char_level_op=False):
datasets = {}
info = DataInfo()
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset
def wordtochar(words):
chars = []
for word in words:
word = word.lower()
for char in word:
chars.append(char)
return chars

if char_level_op:
for dataset in datasets.values():
dataset.apply_field(wordtochar, field_name="words", new_field_name='chars')

datasets["train"], datasets["dev"] = datasets["train"].split(0.1, shuffle=False)

@@ -80,3 +93,16 @@ class IMDBLoader(DataSetLoader):
dataset.set_target("target")

return info


if __name__=="__main__":
datapath = {"train": "/remote-home/ygwang/IMDB_data/train.csv",
"test": "/remote-home/ygwang/IMDB_data/test.csv"}
datainfo=IMDBLoader().process(datapath,char_level_op=True)
#print(datainfo.datasets["train"])
len_count = 0
for instance in datainfo.datasets["train"]:
len_count += len(instance["chars"])

ave_len = len_count / len(datainfo.datasets["train"])
print(ave_len)

+ 98
- 0
reproduction/text_classification/data/sstLoader.py View File

@@ -0,0 +1,98 @@
import csv
from typing import Iterable
from fastNLP import DataSet, Instance, Vocabulary
from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io.base_loader import DataInfo,DataSetLoader
from fastNLP.io.embed_loader import EmbeddingOption
from fastNLP.io.file_reader import _read_json
from typing import Union, Dict
from reproduction.Star_transformer.datasets import EmbedLoader
from reproduction.utils import check_dataloader_paths

class sst2Loader(DataSetLoader):
'''
数据来源"SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
'''
def __init__(self):
super(sst2Loader, self).__init__()

def _load(self, path: str) -> DataSet:
ds = DataSet()
all_count=0
csv_reader = csv.reader(open(path, encoding='utf-8'),delimiter='\t')
skip_row = 0
for idx,row in enumerate(csv_reader):
if idx<=skip_row:
continue
target = row[1]
words = row[0].split()
ds.append(Instance(words=words,target=target))
all_count+=1
print("all count:", all_count)
return ds

def process(self,
paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None,
src_embed_opt: EmbeddingOption = None,
char_level_op=False):

paths = check_dataloader_paths(paths)
datasets = {}
info = DataInfo()
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset

def wordtochar(words):
chars=[]
for word in words:
word=word.lower()
for char in word:
chars.append(char)
return chars

input_name, target_name = 'words', 'target'
info.vocabs={}

# 就分隔为char形式
if char_level_op:
for dataset in datasets.values():
dataset.apply_field(wordtochar, field_name="words", new_field_name='chars')

src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
src_vocab.from_dataset(datasets['train'], field_name='words')
src_vocab.index_dataset(*datasets.values(), field_name='words')

tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
tgt_vocab.from_dataset(datasets['train'], field_name='target')
tgt_vocab.index_dataset(*datasets.values(), field_name='target')


info.vocabs = {
"words": src_vocab,
"target": tgt_vocab
}

info.datasets = datasets


if src_embed_opt is not None:
embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
info.embeddings['words'] = embed

return info

if __name__=="__main__":
datapath = {"train": "/remote-home/ygwang/workspace/GLUE/SST-2/train.tsv",
"dev": "/remote-home/ygwang/workspace/GLUE/SST-2/dev.tsv"}
datainfo=sst2Loader().process(datapath,char_level_op=True)
#print(datainfo.datasets["train"])
len_count = 0
for instance in datainfo.datasets["train"]:
len_count += len(instance["chars"])

ave_len = len_count / len(datainfo.datasets["train"])
print(ave_len)

+ 54
- 49
reproduction/text_classification/data/yelpLoader.py View File

@@ -4,13 +4,14 @@ from typing import Iterable
from fastNLP import DataSet, Instance, Vocabulary
from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io import JsonLoader
from fastNLP.io.base_loader import DataInfo
from fastNLP.io.base_loader import DataInfo,DataSetLoader
from fastNLP.io.embed_loader import EmbeddingOption
from fastNLP.io.file_reader import _read_json
from typing import Union, Dict
from reproduction.utils import check_dataloader_paths



def get_tokenizer():
try:
import spacy
@@ -45,26 +46,19 @@ def clean_str(sentence, tokenizer, char_lower=False):
return words_collection


class yelpLoader(JsonLoader):
class yelpLoader(DataSetLoader):
"""
读取Yelp数据集, DataSet包含fields:

review_id: str, 22 character unique review id
user_id: str, 22 character unique user id
business_id: str, 22 character business id
useful: int, number of useful votes received
funny: int, number of funny votes received
cool: int, number of cool votes received
date: str, date formatted YYYY-MM-DD
读取Yelp_full/Yelp_polarity数据集, DataSet包含fields:
words: list(str), 需要分类的文本
target: str, 文本的标签
chars:list(str),未index的字符列表

数据来源: https://www.yelp.com/dataset/download

数据集:yelp_full/yelp_polarity
:param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False``
"""
def __init__(self, fine_grained=False, lower=False):
def __init__(self, fine_grained=False,lower=False):
super(yelpLoader, self).__init__()
tag_v = {'1.0': 'very negative', '2.0': 'negative', '3.0': 'neutral',
'4.0': 'positive', '5.0': 'very positive'}
@@ -77,6 +71,21 @@ class yelpLoader(JsonLoader):
self.tokenizer = get_tokenizer()

'''
读取Yelp数据集, DataSet包含fields:
review_id: str, 22 character unique review id
user_id: str, 22 character unique user id
business_id: str, 22 character business id
useful: int, number of useful votes received
funny: int, number of funny votes received
cool: int, number of cool votes received
date: str, date formatted YYYY-MM-DD
words: list(str), 需要分类的文本
target: str, 文本的标签
数据来源: https://www.yelp.com/dataset/download

def _load_json(self, path):
ds = DataSet()
for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna):
@@ -85,7 +94,7 @@ class yelpLoader(JsonLoader):
d["target"] = self.tag_v[str(d.pop("stars"))]
ds.append(Instance(**d))
return ds
def _load_yelp2015_broken(self,path):
ds = DataSet()
with open (path,encoding='ISO 8859-1') as f:
@@ -109,24 +118,26 @@ class yelpLoader(JsonLoader):
print("all count:",all_count)
return ds
'''
def _load(self, path):
ds = DataSet()
csv_reader = csv.reader(open(path, encoding='utf-8'))
all_count = 0
real_count = 0
csv_reader=csv.reader(open(path,encoding='utf-8'))
all_count=0
real_count=0
for row in csv_reader:
all_count += 1
if len(row) == 2:
target = self.tag_v[row[0] + ".0"]
words = clean_str(row[1], self.tokenizer, self.lower)
if len(words) != 0:
ds.append(Instance(words=words, target=target))
all_count+=1
if len(row)==2:
target=self.tag_v[row[0]+".0"]
words=clean_str(row[1],self.lower)
if len(words)!=0:
ds.append(Instance(words=words,target=target))
real_count += 1
print("all count:", all_count)
print("real count:", real_count)
return ds



def process(self, paths: Union[str, Dict[str, str]],
train_ds: Iterable[str] = None,
src_vocab_op: VocabularyOption = None,
@@ -142,57 +153,51 @@ class yelpLoader(JsonLoader):
_train_ds = [info.datasets[name]
for name in train_ds] if train_ds else info.datasets.values()

# vocab = Vocabulary(min_freq=2) if vocab_opt is None else Vocabulary(**vocab_opt)
# for name, path in paths.items():
# dataset = self.load(path)
# datasets[name] = dataset
# vocab.from_dataset(dataset, field_name="words")
# info.vocabs = vocab
# info.datasets = datasets

def wordtochar(words):
chars = []

chars=[]
for word in words:
word = word.lower()
word=word.lower()
for char in word:
chars.append(char)
return chars

input_name, target_name = 'words', 'target'
info.vocabs = {}
# 就分隔为char形式
info.vocabs={}
#就分隔为char形式
if char_level_op:
for dataset in info.datasets.values():
dataset.apply_field(wordtochar, field_name="words", new_field_name='chars')
dataset.apply_field(wordtochar, field_name="words",new_field_name='chars')
# if embed_opt is not None:
# embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab)
# info.embeddings['words'] = embed
else:
src_vocab.from_dataset(*_train_ds, field_name=input_name)
src_vocab.index_dataset(*info.datasets.values(), field_name=input_name, new_field_name=input_name)
info.vocabs[input_name] = src_vocab
src_vocab.index_dataset(*info.datasets.values(),field_name=input_name, new_field_name=input_name)
info.vocabs[input_name]=src_vocab

tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
tgt_vocab.index_dataset(
*info.datasets.values(),
field_name=target_name, new_field_name=target_name)
info.vocabs[target_name] = tgt_vocab

return info
info.vocabs[target_name]=tgt_vocab

return info

if __name__ == "__main__":
testloader = yelpLoader()
if __name__=="__main__":
testloader=yelpLoader()
# datapath = {"train": "/remote-home/ygwang/yelp_full/train.csv",
# "test": "/remote-home/ygwang/yelp_full/test.csv"}
# datapath={"train": "/remote-home/ygwang/yelp_full/test.csv"}
#datapath={"train": "/remote-home/ygwang/yelp_full/test.csv"}
datapath = {"train": "/remote-home/ygwang/yelp_polarity/train.csv",
"test": "/remote-home/ygwang/yelp_polarity/test.csv"}
datainfo = testloader.process(datapath, char_level_op=True)
datainfo=testloader.process(datapath,char_level_op=True)

len_count = 0
len_count=0
for instance in datainfo.datasets["train"]:
len_count += len(instance["chars"])
len_count+=len(instance["chars"])

ave_len = len_count / len(datainfo.datasets["train"])
ave_len=len_count/len(datainfo.datasets["train"])
print(ave_len)

+ 90
- 1
reproduction/text_classification/model/char_cnn.py View File

@@ -1 +1,90 @@
# TODO
'''
@author: https://github.com/ahmedbesbes/character-based-cnn
这里借鉴了上述链接中char-cnn model的代码,改动主要为将其改动为符合fastnlp的pipline
'''
import torch
import torch.nn as nn
from fastNLP.core.const import Const as C

class CharacterLevelCNN(nn.Module):
def __init__(self, args,embedding):
super(CharacterLevelCNN, self).__init__()

self.config=args.char_cnn_config
self.embedding=embedding

conv_layers = []
for i, conv_layer_parameter in enumerate(self.config['model_parameters'][args.model_size]['conv']):
if i == 0:
#in_channels = args.number_of_characters + len(args.extra_characters)
in_channels = args.embedding_dim
out_channels = conv_layer_parameter[0]
else:
in_channels, out_channels = conv_layer_parameter[0], conv_layer_parameter[0]

if conv_layer_parameter[2] != -1:
conv_layer = nn.Sequential(nn.Conv1d(in_channels,
out_channels,
kernel_size=conv_layer_parameter[1], padding=0),
nn.ReLU(),
nn.MaxPool1d(conv_layer_parameter[2]))
else:
conv_layer = nn.Sequential(nn.Conv1d(in_channels,
out_channels,
kernel_size=conv_layer_parameter[1], padding=0),
nn.ReLU())
conv_layers.append(conv_layer)
self.conv_layers = nn.ModuleList(conv_layers)

input_shape = (args.batch_size, args.max_length,
args.number_of_characters + len(args.extra_characters))
dimension = self._get_conv_output(input_shape)

print('dimension :', dimension)

fc_layer_parameter = self.config['model_parameters'][args.model_size]['fc'][0]
fc_layers = nn.ModuleList([
nn.Sequential(
nn.Linear(dimension, fc_layer_parameter), nn.Dropout(0.5)),
nn.Sequential(nn.Linear(fc_layer_parameter,
fc_layer_parameter), nn.Dropout(0.5)),
nn.Linear(fc_layer_parameter, args.num_classes),
])

self.fc_layers = fc_layers

if args.model_size == 'small':
self._create_weights(mean=0.0, std=0.05)
elif args.model_size == 'large':
self._create_weights(mean=0.0, std=0.02)

def _create_weights(self, mean=0.0, std=0.05):
for module in self.modules():
if isinstance(module, nn.Conv1d) or isinstance(module, nn.Linear):
module.weight.data.normal_(mean, std)

def _get_conv_output(self, shape):
input = torch.rand(shape)
output = input.transpose(1, 2)
# forward pass through conv layers
for i in range(len(self.conv_layers)):
output = self.conv_layers[i](output)

output = output.view(output.size(0), -1)
n_size = output.size(1)
return n_size

def forward(self, chars):
input=self.embedding(chars)
output = input.transpose(1, 2)
# forward pass through conv layers
for i in range(len(self.conv_layers)):
output = self.conv_layers[i](output)

output = output.view(output.size(0), -1)

# forward pass through fc layers
for i in range(len(self.fc_layers)):
output = self.fc_layers[i](output)

return {C.OUTPUT: output}

+ 10
- 1
reproduction/text_classification/model/dpcnn.py View File

@@ -1,3 +1,4 @@

import torch
import torch.nn as nn
from fastNLP.modules.utils import get_embeddings
@@ -10,11 +11,13 @@ class DPCNN(nn.Module):
super().__init__()
self.region_embed = RegionEmbedding(
init_embed, out_dim=n_filters, kernel_sizes=[1, 3, 5])

embed_dim = self.region_embed.embedding_dim
self.conv_list = nn.ModuleList()
for i in range(n_layers):
self.conv_list.append(nn.Sequential(
nn.ReLU(),

nn.Conv1d(n_filters, n_filters, kernel_size,
padding=kernel_size//2),
nn.Conv1d(n_filters, n_filters, kernel_size,
@@ -24,10 +27,12 @@ class DPCNN(nn.Module):
self.embed_drop = nn.Dropout(embed_dropout)
self.classfier = nn.Sequential(
nn.Dropout(cls_dropout),

nn.Linear(n_filters, num_cls),
)
self.reset_parameters()


def reset_parameters(self):
for m in self.modules():
if isinstance(m, (nn.Conv1d, nn.Conv2d, nn.Linear)):
@@ -35,6 +40,7 @@ class DPCNN(nn.Module):
if m.bias is not None:
nn.init.normal_(m.bias, mean=0, std=0.01)


def forward(self, words, seq_len=None):
words = words.long()
# get region embeddings
@@ -52,18 +58,20 @@ class DPCNN(nn.Module):
x = self.classfier(x)
return {C.OUTPUT: x}


def predict(self, words, seq_len=None):
x = self.forward(words, seq_len)[C.OUTPUT]
return {C.OUTPUT: torch.argmax(x, 1)}


class RegionEmbedding(nn.Module):
def __init__(self, init_embed, out_dim=300, kernel_sizes=None):
super().__init__()
if kernel_sizes is None:
kernel_sizes = [5, 9]

assert isinstance(
kernel_sizes, list), 'kernel_sizes should be List(int)'

self.embed = get_embeddings(init_embed)
try:
embed_dim = self.embed.embedding_dim
@@ -95,3 +103,4 @@ if __name__ == '__main__':
model = DPCNN((10000, 300), 20)
y = model(x)
print(y.size(), y.mean(1), y.std(1))


+ 206
- 0
reproduction/text_classification/train_char_cnn.py View File

@@ -0,0 +1,206 @@
# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径
import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'

import sys
sys.path.append('../..')
from fastNLP.core.const import Const as C
import torch.nn as nn
from fastNLP.io.dataset_loader import SSTLoader
from data.yelpLoader import yelpLoader
from data.sstLoader import sst2Loader
from data.IMDBLoader import IMDBLoader
from model.char_cnn import CharacterLevelCNN
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.models.cnn_text_classification import CNNText
from fastNLP.modules.encoder.embedding import CNNCharEmbedding,StaticEmbedding,StackEmbedding,LSTMCharEmbedding
from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP.core.trainer import Trainer
from torch.optim import SGD
from torch.autograd import Variable
import torch
from fastNLP import BucketSampler

##hyper
#todo 这里加入fastnlp的记录
class Config():
model_dir_or_name="en-base-uncased"
embedding_grad= False,
bert_embedding_larers= '4,-2,-1'
train_epoch= 50
num_classes=2
task= "IMDB"
#yelp_p
datapath = {"train": "/remote-home/ygwang/yelp_polarity/train.csv",
"test": "/remote-home/ygwang/yelp_polarity/test.csv"}
#IMDB
#datapath = {"train": "/remote-home/ygwang/IMDB_data/train.csv",
# "test": "/remote-home/ygwang/IMDB_data/test.csv"}
# sst
# datapath = {"train": "/remote-home/ygwang/workspace/GLUE/SST-2/train.tsv",
# "dev": "/remote-home/ygwang/workspace/GLUE/SST-2/dev.tsv"}

lr=0.01
batch_size=128
model_size="large"
number_of_characters=69
extra_characters=''
max_length=1014

char_cnn_config={
"alphabet": {
"en": {
"lower": {
"alphabet": "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}",
"number_of_characters": 69
},
"both": {
"alphabet": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}",
"number_of_characters": 95
}
}
},
"model_parameters": {
"small": {
"conv": [
#依次是channel,kennnel_size,maxpooling_size
[256,7,3],
[256,7,3],
[256,3,-1],
[256,3,-1],
[256,3,-1],
[256,3,3]
],
"fc": [1024,1024]
},
"large":{
"conv":[
[1024, 7, 3],
[1024, 7, 3],
[1024, 3, -1],
[1024, 3, -1],
[1024, 3, -1],
[1024, 3, 3]
],
"fc": [2048,2048]
}
},
"data": {
"text_column": "SentimentText",
"label_column": "Sentiment",
"max_length": 1014,
"num_of_classes": 2,
"encoding": None,
"chunksize": 50000,
"max_rows": 100000,
"preprocessing_steps": ["lower", "remove_hashtags", "remove_urls", "remove_user_mentions"]
},
"training": {
"batch_size": 128,
"learning_rate": 0.01,
"epochs": 10,
"optimizer": "sgd"
}
}
ops=Config


##1.task相关信息:利用dataloader载入dataInfo
dataloader=sst2Loader()
dataloader=IMDBLoader()
#dataloader=yelpLoader(fine_grained=True)
datainfo=dataloader.process(ops.datapath,char_level_op=True)
char_vocab=ops.char_cnn_config["alphabet"]["en"]["lower"]["alphabet"]
ops.number_of_characters=len(char_vocab)
ops.embedding_dim=ops.number_of_characters

#chartoindex
def chartoindex(chars):
max_seq_len=ops.max_length
zero_index=len(char_vocab)
char_index_list=[]
for char in chars:
if char in char_vocab:
char_index_list.append(char_vocab.index(char))
else:
#<unk>和<pad>均使用最后一个作为embbeding
char_index_list.append(zero_index)
if len(char_index_list) > max_seq_len:
char_index_list = char_index_list[:max_seq_len]
elif 0 < len(char_index_list) < max_seq_len:
char_index_list = char_index_list+[zero_index]*(max_seq_len-len(char_index_list))
elif len(char_index_list) == 0:
char_index_list=[zero_index]*max_seq_len
return char_index_list

for dataset in datainfo.datasets.values():
dataset.apply_field(chartoindex,field_name='chars',new_field_name='chars')

datainfo.datasets['train'].set_input('chars')
datainfo.datasets['test'].set_input('chars')
datainfo.datasets['train'].set_target('target')
datainfo.datasets['test'].set_target('target')

##2. 定义/组装模型,这里可以随意,就如果是fastNLP封装好的,类似CNNText就直接用初始化调用就好了,这里只是给出一个伪框架表示占位,在这里建立符合fastNLP输入输出规范的model
class ModelFactory(nn.Module):
"""
用于拼装embedding,encoder,decoder 以及设计forward过程

:param embedding: embbeding model
:param encoder: encoder model
:param decoder: decoder model

"""
def __int__(self,embedding,encoder,decoder,**kwargs):
super(ModelFactory,self).__init__()
self.embedding=embedding
self.encoder=encoder
self.decoder=decoder

def forward(self,x):
return {C.OUTPUT:None}

## 2.或直接复用fastNLP的模型
#vocab=datainfo.vocabs['words']
vocab_label=datainfo.vocabs['target']
'''
# emded_char=CNNCharEmbedding(vocab)
# embed_word = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True)
# embedding=StackEmbedding([emded_char, embed_word])
# cnn_char_embed = CNNCharEmbedding(vocab)
# lstm_char_embed = LSTMCharEmbedding(vocab)
# embedding = StackEmbedding([cnn_char_embed, lstm_char_embed])
'''
#one-hot embedding
embedding_weight= Variable(torch.zeros(len(char_vocab)+1, len(char_vocab)))

for i in range(len(char_vocab)):
embedding_weight[i][i]=1
embedding=nn.Embedding(num_embeddings=len(char_vocab)+1,embedding_dim=len(char_vocab),padding_idx=len(char_vocab),_weight=embedding_weight)
for para in embedding.parameters():
para.requires_grad=False
#CNNText太过于简单
#model=CNNText(init_embed=embedding, num_classes=ops.num_classes)
model=CharacterLevelCNN(ops,embedding)

## 3. 声明loss,metric,optimizer
loss=CrossEntropyLoss
metric=AccuracyMetric
optimizer= SGD([param for param in model.parameters() if param.requires_grad==True], lr=ops.lr)

## 4.定义train方法
def train(model,datainfo,loss,metrics,optimizer,num_epochs=100):
trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss(target='target'),
metrics=[metrics(target='target')], dev_data=datainfo.datasets['test'], device=0, check_code_level=-1,
n_epochs=num_epochs)
print(trainer.train())



if __name__=="__main__":
#print(vocab_label)

#print(datainfo.datasets["train"])
train(model,datainfo,loss,metric,optimizer,num_epochs=ops.train_epoch)

+ 5
- 0
reproduction/text_classification/train_dpcnn.py View File

@@ -20,6 +20,7 @@ os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"



# hyper

class Config():
@@ -70,6 +71,7 @@ datainfo = load_data()
vocab = datainfo.vocabs['words']
# embedding = StackEmbedding([StaticEmbedding(vocab), CNNCharEmbedding(vocab, 100)])
#embedding = StaticEmbedding(vocab)

embedding = StaticEmbedding(
vocab, model_dir_or_name='en-word2vec-300', requires_grad=ops.embedding_grad,
normalize=False
@@ -79,9 +81,11 @@ print(len(datainfo.datasets['train']))
print(len(datainfo.datasets['test']))
print(datainfo.datasets['train'][0])


print(len(vocab))
print(len(datainfo.vocabs['target']))


model = DPCNN(init_embed=embedding, num_cls=ops.num_classes,
embed_dropout=ops.embed_dropout, cls_dropout=ops.cls_dropout)
print(model)
@@ -118,3 +122,4 @@ trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=l

if __name__ == "__main__":
print(trainer.train())


Loading…
Cancel
Save