@@ -0,0 +1,201 @@ | |||
Apache License | |||
Version 2.0, January 2004 | |||
http://www.apache.org/licenses/ | |||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION | |||
1. Definitions. | |||
"License" shall mean the terms and conditions for use, reproduction, | |||
and distribution as defined by Sections 1 through 9 of this document. | |||
"Licensor" shall mean the copyright owner or entity authorized by | |||
the copyright owner that is granting the License. | |||
"Legal Entity" shall mean the union of the acting entity and all | |||
other entities that control, are controlled by, or are under common | |||
control with that entity. For the purposes of this definition, | |||
"control" means (i) the power, direct or indirect, to cause the | |||
direction or management of such entity, whether by contract or | |||
otherwise, or (ii) ownership of fifty percent (50%) or more of the | |||
outstanding shares, or (iii) beneficial ownership of such entity. | |||
"You" (or "Your") shall mean an individual or Legal Entity | |||
exercising permissions granted by this License. | |||
"Source" form shall mean the preferred form for making modifications, | |||
including but not limited to software source code, documentation | |||
source, and configuration files. | |||
"Object" form shall mean any form resulting from mechanical | |||
transformation or translation of a Source form, including but | |||
not limited to compiled object code, generated documentation, | |||
and conversions to other media types. | |||
"Work" shall mean the work of authorship, whether in Source or | |||
Object form, made available under the License, as indicated by a | |||
copyright notice that is included in or attached to the work | |||
(an example is provided in the Appendix below). | |||
"Derivative Works" shall mean any work, whether in Source or Object | |||
form, that is based on (or derived from) the Work and for which the | |||
editorial revisions, annotations, elaborations, or other modifications | |||
represent, as a whole, an original work of authorship. For the purposes | |||
of this License, Derivative Works shall not include works that remain | |||
separable from, or merely link (or bind by name) to the interfaces of, | |||
the Work and Derivative Works thereof. | |||
"Contribution" shall mean any work of authorship, including | |||
the original version of the Work and any modifications or additions | |||
to that Work or Derivative Works thereof, that is intentionally | |||
submitted to Licensor for inclusion in the Work by the copyright owner | |||
or by an individual or Legal Entity authorized to submit on behalf of | |||
the copyright owner. For the purposes of this definition, "submitted" | |||
means any form of electronic, verbal, or written communication sent | |||
to the Licensor or its representatives, including but not limited to | |||
communication on electronic mailing lists, source code control systems, | |||
and issue tracking systems that are managed by, or on behalf of, the | |||
Licensor for the purpose of discussing and improving the Work, but | |||
excluding communication that is conspicuously marked or otherwise | |||
designated in writing by the copyright owner as "Not a Contribution." | |||
"Contributor" shall mean Licensor and any individual or Legal Entity | |||
on behalf of whom a Contribution has been received by Licensor and | |||
subsequently incorporated within the Work. | |||
2. Grant of Copyright License. Subject to the terms and conditions of | |||
this License, each Contributor hereby grants to You a perpetual, | |||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable | |||
copyright license to reproduce, prepare Derivative Works of, | |||
publicly display, publicly perform, sublicense, and distribute the | |||
Work and such Derivative Works in Source or Object form. | |||
3. Grant of Patent License. Subject to the terms and conditions of | |||
this License, each Contributor hereby grants to You a perpetual, | |||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable | |||
(except as stated in this section) patent license to make, have made, | |||
use, offer to sell, sell, import, and otherwise transfer the Work, | |||
where such license applies only to those patent claims licensable | |||
by such Contributor that are necessarily infringed by their | |||
Contribution(s) alone or by combination of their Contribution(s) | |||
with the Work to which such Contribution(s) was submitted. If You | |||
institute patent litigation against any entity (including a | |||
cross-claim or counterclaim in a lawsuit) alleging that the Work | |||
or a Contribution incorporated within the Work constitutes direct | |||
or contributory patent infringement, then any patent licenses | |||
granted to You under this License for that Work shall terminate | |||
as of the date such litigation is filed. | |||
4. Redistribution. You may reproduce and distribute copies of the | |||
Work or Derivative Works thereof in any medium, with or without | |||
modifications, and in Source or Object form, provided that You | |||
meet the following conditions: | |||
(a) You must give any other recipients of the Work or | |||
Derivative Works a copy of this License; and | |||
(b) You must cause any modified files to carry prominent notices | |||
stating that You changed the files; and | |||
(c) You must retain, in the Source form of any Derivative Works | |||
that You distribute, all copyright, patent, trademark, and | |||
attribution notices from the Source form of the Work, | |||
excluding those notices that do not pertain to any part of | |||
the Derivative Works; and | |||
(d) If the Work includes a "NOTICE" text file as part of its | |||
distribution, then any Derivative Works that You distribute must | |||
include a readable copy of the attribution notices contained | |||
within such NOTICE file, excluding those notices that do not | |||
pertain to any part of the Derivative Works, in at least one | |||
of the following places: within a NOTICE text file distributed | |||
as part of the Derivative Works; within the Source form or | |||
documentation, if provided along with the Derivative Works; or, | |||
within a display generated by the Derivative Works, if and | |||
wherever such third-party notices normally appear. The contents | |||
of the NOTICE file are for informational purposes only and | |||
do not modify the License. You may add Your own attribution | |||
notices within Derivative Works that You distribute, alongside | |||
or as an addendum to the NOTICE text from the Work, provided | |||
that such additional attribution notices cannot be construed | |||
as modifying the License. | |||
You may add Your own copyright statement to Your modifications and | |||
may provide additional or different license terms and conditions | |||
for use, reproduction, or distribution of Your modifications, or | |||
for any such Derivative Works as a whole, provided Your use, | |||
reproduction, and distribution of the Work otherwise complies with | |||
the conditions stated in this License. | |||
5. Submission of Contributions. Unless You explicitly state otherwise, | |||
any Contribution intentionally submitted for inclusion in the Work | |||
by You to the Licensor shall be under the terms and conditions of | |||
this License, without any additional terms or conditions. | |||
Notwithstanding the above, nothing herein shall supersede or modify | |||
the terms of any separate license agreement you may have executed | |||
with Licensor regarding such Contributions. | |||
6. Trademarks. This License does not grant permission to use the trade | |||
names, trademarks, service marks, or product names of the Licensor, | |||
except as required for reasonable and customary use in describing the | |||
origin of the Work and reproducing the content of the NOTICE file. | |||
7. Disclaimer of Warranty. Unless required by applicable law or | |||
agreed to in writing, Licensor provides the Work (and each | |||
Contributor provides its Contributions) on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
implied, including, without limitation, any warranties or conditions | |||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A | |||
PARTICULAR PURPOSE. You are solely responsible for determining the | |||
appropriateness of using or redistributing the Work and assume any | |||
risks associated with Your exercise of permissions under this License. | |||
8. Limitation of Liability. In no event and under no legal theory, | |||
whether in tort (including negligence), contract, or otherwise, | |||
unless required by applicable law (such as deliberate and grossly | |||
negligent acts) or agreed to in writing, shall any Contributor be | |||
liable to You for damages, including any direct, indirect, special, | |||
incidental, or consequential damages of any character arising as a | |||
result of this License or out of the use or inability to use the | |||
Work (including but not limited to damages for loss of goodwill, | |||
work stoppage, computer failure or malfunction, or any and all | |||
other commercial damages or losses), even if such Contributor | |||
has been advised of the possibility of such damages. | |||
9. Accepting Warranty or Additional Liability. While redistributing | |||
the Work or Derivative Works thereof, You may choose to offer, | |||
and charge a fee for, acceptance of support, warranty, indemnity, | |||
or other liability obligations and/or rights consistent with this | |||
License. However, in accepting such obligations, You may act only | |||
on Your own behalf and on Your sole responsibility, not on behalf | |||
of any other Contributor, and only if You agree to indemnify, | |||
defend, and hold each Contributor harmless for any liability | |||
incurred by, or claims asserted against, such Contributor by reason | |||
of your accepting any such warranty or additional liability. | |||
END OF TERMS AND CONDITIONS | |||
APPENDIX: How to apply the Apache License to your work. | |||
To apply the Apache License to your work, attach the following | |||
boilerplate notice, with the fields enclosed by brackets "[]" | |||
replaced with your own identifying information. (Don't include | |||
the brackets!) The text should be enclosed in the appropriate | |||
comment syntax for the file format. We also recommend that a | |||
file or class name and description of purpose be included on the | |||
same "printed page" as the copyright notice for easier | |||
identification within third-party archives. | |||
Copyright [yyyy] [name of copyright owner] | |||
Licensed under the Apache License, Version 2.0 (the "License"); | |||
you may not use this file except in compliance with the License. | |||
You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. |
@@ -1,2 +1,58 @@ | |||
# FastNLP | |||
``` | |||
FastNLP | |||
│ LICENSE | |||
│ README.md | |||
│ requirements.txt | |||
│ setup.py | |||
| | |||
├─docs (documentation) | |||
| | |||
└─tests (unit tests, intergrating tests, system tests) | |||
| │ test_charlm.py | |||
| │ test_loader.py | |||
| │ test_trainer.py | |||
| │ test_word_seg.py | |||
| │ | |||
| └─data_for_tests (test data used by models) | |||
| charlm.txt | |||
| cws_test | |||
| cws_train | |||
| | |||
└─fastNLP | |||
├─action (model independent process) | |||
│ │ action.py (base class) | |||
│ │ README.md | |||
│ │ tester.py (model testing, for deployment and validation) | |||
│ │ trainer.py (main logic for model training) | |||
│ │ __init__.py | |||
│ │ | |||
| | |||
│ | |||
├─loader (file loader for all loading operations) | |||
│ | base_loader.py (base class) | |||
│ | config_loader.py (model-specific configuration/parameter loader) | |||
│ | dataset_loader.py (data set loader, base class) | |||
│ | embed_loader.py (embedding loader, base class) | |||
│ | __init__.py | |||
│ | |||
├─model (definitions of PyTorch models) | |||
│ │ base_model.py (base class, abstract) | |||
│ │ char_language_model.py (derived class, to implement abstract methods) | |||
│ │ word_seg_model.py | |||
│ │ __init__.py | |||
│ │ | |||
│ | |||
├─reproduction (code library for paper reproduction) | |||
│ ├─Char-aware_NLM | |||
│ │ | |||
│ ├─CNN-sentence_classification | |||
│ │ | |||
│ └─HAN-document_classification | |||
│ | |||
├─saver (file saver for all saving operations) | |||
│ base_saver.py | |||
│ logger.py | |||
│ model_saver.py | |||
│ | |||
``` |
@@ -0,0 +1 @@ | |||
# FastNLP Quick Tutorial |
@@ -0,0 +1,8 @@ | |||
SpaCy "Doc" | |||
https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/tokens/doc.pyx#L80 | |||
SpaCy "Vocab" | |||
https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/vocab.pyx#L25 | |||
SpaCy "Token" | |||
https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/tokens/token.pyx#L27 |
@@ -0,0 +1,35 @@ | |||
class Action(object): | |||
""" | |||
base class for Trainer and Tester | |||
""" | |||
def __init__(self): | |||
super(Action, self).__init__() | |||
def batchify(self, batch_size, X, Y=None): | |||
""" | |||
:param batch_size: int | |||
:param X: feature matrix of size [n_sample, m_feature] | |||
:param Y: label vector of size [n_sample, 1] (optional) | |||
:return iteration:int, the number of step in each epoch | |||
generator:generator, to generate batch inputs | |||
""" | |||
n_samples = X.shape[0] | |||
num_iter = n_samples // batch_size | |||
if Y is None: | |||
generator = self._batch_generate(batch_size, num_iter, X) | |||
else: | |||
generator = self._batch_generate(batch_size, num_iter, X, Y) | |||
return num_iter, generator | |||
@staticmethod | |||
def _batch_generate(batch_size, num_iter, *data): | |||
for step in range(num_iter): | |||
start = batch_size * step | |||
end = batch_size * (step + 1) | |||
yield tuple([x[start:end] for x in data]) | |||
def make_log(self, *args): | |||
return "log" |
@@ -0,0 +1,87 @@ | |||
from collections import namedtuple | |||
import numpy as np | |||
from fastNLP.action.action import Action | |||
class Tester(Action): | |||
"""docstring for Tester""" | |||
TestConfig = namedtuple("config", ["validate_in_training", "save_dev_input", "save_output", | |||
"save_loss", "batch_size"]) | |||
def __init__(self, test_args): | |||
""" | |||
:param test_args: named tuple | |||
""" | |||
super(Tester, self).__init__() | |||
self.validate_in_training = test_args.validate_in_training | |||
self.save_dev_input = test_args.save_dev_input | |||
self.valid_x = None | |||
self.valid_y = None | |||
self.save_output = test_args.save_output | |||
self.output = None | |||
self.save_loss = test_args.save_loss | |||
self.mean_loss = None | |||
self.batch_size = test_args.batch_size | |||
def test(self, network, data): | |||
print("testing") | |||
network.mode(test=True) # turn on the testing mode | |||
if self.save_dev_input: | |||
if self.valid_x is None: | |||
valid_x, valid_y = network.prepare_input(data) | |||
self.valid_x = valid_x | |||
self.valid_y = valid_y | |||
else: | |||
valid_x = self.valid_x | |||
valid_y = self.valid_y | |||
else: | |||
valid_x, valid_y = network.prepare_input(data) | |||
# split into batches by self.batch_size | |||
iterations, test_batch_generator = self.batchify(self.batch_size, valid_x, valid_y) | |||
batch_output = list() | |||
loss_history = list() | |||
# turn on the testing mode of the network | |||
network.mode(test=True) | |||
for step in range(iterations): | |||
batch_x, batch_y = test_batch_generator.__next__() | |||
# forward pass from test input to predicted output | |||
prediction = network.data_forward(batch_x) | |||
loss = network.get_loss(prediction, batch_y) | |||
if self.save_output: | |||
batch_output.append(prediction.data) | |||
if self.save_loss: | |||
loss_history.append(loss) | |||
self.log(self.make_log(step, loss)) | |||
if self.save_loss: | |||
self.mean_loss = np.mean(np.array(loss_history)) | |||
if self.save_output: | |||
self.output = self.make_output(batch_output) | |||
@property | |||
def loss(self): | |||
return self.mean_loss | |||
@property | |||
def result(self): | |||
return self.output | |||
@staticmethod | |||
def make_output(batch_outputs): | |||
# construct full prediction with batch outputs | |||
return np.concatenate(batch_outputs, axis=0) | |||
def load_config(self, args): | |||
raise NotImplementedError | |||
def load_dataset(self, args): | |||
raise NotImplementedError |
@@ -0,0 +1,268 @@ | |||
from collections import namedtuple | |||
import numpy as np | |||
import torch | |||
from fastNLP.action.action import Action | |||
from fastNLP.action.tester import Tester | |||
class BaseTrainer(Action): | |||
"""Base trainer for all trainers. | |||
Trainer receives a model and data, and then performs training. | |||
Subclasses must implement the following abstract methods: | |||
- prepare_input | |||
- mode | |||
- define_optimizer | |||
- data_forward | |||
- grad_backward | |||
- get_loss | |||
""" | |||
TrainConfig = namedtuple("config", ["epochs", "validate", "save_when_better", | |||
"log_per_step", "log_validation", "batch_size"]) | |||
def __init__(self, train_args): | |||
""" | |||
training parameters | |||
""" | |||
super(BaseTrainer, self).__init__() | |||
self.n_epochs = train_args.epochs | |||
self.validate = train_args.validate | |||
self.batch_size = train_args.batch_size | |||
self.model = None | |||
def train(self, network, train_data, dev_data=None): | |||
"""General training loop. | |||
:param network: a model | |||
:param train_data: raw data for training | |||
:param dev_data: raw data for validation | |||
The method is framework independent. | |||
Work by calling the following methods: | |||
- prepare_input | |||
- mode | |||
- define_optimizer | |||
- data_forward | |||
- get_loss | |||
- grad_backward | |||
- update | |||
Subclasses must implement these methods with a specific framework. | |||
""" | |||
self.model = network | |||
train_x, train_y = self.prepare_input(train_data) | |||
iterations, train_batch_generator = self.batchify(self.batch_size, train_x, train_y) | |||
test_args = Tester.TestConfig(save_output=True, validate_in_training=True, | |||
save_dev_input=True, save_loss=True, batch_size=self.batch_size) | |||
evaluator = Tester(test_args) | |||
best_loss = 1e10 | |||
for epoch in range(self.n_epochs): | |||
self.mode(test=False) # turn on the train mode | |||
self.define_optimizer() | |||
for step in range(iterations): | |||
batch_x, batch_y = train_batch_generator.__next__() | |||
prediction = self.data_forward(network, batch_x) | |||
loss = self.get_loss(prediction, batch_y) | |||
self.grad_backward(loss) | |||
self.update() | |||
if self.validate: | |||
if dev_data is None: | |||
raise RuntimeError("No validation data provided.") | |||
evaluator.test(network, dev_data) | |||
if evaluator.loss < best_loss: | |||
best_loss = evaluator.loss | |||
# finish training | |||
def prepare_input(self, data): | |||
""" | |||
Perform data transformation from raw input to vector/matrix inputs. | |||
:param data: raw inputs | |||
:return (X, Y): tuple, input features and labels | |||
""" | |||
raise NotImplementedError | |||
def mode(self, test=False): | |||
""" | |||
Tell the network to be trained or not. | |||
:param test: bool | |||
""" | |||
raise NotImplementedError | |||
def define_optimizer(self): | |||
""" | |||
Define framework-specific optimizer specified by the models. | |||
""" | |||
raise NotImplementedError | |||
def update(self): | |||
""" | |||
Perform weight update on a model. | |||
For PyTorch, just call optimizer to update. | |||
""" | |||
raise NotImplementedError | |||
def data_forward(self, network, x): | |||
""" | |||
Forward pass of the data. | |||
:param network: a model | |||
:param x: input feature matrix and label vector | |||
:return: output by the models | |||
For PyTorch, just do "network(*x)" | |||
""" | |||
raise NotImplementedError | |||
def grad_backward(self, loss): | |||
""" | |||
Compute gradient with link rules. | |||
:param loss: a scalar where back-prop starts | |||
For PyTorch, just do "loss.backward()" | |||
""" | |||
raise NotImplementedError | |||
def get_loss(self, predict, truth): | |||
""" | |||
Compute loss given prediction and ground truth. | |||
:param predict: prediction label vector | |||
:param truth: ground truth label vector | |||
:return: a scalar | |||
""" | |||
raise NotImplementedError | |||
class ToyTrainer(BaseTrainer): | |||
"""A simple trainer for a PyTorch model.""" | |||
def __init__(self, train_args): | |||
super(ToyTrainer, self).__init__(train_args) | |||
self.test_mode = False | |||
self.weight = np.random.rand(5, 1) | |||
self.bias = np.random.rand() | |||
self._loss = 0 | |||
self._optimizer = None | |||
def prepare_input(self, data): | |||
return data[:, :-1], data[:, -1] | |||
def mode(self, test=False): | |||
self.model.mode(test) | |||
def data_forward(self, network, x): | |||
return np.matmul(x, self.weight) + self.bias | |||
def grad_backward(self, loss): | |||
loss.backward() | |||
def get_loss(self, pred, truth): | |||
self._loss = np.mean(np.square(pred - truth)) | |||
return self._loss | |||
def define_optimizer(self): | |||
self._optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01) | |||
def update(self): | |||
self._optimizer.step() | |||
class WordSegTrainer(BaseTrainer): | |||
""" | |||
reserve for changes | |||
""" | |||
def __init__(self, train_args): | |||
super(WordSegTrainer, self).__init__(train_args) | |||
self.id2word = None | |||
self.word2id = None | |||
self.id2tag = None | |||
self.tag2id = None | |||
self.lstm_batch_size = 8 | |||
self.lstm_seq_len = 32 # Trainer batch_size == lstm_batch_size * lstm_seq_len | |||
self.hidden_dim = 100 | |||
self.lstm_num_layers = 2 | |||
self.vocab_size = 100 | |||
self.word_emb_dim = 100 | |||
self.hidden = (self.to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)), | |||
self.to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim))) | |||
self.optimizer = None | |||
self._loss = None | |||
self.USE_GPU = False | |||
def to_var(self, x): | |||
if torch.cuda.is_available() and self.USE_GPU: | |||
x = x.cuda() | |||
return torch.autograd.Variable(x) | |||
def prepare_input(self, data): | |||
""" | |||
perform word indices lookup to convert strings into indices | |||
:param data: list of string, each string contains word + space + [B, M, E, S] | |||
:return | |||
""" | |||
word_list = [] | |||
tag_list = [] | |||
for line in data: | |||
if len(line) > 2: | |||
tokens = line.split("#") | |||
word_list.append(tokens[0]) | |||
tag_list.append(tokens[2][0]) | |||
self.id2word = list(set(word_list)) | |||
self.word2id = {word: idx for idx, word in enumerate(self.id2word)} | |||
self.id2tag = list(set(tag_list)) | |||
self.tag2id = {tag: idx for idx, tag in enumerate(self.id2tag)} | |||
words = np.array([self.word2id[w] for w in word_list]).reshape(-1, 1) | |||
tags = np.array([self.tag2id[t] for t in tag_list]).reshape(-1, 1) | |||
return words, tags | |||
def mode(self, test=False): | |||
if test: | |||
self.model.eval() | |||
else: | |||
self.model.train() | |||
def data_forward(self, network, x): | |||
""" | |||
:param network: a PyTorch model | |||
:param x: sequence of length [batch_size], word indices | |||
:return: | |||
""" | |||
x = x.reshape(self.lstm_batch_size, self.lstm_seq_len) | |||
output, self.hidden = network(x, self.hidden) | |||
return output | |||
def define_optimizer(self): | |||
self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01, momentum=0.85) | |||
def get_loss(self, predict, truth): | |||
self._loss = torch.nn.CrossEntropyLoss(predict, truth) | |||
return self._loss | |||
def grad_backward(self, network): | |||
self.model.zero_grad() | |||
self._loss.backward() | |||
torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2) | |||
def update(self): | |||
self.optimizer.step() | |||
if __name__ == "__name__": | |||
Config = namedtuple("config", ["epochs", "validate", "save_when_better", "log_per_step", | |||
"log_validation", "batch_size"]) | |||
train_config = Config(epochs=5, validate=True, save_when_better=True, log_per_step=10, log_validation=True, | |||
batch_size=32) | |||
trainer = ToyTrainer(train_config) |
@@ -0,0 +1,36 @@ | |||
class BaseLoader(object): | |||
"""docstring for BaseLoader""" | |||
def __init__(self, data_name, data_path): | |||
super(BaseLoader, self).__init__() | |||
self.data_name = data_name | |||
self.data_path = data_path | |||
def load(self): | |||
""" | |||
:return: string | |||
""" | |||
with open(self.data_path, "r", encoding="utf-8") as f: | |||
text = f.read() | |||
return text | |||
def load_lines(self): | |||
with open(self.data_path, "r", encoding="utf=8") as f: | |||
text = f.readlines() | |||
return text | |||
class ToyLoader0(BaseLoader): | |||
""" | |||
For charLM | |||
""" | |||
def __init__(self, name, path): | |||
super(ToyLoader0, self).__init__(name, path) | |||
def load(self): | |||
with open(self.data_path, 'r') as f: | |||
corpus = f.read().lower() | |||
import re | |||
corpus = re.sub(r"<unk>", "unk", corpus) | |||
return corpus.split() |
@@ -0,0 +1,42 @@ | |||
from fastNLP.loader.base_loader import BaseLoader | |||
import configparser | |||
import traceback | |||
import json | |||
class ConfigLoader(BaseLoader): | |||
"""loader for configuration files""" | |||
def __int__(self, data_name, data_path): | |||
super(ConfigLoader, self).__init__(data_name, data_path) | |||
self.config = self.parse(super(ConfigLoader, self).load()) | |||
@staticmethod | |||
def parse(string): | |||
raise NotImplementedError | |||
@staticmethod | |||
def loadConfig(filePath, sections): | |||
""" | |||
:param filePath: the path of config file | |||
:param sections: the dict of sections | |||
:return: | |||
""" | |||
cfg = configparser.ConfigParser() | |||
cfg.read(filePath) | |||
for s in sections: | |||
attr_list = [i for i in type(sections[s]).__dict__.keys() if | |||
not callable(getattr(sections[s], i)) and not i.startswith("__")] | |||
gen_sec = cfg[s] | |||
for attr in attr_list: | |||
try: | |||
val = json.loads(gen_sec[attr]) | |||
print(s, attr, val, type(val)) | |||
assert type(val) == type(getattr(sections[s], attr)), \ | |||
'type not match, except %s but got %s' % \ | |||
(type(getattr(sections[s], attr)), type(val)) | |||
setattr(sections[s], attr, val) | |||
except Exception as e: | |||
# attribute attr in section s did not been set, default val will be used | |||
pass |
@@ -0,0 +1,111 @@ | |||
import os | |||
from fastNLP.loader.base_loader import BaseLoader | |||
class DatasetLoader(BaseLoader): | |||
""""loader for data sets""" | |||
def __init__(self, data_name, data_path): | |||
super(DatasetLoader, self).__init__(data_name, data_path) | |||
class POSDatasetLoader(DatasetLoader): | |||
"""loader for pos data sets""" | |||
def __init__(self, data_name, data_path): | |||
super(POSDatasetLoader, self).__init__(data_name, data_path) | |||
#self.data_set = self.load() | |||
def load(self): | |||
assert os.path.exists(self.data_path) | |||
with open(self.data_path, "r", encoding="utf-8") as f: | |||
lines = f.readlines() | |||
return self.parse(lines) | |||
@staticmethod | |||
def parse(lines): | |||
""" | |||
:param lines: lines from dataset | |||
:return: list(list(list())): the three level of lists are | |||
token, sentence, and dataset | |||
""" | |||
dataset = list() | |||
for line in lines: | |||
sentence = list() | |||
words = line.split(" ") | |||
for w in words: | |||
tokens = list() | |||
tokens.append(w.split('/')[0]) | |||
tokens.append(w.split('/')[1]) | |||
sentence.append(tokens) | |||
dataset.append(sentence) | |||
return dataset | |||
class ClassificationDatasetLoader(DatasetLoader): | |||
"""loader for classfication data sets""" | |||
def __init__(self, data_name, data_path): | |||
super(ClassificationDatasetLoader, data_name).__init__() | |||
def load(self): | |||
assert os.path.exists(self.data_path) | |||
with open(self.data_path, "r", encoding="utf-8") as f: | |||
lines = f.readlines() | |||
return self.parse(lines) | |||
@staticmethod | |||
def parse(lines): | |||
""" | |||
:param lines: lines from dataset | |||
:return: list(list(list())): the three level of lists are | |||
words, sentence, and dataset | |||
""" | |||
dataset = list() | |||
for line in lines: | |||
label = line.split(" ")[0] | |||
words = line.split(" ")[1:] | |||
word = list([w for w in words]) | |||
sentence = list([word, label]) | |||
dataset.append(sentence) | |||
return dataset | |||
class ConllLoader(DatasetLoader): | |||
"""loader for conll format files""" | |||
def __int__(self, data_name, data_path): | |||
""" | |||
:param str data_name: the name of the conll data set | |||
:param str data_path: the path to the conll data set | |||
""" | |||
super(ConllLoader, self).__init__(data_name, data_path) | |||
self.data_set = self.parse(self.load()) | |||
def load(self): | |||
""" | |||
:return: list lines: all lines in a conll file | |||
""" | |||
with open(self.data_path, "r", encoding="utf-8") as f: | |||
lines = f.readlines() | |||
return lines | |||
@staticmethod | |||
def parse(lines): | |||
""" | |||
:param list lines:a list containing all lines in a conll file. | |||
:return: a 3D list | |||
""" | |||
sentences = list() | |||
tokens = list() | |||
for line in lines: | |||
if line[0] == "#": | |||
# skip the comments | |||
continue | |||
if line == "\n": | |||
sentences.append(tokens) | |||
tokens = [] | |||
continue | |||
tokens.append(line.split()) | |||
return sentences |
@@ -0,0 +1,8 @@ | |||
from loader.base_loader import BaseLoader | |||
class EmbedLoader(BaseLoader): | |||
"""docstring for EmbedLoader""" | |||
def __init__(self, data_name, data_path): | |||
super(EmbedLoader, self).__init__(data_name, data_path) |
@@ -0,0 +1,95 @@ | |||
import torch | |||
class BaseModel(torch.nn.Module): | |||
"""Base PyTorch model for all models. | |||
Three network modules presented: | |||
- embedding module | |||
- aggregation module | |||
- output module | |||
Subclasses must implement these three modules with "components". | |||
""" | |||
def __init__(self): | |||
super(BaseModel, self).__init__() | |||
def forward(self, *inputs): | |||
x = self.encode(*inputs) | |||
x = self.aggregation(x) | |||
x = self.output(x) | |||
return x | |||
def encode(self, x): | |||
raise NotImplementedError | |||
def aggregation(self, x): | |||
raise NotImplementedError | |||
def output(self, x): | |||
raise NotImplementedError | |||
class Vocabulary(object): | |||
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab` | |||
instance also provides access to the `StringStore`, and owns underlying | |||
data that is shared between `Doc` objects. | |||
""" | |||
def __init__(self): | |||
"""Create the vocabulary. | |||
RETURNS (Vocab): The newly constructed object. | |||
""" | |||
self.data_frame = None | |||
class Document(object): | |||
"""A sequence of Token objects. Access sentences and named entities, export | |||
annotations to numpy arrays, losslessly serialize to compressed binary | |||
strings. The `Doc` object holds an array of `Token` objects. The | |||
Python-level `Token` and `Span` objects are views of this array, i.e. | |||
they don't own the data themselves. -- spacy | |||
""" | |||
def __init__(self, vocab, words=None, spaces=None): | |||
"""Create a Doc object. | |||
vocab (Vocab): A vocabulary object, which must match any models you | |||
want to use (e.g. tokenizer, parser, entity recognizer). | |||
words (list or None): A list of unicode strings, to add to the document | |||
as words. If `None`, defaults to empty list. | |||
spaces (list or None): A list of boolean values, of the same length as | |||
words. True means that the word is followed by a space, False means | |||
it is not. If `None`, defaults to `[True]*len(words)` | |||
user_data (dict or None): Optional extra data to attach to the Doc. | |||
RETURNS (Doc): The newly constructed object. | |||
""" | |||
self.vocab = vocab | |||
self.spaces = spaces | |||
self.words = words | |||
if spaces is None: | |||
self.spaces = [True] * len(self.words) | |||
elif len(spaces) != len(self.words): | |||
raise ValueError("dismatch spaces and words") | |||
def get_chunker(self, vocab): | |||
return None | |||
def push_back(self, vocab): | |||
pass | |||
class Token(object): | |||
"""An individual token – i.e. a word, punctuation symbol, whitespace, | |||
etc. | |||
""" | |||
def __init__(self, vocab, doc, offset): | |||
"""Construct a `Token` object. | |||
vocab (Vocabulary): A storage container for lexical types. | |||
doc (Document): The parent document. | |||
offset (int): The index of the token within the document. | |||
""" | |||
self.vocab = vocab | |||
self.doc = doc | |||
self.token = doc[offset] | |||
self.i = offset |
@@ -0,0 +1,359 @@ | |||
import os | |||
from collections import namedtuple | |||
import numpy as np | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
import torch.optim as optim | |||
from torch.autograd import Variable | |||
from fastNLP.models.base_model import BaseModel | |||
USE_GPU = True | |||
""" | |||
To be deprecated. | |||
""" | |||
class CharLM(BaseModel): | |||
""" | |||
Controller of the Character-level Neural Language Model | |||
To do: | |||
- where the data goes, call data savers. | |||
""" | |||
DataTuple = namedtuple("DataTuple", ["feature", "label"]) | |||
def __init__(self, lstm_batch_size, lstm_seq_len): | |||
super(CharLM, self).__init__() | |||
""" | |||
Settings: should come from config loader or pre-processing | |||
""" | |||
self.word_embed_dim = 300 | |||
self.char_embedding_dim = 15 | |||
self.cnn_batch_size = lstm_batch_size * lstm_seq_len | |||
self.lstm_seq_len = lstm_seq_len | |||
self.lstm_batch_size = lstm_batch_size | |||
self.num_epoch = 10 | |||
self.old_PPL = 100000 | |||
self.best_PPL = 100000 | |||
""" | |||
These parameters are set by pre-processing. | |||
""" | |||
self.max_word_len = None | |||
self.num_char = None | |||
self.vocab_size = None | |||
self.preprocess("./data_for_tests/charlm.txt") | |||
self.data = None # named tuple to store all data set | |||
self.data_ready = False | |||
self.criterion = nn.CrossEntropyLoss() | |||
self._loss = None | |||
self.use_gpu = USE_GPU | |||
# word_emb_dim == hidden_size / num of hidden units | |||
self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)), | |||
to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim))) | |||
self.model = charLM(self.char_embedding_dim, | |||
self.word_embed_dim, | |||
self.vocab_size, | |||
self.num_char, | |||
use_gpu=self.use_gpu) | |||
for param in self.model.parameters(): | |||
nn.init.uniform(param.data, -0.05, 0.05) | |||
self.learning_rate = 0.1 | |||
self.optimizer = None | |||
def prepare_input(self, raw_text): | |||
""" | |||
:param raw_text: raw input text consisting of words | |||
:return: torch.Tensor, torch.Tensor | |||
feature matrix, label vector | |||
This function is only called once in Trainer.train, but may called multiple times in Tester.test | |||
So Tester will save test input for frequent calls. | |||
""" | |||
if os.path.exists("cache/prep.pt") is False: | |||
self.preprocess("./data_for_tests/charlm.txt") # To do: This is not good. Need to fix.. | |||
objects = torch.load("cache/prep.pt") | |||
word_dict = objects["word_dict"] | |||
char_dict = objects["char_dict"] | |||
max_word_len = self.max_word_len | |||
print("word/char dictionary built. Start making inputs.") | |||
words = raw_text | |||
input_vec = np.array(text2vec(words, char_dict, max_word_len)) | |||
# Labels are next-word index in word_dict with the same length as inputs | |||
input_label = np.array([word_dict[w] for w in words[1:]] + [word_dict[words[-1]]]) | |||
feature_input = torch.from_numpy(input_vec) | |||
label_input = torch.from_numpy(input_label) | |||
return feature_input, label_input | |||
def mode(self, test=False): | |||
if test: | |||
self.model.eval() | |||
else: | |||
self.model.train() | |||
def data_forward(self, x): | |||
""" | |||
:param x: Tensor of size [lstm_batch_size, lstm_seq_len, max_word_len+2] | |||
:return: Tensor of size [num_words, ?] | |||
""" | |||
# additional processing of inputs after batching | |||
num_seq = x.size()[0] // self.lstm_seq_len | |||
x = x[:num_seq * self.lstm_seq_len, :] | |||
x = x.view(-1, self.lstm_seq_len, self.max_word_len + 2) | |||
# detach hidden state of LSTM from last batch | |||
hidden = [state.detach() for state in self.hidden] | |||
output, self.hidden = self.model(to_var(x), hidden) | |||
return output | |||
def grad_backward(self): | |||
self.model.zero_grad() | |||
self._loss.backward() | |||
torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2) | |||
self.optimizer.step() | |||
def get_loss(self, predict, truth): | |||
self._loss = self.criterion(predict, to_var(truth)) | |||
return self._loss.data # No pytorch data structure exposed outsides | |||
def define_optimizer(self): | |||
# redefine optimizer for every new epoch | |||
self.optimizer = optim.SGD(self.model.parameters(), lr=self.learning_rate, momentum=0.85) | |||
def save(self): | |||
print("network saved") | |||
# torch.save(self.models, "cache/models.pkl") | |||
def preprocess(self, all_text_files): | |||
word_dict, char_dict = create_word_char_dict(all_text_files) | |||
num_char = len(char_dict) | |||
self.vocab_size = len(word_dict) | |||
char_dict["BOW"] = num_char + 1 | |||
char_dict["EOW"] = num_char + 2 | |||
char_dict["PAD"] = 0 | |||
self.num_char = num_char + 3 | |||
# char_dict is a dict of (int, string), int counting from 0 to 47 | |||
reverse_word_dict = {value: key for key, value in word_dict.items()} | |||
self.max_word_len = max([len(word) for word in word_dict]) | |||
objects = { | |||
"word_dict": word_dict, | |||
"char_dict": char_dict, | |||
"reverse_word_dict": reverse_word_dict, | |||
} | |||
torch.save(objects, "cache/prep.pt") | |||
print("Preprocess done.") | |||
""" | |||
Global Functions | |||
""" | |||
def batch_generator(x, batch_size): | |||
# x: [num_words, in_channel, height, width] | |||
# partitions x into batches | |||
num_step = x.size()[0] // batch_size | |||
for t in range(num_step): | |||
yield x[t * batch_size:(t + 1) * batch_size] | |||
def text2vec(words, char_dict, max_word_len): | |||
""" Return list of list of int """ | |||
word_vec = [] | |||
for word in words: | |||
vec = [char_dict[ch] for ch in word] | |||
if len(vec) < max_word_len: | |||
vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))] | |||
vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]] | |||
word_vec.append(vec) | |||
return word_vec | |||
def read_data(file_name): | |||
with open(file_name, 'r') as f: | |||
corpus = f.read().lower() | |||
import re | |||
corpus = re.sub(r"<unk>", "unk", corpus) | |||
return corpus.split() | |||
def get_char_dict(vocabulary): | |||
char_dict = dict() | |||
count = 1 | |||
for word in vocabulary: | |||
for ch in word: | |||
if ch not in char_dict: | |||
char_dict[ch] = count | |||
count += 1 | |||
return char_dict | |||
def create_word_char_dict(*file_name): | |||
text = [] | |||
for file in file_name: | |||
text += read_data(file) | |||
word_dict = {word: ix for ix, word in enumerate(set(text))} | |||
char_dict = get_char_dict(word_dict) | |||
return word_dict, char_dict | |||
def to_var(x): | |||
if torch.cuda.is_available() and USE_GPU: | |||
x = x.cuda() | |||
return Variable(x) | |||
""" | |||
Neural Network | |||
""" | |||
class Highway(nn.Module): | |||
"""Highway network""" | |||
def __init__(self, input_size): | |||
super(Highway, self).__init__() | |||
self.fc1 = nn.Linear(input_size, input_size, bias=True) | |||
self.fc2 = nn.Linear(input_size, input_size, bias=True) | |||
def forward(self, x): | |||
t = F.sigmoid(self.fc1(x)) | |||
return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1 - t, x) | |||
class charLM(nn.Module): | |||
"""Character-level Neural Language Model | |||
CNN + highway network + LSTM | |||
# Input: | |||
4D tensor with shape [batch_size, in_channel, height, width] | |||
# Output: | |||
2D Tensor with shape [batch_size, vocab_size] | |||
# Arguments: | |||
char_emb_dim: the size of each character's attention | |||
word_emb_dim: the size of each word's attention | |||
vocab_size: num of unique words | |||
num_char: num of characters | |||
use_gpu: True or False | |||
""" | |||
def __init__(self, char_emb_dim, word_emb_dim, | |||
vocab_size, num_char, use_gpu): | |||
super(charLM, self).__init__() | |||
self.char_emb_dim = char_emb_dim | |||
self.word_emb_dim = word_emb_dim | |||
self.vocab_size = vocab_size | |||
# char attention layer | |||
self.char_embed = nn.Embedding(num_char, char_emb_dim) | |||
# convolutions of filters with different sizes | |||
self.convolutions = [] | |||
# list of tuples: (the number of filter, width) | |||
# self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)] | |||
self.filter_num_width = [(25, 1), (50, 2), (75, 3)] | |||
for out_channel, filter_width in self.filter_num_width: | |||
self.convolutions.append( | |||
nn.Conv2d( | |||
1, # in_channel | |||
out_channel, # out_channel | |||
kernel_size=(char_emb_dim, filter_width), # (height, width) | |||
bias=True | |||
) | |||
) | |||
self.highway_input_dim = sum([x for x, y in self.filter_num_width]) | |||
self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False) | |||
# highway net | |||
self.highway1 = Highway(self.highway_input_dim) | |||
self.highway2 = Highway(self.highway_input_dim) | |||
# LSTM | |||
self.lstm_num_layers = 2 | |||
self.lstm = nn.LSTM(input_size=self.highway_input_dim, | |||
hidden_size=self.word_emb_dim, | |||
num_layers=self.lstm_num_layers, | |||
bias=True, | |||
dropout=0.5, | |||
batch_first=True) | |||
# output layer | |||
self.dropout = nn.Dropout(p=0.5) | |||
self.linear = nn.Linear(self.word_emb_dim, self.vocab_size) | |||
if use_gpu is True: | |||
for x in range(len(self.convolutions)): | |||
self.convolutions[x] = self.convolutions[x].cuda() | |||
self.highway1 = self.highway1.cuda() | |||
self.highway2 = self.highway2.cuda() | |||
self.lstm = self.lstm.cuda() | |||
self.dropout = self.dropout.cuda() | |||
self.char_embed = self.char_embed.cuda() | |||
self.linear = self.linear.cuda() | |||
self.batch_norm = self.batch_norm.cuda() | |||
def forward(self, x, hidden): | |||
# Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2] | |||
# Return: Variable of Tensor with shape [num_words, len(word_dict)] | |||
lstm_batch_size = x.size()[0] | |||
lstm_seq_len = x.size()[1] | |||
x = x.contiguous().view(-1, x.size()[2]) | |||
# [num_seq*seq_len, max_word_len+2] | |||
x = self.char_embed(x) | |||
# [num_seq*seq_len, max_word_len+2, char_emb_dim] | |||
x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3) | |||
# [num_seq*seq_len, 1, char_emb_dim, max_word_len+2] | |||
x = self.conv_layers(x) | |||
# [num_seq*seq_len, total_num_filters] | |||
x = self.batch_norm(x) | |||
# [num_seq*seq_len, total_num_filters] | |||
x = self.highway1(x) | |||
x = self.highway2(x) | |||
# [num_seq*seq_len, total_num_filters] | |||
x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1) | |||
# [num_seq, seq_len, total_num_filters] | |||
x, hidden = self.lstm(x, hidden) | |||
# [seq_len, num_seq, hidden_size] | |||
x = self.dropout(x) | |||
# [seq_len, num_seq, hidden_size] | |||
x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1) | |||
# [num_seq*seq_len, hidden_size] | |||
x = self.linear(x) | |||
# [num_seq*seq_len, vocab_size] | |||
return x, hidden | |||
def conv_layers(self, x): | |||
chosen_list = list() | |||
for conv in self.convolutions: | |||
feature_map = F.tanh(conv(x)) | |||
# (batch_size, out_channel, 1, max_word_len-width+1) | |||
chosen = torch.max(feature_map, 3)[0] | |||
# (batch_size, out_channel, 1) | |||
chosen = chosen.squeeze() | |||
# (batch_size, out_channel) | |||
chosen_list.append(chosen) | |||
# (batch_size, total_num_filers) | |||
return torch.cat(chosen_list, 1) |
@@ -0,0 +1,46 @@ | |||
import torch.nn as nn | |||
from fastNLP.models.base_model import BaseModel | |||
class WordSeg(BaseModel): | |||
""" | |||
PyTorch Network for word segmentation | |||
""" | |||
def __init__(self, hidden_dim, lstm_num_layers, vocab_size, word_emb_dim=100): | |||
super(WordSeg, self).__init__() | |||
self.vocab_size = vocab_size | |||
self.word_emb_dim = word_emb_dim | |||
self.lstm_num_layers = lstm_num_layers | |||
self.hidden_dim = hidden_dim | |||
self.word_emb = nn.Embedding(self.vocab_size, self.word_emb_dim) | |||
self.lstm = nn.LSTM(input_size=self.word_emb_dim, | |||
hidden_size=self.word_emb_dim, | |||
num_layers=self.lstm_num_layers, | |||
bias=True, | |||
dropout=0.5, | |||
batch_first=True) | |||
self.linear = nn.Linear(self.word_emb_dim, self.vocab_size) | |||
def forward(self, x, hidden): | |||
""" | |||
:param x: tensor of shape [batch_size, seq_len], vocabulary index | |||
:param hidden: | |||
:return x: probability of vocabulary entries | |||
hidden: (memory cell, hidden state) from LSTM | |||
""" | |||
# [batch_size, seq_len] | |||
x = self.word_emb(x) | |||
# [batch_size, seq_len, word_emb_size] | |||
x, hidden = self.lstm(x, hidden) | |||
# [batch_size, seq_len, word_emb_size] | |||
x = x.contiguous().view(x.shape[0] * x.shape[1], -1) | |||
# [batch_size*seq_len, word_emb_size] | |||
x = self.linear(x) | |||
# [batch_size*seq_len, vocab_size] | |||
return x, hidden |
@@ -0,0 +1,174 @@ | |||
import torch | |||
from torch import nn | |||
def log_sum_exp(x, dim=-1): | |||
max_value, _ = x.max(dim=dim, keepdim=True) | |||
res = torch.log(torch.sum(torch.exp(x - max_value), dim=dim, keepdim=True)) + max_value | |||
return res.squeeze(dim) | |||
def seq_len_to_byte_mask(seq_lens): | |||
# usually seq_lens: LongTensor, batch_size | |||
# return value: ByteTensor, batch_size x max_len | |||
batch_size = seq_lens.size(0) | |||
max_len = seq_lens.max() | |||
broadcast_arange = torch.arange(max_len).view(1, -1).repeat(batch_size, 1) | |||
mask = broadcast_arange.lt(seq_lens.float().view(-1, 1)) | |||
return mask | |||
class ContionalRandomField(nn.Module): | |||
def __init__(self, tag_size, include_start_end_trans=True): | |||
""" | |||
:param tag_size: int, num of tags | |||
:param include_start_end_trans: bool, whether to include start/end tag | |||
""" | |||
super(ContionalRandomField, self).__init__() | |||
self.include_start_end_trans = include_start_end_trans | |||
self.tag_size = tag_size | |||
# the meaning of entry in this matrix is (from_tag_id, to_tag_id) score | |||
self.transition_m = nn.Parameter(torch.randn(tag_size, tag_size)) | |||
if self.include_start_end_trans: | |||
self.start_scores = nn.Parameter(torch.randn(tag_size)) | |||
self.end_scores = nn.Parameter(torch.randn(tag_size)) | |||
self.reset_parameter() | |||
def reset_parameter(self): | |||
nn.init.xavier_normal_(self.transition_m) | |||
if self.include_start_end_trans: | |||
nn.init.normal_(self.start_scores) | |||
nn.init.normal_(self.end_scores) | |||
def _normalizer_likelihood(self, feats, masks): | |||
""" | |||
Computes the (batch_size,) denominator term for the log-likelihood, which is the | |||
sum of the likelihoods across all possible state sequences. | |||
:param feats:FloatTensor, batch_size x max_len x tag_size | |||
:param masks:ByteTensor, batch_size x max_len | |||
:return:FloatTensor, batch_size | |||
""" | |||
batch_size, max_len, _ = feats.size() | |||
# alpha, batch_size x tag_size | |||
if self.include_start_end_trans: | |||
alpha = self.start_scores.view(1, -1) + feats[:, 0] | |||
else: | |||
alpha = feats[:, 0] | |||
# broadcast_trans_m, the meaning of entry in this matrix is [batch_idx, to_tag_id, from_tag_id] | |||
broadcast_trans_m = self.transition_m.permute( | |||
1, 0).unsqueeze(0).repeat(batch_size, 1, 1) | |||
# loop | |||
for i in range(1, max_len): | |||
emit_score = feats[:, i].unsqueeze(2) | |||
new_alpha = broadcast_trans_m + alpha.unsqueeze(1) + emit_score | |||
new_alpha = log_sum_exp(new_alpha, dim=2) | |||
alpha = new_alpha * \ | |||
masks[:, i:i + 1].float() + alpha * \ | |||
(1 - masks[:, i:i + 1].float()) | |||
if self.include_start_end_trans: | |||
alpha = alpha + self.end_scores.view(1, -1) | |||
return log_sum_exp(alpha) | |||
def _glod_score(self, feats, tags, masks): | |||
""" | |||
Compute the score for the gold path. | |||
:param feats: FloatTensor, batch_size x tag_size x tag_size | |||
:param tags: LongTensor, batch_size x max_len | |||
:param masks: ByteTensor, batch_size x max_len | |||
:return:FloatTensor, batch_size | |||
""" | |||
batch_size, max_len, _ = feats.size() | |||
# alpha, B x 1 | |||
if self.include_start_end_trans: | |||
alpha = self.start_scores.view(1, -1).repeat(batch_size, 1).gather(dim=1, index=tags[:, :1]) + \ | |||
feats[:, 0].gather(dim=1, index=tags[:, :1]) | |||
else: | |||
alpha = feats[:, 0].gather(dim=1, index=tags[:, :1]) | |||
for i in range(1, max_len): | |||
trans_score = self.transition_m[( | |||
tags[:, i - 1], tags[:, i])].unsqueeze(1) | |||
emit_score = feats[:, i].gather(dim=1, index=tags[:, i:i + 1]) | |||
new_alpha = alpha + trans_score + emit_score | |||
alpha = new_alpha * \ | |||
masks[:, i:i + 1].float() + alpha * \ | |||
(1 - masks[:, i:i + 1].float()) | |||
if self.include_start_end_trans: | |||
last_tag_index = masks.cumsum(dim=1, dtype=torch.long)[:, -1:] - 1 | |||
last_from_tag_id = tags.gather(dim=1, index=last_tag_index) | |||
trans_score = self.end_scores.view( | |||
1, -1).repeat(batch_size, 1).gather(dim=1, index=last_from_tag_id) | |||
alpha = alpha + trans_score | |||
return alpha.squeeze(1) | |||
def forward(self, feats, tags, masks): | |||
""" | |||
Calculate the neg log likelihood | |||
:param feats:FloatTensor, batch_size x tag_size x tag_size | |||
:param tags:LongTensor, batch_size x max_len | |||
:param masks:ByteTensor batch_size x max_len | |||
:return:FloatTensor, batch_size | |||
""" | |||
all_path_score = self._normalizer_likelihood(feats, masks) | |||
gold_path_score = self._glod_score(feats, tags, masks) | |||
return all_path_score - gold_path_score | |||
def viterbi_decode(self, feats, masks): | |||
""" | |||
Given a feats matrix, return best decode path and best score. | |||
:param feats: | |||
:param masks: | |||
:return:List[Tuple(List, float)], | |||
""" | |||
batch_size, max_len, tag_size = feats.size() | |||
paths = torch.zeros(batch_size, max_len - 1, self.tag_size) | |||
if self.include_start_end_trans: | |||
alpha = self.start_scores.repeat(batch_size, 1) + feats[:, 0] | |||
else: | |||
alpha = feats[:, 0] | |||
for i in range(1, max_len): | |||
new_alpha = alpha.clone() | |||
for t in range(self.tag_size): | |||
pre_scores = self.transition_m[:, t].view( | |||
1, self.tag_size) + alpha | |||
max_scroe, indice = pre_scores.max(dim=1) | |||
new_alpha[:, t] = max_scroe + feats[:, i, t] | |||
paths[:, i - 1, t] = indice | |||
alpha = new_alpha * \ | |||
masks[:, i:i + 1].float() + alpha * \ | |||
(1 - masks[:, i:i + 1].float()) | |||
if self.include_start_end_trans: | |||
alpha += self.end_scores.view(1, -1) | |||
max_scroes, indice = alpha.max(dim=1) | |||
indice = indice.cpu().numpy() | |||
final_paths = [] | |||
paths = paths.cpu().numpy().astype(int) | |||
seq_lens = masks.cumsum(dim=1, dtype=torch.long)[:, -1] | |||
for b in range(batch_size): | |||
path = [indice[b]] | |||
for i in range(seq_lens[b] - 2, -1, -1): | |||
index = paths[b, i, path[-1]] | |||
path.append(index) | |||
final_paths.append(path[::-1]) | |||
return list(zip(final_paths, max_scroes.detach().cpu().numpy())) |
@@ -0,0 +1,19 @@ | |||
import torch | |||
from fastNLP.modules.utils import mask_softmax | |||
class Attention(torch.nn.Module): | |||
def __init__(self, normalize=False): | |||
super(Attention, self).__init__() | |||
self.normalize = normalize | |||
def forward(self, query, memory, mask): | |||
similarities = self._atten_forward(query, memory) | |||
if self.normalize: | |||
return mask_softmax(similarities, mask) | |||
return similarities | |||
def _atten_forward(self, query, memory): | |||
raise NotImplementedError |
@@ -0,0 +1,9 @@ | |||
from fastNLP.modules.attention.attention import Attention | |||
class LinearAttention(Attention): | |||
def __init__(self, normalize=False): | |||
super(LinearAttention, self).__init__(normalize) | |||
def _atten_forward(self, query, memory): | |||
raise NotImplementedError |
@@ -0,0 +1,9 @@ | |||
import torch | |||
def mask_softmax(matrix, mask): | |||
if mask is None: | |||
result = torch.nn.functional.softmax(matrix, dim=-1) | |||
else: | |||
raise NotImplementedError | |||
return result |
@@ -0,0 +1,110 @@ | |||
# Byte-compiled / optimized / DLL files | |||
__pycache__/ | |||
*.py[cod] | |||
*$py.class | |||
# C extensions | |||
*.so | |||
# Distribution / packaging | |||
.Python | |||
build/ | |||
develop-eggs/ | |||
dist/ | |||
downloads/ | |||
eggs/ | |||
.eggs/ | |||
lib/ | |||
lib64/ | |||
parts/ | |||
sdist/ | |||
var/ | |||
wheels/ | |||
*.egg-info/ | |||
.installed.cfg | |||
*.egg | |||
MANIFEST | |||
# PyInstaller | |||
# Usually these files are written by a python script from a template | |||
# before PyInstaller builds the exe, so as to inject date/other infos into it. | |||
*.manifest | |||
*.spec | |||
# Installer logs | |||
pip-log.txt | |||
pip-delete-this-directory.txt | |||
# Unit test / coverage reports | |||
htmlcov/ | |||
.tox/ | |||
.coverage | |||
.coverage.* | |||
.cache | |||
nosetests.xml | |||
coverage.xml | |||
*.cover | |||
.hypothesis/ | |||
.pytest_cache/ | |||
# Translations | |||
*.mo | |||
*.pot | |||
# Django stuff: | |||
*.log | |||
local_settings.py | |||
db.sqlite3 | |||
# Flask stuff: | |||
instance/ | |||
.webassets-cache | |||
# Scrapy stuff: | |||
.scrapy | |||
# Sphinx documentation | |||
docs/_build/ | |||
# PyBuilder | |||
target/ | |||
# Jupyter Notebook | |||
.ipynb_checkpoints | |||
# pyenv | |||
.python-version | |||
# celery beat schedule file | |||
celerybeat-schedule | |||
# SageMath parsed files | |||
*.sage.py | |||
# Environments | |||
.env | |||
.venv | |||
env/ | |||
venv/ | |||
ENV/ | |||
env.bak/ | |||
venv.bak/ | |||
# Spyder project settings | |||
.spyderproject | |||
.spyproject | |||
# Rope project settings | |||
.ropeproject | |||
# mkdocs documentation | |||
/site | |||
# mypy | |||
.mypy_cache | |||
#custom | |||
GoogleNews-vectors-negative300.bin/ | |||
GoogleNews-vectors-negative300.bin.gz | |||
models/ | |||
*.swp |
@@ -0,0 +1,77 @@ | |||
## Introduction | |||
This is the implementation of [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) paper in PyTorch. | |||
* MRDataset, non-static-model(word2vec rained by Mikolov etal. (2013) on 100 billion words of Google News) | |||
* It can be run in both CPU and GPU | |||
* The best accuracy is 82.61%, which is better than 81.5% in the paper | |||
(by Jingyuan Liu @Fudan University; Email:(fdjingyuan@outlook.com) Welcome to discussion!) | |||
## Requirement | |||
* python 3.6 | |||
* pytorch > 0.1 | |||
* numpy | |||
* gensim | |||
## Run | |||
STEP 1 | |||
install packages like gensim (other needed pakages is the same) | |||
``` | |||
pip install gensim | |||
``` | |||
STEP 2 | |||
install MRdataset and word2vec resources | |||
* MRdataset: you can download the dataset in (https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz) | |||
* word2vec: you can download the file in (https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit) | |||
Since this file is more than 1.5G, I did not display in folders. If you download the file, please remember modify the path in Function def word_embeddings(path = './GoogleNews-vectors-negative300.bin/'): | |||
STEP 3 | |||
train the model | |||
``` | |||
python train.py | |||
``` | |||
you will get the information printed in the screen, like | |||
``` | |||
Epoch [1/20], Iter [100/192] Loss: 0.7008 | |||
Test Accuracy: 71.869159 % | |||
Epoch [2/20], Iter [100/192] Loss: 0.5957 | |||
Test Accuracy: 75.700935 % | |||
Epoch [3/20], Iter [100/192] Loss: 0.4934 | |||
Test Accuracy: 78.130841 % | |||
...... | |||
Epoch [20/20], Iter [100/192] Loss: 0.0364 | |||
Test Accuracy: 81.495327 % | |||
Best Accuracy: 82.616822 % | |||
Best Model: models/cnn.pkl | |||
``` | |||
## Hyperparameters | |||
According to the paper and experiment, I set: | |||
|Epoch|Kernel Size|dropout|learning rate|batch size| | |||
|---|---|---|---|---| | |||
|20|\(h,300,100\)|0.5|0.0001|50| | |||
h = [3,4,5] | |||
If the accuracy is not improved, the learning rate will \*0.8. | |||
## Result | |||
I just tried one dataset : MR. (Other 6 dataset in paper SST-1, SST-2, TREC, CR, MPQA) | |||
There are four models in paper: CNN-rand, CNN-static, CNN-non-static, CNN-multichannel. | |||
I have tried CNN-non-static:A model with pre-trained vectors from word2vec. | |||
All words—including the unknown ones that are randomly initialized and the pretrained vectors are fine-tuned for each task | |||
(which has almost the best performance and the most difficut to implement among the four models) | |||
|Dataset|Class Size|Best Result|Kim's Paper Result| | |||
|---|---|---|---| | |||
|MR|2|82.617%(CNN-non-static)|81.5%(CNN-nonstatic)| | |||
## Reference | |||
* [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) | |||
* https://github.com/Shawn1993/cnn-text-classification-pytorch | |||
* https://github.com/junwang4/CNN-sentence-classification-pytorch-2017/blob/master/utils.py | |||
@@ -0,0 +1,136 @@ | |||
import codecs | |||
import random | |||
import re | |||
import gensim | |||
import numpy as np | |||
from gensim import corpora | |||
from torch.utils.data import Dataset | |||
def clean_str(string): | |||
""" | |||
Tokenization/string cleaning for all datasets except for SST. | |||
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py | |||
""" | |||
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) | |||
string = re.sub(r"\'s", " \'s", string) | |||
string = re.sub(r"\'ve", " \'ve", string) | |||
string = re.sub(r"n\'t", " n\'t", string) | |||
string = re.sub(r"\'re", " \'re", string) | |||
string = re.sub(r"\'d", " \'d", string) | |||
string = re.sub(r"\'ll", " \'ll", string) | |||
string = re.sub(r",", " , ", string) | |||
string = re.sub(r"!", " ! ", string) | |||
string = re.sub(r"\(", " \( ", string) | |||
string = re.sub(r"\)", " \) ", string) | |||
string = re.sub(r"\?", " \? ", string) | |||
string = re.sub(r"\s{2,}", " ", string) | |||
return string.strip() | |||
def pad_sentences(sentence, padding_word=" <PAD/>"): | |||
sequence_length = 64 | |||
sent = sentence.split() | |||
padded_sentence = sentence + padding_word * (sequence_length - len(sent)) | |||
return padded_sentence | |||
# data loader | |||
class MRDataset(Dataset): | |||
def __init__(self): | |||
# load positive and negative sentenses from files | |||
with codecs.open("./rt-polaritydata/rt-polarity.pos", encoding='ISO-8859-1') as f: | |||
positive_examples = list(f.readlines()) | |||
with codecs.open("./rt-polaritydata/rt-polarity.neg", encoding='ISO-8859-1') as f: | |||
negative_examples = list(f.readlines()) | |||
# s.strip: clear "\n"; clear_str; pad | |||
positive_examples = [pad_sentences(clean_str(s.strip())) for s in positive_examples] | |||
negative_examples = [pad_sentences(clean_str(s.strip())) for s in negative_examples] | |||
self.examples = positive_examples + negative_examples | |||
self.sentences_texts = [sample.split() for sample in self.examples] | |||
# word dictionary | |||
dictionary = corpora.Dictionary(self.sentences_texts) | |||
self.word2id_dict = dictionary.token2id # transform to dict, like {"human":0, "a":1,...} | |||
# set lables: postive is 1; negative is 0 | |||
positive_labels = [1 for _ in positive_examples] | |||
negative_labels = [0 for _ in negative_examples] | |||
self.lables = positive_labels + negative_labels | |||
examples_lables = list(zip(self.examples, self.lables)) | |||
random.shuffle(examples_lables) | |||
self.MRDataset_frame = examples_lables | |||
# transform word to id | |||
self.MRDataset_wordid = \ | |||
[( | |||
np.array([self.word2id_dict[word] for word in sent[0].split()], dtype=np.int64), | |||
sent[1] | |||
) for sent in self.MRDataset_frame] | |||
def word_embeddings(self, path="./GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin"): | |||
# establish from google | |||
model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True) | |||
print('Please wait ... (it could take a while to load the file : {})'.format(path)) | |||
word_dict = self.word2id_dict | |||
embedding_weights = np.random.uniform(-0.25, 0.25, (len(self.word2id_dict), 300)) | |||
for word in word_dict: | |||
word_id = word_dict[word] | |||
if word in model.wv.vocab: | |||
embedding_weights[word_id, :] = model[word] | |||
return embedding_weights | |||
def __len__(self): | |||
return len(self.MRDataset_frame) | |||
def __getitem__(self, idx): | |||
sample = self.MRDataset_wordid[idx] | |||
return sample | |||
def getsent(self, idx): | |||
sample = self.MRDataset_wordid[idx][0] | |||
return sample | |||
def getlabel(self, idx): | |||
label = self.MRDataset_wordid[idx][1] | |||
return label | |||
def word2id(self): | |||
return self.word2id_dict | |||
def id2word(self): | |||
id2word_dict = dict([val, key] for key, val in self.word2id_dict.items()) | |||
return id2word_dict | |||
class train_set(Dataset): | |||
def __init__(self, samples): | |||
self.train_frame = samples | |||
def __len__(self): | |||
return len(self.train_frame) | |||
def __getitem__(self, idx): | |||
return self.train_frame[idx] | |||
class test_set(Dataset): | |||
def __init__(self, samples): | |||
self.test_frame = samples | |||
def __len__(self): | |||
return len(self.test_frame) | |||
def __getitem__(self, idx): | |||
return self.test_frame[idx] |
@@ -0,0 +1,35 @@ | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
class CNN_text(nn.Module): | |||
def __init__(self, kernel_h=[3, 4, 5], kernel_num=100, embed_num=1000, embed_dim=300, dropout=0.5, L2_constrain=3, | |||
batchsize=50, pretrained_embeddings=None): | |||
super(CNN_text, self).__init__() | |||
self.embedding = nn.Embedding(embed_num, embed_dim) | |||
self.dropout = nn.Dropout(dropout) | |||
if pretrained_embeddings is not None: | |||
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings)) | |||
# the network structure | |||
# Conv2d: input- N,C,H,W output- (50,100,62,1) | |||
self.conv1 = nn.ModuleList([nn.Conv2d(1, 100, (K, 300)) for K in kernel_h]) | |||
self.fc1 = nn.Linear(300, 2) | |||
def max_pooling(self, x): | |||
x = F.relu(conv(x)).squeeze(3) # N,C,L - (50,100,62) | |||
x = F.max_pool1d(x, x.size(2)).squeeze(2) | |||
# x.size(2)=62 squeeze: (50,100,1) -> (50,100) | |||
return x | |||
def forward(self, x): | |||
x = self.embedding(x) # output: (N,H,W) = (50,64,300) | |||
x = x.unsqueeze(1) # (N,C,H,W) | |||
x = [F.relu(conv(x)).squeeze(3) for conv in self.conv1] # [N, C, H(50,100,62),(50,100,61),(50,100,60)] | |||
x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] # [N,C(50,100),(50,100),(50,100)] | |||
x = torch.cat(x, 1) | |||
x = self.dropout(x) | |||
x = self.fc1(x) | |||
return x |
@@ -0,0 +1,93 @@ | |||
import os | |||
import | |||
import | |||
import torch | |||
import torch.nn as nn | |||
.dataset as dst | |||
from .model import CNN_text | |||
from torch.autograd import Variable | |||
# Hyper Parameters | |||
batch_size = 50 | |||
learning_rate = 0.0001 | |||
num_epochs = 20 | |||
cuda = True | |||
# split Dataset | |||
dataset = dst.MRDataset() | |||
length = len(dataset) | |||
train_dataset = dataset[:int(0.9 * length)] | |||
test_dataset = dataset[int(0.9 * length):] | |||
train_dataset = dst.train_set(train_dataset) | |||
test_dataset = dst.test_set(test_dataset) | |||
# Data Loader | |||
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, | |||
batch_size=batch_size, | |||
shuffle=True) | |||
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, | |||
batch_size=batch_size, | |||
shuffle=False) | |||
# cnn | |||
cnn = CNN_text(embed_num=len(dataset.word2id()), pretrained_embeddings=dataset.word_embeddings()) | |||
if cuda: | |||
cnn.cuda() | |||
# Loss and Optimizer | |||
criterion = nn.CrossEntropyLoss() | |||
optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate) | |||
# train and test | |||
best_acc = None | |||
for epoch in range(num_epochs): | |||
# Train the Model | |||
cnn.train() | |||
for i, (sents, labels) in enumerate(train_loader): | |||
sents = Variable(sents) | |||
labels = Variable(labels) | |||
if cuda: | |||
sents = sents.cuda() | |||
labels = labels.cuda() | |||
optimizer.zero_grad() | |||
outputs = cnn(sents) | |||
loss = criterion(outputs, labels) | |||
loss.backward() | |||
optimizer.step() | |||
if (i + 1) % 100 == 0: | |||
print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f' | |||
% (epoch + 1, num_epochs, i + 1, len(train_dataset) // batch_size, loss.data[0])) | |||
# Test the Model | |||
cnn.eval() | |||
correct = 0 | |||
total = 0 | |||
for sents, labels in test_loader: | |||
sents = Variable(sents) | |||
if cuda: | |||
sents = sents.cuda() | |||
labels = labels.cuda() | |||
outputs = cnn(sents) | |||
_, predicted = torch.max(outputs.data, 1) | |||
total += labels.size(0) | |||
correct += (predicted == labels).sum() | |||
acc = 100. * correct / total | |||
print('Test Accuracy: %f %%' % (acc)) | |||
if best_acc is None or acc > best_acc: | |||
best_acc = acc | |||
if os.path.exists("models") is False: | |||
os.makedirs("models") | |||
torch.save(cnn.state_dict(), 'models/cnn.pkl') | |||
else: | |||
learning_rate = learning_rate * 0.8 | |||
print("Best Accuracy: %f %%" % best_acc) | |||
print("Best Model: models/cnn.pkl") |
@@ -0,0 +1,21 @@ | |||
MIT License | |||
Copyright (c) 2017 | |||
Permission is hereby granted, free of charge, to any person obtaining a copy | |||
of this software and associated documentation files (the "Software"), to deal | |||
in the Software without restriction, including without limitation the rights | |||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
copies of the Software, and to permit persons to whom the Software is | |||
furnished to do so, subject to the following conditions: | |||
The above copyright notice and this permission notice shall be included in all | |||
copies or substantial portions of the Software. | |||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
SOFTWARE. |
@@ -0,0 +1,40 @@ | |||
# PyTorch-Character-Aware-Neural-Language-Model | |||
This is the PyTorch implementation of character-aware neural language model proposed in this [paper](https://arxiv.org/abs/1508.06615) by Yoon Kim. | |||
## Requiredments | |||
The code is run and tested with **Python 3.5.2** and **PyTorch 0.3.1**. | |||
## HyperParameters | |||
| HyperParam | value | | |||
| ------ | :-------| | |||
| LSTM batch size | 20 | | |||
| LSTM sequence length | 35 | | |||
| LSTM hidden units | 300 | | |||
| epochs | 35 | | |||
| initial learning rate | 1.0 | | |||
| character embedding dimension | 15 | | |||
## Demo | |||
Train the model with split train/valid/test data. | |||
`python train.py` | |||
The trained model will saved in `cache/net.pkl`. | |||
Test the model. | |||
`python test.py` | |||
Best result on test set: | |||
PPl=127.2163 | |||
cross entropy loss=4.8459 | |||
## Acknowledgement | |||
This implementation borrowed ideas from | |||
https://github.com/jarfo/kchar | |||
https://github.com/cronos123/Character-Aware-Neural-Language-Models | |||
@@ -0,0 +1,145 @@ | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
class Highway(nn.Module): | |||
"""Highway network""" | |||
def __init__(self, input_size): | |||
super(Highway, self).__init__() | |||
self.fc1 = nn.Linear(input_size, input_size, bias=True) | |||
self.fc2 = nn.Linear(input_size, input_size, bias=True) | |||
def forward(self, x): | |||
t = F.sigmoid(self.fc1(x)) | |||
return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1 - t, x) | |||
class charLM(nn.Module): | |||
"""CNN + highway network + LSTM | |||
# Input: | |||
4D tensor with shape [batch_size, in_channel, height, width] | |||
# Output: | |||
2D Tensor with shape [batch_size, vocab_size] | |||
# Arguments: | |||
char_emb_dim: the size of each character's attention | |||
word_emb_dim: the size of each word's attention | |||
vocab_size: num of unique words | |||
num_char: num of characters | |||
use_gpu: True or False | |||
""" | |||
def __init__(self, char_emb_dim, word_emb_dim, | |||
vocab_size, num_char, use_gpu): | |||
super(charLM, self).__init__() | |||
self.char_emb_dim = char_emb_dim | |||
self.word_emb_dim = word_emb_dim | |||
self.vocab_size = vocab_size | |||
# char attention layer | |||
self.char_embed = nn.Embedding(num_char, char_emb_dim) | |||
# convolutions of filters with different sizes | |||
self.convolutions = [] | |||
# list of tuples: (the number of filter, width) | |||
self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)] | |||
for out_channel, filter_width in self.filter_num_width: | |||
self.convolutions.append( | |||
nn.Conv2d( | |||
1, # in_channel | |||
out_channel, # out_channel | |||
kernel_size=(char_emb_dim, filter_width), # (height, width) | |||
bias=True | |||
) | |||
) | |||
self.highway_input_dim = sum([x for x, y in self.filter_num_width]) | |||
self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False) | |||
# highway net | |||
self.highway1 = Highway(self.highway_input_dim) | |||
self.highway2 = Highway(self.highway_input_dim) | |||
# LSTM | |||
self.lstm_num_layers = 2 | |||
self.lstm = nn.LSTM(input_size=self.highway_input_dim, | |||
hidden_size=self.word_emb_dim, | |||
num_layers=self.lstm_num_layers, | |||
bias=True, | |||
dropout=0.5, | |||
batch_first=True) | |||
# output layer | |||
self.dropout = nn.Dropout(p=0.5) | |||
self.linear = nn.Linear(self.word_emb_dim, self.vocab_size) | |||
if use_gpu is True: | |||
for x in range(len(self.convolutions)): | |||
self.convolutions[x] = self.convolutions[x].cuda() | |||
self.highway1 = self.highway1.cuda() | |||
self.highway2 = self.highway2.cuda() | |||
self.lstm = self.lstm.cuda() | |||
self.dropout = self.dropout.cuda() | |||
self.char_embed = self.char_embed.cuda() | |||
self.linear = self.linear.cuda() | |||
self.batch_norm = self.batch_norm.cuda() | |||
def forward(self, x, hidden): | |||
# Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2] | |||
# Return: Variable of Tensor with shape [num_words, len(word_dict)] | |||
lstm_batch_size = x.size()[0] | |||
lstm_seq_len = x.size()[1] | |||
x = x.contiguous().view(-1, x.size()[2]) | |||
# [num_seq*seq_len, max_word_len+2] | |||
x = self.char_embed(x) | |||
# [num_seq*seq_len, max_word_len+2, char_emb_dim] | |||
x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3) | |||
# [num_seq*seq_len, 1, max_word_len+2, char_emb_dim] | |||
x = self.conv_layers(x) | |||
# [num_seq*seq_len, total_num_filters] | |||
x = self.batch_norm(x) | |||
# [num_seq*seq_len, total_num_filters] | |||
x = self.highway1(x) | |||
x = self.highway2(x) | |||
# [num_seq*seq_len, total_num_filters] | |||
x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1) | |||
# [num_seq, seq_len, total_num_filters] | |||
x, hidden = self.lstm(x, hidden) | |||
# [seq_len, num_seq, hidden_size] | |||
x = self.dropout(x) | |||
# [seq_len, num_seq, hidden_size] | |||
x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1) | |||
# [num_seq*seq_len, hidden_size] | |||
x = self.linear(x) | |||
# [num_seq*seq_len, vocab_size] | |||
return x, hidden | |||
def conv_layers(self, x): | |||
chosen_list = list() | |||
for conv in self.convolutions: | |||
feature_map = F.tanh(conv(x)) | |||
# (batch_size, out_channel, 1, max_word_len-width+1) | |||
chosen = torch.max(feature_map, 3)[0] | |||
# (batch_size, out_channel, 1) | |||
chosen = chosen.squeeze() | |||
# (batch_size, out_channel) | |||
chosen_list.append(chosen) | |||
# (batch_size, total_num_filers) | |||
return torch.cat(chosen_list, 1) |
@@ -0,0 +1,117 @@ | |||
import os | |||
from collections import namedtuple | |||
import numpy as np | |||
import torch | |||
import torch.nn as nn | |||
from torch.autograd import Variable | |||
from utilities import * | |||
def to_var(x): | |||
if torch.cuda.is_available(): | |||
x = x.cuda() | |||
return Variable(x) | |||
def test(net, data, opt): | |||
net.eval() | |||
test_input = torch.from_numpy(data.test_input) | |||
test_label = torch.from_numpy(data.test_label) | |||
num_seq = test_input.size()[0] // opt.lstm_seq_len | |||
test_input = test_input[:num_seq * opt.lstm_seq_len, :] | |||
# [num_seq, seq_len, max_word_len+2] | |||
test_input = test_input.view(-1, opt.lstm_seq_len, opt.max_word_len + 2) | |||
criterion = nn.CrossEntropyLoss() | |||
loss_list = [] | |||
num_hits = 0 | |||
total = 0 | |||
iterations = test_input.size()[0] // opt.lstm_batch_size | |||
test_generator = batch_generator(test_input, opt.lstm_batch_size) | |||
label_generator = batch_generator(test_label, opt.lstm_batch_size * opt.lstm_seq_len) | |||
hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)), | |||
to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim))) | |||
add_loss = 0.0 | |||
for t in range(iterations): | |||
batch_input = test_generator.__next__() | |||
batch_label = label_generator.__next__() | |||
net.zero_grad() | |||
hidden = [state.detach() for state in hidden] | |||
test_output, hidden = net(to_var(batch_input), hidden) | |||
test_loss = criterion(test_output, to_var(batch_label)).data | |||
loss_list.append(test_loss) | |||
add_loss += test_loss | |||
print("Test Loss={0:.4f}".format(float(add_loss) / iterations)) | |||
print("Test PPL={0:.4f}".format(float(np.exp(add_loss / iterations)))) | |||
############################################################# | |||
if __name__ == "__main__": | |||
word_embed_dim = 300 | |||
char_embedding_dim = 15 | |||
if os.path.exists("cache/prep.pt") is False: | |||
print("Cannot find prep.pt") | |||
objetcs = torch.load("cache/prep.pt") | |||
word_dict = objetcs["word_dict"] | |||
char_dict = objetcs["char_dict"] | |||
reverse_word_dict = objetcs["reverse_word_dict"] | |||
max_word_len = objetcs["max_word_len"] | |||
num_words = len(word_dict) | |||
print("word/char dictionary built. Start making inputs.") | |||
if os.path.exists("cache/data_sets.pt") is False: | |||
test_text = read_data("./test.txt") | |||
test_set = np.array(text2vec(test_text, char_dict, max_word_len)) | |||
# Labels are next-word index in word_dict with the same length as inputs | |||
test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]]) | |||
category = {"test": test_set, "tlabel": test_label} | |||
torch.save(category, "cache/data_sets.pt") | |||
else: | |||
data_sets = torch.load("cache/data_sets.pt") | |||
test_set = data_sets["test"] | |||
test_label = data_sets["tlabel"] | |||
train_set = data_sets["tdata"] | |||
train_label = data_sets["trlabel"] | |||
DataTuple = namedtuple("DataTuple", "test_input test_label train_input train_label ") | |||
data = DataTuple(test_input=test_set, | |||
test_label=test_label, train_label=train_label, train_input=train_set) | |||
print("Loaded data sets. Start building network.") | |||
USE_GPU = True | |||
cnn_batch_size = 700 | |||
lstm_seq_len = 35 | |||
lstm_batch_size = 20 | |||
net = torch.load("cache/net.pkl") | |||
Options = namedtuple("Options", ["cnn_batch_size", "lstm_seq_len", | |||
"max_word_len", "lstm_batch_size", "word_embed_dim"]) | |||
opt = Options(cnn_batch_size=lstm_seq_len * lstm_batch_size, | |||
lstm_seq_len=lstm_seq_len, | |||
max_word_len=max_word_len, | |||
lstm_batch_size=lstm_batch_size, | |||
word_embed_dim=word_embed_dim) | |||
print("Network built. Start testing.") | |||
test(net, data, opt) |
@@ -0,0 +1,263 @@ | |||
import os | |||
from collections import namedtuple | |||
import numpy as np | |||
import torch.optim as optim | |||
from .model import charLM | |||
from .test import test | |||
from .utilities import * | |||
def preprocess(): | |||
word_dict, char_dict = create_word_char_dict("charlm.txt", "train.txt", "test.txt") | |||
num_words = len(word_dict) | |||
num_char = len(char_dict) | |||
char_dict["BOW"] = num_char + 1 | |||
char_dict["EOW"] = num_char + 2 | |||
char_dict["PAD"] = 0 | |||
# dict of (int, string) | |||
reverse_word_dict = {value: key for key, value in word_dict.items()} | |||
max_word_len = max([len(word) for word in word_dict]) | |||
objects = { | |||
"word_dict": word_dict, | |||
"char_dict": char_dict, | |||
"reverse_word_dict": reverse_word_dict, | |||
"max_word_len": max_word_len | |||
} | |||
torch.save(objects, "cache/prep.pt") | |||
print("Preprocess done.") | |||
def to_var(x): | |||
if torch.cuda.is_available(): | |||
x = x.cuda() | |||
return Variable(x) | |||
def train(net, data, opt): | |||
""" | |||
:param net: the pytorch models | |||
:param data: numpy array | |||
:param opt: named tuple | |||
1. random seed | |||
2. define local input | |||
3. training settting: learning rate, loss, etc | |||
4. main loop epoch | |||
5. batchify | |||
6. validation | |||
7. save models | |||
""" | |||
torch.manual_seed(1024) | |||
train_input = torch.from_numpy(data.train_input) | |||
train_label = torch.from_numpy(data.train_label) | |||
valid_input = torch.from_numpy(data.valid_input) | |||
valid_label = torch.from_numpy(data.valid_label) | |||
# [num_seq, seq_len, max_word_len+2] | |||
num_seq = train_input.size()[0] // opt.lstm_seq_len | |||
train_input = train_input[:num_seq * opt.lstm_seq_len, :] | |||
train_input = train_input.view(-1, opt.lstm_seq_len, opt.max_word_len + 2) | |||
num_seq = valid_input.size()[0] // opt.lstm_seq_len | |||
valid_input = valid_input[:num_seq * opt.lstm_seq_len, :] | |||
valid_input = valid_input.view(-1, opt.lstm_seq_len, opt.max_word_len + 2) | |||
num_epoch = opt.epochs | |||
num_iter_per_epoch = train_input.size()[0] // opt.lstm_batch_size | |||
learning_rate = opt.init_lr | |||
old_PPL = 100000 | |||
best_PPL = 100000 | |||
# Log-SoftMax | |||
criterion = nn.CrossEntropyLoss() | |||
# word_emb_dim == hidden_size / num of hidden units | |||
hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)), | |||
to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim))) | |||
for epoch in range(num_epoch): | |||
################ Validation #################### | |||
net.eval() | |||
loss_batch = [] | |||
PPL_batch = [] | |||
iterations = valid_input.size()[0] // opt.lstm_batch_size | |||
valid_generator = batch_generator(valid_input, opt.lstm_batch_size) | |||
vlabel_generator = batch_generator(valid_label, opt.lstm_batch_size * opt.lstm_seq_len) | |||
for t in range(iterations): | |||
batch_input = valid_generator.__next__() | |||
batch_label = vlabel_generator.__next__() | |||
hidden = [state.detach() for state in hidden] | |||
valid_output, hidden = net(to_var(batch_input), hidden) | |||
length = valid_output.size()[0] | |||
# [num_sample-1, len(word_dict)] vs [num_sample-1] | |||
valid_loss = criterion(valid_output, to_var(batch_label)) | |||
PPL = torch.exp(valid_loss.data) | |||
loss_batch.append(float(valid_loss)) | |||
PPL_batch.append(float(PPL)) | |||
PPL = np.mean(PPL_batch) | |||
print("[epoch {}] valid PPL={}".format(epoch, PPL)) | |||
print("valid loss={}".format(np.mean(loss_batch))) | |||
print("PPL decrease={}".format(float(old_PPL - PPL))) | |||
# Preserve the best models | |||
if best_PPL > PPL: | |||
best_PPL = PPL | |||
torch.save(net.state_dict(), "cache/models.pt") | |||
torch.save(net, "cache/net.pkl") | |||
# Adjust the learning rate | |||
if float(old_PPL - PPL) <= 1.0: | |||
learning_rate /= 2 | |||
print("halved lr:{}".format(learning_rate)) | |||
old_PPL = PPL | |||
################################################## | |||
#################### Training #################### | |||
net.train() | |||
optimizer = optim.SGD(net.parameters(), | |||
lr=learning_rate, | |||
momentum=0.85) | |||
# split the first dim | |||
input_generator = batch_generator(train_input, opt.lstm_batch_size) | |||
label_generator = batch_generator(train_label, opt.lstm_batch_size * opt.lstm_seq_len) | |||
for t in range(num_iter_per_epoch): | |||
batch_input = input_generator.__next__() | |||
batch_label = label_generator.__next__() | |||
# detach hidden state of LSTM from last batch | |||
hidden = [state.detach() for state in hidden] | |||
output, hidden = net(to_var(batch_input), hidden) | |||
# [num_word, vocab_size] | |||
loss = criterion(output, to_var(batch_label)) | |||
net.zero_grad() | |||
loss.backward() | |||
torch.nn.utils.clip_grad_norm(net.parameters(), 5, norm_type=2) | |||
optimizer.step() | |||
if (t + 1) % 100 == 0: | |||
print("[epoch {} step {}] train loss={}, Perplexity={}".format(epoch + 1, | |||
t + 1, float(loss.data), | |||
float(np.exp(loss.data)))) | |||
torch.save(net.state_dict(), "cache/models.pt") | |||
print("Training finished.") | |||
################################################################ | |||
if __name__ == "__main__": | |||
word_embed_dim = 300 | |||
char_embedding_dim = 15 | |||
if os.path.exists("cache/prep.pt") is False: | |||
preprocess() | |||
objetcs = torch.load("cache/prep.pt") | |||
word_dict = objetcs["word_dict"] | |||
char_dict = objetcs["char_dict"] | |||
reverse_word_dict = objetcs["reverse_word_dict"] | |||
max_word_len = objetcs["max_word_len"] | |||
num_words = len(word_dict) | |||
print("word/char dictionary built. Start making inputs.") | |||
if os.path.exists("cache/data_sets.pt") is False: | |||
train_text = read_data("./train.txt") | |||
valid_text = read_data("./charlm.txt") | |||
test_text = read_data("./test.txt") | |||
train_set = np.array(text2vec(train_text, char_dict, max_word_len)) | |||
valid_set = np.array(text2vec(valid_text, char_dict, max_word_len)) | |||
test_set = np.array(text2vec(test_text, char_dict, max_word_len)) | |||
# Labels are next-word index in word_dict with the same length as inputs | |||
train_label = np.array([word_dict[w] for w in train_text[1:]] + [word_dict[train_text[-1]]]) | |||
valid_label = np.array([word_dict[w] for w in valid_text[1:]] + [word_dict[valid_text[-1]]]) | |||
test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]]) | |||
category = {"tdata": train_set, "vdata": valid_set, "test": test_set, | |||
"trlabel": train_label, "vlabel": valid_label, "tlabel": test_label} | |||
torch.save(category, "cache/data_sets.pt") | |||
else: | |||
data_sets = torch.load("cache/data_sets.pt") | |||
train_set = data_sets["tdata"] | |||
valid_set = data_sets["vdata"] | |||
test_set = data_sets["test"] | |||
train_label = data_sets["trlabel"] | |||
valid_label = data_sets["vlabel"] | |||
test_label = data_sets["tlabel"] | |||
DataTuple = namedtuple("DataTuple", | |||
"train_input train_label valid_input valid_label test_input test_label") | |||
data = DataTuple(train_input=train_set, | |||
train_label=train_label, | |||
valid_input=valid_set, | |||
valid_label=valid_label, | |||
test_input=test_set, | |||
test_label=test_label) | |||
print("Loaded data sets. Start building network.") | |||
USE_GPU = True | |||
cnn_batch_size = 700 | |||
lstm_seq_len = 35 | |||
lstm_batch_size = 20 | |||
# cnn_batch_size == lstm_seq_len * lstm_batch_size | |||
net = charLM(char_embedding_dim, | |||
word_embed_dim, | |||
num_words, | |||
len(char_dict), | |||
use_gpu=USE_GPU) | |||
for param in net.parameters(): | |||
nn.init.uniform(param.data, -0.05, 0.05) | |||
Options = namedtuple("Options", [ | |||
"cnn_batch_size", "init_lr", "lstm_seq_len", | |||
"max_word_len", "lstm_batch_size", "epochs", | |||
"word_embed_dim"]) | |||
opt = Options(cnn_batch_size=lstm_seq_len * lstm_batch_size, | |||
init_lr=1.0, | |||
lstm_seq_len=lstm_seq_len, | |||
max_word_len=max_word_len, | |||
lstm_batch_size=lstm_batch_size, | |||
epochs=35, | |||
word_embed_dim=word_embed_dim) | |||
print("Network built. Start training.") | |||
# You can stop training anytime by "ctrl+C" | |||
try: | |||
train(net, data, opt) | |||
except KeyboardInterrupt: | |||
print('-' * 89) | |||
print('Exiting from training early') | |||
torch.save(net, "cache/net.pkl") | |||
print("save net") | |||
test(net, data, opt) |
@@ -0,0 +1,82 @@ | |||
import torch | |||
import torch.nn.functional as F | |||
def batch_generator(x, batch_size): | |||
# x: [num_words, in_channel, height, width] | |||
# partitions x into batches | |||
num_step = x.size()[0] // batch_size | |||
for t in range(num_step): | |||
yield x[t * batch_size:(t + 1) * batch_size] | |||
def text2vec(words, char_dict, max_word_len): | |||
""" Return list of list of int """ | |||
word_vec = [] | |||
for word in words: | |||
vec = [char_dict[ch] for ch in word] | |||
if len(vec) < max_word_len: | |||
vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))] | |||
vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]] | |||
word_vec.append(vec) | |||
return word_vec | |||
def seq2vec(input_words, char_embedding, char_embedding_dim, char_table): | |||
""" convert the input strings into character embeddings """ | |||
# input_words == list of string | |||
# char_embedding == torch.nn.Embedding | |||
# char_embedding_dim == int | |||
# char_table == list of unique chars | |||
# Returns: tensor of shape [len(input_words), char_embedding_dim, max_word_len+2] | |||
max_word_len = max([len(word) for word in input_words]) | |||
print("max_word_len={}".format(max_word_len)) | |||
tensor_list = [] | |||
start_column = torch.ones(char_embedding_dim, 1) | |||
end_column = torch.ones(char_embedding_dim, 1) | |||
for word in input_words: | |||
# convert string to word attention | |||
word_encoding = char_embedding_lookup(word, char_embedding, char_table) | |||
# add start and end columns | |||
word_encoding = torch.cat([start_column, word_encoding, end_column], 1) | |||
# zero-pad right columns | |||
word_encoding = F.pad(word_encoding, (0, max_word_len - word_encoding.size()[1] + 2)).data | |||
# create dimension | |||
word_encoding = word_encoding.unsqueeze(0) | |||
tensor_list.append(word_encoding) | |||
return torch.cat(tensor_list, 0) | |||
def read_data(file_name): | |||
# Return: list of strings | |||
with open(file_name, 'r') as f: | |||
corpus = f.read().lower() | |||
import re | |||
corpus = re.sub(r"<unk>", "unk", corpus) | |||
return corpus.split() | |||
def get_char_dict(vocabulary): | |||
# vocabulary == dict of (word, int) | |||
# Return: dict of (char, int), starting from 1 | |||
char_dict = dict() | |||
count = 1 | |||
for word in vocabulary: | |||
for ch in word: | |||
if ch not in char_dict: | |||
char_dict[ch] = count | |||
count += 1 | |||
return char_dict | |||
def create_word_char_dict(*file_name): | |||
text = [] | |||
for file in file_name: | |||
text += read_data(file) | |||
word_dict = {word: ix for ix, word in enumerate(set(text))} | |||
char_dict = get_char_dict(word_dict) | |||
return word_dict, char_dict |
@@ -0,0 +1,36 @@ | |||
## Introduction | |||
This is the implementation of [Hierarchical Attention Networks for Document Classification](https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf) paper in PyTorch. | |||
* Dataset is 600k documents extracted from [Yelp 2018](https://www.yelp.com/dataset) customer reviews | |||
* Use [NLTK](http://www.nltk.org/) and [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/) to tokenize documents and sentences | |||
* Both CPU & GPU support | |||
* The best accuracy is 71%, reaching the same performance in the paper | |||
## Requirement | |||
* python 3.6 | |||
* pytorch = 0.3.0 | |||
* numpy | |||
* gensim | |||
* nltk | |||
* coreNLP | |||
## Parameters | |||
According to the paper and experiment, I set model parameters: | |||
|word embedding dimension|GRU hidden size|GRU layer|word/sentence context vector dimension| | |||
|---|---|---|---| | |||
|200|50|1|100| | |||
And the training parameters: | |||
|Epoch|learning rate|momentum|batch size| | |||
|---|---|---|---| | |||
|3|0.01|0.9|64| | |||
## Run | |||
1. Prepare dataset. Download the [data set](https://www.yelp.com/dataset), and unzip the custom reviews as a file. Use preprocess.py to transform file into data set foe model input. | |||
2. Train the model. Word enbedding of train data in 'yelp.word2vec'. The model will trained and autosaved in 'model.dict' | |||
``` | |||
python train | |||
``` | |||
3. Test the model. | |||
``` | |||
python evaluate | |||
``` |
@@ -0,0 +1,45 @@ | |||
from model import * | |||
from train import * | |||
def evaluate(net, dataset, bactch_size=64, use_cuda=False): | |||
dataloader = DataLoader(dataset, batch_size=bactch_size, collate_fn=collate, num_workers=0) | |||
count = 0 | |||
if use_cuda: | |||
net.cuda() | |||
for i, batch_samples in enumerate(dataloader): | |||
x, y = batch_samples | |||
doc_list = [] | |||
for sample in x: | |||
doc = [] | |||
for sent_vec in sample: | |||
if use_cuda: | |||
sent_vec = sent_vec.cuda() | |||
doc.append(Variable(sent_vec, volatile=True)) | |||
doc_list.append(pack_sequence(doc)) | |||
if use_cuda: | |||
y = y.cuda() | |||
predicts = net(doc_list) | |||
p, idx = torch.max(predicts, dim=1) | |||
idx = idx.data | |||
count += torch.sum(torch.eq(idx, y)) | |||
return count | |||
if __name__ == '__main__': | |||
''' | |||
Evaluate the performance of models | |||
''' | |||
from gensim.models import Word2Vec | |||
embed_model = Word2Vec.load('yelp.word2vec') | |||
embedding = Embedding_layer(embed_model.wv, embed_model.wv.vector_size) | |||
del embed_model | |||
net = HAN(input_size=200, output_size=5, | |||
word_hidden_size=50, word_num_layers=1, word_context_size=100, | |||
sent_hidden_size=50, sent_num_layers=1, sent_context_size=100) | |||
net.load_state_dict(torch.load('models.dict')) | |||
test_dataset = YelpDocSet('reviews', 199, 4, embedding) | |||
correct = evaluate(net, test_dataset, True) | |||
print('accuracy {}'.format(correct / len(test_dataset))) |
@@ -0,0 +1,113 @@ | |||
import torch | |||
import torch.nn as nn | |||
from torch.autograd import Variable | |||
def pack_sequence(tensor_seq, padding_value=0.0): | |||
if len(tensor_seq) <= 0: | |||
return | |||
length = [v.size(0) for v in tensor_seq] | |||
max_len = max(length) | |||
size = [len(tensor_seq), max_len] | |||
size.extend(list(tensor_seq[0].size()[1:])) | |||
ans = torch.Tensor(*size).fill_(padding_value) | |||
if tensor_seq[0].data.is_cuda: | |||
ans = ans.cuda() | |||
ans = Variable(ans) | |||
for i, v in enumerate(tensor_seq): | |||
ans[i, :length[i], :] = v | |||
return ans | |||
class HAN(nn.Module): | |||
def __init__(self, input_size, output_size, | |||
word_hidden_size, word_num_layers, word_context_size, | |||
sent_hidden_size, sent_num_layers, sent_context_size): | |||
super(HAN, self).__init__() | |||
self.word_layer = AttentionNet(input_size, | |||
word_hidden_size, | |||
word_num_layers, | |||
word_context_size) | |||
self.sent_layer = AttentionNet(2 * word_hidden_size, | |||
sent_hidden_size, | |||
sent_num_layers, | |||
sent_context_size) | |||
self.output_layer = nn.Linear(2 * sent_hidden_size, output_size) | |||
self.softmax = nn.LogSoftmax(dim=1) | |||
def forward(self, batch_doc): | |||
# input is a sequence of matrix | |||
doc_vec_list = [] | |||
for doc in batch_doc: | |||
sent_mat = self.word_layer(doc) # doc's dim (num_sent, seq_len, word_dim) | |||
doc_vec_list.append(sent_mat) # sent_mat's dim (num_sent, vec_dim) | |||
doc_vec = self.sent_layer(pack_sequence(doc_vec_list)) | |||
output = self.softmax(self.output_layer(doc_vec)) | |||
return output | |||
class AttentionNet(nn.Module): | |||
def __init__(self, input_size, gru_hidden_size, gru_num_layers, context_vec_size): | |||
super(AttentionNet, self).__init__() | |||
self.input_size = input_size | |||
self.gru_hidden_size = gru_hidden_size | |||
self.gru_num_layers = gru_num_layers | |||
self.context_vec_size = context_vec_size | |||
# Encoder | |||
self.gru = nn.GRU(input_size=input_size, | |||
hidden_size=gru_hidden_size, | |||
num_layers=gru_num_layers, | |||
batch_first=True, | |||
bidirectional=True) | |||
# Attention | |||
self.fc = nn.Linear(2 * gru_hidden_size, context_vec_size) | |||
self.tanh = nn.Tanh() | |||
self.softmax = nn.Softmax(dim=1) | |||
# context vector | |||
self.context_vec = nn.Parameter(torch.Tensor(context_vec_size, 1)) | |||
self.context_vec.data.uniform_(-0.1, 0.1) | |||
def forward(self, inputs): | |||
# GRU part | |||
h_t, hidden = self.gru(inputs) # inputs's dim (batch_size, seq_len, word_dim) | |||
u = self.tanh(self.fc(h_t)) | |||
# Attention part | |||
alpha = self.softmax(torch.matmul(u, self.context_vec)) # u's dim (batch_size, seq_len, context_vec_size) | |||
output = torch.bmm(torch.transpose(h_t, 1, 2), alpha) # alpha's dim (batch_size, seq_len, 1) | |||
return torch.squeeze(output, dim=2) # output's dim (batch_size, 2*hidden_size, 1) | |||
if __name__ == '__main__': | |||
''' | |||
Test the models correctness | |||
''' | |||
import numpy as np | |||
use_cuda = True | |||
net = HAN(input_size=200, output_size=5, | |||
word_hidden_size=50, word_num_layers=1, word_context_size=100, | |||
sent_hidden_size=50, sent_num_layers=1, sent_context_size=100) | |||
optimizer = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9) | |||
criterion = nn.NLLLoss() | |||
test_time = 10 | |||
batch_size = 64 | |||
if use_cuda: | |||
net.cuda() | |||
print('test training') | |||
for step in range(test_time): | |||
x_data = [torch.randn(np.random.randint(1, 10), 200, 200) for i in range(batch_size)] | |||
y_data = torch.LongTensor([np.random.randint(0, 5) for i in range(batch_size)]) | |||
if use_cuda: | |||
x_data = [x_i.cuda() for x_i in x_data] | |||
y_data = y_data.cuda() | |||
x = [Variable(x_i) for x_i in x_data] | |||
y = Variable(y_data) | |||
predict = net(x) | |||
loss = criterion(predict, y) | |||
optimizer.zero_grad() | |||
loss.backward() | |||
optimizer.step() | |||
print(loss.data[0]) |
@@ -0,0 +1,50 @@ | |||
'''' | |||
Tokenize yelp dataset's documents using stanford core nlp | |||
''' | |||
import json | |||
import os | |||
import pickle | |||
import nltk | |||
from nltk.tokenize import stanford | |||
input_filename = 'review.json' | |||
# config for stanford core nlp | |||
os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe' | |||
path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar' | |||
tokenizer = stanford.CoreNLPTokenizer() | |||
in_dirname = 'review' | |||
out_dirname = 'reviews' | |||
f = open(input_filename, encoding='utf-8') | |||
samples = [] | |||
j = 0 | |||
for i, line in enumerate(f.readlines()): | |||
review = json.loads(line) | |||
samples.append((review['stars'], review['text'])) | |||
if (i + 1) % 5000 == 0: | |||
print(i) | |||
pickle.dump(samples, open(in_dirname + '/samples%d.pkl' % j, 'wb')) | |||
j += 1 | |||
samples = [] | |||
pickle.dump(samples, open(in_dirname + '/samples%d.pkl' % j, 'wb')) | |||
# samples = pickle.load(open(out_dirname + '/samples0.pkl', 'rb')) | |||
# print(samples[0]) | |||
for fn in os.listdir(in_dirname): | |||
print(fn) | |||
precessed = [] | |||
for stars, text in pickle.load(open(os.path.join(in_dirname, fn), 'rb')): | |||
tokens = [] | |||
sents = nltk.tokenize.sent_tokenize(text) | |||
for s in sents: | |||
tokens.append(tokenizer.tokenize(s)) | |||
precessed.append((stars, tokens)) | |||
# print(tokens) | |||
if len(precessed) % 100 == 0: | |||
print(len(precessed)) | |||
pickle.dump(precessed, open(os.path.join(out_dirname, fn), 'wb')) |
@@ -0,0 +1,171 @@ | |||
import os | |||
import pickle | |||
import numpy as np | |||
import torch | |||
from model import * | |||
class SentIter: | |||
def __init__(self, dirname, count): | |||
self.dirname = dirname | |||
self.count = int(count) | |||
def __iter__(self): | |||
for f in os.listdir(self.dirname)[:self.count]: | |||
with open(os.path.join(self.dirname, f), 'rb') as f: | |||
for y, x in pickle.load(f): | |||
for sent in x: | |||
yield sent | |||
def train_word_vec(): | |||
# load data | |||
dirname = 'reviews' | |||
sents = SentIter(dirname, 238) | |||
# define models and train | |||
model = models.Word2Vec(size=200, sg=0, workers=4, min_count=5) | |||
model.build_vocab(sents) | |||
model.train(sents, total_examples=model.corpus_count, epochs=10) | |||
model.save('yelp.word2vec') | |||
print(model.wv.similarity('woman', 'man')) | |||
print(model.wv.similarity('nice', 'awful')) | |||
class Embedding_layer: | |||
def __init__(self, wv, vector_size): | |||
self.wv = wv | |||
self.vector_size = vector_size | |||
def get_vec(self, w): | |||
try: | |||
v = self.wv[w] | |||
except KeyError as e: | |||
v = np.random.randn(self.vector_size) | |||
return v | |||
from torch.utils.data import DataLoader, Dataset | |||
class YelpDocSet(Dataset): | |||
def __init__(self, dirname, start_file, num_files, embedding): | |||
self.dirname = dirname | |||
self.num_files = num_files | |||
self._files = os.listdir(dirname)[start_file:start_file + num_files] | |||
self.embedding = embedding | |||
self._cache = [(-1, None) for i in range(5)] | |||
def get_doc(self, n): | |||
file_id = n // 5000 | |||
idx = file_id % 5 | |||
if self._cache[idx][0] != file_id: | |||
with open(os.path.join(self.dirname, self._files[file_id]), 'rb') as f: | |||
self._cache[idx] = (file_id, pickle.load(f)) | |||
y, x = self._cache[idx][1][n % 5000] | |||
sents = [] | |||
for s_list in x: | |||
sents.append(' '.join(s_list)) | |||
x = '\n'.join(sents) | |||
return x, y - 1 | |||
def __len__(self): | |||
return len(self._files) * 5000 | |||
def __getitem__(self, n): | |||
file_id = n // 5000 | |||
idx = file_id % 5 | |||
if self._cache[idx][0] != file_id: | |||
print('load {} to {}'.format(file_id, idx)) | |||
with open(os.path.join(self.dirname, self._files[file_id]), 'rb') as f: | |||
self._cache[idx] = (file_id, pickle.load(f)) | |||
y, x = self._cache[idx][1][n % 5000] | |||
doc = [] | |||
for sent in x: | |||
if len(sent) == 0: | |||
continue | |||
sent_vec = [] | |||
for word in sent: | |||
vec = self.embedding.get_vec(word) | |||
sent_vec.append(vec.tolist()) | |||
sent_vec = torch.Tensor(sent_vec) | |||
doc.append(sent_vec) | |||
if len(doc) == 0: | |||
doc = [torch.zeros(1, 200)] | |||
return doc, y - 1 | |||
def collate(iterable): | |||
y_list = [] | |||
x_list = [] | |||
for x, y in iterable: | |||
y_list.append(y) | |||
x_list.append(x) | |||
return x_list, torch.LongTensor(y_list) | |||
def train(net, dataset, num_epoch, batch_size, print_size=10, use_cuda=False): | |||
optimizer = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9) | |||
criterion = nn.NLLLoss() | |||
dataloader = DataLoader(dataset, | |||
batch_size=batch_size, | |||
collate_fn=collate, | |||
num_workers=0) | |||
running_loss = 0.0 | |||
if use_cuda: | |||
net.cuda() | |||
print('start training') | |||
for epoch in range(num_epoch): | |||
for i, batch_samples in enumerate(dataloader): | |||
x, y = batch_samples | |||
doc_list = [] | |||
for sample in x: | |||
doc = [] | |||
for sent_vec in sample: | |||
if use_cuda: | |||
sent_vec = sent_vec.cuda() | |||
doc.append(Variable(sent_vec)) | |||
doc_list.append(pack_sequence(doc)) | |||
if use_cuda: | |||
y = y.cuda() | |||
y = Variable(y) | |||
predict = net(doc_list) | |||
loss = criterion(predict, y) | |||
optimizer.zero_grad() | |||
loss.backward() | |||
optimizer.step() | |||
running_loss += loss.data[0] | |||
if i % print_size == print_size - 1: | |||
print('{}, {}'.format(i + 1, running_loss / print_size)) | |||
running_loss = 0.0 | |||
torch.save(net.state_dict(), 'models.dict') | |||
torch.save(net.state_dict(), 'models.dict') | |||
if __name__ == '__main__': | |||
''' | |||
Train process | |||
''' | |||
from gensim.models import Word2Vec | |||
from gensim import models | |||
train_word_vec() | |||
embed_model = Word2Vec.load('yelp.word2vec') | |||
embedding = Embedding_layer(embed_model.wv, embed_model.wv.vector_size) | |||
del embed_model | |||
start_file = 0 | |||
dataset = YelpDocSet('reviews', start_file, 120 - start_file, embedding) | |||
print('training data size {}'.format(len(dataset))) | |||
net = HAN(input_size=200, output_size=5, | |||
word_hidden_size=50, word_num_layers=1, word_context_size=100, | |||
sent_hidden_size=50, sent_num_layers=1, sent_context_size=100) | |||
try: | |||
net.load_state_dict(torch.load('models.dict')) | |||
print("last time trained models has loaded") | |||
except Exception: | |||
print("cannot load models, train the inital models") | |||
train(net, dataset, num_epoch=5, batch_size=64, use_cuda=True) |
@@ -0,0 +1,3 @@ | |||
numpy==1.14.2 | |||
torch==0.4.0 | |||
torchvision==0.1.8 |
@@ -0,0 +1,32 @@ | |||
from loader.base_loader import ToyLoader0 | |||
from model.char_language_model import CharLM | |||
from fastNLP.action import Tester | |||
from fastNLP.action.trainer import Trainer | |||
def test_charlm(): | |||
train_config = Trainer.TrainConfig(epochs=1, validate=True, save_when_better=True, | |||
log_per_step=10, log_validation=True, batch_size=160) | |||
trainer = Trainer(train_config) | |||
model = CharLM(lstm_batch_size=16, lstm_seq_len=10) | |||
train_data = ToyLoader0("load_train", "./data_for_tests/charlm.txt").load() | |||
valid_data = ToyLoader0("load_valid", "./data_for_tests/charlm.txt").load() | |||
trainer.train(model, train_data, valid_data) | |||
trainer.save_model(model) | |||
test_config = Tester.TestConfig(save_output=True, validate_in_training=True, | |||
save_dev_input=True, save_loss=True, batch_size=160) | |||
tester = Tester(test_config) | |||
test_data = ToyLoader0("load_test", "./data_for_tests/charlm.txt").load() | |||
tester.test(model, test_data) | |||
if __name__ == "__main__": | |||
test_charlm() |
@@ -0,0 +1,10 @@ | |||
import unittest | |||
class MyTestCase(unittest.TestCase): | |||
def test_something(self): | |||
self.assertEqual(True, False) | |||
if __name__ == '__main__': | |||
unittest.main() |
@@ -0,0 +1,21 @@ | |||
from collections import namedtuple | |||
import numpy as np | |||
from model.base_model import ToyModel | |||
from fastNLP.action.trainer import Trainer | |||
def test_trainer(): | |||
Config = namedtuple("config", ["epochs", "validate", "save_when_better"]) | |||
train_config = Config(epochs=5, validate=True, save_when_better=True) | |||
trainer = Trainer(train_config) | |||
net = ToyModel() | |||
data = np.random.rand(20, 6) | |||
dev_data = np.random.rand(20, 6) | |||
trainer.train(net, data, dev_data) | |||
if __name__ == "__main__": | |||
test_trainer() |
@@ -0,0 +1,28 @@ | |||
from fastNLP.action.tester import Tester | |||
from fastNLP.action.trainer import WordSegTrainer | |||
from fastNLP.loader.base_loader import BaseLoader | |||
from fastNLP.models.word_seg_model import WordSeg | |||
def test_wordseg(): | |||
train_config = WordSegTrainer.TrainConfig(epochs=5, validate=False, save_when_better=False, | |||
log_per_step=10, log_validation=False, batch_size=254) | |||
trainer = WordSegTrainer(train_config) | |||
model = WordSeg(100, 2, 1000) | |||
train_data = BaseLoader("load_train", "./data_for_tests/cws_train").load_lines() | |||
trainer.train(model, train_data) | |||
test_config = Tester.TestConfig(save_output=False, validate_in_training=False, | |||
save_dev_input=False, save_loss=False, batch_size=254) | |||
tester = Tester(test_config) | |||
test_data = BaseLoader("load_test", "./data_for_tests/cws_test").load_lines() | |||
tester.test(model, test_data) | |||
if __name__ == "__main__": | |||
test_wordseg() |