From c78811f87f121984a7fabb995df3b1c35fae1837 Mon Sep 17 00:00:00 2001 From: lyhuang18 <42239874+lyhuang18@users.noreply.github.com> Date: Sun, 16 Jun 2019 23:43:37 +0800 Subject: [PATCH] add TC/MTL16Loader --- .../text_classification/data/MTL16Loader.py | 75 +++++++++++++++++++ .../text_classification/test/sample_MTL16.txt | 10 +++ .../test/test_MTL16Loader.py | 10 +++ 3 files changed, 95 insertions(+) create mode 100644 reproduction/text_classification/data/MTL16Loader.py create mode 100644 reproduction/text_classification/test/sample_MTL16.txt create mode 100644 reproduction/text_classification/test/test_MTL16Loader.py diff --git a/reproduction/text_classification/data/MTL16Loader.py b/reproduction/text_classification/data/MTL16Loader.py new file mode 100644 index 00000000..1b3e6245 --- /dev/null +++ b/reproduction/text_classification/data/MTL16Loader.py @@ -0,0 +1,75 @@ +from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader +from fastNLP.core.vocabulary import VocabularyOption +from fastNLP.io.base_loader import DataSetLoader, DataInfo +from typing import Union, Dict, List, Iterator +from fastNLP import DataSet +from fastNLP import Instance +from fastNLP import Vocabulary +from fastNLP import Const +from reproduction.utils import check_dataloader_paths +from functools import partial + +class MTL16Loader(DataSetLoader): + """ + 读取MTL16数据集,DataSet包含以下fields: + + words: list(str), 需要分类的文本 + target: str, 文本的标签 + + 数据来源:https://pan.baidu.com/s/1c2L6vdA + + """ + + def __init__(self): + super(MTL16Loader, self).__init__() + + def _load(self, path): + dataset = DataSet() + with open(path, 'r', encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + parts = line.split('\t') + target = parts[0] + words = parts[1].split() + dataset.append(Instance(words=words, target=target)) + if len(dataset)==0: + raise RuntimeError(f"{path} has no valid data.") + + return dataset + + def process(self, + paths: Union[str, Dict[str, str]], + src_vocab_opt: VocabularyOption = None, + tgt_vocab_opt: VocabularyOption = None, + src_embed_opt: EmbeddingOption = None): + + paths = check_dataloader_paths(paths) + datasets = {} + info = DataInfo() + for name, path in paths.items(): + dataset = self.load(path) + datasets[name] = dataset + + src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt) + src_vocab.from_dataset(datasets['train'], field_name='words') + src_vocab.index_dataset(*datasets.values(), field_name='words') + + tgt_vocab = Vocabulary(unknown=None, padding=None) \ + if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) + tgt_vocab.from_dataset(datasets['train'], field_name='target') + tgt_vocab.index_dataset(*datasets.values(), field_name='target') + + info.vocabs = { + "words": src_vocab, + "target": tgt_vocab + } + + info.datasets = datasets + + if src_embed_opt is not None: + embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab) + info.embeddings['words'] = embed + + return info diff --git a/reproduction/text_classification/test/sample_MTL16.txt b/reproduction/text_classification/test/sample_MTL16.txt new file mode 100644 index 00000000..f1efeb4c --- /dev/null +++ b/reproduction/text_classification/test/sample_MTL16.txt @@ -0,0 +1,10 @@ +1 the only thing better than these sunglasses is the customer service i got , after i dropped and broke the lenses on these i called 80 's purple and they actually sent me out a replacement free of charge . i was blown away +0 this light worked for one day . i should have known better because in the past , i bought a tap light , and it worked for only a few days , too . do n't waste your money +1 i 've tried 6 different nursing bras . this one , with the center snap closure , is the easiest to use . it is also the lightest and most comfortable , while providing good support . my only complaint is that after about 50 washes the underwire begins to poke free from the fabric . even when i try to sew it back into place , it breaks loose after a few washes . perhaps if i handwashed the bra instead of using a machine , it would last longer . this bra is less durabe than my other nursing bras ( particularly the leading lady bra , which seems to be indestructible ) , but it is well worth the sacrifice for comfort , lightness , and ease of use . it is by far my favorite +0 i have had my bag for a couple of months . the liner on the inside has already ripped +0 the photo is quite deceiving . this suit is made out of cheap polyester fabric that looks cheap , shiny , and is horrible to the touch . my three year olds hate the uncomfortable stiffness . spend the extra money for a decent fabric that is actually practical for a toddler if they really need a suit +1 i had bought a bra of this model at a discount store , just got lucky . it quickly became my favorite , and i was glad to find it at amazon . +0 lookslike it would be a nice product , but it 's only for very small babies up to 12 pounds and 23 inches . my baby is very long and just does n't fit - wish target/amazon would have been more upfront with the sizing +0 i purchased the non-premium kit ( $ 9.99 ) with a silicone skin case cover and 2 screen protectors ( one for each screen ) , but it is the same case . the problem is that the silicone skin cover is slippery , twice as slippery as the nintendo lite without the cover . we thought that washing them in dove dish soap would wash away the slipperyness , but that did n't work . after handling the cover , your hands have a slippery residue on them . the other issue is that the cover is so thin that it is little more than scratch protection , not impact protection . the screen covers that come with the non-premium kit are ok , i guess , but one of them had 2 defect particles that were raised ( trust me , the screen was clean ) . i purchased 2 kits , and i had one screen protector defect and my wife accidentally broke one of the silicone covers hinge straps with little effort . i do not recommend this product at all +1 good quality jeans at an affordable price . size is just right , quite comfortable +0 not the best fabric , scratchy and see thru . you get what you pay for on these diff --git a/reproduction/text_classification/test/test_MTL16Loader.py b/reproduction/text_classification/test/test_MTL16Loader.py new file mode 100644 index 00000000..0ae6adc1 --- /dev/null +++ b/reproduction/text_classification/test/test_MTL16Loader.py @@ -0,0 +1,10 @@ +import unittest +from reproduction.text_classification.data.MTL16Loader import MTL16Loader + + +class TestDataLoader(unittest.TestCase): + def test_MTL16Loader(self): + loader = MTL16Loader() + data = loader.process('sample_MTL16.txt') + print(data.datasets) +