Browse Source

add TC/MTL16Loader

tags/v0.4.10
lyhuang18 5 years ago
parent
commit
c78811f87f
3 changed files with 95 additions and 0 deletions
  1. +75
    -0
      reproduction/text_classification/data/MTL16Loader.py
  2. +10
    -0
      reproduction/text_classification/test/sample_MTL16.txt
  3. +10
    -0
      reproduction/text_classification/test/test_MTL16Loader.py

+ 75
- 0
reproduction/text_classification/data/MTL16Loader.py View File

@@ -0,0 +1,75 @@
from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader
from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io.base_loader import DataSetLoader, DataInfo
from typing import Union, Dict, List, Iterator
from fastNLP import DataSet
from fastNLP import Instance
from fastNLP import Vocabulary
from fastNLP import Const
from reproduction.utils import check_dataloader_paths
from functools import partial

class MTL16Loader(DataSetLoader):
"""
读取MTL16数据集,DataSet包含以下fields:

words: list(str), 需要分类的文本
target: str, 文本的标签

数据来源:https://pan.baidu.com/s/1c2L6vdA

"""

def __init__(self):
super(MTL16Loader, self).__init__()

def _load(self, path):
dataset = DataSet()
with open(path, 'r', encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
parts = line.split('\t')
target = parts[0]
words = parts[1].split()
dataset.append(Instance(words=words, target=target))
if len(dataset)==0:
raise RuntimeError(f"{path} has no valid data.")

return dataset
def process(self,
paths: Union[str, Dict[str, str]],
src_vocab_opt: VocabularyOption = None,
tgt_vocab_opt: VocabularyOption = None,
src_embed_opt: EmbeddingOption = None):
paths = check_dataloader_paths(paths)
datasets = {}
info = DataInfo()
for name, path in paths.items():
dataset = self.load(path)
datasets[name] = dataset

src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
src_vocab.from_dataset(datasets['train'], field_name='words')
src_vocab.index_dataset(*datasets.values(), field_name='words')

tgt_vocab = Vocabulary(unknown=None, padding=None) \
if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
tgt_vocab.from_dataset(datasets['train'], field_name='target')
tgt_vocab.index_dataset(*datasets.values(), field_name='target')

info.vocabs = {
"words": src_vocab,
"target": tgt_vocab
}

info.datasets = datasets

if src_embed_opt is not None:
embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
info.embeddings['words'] = embed

return info

+ 10
- 0
reproduction/text_classification/test/sample_MTL16.txt View File

@@ -0,0 +1,10 @@
1 the only thing better than these sunglasses is the customer service i got , after i dropped and broke the lenses on these i called 80 's purple and they actually sent me out a replacement free of charge . i was blown away
0 this light worked for one day . i should have known better because in the past , i bought a tap light , and it worked for only a few days , too . do n't waste your money
1 i 've tried 6 different nursing bras . this one , with the center snap closure , is the easiest to use . it is also the lightest and most comfortable , while providing good support . my only complaint is that after about 50 washes the underwire begins to poke free from the fabric . even when i try to sew it back into place , it breaks loose after a few washes . perhaps if i handwashed the bra instead of using a machine , it would last longer . this bra is less durabe than my other nursing bras ( particularly the leading lady bra , which seems to be indestructible ) , but it is well worth the sacrifice for comfort , lightness , and ease of use . it is by far my favorite
0 i have had my bag for a couple of months . the liner on the inside has already ripped
0 the photo is quite deceiving . this suit is made out of cheap polyester fabric that looks cheap , shiny , and is horrible to the touch . my three year olds hate the uncomfortable stiffness . spend the extra money for a decent fabric that is actually practical for a toddler if they really need a suit
1 i had bought a bra of this model at a discount store , just got lucky . it quickly became my favorite , and i was glad to find it at amazon .
0 lookslike it would be a nice product , but it 's only for very small babies up to 12 pounds and 23 inches . my baby is very long and just does n't fit - wish target/amazon would have been more upfront with the sizing
0 i purchased the non-premium kit ( $ 9.99 ) with a silicone skin case cover and 2 screen protectors ( one for each screen ) , but it is the same case . the problem is that the silicone skin cover is slippery , twice as slippery as the nintendo lite without the cover . we thought that washing them in dove dish soap would wash away the slipperyness , but that did n't work . after handling the cover , your hands have a slippery residue on them . the other issue is that the cover is so thin that it is little more than scratch protection , not impact protection . the screen covers that come with the non-premium kit are ok , i guess , but one of them had 2 defect particles that were raised ( trust me , the screen was clean ) . i purchased 2 kits , and i had one screen protector defect and my wife accidentally broke one of the silicone covers hinge straps with little effort . i do not recommend this product at all
1 good quality jeans at an affordable price . size is just right , quite comfortable
0 not the best fabric , scratchy and see thru . you get what you pay for on these

+ 10
- 0
reproduction/text_classification/test/test_MTL16Loader.py View File

@@ -0,0 +1,10 @@
import unittest
from reproduction.text_classification.data.MTL16Loader import MTL16Loader


class TestDataLoader(unittest.TestCase):
def test_MTL16Loader(self):
loader = MTL16Loader()
data = loader.process('sample_MTL16.txt')
print(data.datasets)


Loading…
Cancel
Save