@@ -417,6 +417,55 @@ class PeopleDailyCorpusLoader(DataSetLoader): | |||
data_set.set_input("seq_len") | |||
return data_set | |||
class Conll2003Loader(DataSetLoader): | |||
"""Self-defined loader of conll2003 dataset | |||
More information about the given dataset cound be found on | |||
https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data | |||
""" | |||
def __init__(self): | |||
super(Conll2003Loader, self).__init__() | |||
def load(self, dataset_path): | |||
with open(dataset_path, "r", encoding="utf-8") as f: | |||
lines = f.readlines() | |||
##Parse the dataset line by line | |||
parsed_data = [] | |||
sentence = [] | |||
tokens = [] | |||
for line in lines: | |||
if '-DOCSTART- -X- -X- O' in line or line == '\n': | |||
if sentence != []: | |||
parsed_data.append((sentence, tokens)) | |||
sentence = [] | |||
tokens = [] | |||
continue | |||
temp = line.strip().split(" ") | |||
sentence.append(temp[0]) | |||
tokens.append(temp[1:4]) | |||
return self.convert(parsed_data) | |||
def convert(self, parsed_data): | |||
dataset = DataSet() | |||
for sample in parsed_data: | |||
label0_list = list(map( | |||
lambda labels: labels[0], sample[1])) | |||
label1_list = list(map( | |||
lambda labels: labels[1], sample[1])) | |||
label2_list = list(map( | |||
lambda labels: labels[2], sample[1])) | |||
dataset.append(Instance(token_list=sample[0], | |||
label0_list=label0_list, | |||
label1_list=label1_list, | |||
label2_list=label2_list)) | |||
return dataset | |||
class SNLIDataSetLoader(DataSetLoader): | |||
"""A data set loader for SNLI data set. | |||
@@ -0,0 +1,442 @@ | |||
-DOCSTART- -X- -X- O | |||
SOCCER NN B-NP O | |||
- : O O | |||
JAPAN NNP B-NP B-LOC | |||
GET VB B-VP O | |||
LUCKY NNP B-NP O | |||
WIN NNP I-NP O | |||
, , O O | |||
CHINA NNP B-NP B-PER | |||
IN IN B-PP O | |||
SURPRISE DT B-NP O | |||
DEFEAT NN I-NP O | |||
. . O O | |||
Nadim NNP B-NP B-PER | |||
Ladki NNP I-NP I-PER | |||
AL-AIN NNP B-NP B-LOC | |||
, , O O | |||
United NNP B-NP B-LOC | |||
Arab NNP I-NP I-LOC | |||
Emirates NNPS I-NP I-LOC | |||
1996-12-06 CD I-NP O | |||
Japan NNP B-NP B-LOC | |||
began VBD B-VP O | |||
the DT B-NP O | |||
defence NN I-NP O | |||
of IN B-PP O | |||
their PRP$ B-NP O | |||
Asian JJ I-NP B-MISC | |||
Cup NNP I-NP I-MISC | |||
title NN I-NP O | |||
with IN B-PP O | |||
a DT B-NP O | |||
lucky JJ I-NP O | |||
2-1 CD I-NP O | |||
win VBP B-VP O | |||
against IN B-PP O | |||
Syria NNP B-NP B-LOC | |||
in IN B-PP O | |||
a DT B-NP O | |||
Group NNP I-NP O | |||
C NNP I-NP O | |||
championship NN I-NP O | |||
match NN I-NP O | |||
on IN B-PP O | |||
Friday NNP B-NP O | |||
. . O O | |||
But CC O O | |||
China NNP B-NP B-LOC | |||
saw VBD B-VP O | |||
their PRP$ B-NP O | |||
luck NN I-NP O | |||
desert VB B-VP O | |||
them PRP B-NP O | |||
in IN B-PP O | |||
the DT B-NP O | |||
second NN I-NP O | |||
match NN I-NP O | |||
of IN B-PP O | |||
the DT B-NP O | |||
group NN I-NP O | |||
, , O O | |||
crashing VBG B-VP O | |||
to TO B-PP O | |||
a DT B-NP O | |||
surprise NN I-NP O | |||
2-0 CD I-NP O | |||
defeat NN I-NP O | |||
to TO B-PP O | |||
newcomers NNS B-NP O | |||
Uzbekistan NNP I-NP B-LOC | |||
. . O O | |||
China NNP B-NP B-LOC | |||
controlled VBD B-VP O | |||
most JJS B-NP O | |||
of IN B-PP O | |||
the DT B-NP O | |||
match NN I-NP O | |||
and CC O O | |||
saw VBD B-VP O | |||
several JJ B-NP O | |||
chances NNS I-NP O | |||
missed VBD B-VP O | |||
until IN B-SBAR O | |||
the DT B-NP O | |||
78th JJ I-NP O | |||
minute NN I-NP O | |||
when WRB B-ADVP O | |||
Uzbek NNP B-NP B-MISC | |||
striker NN I-NP O | |||
Igor JJ B-NP B-PER | |||
Shkvyrin NNP I-NP I-PER | |||
took VBD B-VP O | |||
advantage NN B-NP O | |||
of IN B-PP O | |||
a DT B-NP O | |||
misdirected JJ I-NP O | |||
defensive JJ I-NP O | |||
header NN I-NP O | |||
to TO B-VP O | |||
lob VB I-VP O | |||
the DT B-NP O | |||
ball NN I-NP O | |||
over IN B-PP O | |||
the DT B-NP O | |||
advancing VBG I-NP O | |||
Chinese JJ I-NP B-MISC | |||
keeper NN I-NP O | |||
and CC O O | |||
into IN B-PP O | |||
an DT B-NP O | |||
empty JJ I-NP O | |||
net NN I-NP O | |||
. . O O | |||
Oleg NNP B-NP B-PER | |||
Shatskiku NNP I-NP I-PER | |||
made VBD B-VP O | |||
sure JJ B-ADJP O | |||
of IN B-PP O | |||
the DT B-NP O | |||
win VBP B-VP O | |||
in IN B-PP O | |||
injury NN B-NP O | |||
time NN I-NP O | |||
, , O O | |||
hitting VBG B-VP O | |||
an DT B-NP O | |||
unstoppable JJ I-NP O | |||
left VBD B-VP O | |||
foot NN B-NP O | |||
shot NN I-NP O | |||
from IN B-PP O | |||
just RB B-NP O | |||
outside IN B-PP O | |||
the DT B-NP O | |||
area NN I-NP O | |||
. . O O | |||
The DT B-NP O | |||
former JJ I-NP O | |||
Soviet JJ I-NP B-MISC | |||
republic NN I-NP O | |||
was VBD B-VP O | |||
playing VBG I-VP O | |||
in IN B-PP O | |||
an DT B-NP O | |||
Asian NNP I-NP B-MISC | |||
Cup NNP I-NP I-MISC | |||
finals NNS I-NP O | |||
tie NN I-NP O | |||
for IN B-PP O | |||
the DT B-NP O | |||
first JJ I-NP O | |||
time NN I-NP O | |||
. . O O | |||
Despite IN B-PP O | |||
winning VBG B-VP O | |||
the DT B-NP O | |||
Asian JJ I-NP B-MISC | |||
Games NNPS I-NP I-MISC | |||
title NN I-NP O | |||
two CD B-NP O | |||
years NNS I-NP O | |||
ago RB B-ADVP O | |||
, , O O | |||
Uzbekistan NNP B-NP B-LOC | |||
are VBP B-VP O | |||
in IN B-PP O | |||
the DT B-NP O | |||
finals NNS I-NP O | |||
as IN B-SBAR O | |||
outsiders NNS B-NP O | |||
. . O O | |||
Two CD B-NP O | |||
goals NNS I-NP O | |||
from IN B-PP O | |||
defensive JJ B-NP O | |||
errors NNS I-NP O | |||
in IN B-PP O | |||
the DT B-NP O | |||
last JJ I-NP O | |||
six CD I-NP O | |||
minutes NNS I-NP O | |||
allowed VBD B-VP O | |||
Japan NNP B-NP B-LOC | |||
to TO B-VP O | |||
come VB I-VP O | |||
from IN B-PP O | |||
behind NN B-NP O | |||
and CC O O | |||
collect VB B-VP O | |||
all DT B-NP O | |||
three CD I-NP O | |||
points NNS I-NP O | |||
from IN B-PP O | |||
their PRP$ B-NP O | |||
opening NN I-NP O | |||
meeting NN I-NP O | |||
against IN B-PP O | |||
Syria NNP B-NP B-LOC | |||
. . O O | |||
Takuya NNP B-NP B-PER | |||
Takagi NNP I-NP I-PER | |||
scored VBD B-VP O | |||
the DT B-NP O | |||
winner NN I-NP O | |||
in IN B-PP O | |||
the DT B-NP O | |||
88th JJ I-NP O | |||
minute NN I-NP O | |||
, , O O | |||
rising VBG B-VP O | |||
to TO I-VP O | |||
head VB I-VP O | |||
a DT B-NP O | |||
Hiroshige NNP I-NP B-PER | |||
Yanagimoto NNP I-NP I-PER | |||
cross VB B-VP O | |||
towards IN B-PP O | |||
the DT B-NP O | |||
Syrian JJ I-NP B-MISC | |||
goal NN I-NP O | |||
which WDT B-NP O | |||
goalkeeper VBD B-VP O | |||
Salem NNP B-NP B-PER | |||
Bitar NNP I-NP I-PER | |||
appeared VBD B-VP O | |||
to TO I-VP O | |||
have VB I-VP O | |||
covered VBN I-VP O | |||
but CC O O | |||
then RB B-VP O | |||
allowed VBN I-VP O | |||
to TO I-VP O | |||
slip VB I-VP O | |||
into IN B-PP O | |||
the DT B-NP O | |||
net NN I-NP O | |||
. . O O | |||
It PRP B-NP O | |||
was VBD B-VP O | |||
the DT B-NP O | |||
second JJ I-NP O | |||
costly JJ I-NP O | |||
blunder NN I-NP O | |||
by IN B-PP O | |||
Syria NNP B-NP B-LOC | |||
in IN B-PP O | |||
four CD B-NP O | |||
minutes NNS I-NP O | |||
. . O O | |||
Defender NNP B-NP O | |||
Hassan NNP I-NP B-PER | |||
Abbas NNP I-NP I-PER | |||
rose VBD B-VP O | |||
to TO I-VP O | |||
intercept VB I-VP O | |||
a DT B-NP O | |||
long JJ I-NP O | |||
ball NN I-NP O | |||
into IN B-PP O | |||
the DT B-NP O | |||
area NN I-NP O | |||
in IN B-PP O | |||
the DT B-NP O | |||
84th JJ I-NP O | |||
minute NN I-NP O | |||
but CC O O | |||
only RB B-ADVP O | |||
managed VBD B-VP O | |||
to TO I-VP O | |||
divert VB I-VP O | |||
it PRP B-NP O | |||
into IN B-PP O | |||
the DT B-NP O | |||
top JJ I-NP O | |||
corner NN I-NP O | |||
of IN B-PP O | |||
Bitar NN B-NP B-PER | |||
's POS B-NP O | |||
goal NN I-NP O | |||
. . O O | |||
Nader NNP B-NP B-PER | |||
Jokhadar NNP I-NP I-PER | |||
had VBD B-VP O | |||
given VBN I-VP O | |||
Syria NNP B-NP B-LOC | |||
the DT B-NP O | |||
lead NN I-NP O | |||
with IN B-PP O | |||
a DT B-NP O | |||
well-struck NN I-NP O | |||
header NN I-NP O | |||
in IN B-PP O | |||
the DT B-NP O | |||
seventh JJ I-NP O | |||
minute NN I-NP O | |||
. . O O | |||
Japan NNP B-NP B-LOC | |||
then RB B-ADVP O | |||
laid VBD B-VP O | |||
siege NN B-NP O | |||
to TO B-PP O | |||
the DT B-NP O | |||
Syrian JJ I-NP B-MISC | |||
penalty NN I-NP O | |||
area NN I-NP O | |||
for IN B-PP O | |||
most JJS B-NP O | |||
of IN B-PP O | |||
the DT B-NP O | |||
game NN I-NP O | |||
but CC O O | |||
rarely RB B-VP O | |||
breached VBD I-VP O | |||
the DT B-NP O | |||
Syrian JJ I-NP B-MISC | |||
defence NN I-NP O | |||
. . O O | |||
Bitar NN B-NP B-PER | |||
pulled VBD B-VP O | |||
off RP B-PRT O | |||
fine JJ B-NP O | |||
saves VBZ B-VP O | |||
whenever WRB B-ADVP O | |||
they PRP B-NP O | |||
did VBD B-VP O | |||
. . O O | |||
Japan NNP B-NP B-LOC | |||
coach NN I-NP O | |||
Shu NNP I-NP B-PER | |||
Kamo NNP I-NP I-PER | |||
said VBD B-VP O | |||
: : O O | |||
' '' O O | |||
' POS B-NP O | |||
The DT I-NP O | |||
Syrian JJ I-NP B-MISC | |||
own JJ I-NP O | |||
goal NN I-NP O | |||
proved VBD B-VP O | |||
lucky JJ B-ADJP O | |||
for IN B-PP O | |||
us PRP B-NP O | |||
. . O O | |||
The DT B-NP O | |||
Syrians NNPS I-NP B-MISC | |||
scored VBD B-VP O | |||
early JJ B-NP O | |||
and CC O O | |||
then RB B-VP O | |||
played VBN I-VP O | |||
defensively RB B-ADVP O | |||
and CC O O | |||
adopted VBD B-VP O | |||
long RB I-VP O | |||
balls VBZ I-VP O | |||
which WDT B-NP O | |||
made VBD B-VP O | |||
it PRP B-NP O | |||
hard JJ B-ADJP O | |||
for IN B-PP O | |||
us PRP B-NP O | |||
. . O O | |||
' '' O O | |||
' '' O O | |||
Japan NNP B-NP B-LOC | |||
, , O O | |||
co-hosts VBZ B-VP O | |||
of IN B-PP O | |||
the DT B-NP O | |||
World NNP I-NP B-MISC | |||
Cup NNP I-NP I-MISC | |||
in IN B-PP O | |||
2002 CD B-NP O | |||
and CC O O | |||
ranked VBD B-VP O | |||
20th JJ B-NP O | |||
in IN B-PP O | |||
the DT B-NP O | |||
world NN I-NP O | |||
by IN B-PP O | |||
FIFA NNP B-NP B-ORG | |||
, , O O | |||
are VBP B-VP O | |||
favourites JJ B-ADJP O | |||
to TO B-VP O | |||
regain VB I-VP O | |||
their PRP$ B-NP O | |||
title NN I-NP O | |||
here RB B-ADVP O | |||
. . O O | |||
Hosts NNPS B-NP O | |||
UAE NNP I-NP B-LOC | |||
play NN I-NP O | |||
Kuwait NNP I-NP B-LOC | |||
and CC O O | |||
South NNP B-NP B-LOC | |||
Korea NNP I-NP I-LOC | |||
take VBP B-VP O | |||
on IN B-PP O | |||
Indonesia NNP B-NP B-LOC | |||
on IN B-PP O | |||
Saturday NNP B-NP O | |||
in IN B-PP O | |||
Group NNP B-NP O | |||
A NNP I-NP O | |||
matches VBZ B-VP O | |||
. . O O | |||
All DT B-NP O | |||
four CD I-NP O | |||
teams NNS I-NP O | |||
are VBP B-VP O | |||
level NN B-NP O | |||
with IN B-PP O | |||
one CD B-NP O | |||
point NN I-NP O | |||
each DT B-NP O | |||
from IN B-PP O | |||
one CD B-NP O | |||
game NN I-NP O | |||
. . O O |
@@ -0,0 +1,23 @@ | |||
import os | |||
import unittest | |||
from fastNLP.io.dataset_loader import Conll2003Loader | |||
class TestDatasetLoader(unittest.TestCase): | |||
def test_case_1(self): | |||
''' | |||
Test the the loader of Conll2003 dataset | |||
''' | |||
dataset_path = "test/data_for_tests/conll_2003_example.txt" | |||
loader = Conll2003Loader() | |||
dataset_2003 = loader.load(dataset_path) | |||
for item in dataset_2003: | |||
len0 = len(item["label0_list"]) | |||
len1 = len(item["label1_list"]) | |||
len2 = len(item["label2_list"]) | |||
lentoken = len(item["token_list"]) | |||
self.assertNotEqual(len0, 0) | |||
self.assertEqual(len0, len1) | |||
self.assertEqual(len1, len2) |