Browse Source

Add a loader for conll2003 dataset

tags/v0.3.0
hazelnutsgz 6 years ago
parent
commit
5f4ab131ac
3 changed files with 514 additions and 0 deletions
  1. +49
    -0
      fastNLP/io/dataset_loader.py
  2. +442
    -0
      test/data_for_tests/conll_2003_example.txt
  3. +23
    -0
      test/io/test_dataset_loader.py

+ 49
- 0
fastNLP/io/dataset_loader.py View File

@@ -417,6 +417,55 @@ class PeopleDailyCorpusLoader(DataSetLoader):
data_set.set_input("seq_len") data_set.set_input("seq_len")
return data_set return data_set


class Conll2003Loader(DataSetLoader):
"""Self-defined loader of conll2003 dataset
More information about the given dataset cound be found on
https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data
"""
def __init__(self):
super(Conll2003Loader, self).__init__()
def load(self, dataset_path):
with open(dataset_path, "r", encoding="utf-8") as f:
lines = f.readlines()
##Parse the dataset line by line
parsed_data = []
sentence = []
tokens = []
for line in lines:
if '-DOCSTART- -X- -X- O' in line or line == '\n':
if sentence != []:
parsed_data.append((sentence, tokens))
sentence = []
tokens = []
continue
temp = line.strip().split(" ")
sentence.append(temp[0])
tokens.append(temp[1:4])
return self.convert(parsed_data)
def convert(self, parsed_data):
dataset = DataSet()
for sample in parsed_data:
label0_list = list(map(
lambda labels: labels[0], sample[1]))
label1_list = list(map(
lambda labels: labels[1], sample[1]))
label2_list = list(map(
lambda labels: labels[2], sample[1]))
dataset.append(Instance(token_list=sample[0],
label0_list=label0_list,
label1_list=label1_list,
label2_list=label2_list))
return dataset


class SNLIDataSetLoader(DataSetLoader): class SNLIDataSetLoader(DataSetLoader):
"""A data set loader for SNLI data set. """A data set loader for SNLI data set.


+ 442
- 0
test/data_for_tests/conll_2003_example.txt View File

@@ -0,0 +1,442 @@
-DOCSTART- -X- -X- O

SOCCER NN B-NP O
- : O O
JAPAN NNP B-NP B-LOC
GET VB B-VP O
LUCKY NNP B-NP O
WIN NNP I-NP O
, , O O
CHINA NNP B-NP B-PER
IN IN B-PP O
SURPRISE DT B-NP O
DEFEAT NN I-NP O
. . O O

Nadim NNP B-NP B-PER
Ladki NNP I-NP I-PER

AL-AIN NNP B-NP B-LOC
, , O O
United NNP B-NP B-LOC
Arab NNP I-NP I-LOC
Emirates NNPS I-NP I-LOC
1996-12-06 CD I-NP O

Japan NNP B-NP B-LOC
began VBD B-VP O
the DT B-NP O
defence NN I-NP O
of IN B-PP O
their PRP$ B-NP O
Asian JJ I-NP B-MISC
Cup NNP I-NP I-MISC
title NN I-NP O
with IN B-PP O
a DT B-NP O
lucky JJ I-NP O
2-1 CD I-NP O
win VBP B-VP O
against IN B-PP O
Syria NNP B-NP B-LOC
in IN B-PP O
a DT B-NP O
Group NNP I-NP O
C NNP I-NP O
championship NN I-NP O
match NN I-NP O
on IN B-PP O
Friday NNP B-NP O
. . O O

But CC O O
China NNP B-NP B-LOC
saw VBD B-VP O
their PRP$ B-NP O
luck NN I-NP O
desert VB B-VP O
them PRP B-NP O
in IN B-PP O
the DT B-NP O
second NN I-NP O
match NN I-NP O
of IN B-PP O
the DT B-NP O
group NN I-NP O
, , O O
crashing VBG B-VP O
to TO B-PP O
a DT B-NP O
surprise NN I-NP O
2-0 CD I-NP O
defeat NN I-NP O
to TO B-PP O
newcomers NNS B-NP O
Uzbekistan NNP I-NP B-LOC
. . O O

China NNP B-NP B-LOC
controlled VBD B-VP O
most JJS B-NP O
of IN B-PP O
the DT B-NP O
match NN I-NP O
and CC O O
saw VBD B-VP O
several JJ B-NP O
chances NNS I-NP O
missed VBD B-VP O
until IN B-SBAR O
the DT B-NP O
78th JJ I-NP O
minute NN I-NP O
when WRB B-ADVP O
Uzbek NNP B-NP B-MISC
striker NN I-NP O
Igor JJ B-NP B-PER
Shkvyrin NNP I-NP I-PER
took VBD B-VP O
advantage NN B-NP O
of IN B-PP O
a DT B-NP O
misdirected JJ I-NP O
defensive JJ I-NP O
header NN I-NP O
to TO B-VP O
lob VB I-VP O
the DT B-NP O
ball NN I-NP O
over IN B-PP O
the DT B-NP O
advancing VBG I-NP O
Chinese JJ I-NP B-MISC
keeper NN I-NP O
and CC O O
into IN B-PP O
an DT B-NP O
empty JJ I-NP O
net NN I-NP O
. . O O

Oleg NNP B-NP B-PER
Shatskiku NNP I-NP I-PER
made VBD B-VP O
sure JJ B-ADJP O
of IN B-PP O
the DT B-NP O
win VBP B-VP O
in IN B-PP O
injury NN B-NP O
time NN I-NP O
, , O O
hitting VBG B-VP O
an DT B-NP O
unstoppable JJ I-NP O
left VBD B-VP O
foot NN B-NP O
shot NN I-NP O
from IN B-PP O
just RB B-NP O
outside IN B-PP O
the DT B-NP O
area NN I-NP O
. . O O

The DT B-NP O
former JJ I-NP O
Soviet JJ I-NP B-MISC
republic NN I-NP O
was VBD B-VP O
playing VBG I-VP O
in IN B-PP O
an DT B-NP O
Asian NNP I-NP B-MISC
Cup NNP I-NP I-MISC
finals NNS I-NP O
tie NN I-NP O
for IN B-PP O
the DT B-NP O
first JJ I-NP O
time NN I-NP O
. . O O

Despite IN B-PP O
winning VBG B-VP O
the DT B-NP O
Asian JJ I-NP B-MISC
Games NNPS I-NP I-MISC
title NN I-NP O
two CD B-NP O
years NNS I-NP O
ago RB B-ADVP O
, , O O
Uzbekistan NNP B-NP B-LOC
are VBP B-VP O
in IN B-PP O
the DT B-NP O
finals NNS I-NP O
as IN B-SBAR O
outsiders NNS B-NP O
. . O O

Two CD B-NP O
goals NNS I-NP O
from IN B-PP O
defensive JJ B-NP O
errors NNS I-NP O
in IN B-PP O
the DT B-NP O
last JJ I-NP O
six CD I-NP O
minutes NNS I-NP O
allowed VBD B-VP O
Japan NNP B-NP B-LOC
to TO B-VP O
come VB I-VP O
from IN B-PP O
behind NN B-NP O
and CC O O
collect VB B-VP O
all DT B-NP O
three CD I-NP O
points NNS I-NP O
from IN B-PP O
their PRP$ B-NP O
opening NN I-NP O
meeting NN I-NP O
against IN B-PP O
Syria NNP B-NP B-LOC
. . O O

Takuya NNP B-NP B-PER
Takagi NNP I-NP I-PER
scored VBD B-VP O
the DT B-NP O
winner NN I-NP O
in IN B-PP O
the DT B-NP O
88th JJ I-NP O
minute NN I-NP O
, , O O
rising VBG B-VP O
to TO I-VP O
head VB I-VP O
a DT B-NP O
Hiroshige NNP I-NP B-PER
Yanagimoto NNP I-NP I-PER
cross VB B-VP O
towards IN B-PP O
the DT B-NP O
Syrian JJ I-NP B-MISC
goal NN I-NP O
which WDT B-NP O
goalkeeper VBD B-VP O
Salem NNP B-NP B-PER
Bitar NNP I-NP I-PER
appeared VBD B-VP O
to TO I-VP O
have VB I-VP O
covered VBN I-VP O
but CC O O
then RB B-VP O
allowed VBN I-VP O
to TO I-VP O
slip VB I-VP O
into IN B-PP O
the DT B-NP O
net NN I-NP O
. . O O

It PRP B-NP O
was VBD B-VP O
the DT B-NP O
second JJ I-NP O
costly JJ I-NP O
blunder NN I-NP O
by IN B-PP O
Syria NNP B-NP B-LOC
in IN B-PP O
four CD B-NP O
minutes NNS I-NP O
. . O O

Defender NNP B-NP O
Hassan NNP I-NP B-PER
Abbas NNP I-NP I-PER
rose VBD B-VP O
to TO I-VP O
intercept VB I-VP O
a DT B-NP O
long JJ I-NP O
ball NN I-NP O
into IN B-PP O
the DT B-NP O
area NN I-NP O
in IN B-PP O
the DT B-NP O
84th JJ I-NP O
minute NN I-NP O
but CC O O
only RB B-ADVP O
managed VBD B-VP O
to TO I-VP O
divert VB I-VP O
it PRP B-NP O
into IN B-PP O
the DT B-NP O
top JJ I-NP O
corner NN I-NP O
of IN B-PP O
Bitar NN B-NP B-PER
's POS B-NP O
goal NN I-NP O
. . O O

Nader NNP B-NP B-PER
Jokhadar NNP I-NP I-PER
had VBD B-VP O
given VBN I-VP O
Syria NNP B-NP B-LOC
the DT B-NP O
lead NN I-NP O
with IN B-PP O
a DT B-NP O
well-struck NN I-NP O
header NN I-NP O
in IN B-PP O
the DT B-NP O
seventh JJ I-NP O
minute NN I-NP O
. . O O

Japan NNP B-NP B-LOC
then RB B-ADVP O
laid VBD B-VP O
siege NN B-NP O
to TO B-PP O
the DT B-NP O
Syrian JJ I-NP B-MISC
penalty NN I-NP O
area NN I-NP O
for IN B-PP O
most JJS B-NP O
of IN B-PP O
the DT B-NP O
game NN I-NP O
but CC O O
rarely RB B-VP O
breached VBD I-VP O
the DT B-NP O
Syrian JJ I-NP B-MISC
defence NN I-NP O
. . O O

Bitar NN B-NP B-PER
pulled VBD B-VP O
off RP B-PRT O
fine JJ B-NP O
saves VBZ B-VP O
whenever WRB B-ADVP O
they PRP B-NP O
did VBD B-VP O
. . O O

Japan NNP B-NP B-LOC
coach NN I-NP O
Shu NNP I-NP B-PER
Kamo NNP I-NP I-PER
said VBD B-VP O
: : O O
' '' O O
' POS B-NP O
The DT I-NP O
Syrian JJ I-NP B-MISC
own JJ I-NP O
goal NN I-NP O
proved VBD B-VP O
lucky JJ B-ADJP O
for IN B-PP O
us PRP B-NP O
. . O O

The DT B-NP O
Syrians NNPS I-NP B-MISC
scored VBD B-VP O
early JJ B-NP O
and CC O O
then RB B-VP O
played VBN I-VP O
defensively RB B-ADVP O
and CC O O
adopted VBD B-VP O
long RB I-VP O
balls VBZ I-VP O
which WDT B-NP O
made VBD B-VP O
it PRP B-NP O
hard JJ B-ADJP O
for IN B-PP O
us PRP B-NP O
. . O O
' '' O O

' '' O O

Japan NNP B-NP B-LOC
, , O O
co-hosts VBZ B-VP O
of IN B-PP O
the DT B-NP O
World NNP I-NP B-MISC
Cup NNP I-NP I-MISC
in IN B-PP O
2002 CD B-NP O
and CC O O
ranked VBD B-VP O
20th JJ B-NP O
in IN B-PP O
the DT B-NP O
world NN I-NP O
by IN B-PP O
FIFA NNP B-NP B-ORG
, , O O
are VBP B-VP O
favourites JJ B-ADJP O
to TO B-VP O
regain VB I-VP O
their PRP$ B-NP O
title NN I-NP O
here RB B-ADVP O
. . O O

Hosts NNPS B-NP O
UAE NNP I-NP B-LOC
play NN I-NP O
Kuwait NNP I-NP B-LOC
and CC O O
South NNP B-NP B-LOC
Korea NNP I-NP I-LOC
take VBP B-VP O
on IN B-PP O
Indonesia NNP B-NP B-LOC
on IN B-PP O
Saturday NNP B-NP O
in IN B-PP O
Group NNP B-NP O
A NNP I-NP O
matches VBZ B-VP O
. . O O

All DT B-NP O
four CD I-NP O
teams NNS I-NP O
are VBP B-VP O
level NN B-NP O
with IN B-PP O
one CD B-NP O
point NN I-NP O
each DT B-NP O
from IN B-PP O
one CD B-NP O
game NN I-NP O
. . O O

+ 23
- 0
test/io/test_dataset_loader.py View File

@@ -0,0 +1,23 @@
import os
import unittest

from fastNLP.io.dataset_loader import Conll2003Loader
class TestDatasetLoader(unittest.TestCase):
def test_case_1(self):
'''
Test the the loader of Conll2003 dataset
'''

dataset_path = "test/data_for_tests/conll_2003_example.txt"
loader = Conll2003Loader()
dataset_2003 = loader.load(dataset_path)
for item in dataset_2003:
len0 = len(item["label0_list"])
len1 = len(item["label1_list"])
len2 = len(item["label2_list"])
lentoken = len(item["token_list"])
self.assertNotEqual(len0, 0)
self.assertEqual(len0, len1)
self.assertEqual(len1, len2)

Loading…
Cancel
Save