diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index 641a631e..76b9584d 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -417,6 +417,55 @@ class PeopleDailyCorpusLoader(DataSetLoader): data_set.set_input("seq_len") return data_set + +class Conll2003Loader(DataSetLoader): + """Self-defined loader of conll2003 dataset + + More information about the given dataset cound be found on + https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data + + """ + + def __init__(self): + super(Conll2003Loader, self).__init__() + + def load(self, dataset_path): + with open(dataset_path, "r", encoding="utf-8") as f: + lines = f.readlines() + + ##Parse the dataset line by line + parsed_data = [] + sentence = [] + tokens = [] + for line in lines: + if '-DOCSTART- -X- -X- O' in line or line == '\n': + if sentence != []: + parsed_data.append((sentence, tokens)) + sentence = [] + tokens = [] + continue + + temp = line.strip().split(" ") + sentence.append(temp[0]) + tokens.append(temp[1:4]) + + return self.convert(parsed_data) + + def convert(self, parsed_data): + dataset = DataSet() + for sample in parsed_data: + label0_list = list(map( + lambda labels: labels[0], sample[1])) + label1_list = list(map( + lambda labels: labels[1], sample[1])) + label2_list = list(map( + lambda labels: labels[2], sample[1])) + dataset.append(Instance(token_list=sample[0], + label0_list=label0_list, + label1_list=label1_list, + label2_list=label2_list)) + + return dataset class SNLIDataSetLoader(DataSetLoader): """A data set loader for SNLI data set. diff --git a/test/data_for_tests/conll_2003_example.txt b/test/data_for_tests/conll_2003_example.txt new file mode 100644 index 00000000..d11a8264 --- /dev/null +++ b/test/data_for_tests/conll_2003_example.txt @@ -0,0 +1,442 @@ +-DOCSTART- -X- -X- O + +SOCCER NN B-NP O +- : O O +JAPAN NNP B-NP B-LOC +GET VB B-VP O +LUCKY NNP B-NP O +WIN NNP I-NP O +, , O O +CHINA NNP B-NP B-PER +IN IN B-PP O +SURPRISE DT B-NP O +DEFEAT NN I-NP O +. . O O + +Nadim NNP B-NP B-PER +Ladki NNP I-NP I-PER + +AL-AIN NNP B-NP B-LOC +, , O O +United NNP B-NP B-LOC +Arab NNP I-NP I-LOC +Emirates NNPS I-NP I-LOC +1996-12-06 CD I-NP O + +Japan NNP B-NP B-LOC +began VBD B-VP O +the DT B-NP O +defence NN I-NP O +of IN B-PP O +their PRP$ B-NP O +Asian JJ I-NP B-MISC +Cup NNP I-NP I-MISC +title NN I-NP O +with IN B-PP O +a DT B-NP O +lucky JJ I-NP O +2-1 CD I-NP O +win VBP B-VP O +against IN B-PP O +Syria NNP B-NP B-LOC +in IN B-PP O +a DT B-NP O +Group NNP I-NP O +C NNP I-NP O +championship NN I-NP O +match NN I-NP O +on IN B-PP O +Friday NNP B-NP O +. . O O + +But CC O O +China NNP B-NP B-LOC +saw VBD B-VP O +their PRP$ B-NP O +luck NN I-NP O +desert VB B-VP O +them PRP B-NP O +in IN B-PP O +the DT B-NP O +second NN I-NP O +match NN I-NP O +of IN B-PP O +the DT B-NP O +group NN I-NP O +, , O O +crashing VBG B-VP O +to TO B-PP O +a DT B-NP O +surprise NN I-NP O +2-0 CD I-NP O +defeat NN I-NP O +to TO B-PP O +newcomers NNS B-NP O +Uzbekistan NNP I-NP B-LOC +. . O O + +China NNP B-NP B-LOC +controlled VBD B-VP O +most JJS B-NP O +of IN B-PP O +the DT B-NP O +match NN I-NP O +and CC O O +saw VBD B-VP O +several JJ B-NP O +chances NNS I-NP O +missed VBD B-VP O +until IN B-SBAR O +the DT B-NP O +78th JJ I-NP O +minute NN I-NP O +when WRB B-ADVP O +Uzbek NNP B-NP B-MISC +striker NN I-NP O +Igor JJ B-NP B-PER +Shkvyrin NNP I-NP I-PER +took VBD B-VP O +advantage NN B-NP O +of IN B-PP O +a DT B-NP O +misdirected JJ I-NP O +defensive JJ I-NP O +header NN I-NP O +to TO B-VP O +lob VB I-VP O +the DT B-NP O +ball NN I-NP O +over IN B-PP O +the DT B-NP O +advancing VBG I-NP O +Chinese JJ I-NP B-MISC +keeper NN I-NP O +and CC O O +into IN B-PP O +an DT B-NP O +empty JJ I-NP O +net NN I-NP O +. . O O + +Oleg NNP B-NP B-PER +Shatskiku NNP I-NP I-PER +made VBD B-VP O +sure JJ B-ADJP O +of IN B-PP O +the DT B-NP O +win VBP B-VP O +in IN B-PP O +injury NN B-NP O +time NN I-NP O +, , O O +hitting VBG B-VP O +an DT B-NP O +unstoppable JJ I-NP O +left VBD B-VP O +foot NN B-NP O +shot NN I-NP O +from IN B-PP O +just RB B-NP O +outside IN B-PP O +the DT B-NP O +area NN I-NP O +. . O O + +The DT B-NP O +former JJ I-NP O +Soviet JJ I-NP B-MISC +republic NN I-NP O +was VBD B-VP O +playing VBG I-VP O +in IN B-PP O +an DT B-NP O +Asian NNP I-NP B-MISC +Cup NNP I-NP I-MISC +finals NNS I-NP O +tie NN I-NP O +for IN B-PP O +the DT B-NP O +first JJ I-NP O +time NN I-NP O +. . O O + +Despite IN B-PP O +winning VBG B-VP O +the DT B-NP O +Asian JJ I-NP B-MISC +Games NNPS I-NP I-MISC +title NN I-NP O +two CD B-NP O +years NNS I-NP O +ago RB B-ADVP O +, , O O +Uzbekistan NNP B-NP B-LOC +are VBP B-VP O +in IN B-PP O +the DT B-NP O +finals NNS I-NP O +as IN B-SBAR O +outsiders NNS B-NP O +. . O O + +Two CD B-NP O +goals NNS I-NP O +from IN B-PP O +defensive JJ B-NP O +errors NNS I-NP O +in IN B-PP O +the DT B-NP O +last JJ I-NP O +six CD I-NP O +minutes NNS I-NP O +allowed VBD B-VP O +Japan NNP B-NP B-LOC +to TO B-VP O +come VB I-VP O +from IN B-PP O +behind NN B-NP O +and CC O O +collect VB B-VP O +all DT B-NP O +three CD I-NP O +points NNS I-NP O +from IN B-PP O +their PRP$ B-NP O +opening NN I-NP O +meeting NN I-NP O +against IN B-PP O +Syria NNP B-NP B-LOC +. . O O + +Takuya NNP B-NP B-PER +Takagi NNP I-NP I-PER +scored VBD B-VP O +the DT B-NP O +winner NN I-NP O +in IN B-PP O +the DT B-NP O +88th JJ I-NP O +minute NN I-NP O +, , O O +rising VBG B-VP O +to TO I-VP O +head VB I-VP O +a DT B-NP O +Hiroshige NNP I-NP B-PER +Yanagimoto NNP I-NP I-PER +cross VB B-VP O +towards IN B-PP O +the DT B-NP O +Syrian JJ I-NP B-MISC +goal NN I-NP O +which WDT B-NP O +goalkeeper VBD B-VP O +Salem NNP B-NP B-PER +Bitar NNP I-NP I-PER +appeared VBD B-VP O +to TO I-VP O +have VB I-VP O +covered VBN I-VP O +but CC O O +then RB B-VP O +allowed VBN I-VP O +to TO I-VP O +slip VB I-VP O +into IN B-PP O +the DT B-NP O +net NN I-NP O +. . O O + +It PRP B-NP O +was VBD B-VP O +the DT B-NP O +second JJ I-NP O +costly JJ I-NP O +blunder NN I-NP O +by IN B-PP O +Syria NNP B-NP B-LOC +in IN B-PP O +four CD B-NP O +minutes NNS I-NP O +. . O O + +Defender NNP B-NP O +Hassan NNP I-NP B-PER +Abbas NNP I-NP I-PER +rose VBD B-VP O +to TO I-VP O +intercept VB I-VP O +a DT B-NP O +long JJ I-NP O +ball NN I-NP O +into IN B-PP O +the DT B-NP O +area NN I-NP O +in IN B-PP O +the DT B-NP O +84th JJ I-NP O +minute NN I-NP O +but CC O O +only RB B-ADVP O +managed VBD B-VP O +to TO I-VP O +divert VB I-VP O +it PRP B-NP O +into IN B-PP O +the DT B-NP O +top JJ I-NP O +corner NN I-NP O +of IN B-PP O +Bitar NN B-NP B-PER +'s POS B-NP O +goal NN I-NP O +. . O O + +Nader NNP B-NP B-PER +Jokhadar NNP I-NP I-PER +had VBD B-VP O +given VBN I-VP O +Syria NNP B-NP B-LOC +the DT B-NP O +lead NN I-NP O +with IN B-PP O +a DT B-NP O +well-struck NN I-NP O +header NN I-NP O +in IN B-PP O +the DT B-NP O +seventh JJ I-NP O +minute NN I-NP O +. . O O + +Japan NNP B-NP B-LOC +then RB B-ADVP O +laid VBD B-VP O +siege NN B-NP O +to TO B-PP O +the DT B-NP O +Syrian JJ I-NP B-MISC +penalty NN I-NP O +area NN I-NP O +for IN B-PP O +most JJS B-NP O +of IN B-PP O +the DT B-NP O +game NN I-NP O +but CC O O +rarely RB B-VP O +breached VBD I-VP O +the DT B-NP O +Syrian JJ I-NP B-MISC +defence NN I-NP O +. . O O + +Bitar NN B-NP B-PER +pulled VBD B-VP O +off RP B-PRT O +fine JJ B-NP O +saves VBZ B-VP O +whenever WRB B-ADVP O +they PRP B-NP O +did VBD B-VP O +. . O O + +Japan NNP B-NP B-LOC +coach NN I-NP O +Shu NNP I-NP B-PER +Kamo NNP I-NP I-PER +said VBD B-VP O +: : O O +' '' O O +' POS B-NP O +The DT I-NP O +Syrian JJ I-NP B-MISC +own JJ I-NP O +goal NN I-NP O +proved VBD B-VP O +lucky JJ B-ADJP O +for IN B-PP O +us PRP B-NP O +. . O O + +The DT B-NP O +Syrians NNPS I-NP B-MISC +scored VBD B-VP O +early JJ B-NP O +and CC O O +then RB B-VP O +played VBN I-VP O +defensively RB B-ADVP O +and CC O O +adopted VBD B-VP O +long RB I-VP O +balls VBZ I-VP O +which WDT B-NP O +made VBD B-VP O +it PRP B-NP O +hard JJ B-ADJP O +for IN B-PP O +us PRP B-NP O +. . O O +' '' O O + +' '' O O + +Japan NNP B-NP B-LOC +, , O O +co-hosts VBZ B-VP O +of IN B-PP O +the DT B-NP O +World NNP I-NP B-MISC +Cup NNP I-NP I-MISC +in IN B-PP O +2002 CD B-NP O +and CC O O +ranked VBD B-VP O +20th JJ B-NP O +in IN B-PP O +the DT B-NP O +world NN I-NP O +by IN B-PP O +FIFA NNP B-NP B-ORG +, , O O +are VBP B-VP O +favourites JJ B-ADJP O +to TO B-VP O +regain VB I-VP O +their PRP$ B-NP O +title NN I-NP O +here RB B-ADVP O +. . O O + +Hosts NNPS B-NP O +UAE NNP I-NP B-LOC +play NN I-NP O +Kuwait NNP I-NP B-LOC +and CC O O +South NNP B-NP B-LOC +Korea NNP I-NP I-LOC +take VBP B-VP O +on IN B-PP O +Indonesia NNP B-NP B-LOC +on IN B-PP O +Saturday NNP B-NP O +in IN B-PP O +Group NNP B-NP O +A NNP I-NP O +matches VBZ B-VP O +. . O O + +All DT B-NP O +four CD I-NP O +teams NNS I-NP O +are VBP B-VP O +level NN B-NP O +with IN B-PP O +one CD B-NP O +point NN I-NP O +each DT B-NP O +from IN B-PP O +one CD B-NP O +game NN I-NP O +. . O O \ No newline at end of file diff --git a/test/io/test_dataset_loader.py b/test/io/test_dataset_loader.py new file mode 100644 index 00000000..9bee175b --- /dev/null +++ b/test/io/test_dataset_loader.py @@ -0,0 +1,23 @@ +import os +import unittest + +from fastNLP.io.dataset_loader import Conll2003Loader +class TestDatasetLoader(unittest.TestCase): + + def test_case_1(self): + ''' + Test the the loader of Conll2003 dataset + ''' + + dataset_path = "test/data_for_tests/conll_2003_example.txt" + loader = Conll2003Loader() + dataset_2003 = loader.load(dataset_path) + + for item in dataset_2003: + len0 = len(item["label0_list"]) + len1 = len(item["label1_list"]) + len2 = len(item["label2_list"]) + lentoken = len(item["token_list"]) + self.assertNotEqual(len0, 0) + self.assertEqual(len0, len1) + self.assertEqual(len1, len2) \ No newline at end of file