| @@ -417,6 +417,55 @@ class PeopleDailyCorpusLoader(DataSetLoader): | |||
| data_set.set_input("seq_len") | |||
| return data_set | |||
| class Conll2003Loader(DataSetLoader): | |||
| """Self-defined loader of conll2003 dataset | |||
| More information about the given dataset cound be found on | |||
| https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data | |||
| """ | |||
| def __init__(self): | |||
| super(Conll2003Loader, self).__init__() | |||
| def load(self, dataset_path): | |||
| with open(dataset_path, "r", encoding="utf-8") as f: | |||
| lines = f.readlines() | |||
| ##Parse the dataset line by line | |||
| parsed_data = [] | |||
| sentence = [] | |||
| tokens = [] | |||
| for line in lines: | |||
| if '-DOCSTART- -X- -X- O' in line or line == '\n': | |||
| if sentence != []: | |||
| parsed_data.append((sentence, tokens)) | |||
| sentence = [] | |||
| tokens = [] | |||
| continue | |||
| temp = line.strip().split(" ") | |||
| sentence.append(temp[0]) | |||
| tokens.append(temp[1:4]) | |||
| return self.convert(parsed_data) | |||
| def convert(self, parsed_data): | |||
| dataset = DataSet() | |||
| for sample in parsed_data: | |||
| label0_list = list(map( | |||
| lambda labels: labels[0], sample[1])) | |||
| label1_list = list(map( | |||
| lambda labels: labels[1], sample[1])) | |||
| label2_list = list(map( | |||
| lambda labels: labels[2], sample[1])) | |||
| dataset.append(Instance(token_list=sample[0], | |||
| label0_list=label0_list, | |||
| label1_list=label1_list, | |||
| label2_list=label2_list)) | |||
| return dataset | |||
| class SNLIDataSetLoader(DataSetLoader): | |||
| """A data set loader for SNLI data set. | |||
| @@ -0,0 +1,442 @@ | |||
| -DOCSTART- -X- -X- O | |||
| SOCCER NN B-NP O | |||
| - : O O | |||
| JAPAN NNP B-NP B-LOC | |||
| GET VB B-VP O | |||
| LUCKY NNP B-NP O | |||
| WIN NNP I-NP O | |||
| , , O O | |||
| CHINA NNP B-NP B-PER | |||
| IN IN B-PP O | |||
| SURPRISE DT B-NP O | |||
| DEFEAT NN I-NP O | |||
| . . O O | |||
| Nadim NNP B-NP B-PER | |||
| Ladki NNP I-NP I-PER | |||
| AL-AIN NNP B-NP B-LOC | |||
| , , O O | |||
| United NNP B-NP B-LOC | |||
| Arab NNP I-NP I-LOC | |||
| Emirates NNPS I-NP I-LOC | |||
| 1996-12-06 CD I-NP O | |||
| Japan NNP B-NP B-LOC | |||
| began VBD B-VP O | |||
| the DT B-NP O | |||
| defence NN I-NP O | |||
| of IN B-PP O | |||
| their PRP$ B-NP O | |||
| Asian JJ I-NP B-MISC | |||
| Cup NNP I-NP I-MISC | |||
| title NN I-NP O | |||
| with IN B-PP O | |||
| a DT B-NP O | |||
| lucky JJ I-NP O | |||
| 2-1 CD I-NP O | |||
| win VBP B-VP O | |||
| against IN B-PP O | |||
| Syria NNP B-NP B-LOC | |||
| in IN B-PP O | |||
| a DT B-NP O | |||
| Group NNP I-NP O | |||
| C NNP I-NP O | |||
| championship NN I-NP O | |||
| match NN I-NP O | |||
| on IN B-PP O | |||
| Friday NNP B-NP O | |||
| . . O O | |||
| But CC O O | |||
| China NNP B-NP B-LOC | |||
| saw VBD B-VP O | |||
| their PRP$ B-NP O | |||
| luck NN I-NP O | |||
| desert VB B-VP O | |||
| them PRP B-NP O | |||
| in IN B-PP O | |||
| the DT B-NP O | |||
| second NN I-NP O | |||
| match NN I-NP O | |||
| of IN B-PP O | |||
| the DT B-NP O | |||
| group NN I-NP O | |||
| , , O O | |||
| crashing VBG B-VP O | |||
| to TO B-PP O | |||
| a DT B-NP O | |||
| surprise NN I-NP O | |||
| 2-0 CD I-NP O | |||
| defeat NN I-NP O | |||
| to TO B-PP O | |||
| newcomers NNS B-NP O | |||
| Uzbekistan NNP I-NP B-LOC | |||
| . . O O | |||
| China NNP B-NP B-LOC | |||
| controlled VBD B-VP O | |||
| most JJS B-NP O | |||
| of IN B-PP O | |||
| the DT B-NP O | |||
| match NN I-NP O | |||
| and CC O O | |||
| saw VBD B-VP O | |||
| several JJ B-NP O | |||
| chances NNS I-NP O | |||
| missed VBD B-VP O | |||
| until IN B-SBAR O | |||
| the DT B-NP O | |||
| 78th JJ I-NP O | |||
| minute NN I-NP O | |||
| when WRB B-ADVP O | |||
| Uzbek NNP B-NP B-MISC | |||
| striker NN I-NP O | |||
| Igor JJ B-NP B-PER | |||
| Shkvyrin NNP I-NP I-PER | |||
| took VBD B-VP O | |||
| advantage NN B-NP O | |||
| of IN B-PP O | |||
| a DT B-NP O | |||
| misdirected JJ I-NP O | |||
| defensive JJ I-NP O | |||
| header NN I-NP O | |||
| to TO B-VP O | |||
| lob VB I-VP O | |||
| the DT B-NP O | |||
| ball NN I-NP O | |||
| over IN B-PP O | |||
| the DT B-NP O | |||
| advancing VBG I-NP O | |||
| Chinese JJ I-NP B-MISC | |||
| keeper NN I-NP O | |||
| and CC O O | |||
| into IN B-PP O | |||
| an DT B-NP O | |||
| empty JJ I-NP O | |||
| net NN I-NP O | |||
| . . O O | |||
| Oleg NNP B-NP B-PER | |||
| Shatskiku NNP I-NP I-PER | |||
| made VBD B-VP O | |||
| sure JJ B-ADJP O | |||
| of IN B-PP O | |||
| the DT B-NP O | |||
| win VBP B-VP O | |||
| in IN B-PP O | |||
| injury NN B-NP O | |||
| time NN I-NP O | |||
| , , O O | |||
| hitting VBG B-VP O | |||
| an DT B-NP O | |||
| unstoppable JJ I-NP O | |||
| left VBD B-VP O | |||
| foot NN B-NP O | |||
| shot NN I-NP O | |||
| from IN B-PP O | |||
| just RB B-NP O | |||
| outside IN B-PP O | |||
| the DT B-NP O | |||
| area NN I-NP O | |||
| . . O O | |||
| The DT B-NP O | |||
| former JJ I-NP O | |||
| Soviet JJ I-NP B-MISC | |||
| republic NN I-NP O | |||
| was VBD B-VP O | |||
| playing VBG I-VP O | |||
| in IN B-PP O | |||
| an DT B-NP O | |||
| Asian NNP I-NP B-MISC | |||
| Cup NNP I-NP I-MISC | |||
| finals NNS I-NP O | |||
| tie NN I-NP O | |||
| for IN B-PP O | |||
| the DT B-NP O | |||
| first JJ I-NP O | |||
| time NN I-NP O | |||
| . . O O | |||
| Despite IN B-PP O | |||
| winning VBG B-VP O | |||
| the DT B-NP O | |||
| Asian JJ I-NP B-MISC | |||
| Games NNPS I-NP I-MISC | |||
| title NN I-NP O | |||
| two CD B-NP O | |||
| years NNS I-NP O | |||
| ago RB B-ADVP O | |||
| , , O O | |||
| Uzbekistan NNP B-NP B-LOC | |||
| are VBP B-VP O | |||
| in IN B-PP O | |||
| the DT B-NP O | |||
| finals NNS I-NP O | |||
| as IN B-SBAR O | |||
| outsiders NNS B-NP O | |||
| . . O O | |||
| Two CD B-NP O | |||
| goals NNS I-NP O | |||
| from IN B-PP O | |||
| defensive JJ B-NP O | |||
| errors NNS I-NP O | |||
| in IN B-PP O | |||
| the DT B-NP O | |||
| last JJ I-NP O | |||
| six CD I-NP O | |||
| minutes NNS I-NP O | |||
| allowed VBD B-VP O | |||
| Japan NNP B-NP B-LOC | |||
| to TO B-VP O | |||
| come VB I-VP O | |||
| from IN B-PP O | |||
| behind NN B-NP O | |||
| and CC O O | |||
| collect VB B-VP O | |||
| all DT B-NP O | |||
| three CD I-NP O | |||
| points NNS I-NP O | |||
| from IN B-PP O | |||
| their PRP$ B-NP O | |||
| opening NN I-NP O | |||
| meeting NN I-NP O | |||
| against IN B-PP O | |||
| Syria NNP B-NP B-LOC | |||
| . . O O | |||
| Takuya NNP B-NP B-PER | |||
| Takagi NNP I-NP I-PER | |||
| scored VBD B-VP O | |||
| the DT B-NP O | |||
| winner NN I-NP O | |||
| in IN B-PP O | |||
| the DT B-NP O | |||
| 88th JJ I-NP O | |||
| minute NN I-NP O | |||
| , , O O | |||
| rising VBG B-VP O | |||
| to TO I-VP O | |||
| head VB I-VP O | |||
| a DT B-NP O | |||
| Hiroshige NNP I-NP B-PER | |||
| Yanagimoto NNP I-NP I-PER | |||
| cross VB B-VP O | |||
| towards IN B-PP O | |||
| the DT B-NP O | |||
| Syrian JJ I-NP B-MISC | |||
| goal NN I-NP O | |||
| which WDT B-NP O | |||
| goalkeeper VBD B-VP O | |||
| Salem NNP B-NP B-PER | |||
| Bitar NNP I-NP I-PER | |||
| appeared VBD B-VP O | |||
| to TO I-VP O | |||
| have VB I-VP O | |||
| covered VBN I-VP O | |||
| but CC O O | |||
| then RB B-VP O | |||
| allowed VBN I-VP O | |||
| to TO I-VP O | |||
| slip VB I-VP O | |||
| into IN B-PP O | |||
| the DT B-NP O | |||
| net NN I-NP O | |||
| . . O O | |||
| It PRP B-NP O | |||
| was VBD B-VP O | |||
| the DT B-NP O | |||
| second JJ I-NP O | |||
| costly JJ I-NP O | |||
| blunder NN I-NP O | |||
| by IN B-PP O | |||
| Syria NNP B-NP B-LOC | |||
| in IN B-PP O | |||
| four CD B-NP O | |||
| minutes NNS I-NP O | |||
| . . O O | |||
| Defender NNP B-NP O | |||
| Hassan NNP I-NP B-PER | |||
| Abbas NNP I-NP I-PER | |||
| rose VBD B-VP O | |||
| to TO I-VP O | |||
| intercept VB I-VP O | |||
| a DT B-NP O | |||
| long JJ I-NP O | |||
| ball NN I-NP O | |||
| into IN B-PP O | |||
| the DT B-NP O | |||
| area NN I-NP O | |||
| in IN B-PP O | |||
| the DT B-NP O | |||
| 84th JJ I-NP O | |||
| minute NN I-NP O | |||
| but CC O O | |||
| only RB B-ADVP O | |||
| managed VBD B-VP O | |||
| to TO I-VP O | |||
| divert VB I-VP O | |||
| it PRP B-NP O | |||
| into IN B-PP O | |||
| the DT B-NP O | |||
| top JJ I-NP O | |||
| corner NN I-NP O | |||
| of IN B-PP O | |||
| Bitar NN B-NP B-PER | |||
| 's POS B-NP O | |||
| goal NN I-NP O | |||
| . . O O | |||
| Nader NNP B-NP B-PER | |||
| Jokhadar NNP I-NP I-PER | |||
| had VBD B-VP O | |||
| given VBN I-VP O | |||
| Syria NNP B-NP B-LOC | |||
| the DT B-NP O | |||
| lead NN I-NP O | |||
| with IN B-PP O | |||
| a DT B-NP O | |||
| well-struck NN I-NP O | |||
| header NN I-NP O | |||
| in IN B-PP O | |||
| the DT B-NP O | |||
| seventh JJ I-NP O | |||
| minute NN I-NP O | |||
| . . O O | |||
| Japan NNP B-NP B-LOC | |||
| then RB B-ADVP O | |||
| laid VBD B-VP O | |||
| siege NN B-NP O | |||
| to TO B-PP O | |||
| the DT B-NP O | |||
| Syrian JJ I-NP B-MISC | |||
| penalty NN I-NP O | |||
| area NN I-NP O | |||
| for IN B-PP O | |||
| most JJS B-NP O | |||
| of IN B-PP O | |||
| the DT B-NP O | |||
| game NN I-NP O | |||
| but CC O O | |||
| rarely RB B-VP O | |||
| breached VBD I-VP O | |||
| the DT B-NP O | |||
| Syrian JJ I-NP B-MISC | |||
| defence NN I-NP O | |||
| . . O O | |||
| Bitar NN B-NP B-PER | |||
| pulled VBD B-VP O | |||
| off RP B-PRT O | |||
| fine JJ B-NP O | |||
| saves VBZ B-VP O | |||
| whenever WRB B-ADVP O | |||
| they PRP B-NP O | |||
| did VBD B-VP O | |||
| . . O O | |||
| Japan NNP B-NP B-LOC | |||
| coach NN I-NP O | |||
| Shu NNP I-NP B-PER | |||
| Kamo NNP I-NP I-PER | |||
| said VBD B-VP O | |||
| : : O O | |||
| ' '' O O | |||
| ' POS B-NP O | |||
| The DT I-NP O | |||
| Syrian JJ I-NP B-MISC | |||
| own JJ I-NP O | |||
| goal NN I-NP O | |||
| proved VBD B-VP O | |||
| lucky JJ B-ADJP O | |||
| for IN B-PP O | |||
| us PRP B-NP O | |||
| . . O O | |||
| The DT B-NP O | |||
| Syrians NNPS I-NP B-MISC | |||
| scored VBD B-VP O | |||
| early JJ B-NP O | |||
| and CC O O | |||
| then RB B-VP O | |||
| played VBN I-VP O | |||
| defensively RB B-ADVP O | |||
| and CC O O | |||
| adopted VBD B-VP O | |||
| long RB I-VP O | |||
| balls VBZ I-VP O | |||
| which WDT B-NP O | |||
| made VBD B-VP O | |||
| it PRP B-NP O | |||
| hard JJ B-ADJP O | |||
| for IN B-PP O | |||
| us PRP B-NP O | |||
| . . O O | |||
| ' '' O O | |||
| ' '' O O | |||
| Japan NNP B-NP B-LOC | |||
| , , O O | |||
| co-hosts VBZ B-VP O | |||
| of IN B-PP O | |||
| the DT B-NP O | |||
| World NNP I-NP B-MISC | |||
| Cup NNP I-NP I-MISC | |||
| in IN B-PP O | |||
| 2002 CD B-NP O | |||
| and CC O O | |||
| ranked VBD B-VP O | |||
| 20th JJ B-NP O | |||
| in IN B-PP O | |||
| the DT B-NP O | |||
| world NN I-NP O | |||
| by IN B-PP O | |||
| FIFA NNP B-NP B-ORG | |||
| , , O O | |||
| are VBP B-VP O | |||
| favourites JJ B-ADJP O | |||
| to TO B-VP O | |||
| regain VB I-VP O | |||
| their PRP$ B-NP O | |||
| title NN I-NP O | |||
| here RB B-ADVP O | |||
| . . O O | |||
| Hosts NNPS B-NP O | |||
| UAE NNP I-NP B-LOC | |||
| play NN I-NP O | |||
| Kuwait NNP I-NP B-LOC | |||
| and CC O O | |||
| South NNP B-NP B-LOC | |||
| Korea NNP I-NP I-LOC | |||
| take VBP B-VP O | |||
| on IN B-PP O | |||
| Indonesia NNP B-NP B-LOC | |||
| on IN B-PP O | |||
| Saturday NNP B-NP O | |||
| in IN B-PP O | |||
| Group NNP B-NP O | |||
| A NNP I-NP O | |||
| matches VBZ B-VP O | |||
| . . O O | |||
| All DT B-NP O | |||
| four CD I-NP O | |||
| teams NNS I-NP O | |||
| are VBP B-VP O | |||
| level NN B-NP O | |||
| with IN B-PP O | |||
| one CD B-NP O | |||
| point NN I-NP O | |||
| each DT B-NP O | |||
| from IN B-PP O | |||
| one CD B-NP O | |||
| game NN I-NP O | |||
| . . O O | |||
| @@ -0,0 +1,23 @@ | |||
| import os | |||
| import unittest | |||
| from fastNLP.io.dataset_loader import Conll2003Loader | |||
| class TestDatasetLoader(unittest.TestCase): | |||
| def test_case_1(self): | |||
| ''' | |||
| Test the the loader of Conll2003 dataset | |||
| ''' | |||
| dataset_path = "test/data_for_tests/conll_2003_example.txt" | |||
| loader = Conll2003Loader() | |||
| dataset_2003 = loader.load(dataset_path) | |||
| for item in dataset_2003: | |||
| len0 = len(item["label0_list"]) | |||
| len1 = len(item["label1_list"]) | |||
| len2 = len(item["label2_list"]) | |||
| lentoken = len(item["token_list"]) | |||
| self.assertNotEqual(len0, 0) | |||
| self.assertEqual(len0, len1) | |||
| self.assertEqual(len1, len2) | |||