diff --git a/test/data_for_tests/io/MSRA_NER/dev.conll b/test/data_for_tests/io/MSRA_NER/dev.conll new file mode 100755 index 00000000..792efce8 --- /dev/null +++ b/test/data_for_tests/io/MSRA_NER/dev.conll @@ -0,0 +1,38 @@ +把 O +欧 B-LOC + +美 B-LOC +、 O + +港 B-LOC +台 B-LOC + +流 O +行 O + +的 O +食 O + +品 O +类 O + +图 O +谱 O + +马 B-PER +列 B-PER + +主 O +义 O + +在 O +中 B-LOC + +国 I-LOC +传 O + +播 O +的 O + +历 O +史 O \ No newline at end of file diff --git a/test/data_for_tests/io/MSRA_NER/test.conll b/test/data_for_tests/io/MSRA_NER/test.conll new file mode 100755 index 00000000..d611fcdd --- /dev/null +++ b/test/data_for_tests/io/MSRA_NER/test.conll @@ -0,0 +1,31 @@ +中 B-ORG +共 I-ORG + +中 I-ORG +央 I-ORG + +致 O +中 B-ORG + +国 I-ORG +致 I-ORG + +公 I-ORG +党 I-ORG + +十 I-ORG +一 I-ORG + +大 I-ORG +的 O + +贺 O +词 O + + +各 O + +位 O +代 O + +表 O diff --git a/test/data_for_tests/io/MSRA_NER/train.conll b/test/data_for_tests/io/MSRA_NER/train.conll new file mode 100755 index 00000000..9edd3aef --- /dev/null +++ b/test/data_for_tests/io/MSRA_NER/train.conll @@ -0,0 +1,60 @@ +是 O +我 O + +们 O +收 O + +藏 O +北 B-LOC + +京 I-LOC +史 O + +料 O + +调 O +查 O + +范 O +围 O + +涉 O +及 O + +故 B-LOC +宫 I-LOC + +、 O +历 B-LOC + +博 I-LOC +、 O + +古 B-ORG +研 I-ORG + +所 I-ORG +、 O + +北 B-LOC +大 I-LOC + +清 I-LOC +华 I-LOC + +图 I-LOC +书 I-LOC + +馆 I-LOC +. O + +夏 B-PER +财 I-PER + +兴 I-PER +家 O + +分 O +到 O + +田 O diff --git a/test/data_for_tests/io/peopledaily/dev.txt b/test/data_for_tests/io/peopledaily/dev.txt new file mode 100755 index 00000000..4769eb79 --- /dev/null +++ b/test/data_for_tests/io/peopledaily/dev.txt @@ -0,0 +1,7 @@ +中 B-ORG +共 I-ORG +中 I-ORG +央 I-ORG + +致 O +中 B-ORG diff --git a/test/data_for_tests/io/peopledaily/test.txt b/test/data_for_tests/io/peopledaily/test.txt new file mode 100755 index 00000000..1a983ebd --- /dev/null +++ b/test/data_for_tests/io/peopledaily/test.txt @@ -0,0 +1,41 @@ +美 B-LOC +国 I-LOC + +的 O +华 B-PER + +莱 B-PER +士 B-PER + +中 B-ORG +共 I-ORG + +中 I-ORG +央 I-ORG + +举 O +办 O + +《 O +“ O + +一 O +国 O + +两 O +制 O + +” O +与 O + +香 B-LOC +港 I-LOC + +基 O +本 O + +法 O +》 O + +讲 O +座 O diff --git a/test/data_for_tests/io/peopledaily/train.txt b/test/data_for_tests/io/peopledaily/train.txt new file mode 100755 index 00000000..4fb5f61b --- /dev/null +++ b/test/data_for_tests/io/peopledaily/train.txt @@ -0,0 +1,46 @@ +我 O +们 O + +收 O +藏 O + +北 B-LOC +京 I-LOC + +史 O +料 O + +历 B-LOC +博 I-LOC + +、 O +古 B-ORG +研 I-ORG +所 I-ORG + +、 O +北 B-LOC + +大 I-LOC +清 I-LOC + +华 I-LOC +图 I-LOC + +书 I-LOC +馆 I-LOC + +我 O +们 O + +是 O +受 O + +到 O +郑 B-PER + +振 I-PER +铎 I-PER + +先 O +生 O diff --git a/test/data_for_tests/io/weibo_NER/dev.conll b/test/data_for_tests/io/weibo_NER/dev.conll new file mode 100755 index 00000000..11db48f8 --- /dev/null +++ b/test/data_for_tests/io/weibo_NER/dev.conll @@ -0,0 +1,21 @@ +老 B-PER.NOM +百 I-PER.NOM +姓 I-PER.NOM + +心 O + +新 B-GPE.NAM +乡 I-GPE.NAM + +年 O + +大 B-ORG.NOM +学 I-ORG.NOM + +同 O + +宿 B-LOC.NOM +舍 I-LOC.NOM + +三 O +年 O diff --git a/test/data_for_tests/io/weibo_NER/test.conll b/test/data_for_tests/io/weibo_NER/test.conll new file mode 100755 index 00000000..b92e7efa --- /dev/null +++ b/test/data_for_tests/io/weibo_NER/test.conll @@ -0,0 +1,17 @@ +感 O +动 O + +了 O + +李 B-PER.NAM +开 I-PER.NAM +复 I-PER.NAM + +小 B-ORG.NOM +学 I-ORG.NOM + +美 O +术 O + +新 O +课 O \ No newline at end of file diff --git a/test/data_for_tests/io/weibo_NER/train.conll b/test/data_for_tests/io/weibo_NER/train.conll new file mode 100755 index 00000000..6d6182c0 --- /dev/null +++ b/test/data_for_tests/io/weibo_NER/train.conll @@ -0,0 +1,69 @@ +坏 O +男 B-PER.NOM +人 I-PER.NOM + +男 B-PER.NOM +人 I-PER.NOM +帮 I-PER.NOM + + +不 O + +南 B-GPE.NAM +都 I-GPE.NAM + +南 B-GPE.NAM +方 I-GPE.NAM +都 I-GPE.NAM +市 I-GPE.NAM + +的 O + +那 B-LOC.NOM +座 I-LOC.NOM + +来 O + +学 B-ORG.NOM +校 I-ORG.NOM + +的 O + +卫 B-ORG.NAM +生 I-ORG.NAM +部 I-ORG.NAM + +台 B-GPE.NAM +灣 I-GPE.NAM + +火 B-LOC.NAM +焰 I-LOC.NAM +山 I-LOC.NAM + +的 O + +成 O +李 B-PER.NAM +力 I-PER.NAM +帆 I-PER.NAM + +我 O + +南 B-GPE.NAM +都 I-GPE.NAM + +深 B-GPE.NAM +圳 I-GPE.NAM + +一 O +个 O + +国 B-GPE.NOM +家 I-GPE.NOM + +以 O + +民 B-PER.NOM + +为 O +本 O diff --git a/test/io/loader/test_classification_loader.py b/test/io/loader/test_classification_loader.py index fdfc9008..d866edec 100644 --- a/test/io/loader/test_classification_loader.py +++ b/test/io/loader/test_classification_loader.py @@ -31,6 +31,7 @@ class TestLoad(unittest.TestCase): 'sst-2': ('test/data_for_tests/io/SST-2', SST2Loader, (5, 5, 5), True), 'sst': ('test/data_for_tests/io/SST', SSTLoader, (6, 6, 6), False), 'imdb': ('test/data_for_tests/io/imdb', IMDBLoader, (6, 6, 6), False), + 'ChnSentiCorp': ('test/data_for_tests/io/ChnSentiCorp', ChnSentiCorpLoader, (6, 6, 6), False), } for k, v in data_set_dict.items(): path, loader, data_set, warns = v diff --git a/test/io/pipe/test_classification.py b/test/io/pipe/test_classification.py index 88bf6921..be6127eb 100644 --- a/test/io/pipe/test_classification.py +++ b/test/io/pipe/test_classification.py @@ -40,15 +40,19 @@ class TestRunClassificationPipe(unittest.TestCase): 'sst-2': ('test/data_for_tests/io/SST-2', SST2Pipe, (5, 5, 5), (139, 2), True), 'sst': ('test/data_for_tests/io/SST', SSTPipe, (6, 354, 6), (232, 5), False), 'imdb': ('test/data_for_tests/io/imdb', IMDBPipe, (6, 6, 6), (1670, 2), False), + 'ChnSentiCorp': ('test/data_for_tests/io/ChnSentiCorp', ChnSentiCorpPipe, (6, 6, 6), (529, 1296, 1483, 2), False), } for k, v in data_set_dict.items(): path, pipe, data_set, vocab, warns = v with self.subTest(pipe=pipe): - if warns: - with self.assertWarns(Warning): + if 'Chn' not in k: + if warns: + with self.assertWarns(Warning): + data_bundle = pipe(tokenizer='raw').process_from_file(path) + else: data_bundle = pipe(tokenizer='raw').process_from_file(path) else: - data_bundle = pipe(tokenizer='raw').process_from_file(path) + data_bundle = pipe(bigrams=True, trigrams=True).process_from_file(path) self.assertTrue(isinstance(data_bundle, DataBundle)) self.assertEqual(len(data_set), data_bundle.num_dataset) diff --git a/test/io/pipe/test_conll.py b/test/io/pipe/test_conll.py index 4ecd7969..d60094c2 100644 --- a/test/io/pipe/test_conll.py +++ b/test/io/pipe/test_conll.py @@ -22,3 +22,19 @@ class TestRunPipe(unittest.TestCase): print(pipe) data_bundle = pipe().process_from_file('test/data_for_tests/conll_2003_example.txt') print(data_bundle) + + +class TestNERPipe(unittest.TestCase): + def test_process_from_file(self): + data_dict = { + 'weibo_NER': WeiboNERPipe, + 'peopledaily': PeopleDailyPipe, + 'MSRA_NER': MsraNERPipe, + } + for k, v in data_dict.items(): + pipe = v + with self.subTest(pipe=pipe): + data_bundle = pipe(bigrams=True, trigrams=True).process_from_file(f'test/data_for_tests/io/{k}') + print(data_bundle) + data_bundle = pipe(encoding_type='bioes').process_from_file(f'test/data_for_tests/io/{k}') + print(data_bundle)