@@ -38,6 +38,7 @@ __all__ = [ | |||||
'JsonLoader', | 'JsonLoader', | ||||
'CWSLoader', | 'CWSLoader', | ||||
"CWSPipe", | |||||
'MNLILoader', | 'MNLILoader', | ||||
"QuoraLoader", | "QuoraLoader", | ||||
@@ -10,6 +10,8 @@ Pipe用于处理通过 Loader 读取的数据,所有的 Pipe 都包含 ``proce | |||||
__all__ = [ | __all__ = [ | ||||
"Pipe", | "Pipe", | ||||
"CWSPipe", | |||||
"YelpFullPipe", | "YelpFullPipe", | ||||
"YelpPolarityPipe", | "YelpPolarityPipe", | ||||
"SSTPipe", | "SSTPipe", | ||||
@@ -43,3 +45,4 @@ from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe | |||||
MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe | MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe | ||||
from .pipe import Pipe | from .pipe import Pipe | ||||
from .conll import Conll2003Pipe | from .conll import Conll2003Pipe | ||||
from .cws import CWSPipe |
@@ -0,0 +1,30 @@ | |||||
使用以下中文NERPipe自动下载的统计数据 | |||||
| MsraNERPipe | # of sents | # of tokens | | |||||
| ----------- | ---------- | ----------- | | |||||
| train | 41747 | 1954374 | | |||||
| dev | 4617 | 215505 | | |||||
| test | 4365 | 172601 | | |||||
| total | 50729 | 2342480 | | |||||
这里报道的统计数据,与[https://arxiv.org/pdf/1805.02023.pdf]()报道的一致 | |||||
| WeiboNERPipe | # of sents | # of tokens | | |||||
| ------------ | ---------- | ----------- | | |||||
| train | 1350 | 73778 | | |||||
| dev | 270 | 14509 | | |||||
| test | 270 | 14842 | | |||||
| total | 1890 | 1890 | | |||||
这里报道的统计数据与[https://www.cs.cmu.edu/~ark/EMNLP-2015/proceedings/EMNLP/pdf/EMNLP064.pdf]()一致 | |||||
| PeopleDailyPipe | # of sents | # of tokens | | |||||
| --------------- | ---------- | ----------- | | |||||
| train | 50658 | 2169879 | | |||||
| dev | 4631 | 172601 | | |||||
| test | 68 | 2270 | | |||||
| total | 55357 | 2344750 | | |||||
这里使用的数据与[https://arxiv.org/pdf/1906.08101.pdf]()的数据是一致的 |
@@ -0,0 +1,32 @@ | |||||
四个数据集的统计信息,最原始的数据可以从[http://sighan.cs.uchicago.edu/bakeoff2005/]()下载。 | |||||
| pku | # of sents | # of tokens | | |||||
| ----- | ---------- | ----------- | | |||||
| train | 17173 | 1650222 | | |||||
| dev | 1881 | 176226 | | |||||
| test | 1944 | 172733 | | |||||
| total | 20998 | 1999181 | | |||||
| cityu | # of sents | # of tokens | | |||||
| ----- | ---------- | ----------- | | |||||
| train | 47696 | 2164907 | | |||||
| dev | 5323 | 238447 | | |||||
| test | 1492 | 67690 | | |||||
| total | 54511 | 2471044 | | |||||
| msra | # of sents | # of tokens | | |||||
| ----- | ---------- | ----------- | | |||||
| train | 78242 | 3644550 | | |||||
| dev | 8676 | 405919 | | |||||
| test | 3985 | 184355 | | |||||
| total | 90903 | 4234824 | | |||||
| as | # of sents | # of tokens | | |||||
| ----- | ---------- | ----------- | | |||||
| train | 638273 | 7536586 | | |||||
| dev | 70680 | 831464 | | |||||
| test | 14429 | 197681 | | |||||
| total | 723382 | 8565731 | |
@@ -1,17 +0,0 @@ | |||||
import unittest | |||||
from ..data.CWSDataLoader import SigHanLoader | |||||
from fastNLP.core.vocabulary import VocabularyOption | |||||
class TestCWSDataLoader(unittest.TestCase): | |||||
def test_case1(self): | |||||
cws_loader = SigHanLoader(target_type='bmes') | |||||
data = cws_loader.process('pku_demo.txt') | |||||
print(data.datasets) | |||||
def test_calse2(self): | |||||
cws_loader = SigHanLoader(target_type='bmes') | |||||
data = cws_loader.process('pku_demo.txt', bigram_vocab_opt=VocabularyOption()) | |||||
print(data.datasets) |